icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2008, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_BREAK_ITERATION
  15
  16 #include "unicode/utypes.h"
  17 #include "unicode/brkiter.h"
  18 #include "unicode/rbbi.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/utf16.h"
  21 #include "unicode/ucnv.h"
  22 #include "unicode/schriter.h"
  23 #include "unicode/uniset.h"
  24 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  25 #include "unicode/ustring.h"
  26 #include "unicode/utext.h"
  27 #include "intltest.h"
  28 #include "rbbitst.h"
  29 #include <string.h>
  30 #include "uvector.h"
  31 #include "uvectr32.h"
  32 #include "triedict.h"
  33 #include <string.h>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36
  37 #define TEST_ASSERT(x) {if (!(x)) { \
  38     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  39
  40 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  41     errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  42
  43
  44 //---------------------------------------------
  45 // runIndexedTest
  46 //---------------------------------------------
  47
  48 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  49 {
  50     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  51
  52     switch (index) {
  53         case 0: name = "TestBug4153072";
  54             if(exec) TestBug4153072();                         break;
  55         case 1: name = "TestJapaneseLineBreak";
  56             if(exec) TestJapaneseLineBreak();                  break;
  57         case 2: name = "TestStatusReturn";
  58             if(exec) TestStatusReturn();                       break;
  59         case 3: name = "TestUnicodeFiles";
  60             if(exec) TestUnicodeFiles();                       break;
  61         case 4: name = "TestEmptyString";
  62             if(exec) TestEmptyString();                        break;
  63
  64         case 5: name = "TestGetAvailableLocales";
  65             if(exec) TestGetAvailableLocales();                break;
  66
  67         case 6: name = "TestGetDisplayName";
  68             if(exec) TestGetDisplayName();                     break;
  69
  70         case 7: name = "TestEndBehaviour";
  71             if(exec) TestEndBehaviour();                       break;
  72         case 8: name = "TestMixedThaiLineBreak";
  73              if(exec) TestMixedThaiLineBreak();                break;
  74         case 9: name = "TestThaiLineBreak";
  75              if(exec) TestThaiLineBreak();                     break;
  76         case 10: name = "TestMaiyamok";
  77              if(exec) TestMaiyamok();                          break;
  78         case 11: name = "TestWordBreaks";
  79              if(exec) TestWordBreaks();                        break;
  80         case 12: name = "TestWordBoundary";
  81              if(exec) TestWordBoundary();                      break;
  82         case 13: name = "TestLineBreaks";
  83              if(exec) TestLineBreaks();                        break;
  84         case 14: name = "TestSentBreaks";
  85              if(exec) TestSentBreaks();                        break;
  86         case 15: name = "TestExtended";
  87              if(exec) TestExtended();                          break;
  88         case 16: name = "TestMonkey";
  89              if(exec) {
  90  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  91                TestMonkey(params);
  92  #else
  93                logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
  94  #endif
  95              }
  96                                                                break;
  97         case 17: name = "TestBug3818";
  98             if(exec) TestBug3818();                            break;
  99         case 18: name = "TestJapaneseWordBreak";
 100             if(exec) TestJapaneseWordBreak();                  break;
 101         case 19: name = "TestDebug";
 102             if(exec) TestDebug();                              break;
 103         case 20: name = "TestTrieDict";
 104             if(exec) TestTrieDict();                           break;
 105         case 21: name = "TestBug5775";
 106             if (exec) TestBug5775();                        break;
 107         case 22: name = "TestThaiBreaks";
 108             if (exec) TestThaiBreaks();                        break;
 109
 110         default: name = ""; break; //needed to end loop
 111     }
 112 }
 113
 114
 115 //---------------------------------------------------------------------------
 116 //
 117 //   class BITestData   Holds a set of Break iterator test data and results
 118 //                      Includes
 119 //                         - the string data to be broken
 120 //                         - a vector of the expected break positions.
 121 //                         - a vector of source line numbers for the data,
 122 //                               (to help see where errors occured.)
 123 //                         - The expected break tag values.
 124 //                         - Vectors of actual break positions and tag values.
 125 //                         - Functions for comparing actual with expected and
 126 //                            reporting errors.
 127 //
 128 //----------------------------------------------------------------------------
 129 class BITestData {
 130 public:
 131     UnicodeString    fDataToBreak;
 132     UVector          fExpectedBreakPositions;
 133     UVector          fExpectedTags;
 134     UVector          fLineNum;
 135     UVector          fActualBreakPositions;   // Test Results.
 136     UVector          fActualTags;
 137
 138     BITestData(UErrorCode &status);
 139     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 140     void             checkResults(const char *heading, RBBITest *test);
 141     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 142     void             clearResults();
 143 };
 144
 145 //
 146 // Constructor.
 147 //
 148 BITestData::BITestData(UErrorCode &status)
 149 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 150   fActualTags(status)
 151 {
 152 }
 153
 154 //
 155 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 156 //                 The macro form collects the line number, which is helpful
 157 //                 when tracking down failures.
 158 //
 159 //                 A null data item is inserted at the start of each test's data
 160 //                  to put the starting zero into the data list.  The position saved for
 161 //                  each non-null item is its ending position.
 162 //
 163 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 164 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 165     if (U_FAILURE(status)) {return;}
 166     if (data != NULL) {
 167         fDataToBreak.append(CharsToUnicodeString(data));
 168     }
 169     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 170     fExpectedTags.addElement(tag, status);
 171     fLineNum.addElement(lineNum, status);
 172 }
 173
 174
 175 //
 176 //  checkResults.   Compare the actual and expected break positions, report any differences.
 177 //
 178 void BITestData::checkResults(const char *heading, RBBITest *test) {
 179     int32_t   expectedIndex = 0;
 180     int32_t   actualIndex = 0;
 181
 182     for (;;) {
 183         // If we've run through both the expected and actual results vectors, we're done.
 184         //   break out of the loop.
 185         if (expectedIndex >= fExpectedBreakPositions.size() &&
 186             actualIndex   >= fActualBreakPositions.size()) {
 187             break;
 188         }
 189
 190
 191         if (expectedIndex >= fExpectedBreakPositions.size()) {
 192             err(heading, test, expectedIndex-1, actualIndex);
 193             actualIndex++;
 194             continue;
 195         }
 196
 197         if (actualIndex >= fActualBreakPositions.size()) {
 198             err(heading, test, expectedIndex, actualIndex-1);
 199             expectedIndex++;
 200             continue;
 201         }
 202
 203         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 204             err(heading, test, expectedIndex, actualIndex);
 205             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 206             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 207                 actualIndex++;
 208             } else {
 209                 expectedIndex++;
 210             }
 211             continue;
 212         }
 213
 214         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 215             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 216                 heading, fLineNum.elementAt(expectedIndex),
 217                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 218         }
 219
 220         actualIndex++;
 221         expectedIndex++;
 222     }
 223 }
 224
 225 //
 226 //  err   -  An error was found.  Report it, along with information about where the
 227 //                                incorrectly broken test data appeared in the source file.
 228 //
 229 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 230 {
 231     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 232     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 233     int32_t   o        = 0;
 234     int32_t   line     = fLineNum.elementAti(expectedIdx);
 235     if (expectedIdx > 0) {
 236         // The line numbers are off by one because a premature break occurs somewhere
 237         //    within the previous item, rather than at the start of the current (expected) item.
 238         //    We want to report the offset of the unexpected break from the start of
 239         //      this previous item.
 240         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 241     }
 242     if (actual < expected) {
 243         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 244     } else {
 245         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 246     }
 247 }
 248
 249
 250 void BITestData::clearResults() {
 251     fActualBreakPositions.removeAllElements();
 252     fActualTags.removeAllElements();
 253 }
 254
 255
 256 //-----------------------------------------------------------------------------------
 257 //
 258 //    Cannned Test Characters
 259 //
 260 //-----------------------------------------------------------------------------------
 261
 262 static const UChar cannedTestArray[] = {
 263     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
 264     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
 265     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
 266     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
 267     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
 268     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
 269     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
 270     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
 271 };
 272
 273 static UnicodeString* cannedTestChars = 0;
 274
 275 #define  halfNA     "\\u0928\\u094d\\u200d"
 276 #define  halfSA     "\\u0938\\u094d\\u200d"
 277 #define  halfCHA    "\\u091a\\u094d\\u200d"
 278 #define  halfKA     "\\u0915\\u094d\\u200d"
 279 #define  deadTA     "\\u0924\\u094d"
 280
 281 //--------------------------------------------------------------------------------------
 282 //
 283 //    RBBITest    constructor and destructor
 284 //
 285 //--------------------------------------------------------------------------------------
 286
 287 RBBITest::RBBITest() {
 288     UnicodeString temp(cannedTestArray);
 289     cannedTestChars = new UnicodeString();
 290     *cannedTestChars += (UChar)0x0000;
 291     *cannedTestChars += temp;
 292 }
 293
 294
 295 RBBITest::~RBBITest() {
 296     delete cannedTestChars;
 297 }
 298
 299
 300 static const int T_NUMBER = 100;
 301 static const int T_LETTER = 200;
 302 static const int T_H_OR_K = 300;
 303 static const int T_IDEO   = 400;
 304
 305
 306
 307
 308
 309
 310 //--------------------------------------------------------------------
 311 //Testing the BreakIterator for devanagari script
 312 //--------------------------------------------------------------------
 313
 314 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
 315 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
 316 #define deadTTHA "\\u0920\\u094d"
 317 #define deadPA   "\\u092a\\u094d"
 318 #define deadSA   "\\u0938\\u094d"
 319 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
 320
 321
 322
 323
 324
 325
 326 //-----------------------------------------------------------------------------------
 327 //
 328 //   Test for status {tag} return value from break rules.
 329 //        TODO:  a more thorough test.
 330 //
 331 //-----------------------------------------------------------------------------------
 332 void RBBITest::TestStatusReturn() {
 333      UnicodeString rulesString1("$Letters = [:L:];\n"
 334                                   "$Numbers = [:N:];\n"
 335                                   "$Letters+{1};\n"
 336                                   "$Numbers+{2};\n"
 337                                   "Help\\ {4}/me\\!;\n"
 338                                   "[^$Letters $Numbers];\n"
 339                                   "!.*;\n", -1, US_INV);
 340      UnicodeString testString1  = "abc123..abc Help me Help me!";
 341                                 // 01234567890123456789012345678
 342      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 343      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 344
 345      UErrorCode status=U_ZERO_ERROR;
 346      UParseError    parseError;
 347
 348      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 349      if(U_FAILURE(status)) {
 350          errln("FAIL : in construction");
 351      } else {
 352          int32_t  pos;
 353          int32_t  i = 0;
 354          bi->setText(testString1);
 355          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 356              if (pos != bounds1[i]) {
 357                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 358                  break;
 359              }
 360
 361              int tag = bi->getRuleStatus();
 362              if (tag != brkStatus[i]) {
 363                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 364                  break;
 365              }
 366              i++;
 367          }
 368      }
 369      delete bi;
 370 }
 371
 372
 373 static void printStringBreaks(UnicodeString ustr, int expected[],
 374                               int expectedcount)
 375 {
 376     UErrorCode status = U_ZERO_ERROR;
 377     char name[100];
 378     printf("code    alpha extend alphanum type word sent line name\n");
 379     int j;
 380     for (j = 0; j < ustr.length(); j ++) {
 381         if (expectedcount > 0) {
 382             int k;
 383             for (k = 0; k < expectedcount; k ++) {
 384                 if (j == expected[k]) {
 385                     printf("------------------------------------------------ %d\n",
 386                            j);
 387                 }
 388             }
 389         }
 390         UChar32 c = ustr.char32At(j);
 391         if (c > 0xffff) {
 392             j ++;
 393         }
 394         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 395         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 396                            u_isUAlphabetic(c),
 397                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 398                            u_isalnum(c),
 399                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 400                                                   u_charType(c),
 401                                                   U_SHORT_PROPERTY_NAME),
 402                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 403                                                   u_getIntPropertyValue(c,
 404                                                           UCHAR_WORD_BREAK),
 405                                                   U_SHORT_PROPERTY_NAME),
 406                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 407                                    u_getIntPropertyValue(c,
 408                                            UCHAR_SENTENCE_BREAK),
 409                                    U_SHORT_PROPERTY_NAME),
 410                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 411                                    u_getIntPropertyValue(c,
 412                                            UCHAR_LINE_BREAK),
 413                                    U_SHORT_PROPERTY_NAME),
 414                            name);
 415     }
 416 }
 417
 418 void RBBITest::TestThaiLineBreak() {
 419     UErrorCode status = U_ZERO_ERROR;
 420     BITestData thaiLineSelection(status);
 421
 422     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
 423     // represents elided letters at the end of a long word.  It should be bound to
 424     // the end of the word and not treated as an independent punctuation mark.
 425
 426
 427     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 428     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
 429     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
 430     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
 431     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
 432 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
 433 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 434     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
 435     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
 436     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
 437     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
 438     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
 439     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
 440     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
 441     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
 442
 443     // the one time where the paiyannoi occurs somewhere other than at the end
 444     // of a word is in the Thai abbrevation for "etc.", which both begins and
 445     // ends with a paiyannoi
 446     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
 447     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 448     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
 449
 450     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 451         Locale("th"), status);
 452     if (U_FAILURE(status))
 453     {
 454         errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
 455         return;
 456     }
 457
 458     generalIteratorTest(*e, thaiLineSelection);
 459     delete e;
 460 }
 461
 462
 463
 464 void RBBITest::TestMixedThaiLineBreak()
 465 {
 466     UErrorCode   status = U_ZERO_ERROR;
 467     BITestData   thaiLineSelection(status);
 468
 469     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 470
 471
 472     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
 473     // start
 474
 475     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
 476     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
 477     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
 478     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
 479     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
 480     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
 481     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
 482     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
 483     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
 484     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
 485     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
 486     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
 487     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
 488     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
 489     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
 490     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
 491
 492     // @suwit - end of changes
 493
 494
 495     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
 496     if (U_FAILURE(status))
 497     {
 498         errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
 499         return;
 500     }
 501
 502
 503     generalIteratorTest(*e, thaiLineSelection);
 504     delete e;
 505 }
 506
 507
 508 void RBBITest::TestMaiyamok()
 509 {
 510     UErrorCode status = U_ZERO_ERROR;
 511     BITestData   thaiLineSelection(status);
 512     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 513     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
 514     // word".  Instead of appearing as a word unto itself, however, it's kept together
 515     // with the word before it
 516     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
 517     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
 518     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
 519     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
 520     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
 521     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
 522     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
 523     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
 524     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
 525
 526     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 527         Locale("th"), status);
 528
 529     if (U_FAILURE(status))
 530     {
 531         errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
 532         return;
 533     }
 534     generalIteratorTest(*e, thaiLineSelection);
 535     delete e;
 536 }
 537
 538
 539
 540 void RBBITest::TestBug3818() {
 541     UErrorCode  status = U_ZERO_ERROR;
 542
 543     // Four Thai words...
 544     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 545                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 546     UnicodeString  thaiStr(thaiWordData);
 547
 548     RuleBasedBreakIterator* bi =
 549         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
 550     if (U_FAILURE(status) || bi == NULL) {
 551         errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 552         return;
 553     }
 554     bi->setText(thaiStr);
 555
 556     int32_t  startOfSecondWord = bi->following(1);
 557     if (startOfSecondWord != 4) {
 558         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 559             __FILE__, __LINE__, startOfSecondWord);
 560     }
 561     startOfSecondWord = bi->following(0);
 562     if (startOfSecondWord != 4) {
 563         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 564             __FILE__, __LINE__, startOfSecondWord);
 565     }
 566     delete bi;
 567 }
 568
 569
 570 void RBBITest::TestJapaneseWordBreak() {
 571     UErrorCode status = U_ZERO_ERROR;
 572     BITestData   japaneseWordSelection(status);
 573
 574     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
 575     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
 576     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
 577     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
 578     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
 579     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
 580     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
 581
 582     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
 583         Locale("ja"), status);
 584     if (U_FAILURE(status))
 585     {
 586         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
 587         return;
 588     }
 589
 590     generalIteratorTest(*e, japaneseWordSelection);
 591     delete e;
 592 }
 593
 594 void RBBITest::TestTrieDict() {
 595     UErrorCode      status  = U_ZERO_ERROR;
 596
 597     //
 598     //  Open and read the test data file.
 599     //
 600     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 601     char testFileName[1000];
 602     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
 603         errln("Can't open test data.  Path too long.");
 604         return;
 605     }
 606     strcpy(testFileName, testDataDirectory);
 607     strcat(testFileName, "riwords.txt");
 608
 609     // Items needing deleting at the end
 610     MutableTrieDictionary *mutableDict = NULL;
 611     CompactTrieDictionary *compactDict = NULL;
 612     UnicodeSet            *breaks      = NULL;
 613     UChar                 *testFile    = NULL;
 614     StringEnumeration     *enumer1     = NULL;
 615     StringEnumeration     *enumer2     = NULL;
 616     MutableTrieDictionary *mutable2    = NULL;
 617     StringEnumeration     *cloneEnum   = NULL;
 618     CompactTrieDictionary *compact2    = NULL;
 619
 620
 621     const UnicodeString *originalWord = NULL;
 622     const UnicodeString *cloneWord    = NULL;
 623     UChar *current;
 624     UChar *word;
 625     UChar uc;
 626     int32_t wordLen;
 627     int32_t wordCount;
 628     int32_t testCount;
 629
 630     int    len;
 631     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
 632     if (U_FAILURE(status)) {
 633         goto cleanup; /* something went wrong, error already output */
 634     }
 635
 636     mutableDict = new MutableTrieDictionary(0x0E1C, status);
 637     if (U_FAILURE(status)) {
 638         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
 639         goto cleanup;
 640     }
 641
 642     breaks = new UnicodeSet;
 643     breaks->add(0x000A);     // Line Feed
 644     breaks->add(0x000D);     // Carriage Return
 645     breaks->add(0x2028);     // Line Separator
 646     breaks->add(0x2029);     // Paragraph Separator
 647
 648     // Now add each non-comment line of the file as a word.
 649     current = testFile;
 650     word = current;
 651     uc = *current++;
 652     wordLen = 0;
 653     wordCount = 0;
 654
 655     while (uc) {
 656         if (uc == 0x0023) {     // #comment line, skip
 657             while (uc && !breaks->contains(uc)) {
 658                 uc = *current++;
 659             }
 660         }
 661         else while (uc && !breaks->contains(uc)) {
 662             ++wordLen;
 663             uc = *current++;
 664         }
 665         if (wordLen > 0) {
 666             mutableDict->addWord(word, wordLen, status);
 667             if (U_FAILURE(status)) {
 668                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
 669                 goto cleanup;
 670             }
 671             wordCount += 1;
 672         }
 673
 674         // Find beginning of next line
 675         while (uc && breaks->contains(uc)) {
 676             uc = *current++;
 677         }
 678         word = current-1;
 679         wordLen = 0;
 680     }
 681
 682     if (wordCount < 50) {
 683         errln("Word count (%d) unreasonably small\n", wordCount);
 684         goto cleanup;
 685     }
 686
 687     enumer1 = mutableDict->openWords(status);
 688     if (U_FAILURE(status)) {
 689         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
 690         goto cleanup;
 691     }
 692
 693     testCount = 0;
 694     if (wordCount != (testCount = enumer1->count(status))) {
 695         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 696             testCount, wordCount, u_errorName(status));
 697         goto cleanup;
 698     }
 699
 700     // Now compact it
 701     compactDict = new CompactTrieDictionary(*mutableDict, status);
 702     if (U_FAILURE(status)) {
 703         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
 704         goto cleanup;
 705     }
 706
 707     enumer2 = compactDict->openWords(status);
 708     if (U_FAILURE(status)) {
 709         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
 710         goto cleanup;
 711     }
 712
 713     if (wordCount != (testCount = enumer2->count(status))) {
 714         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 715             testCount, wordCount, u_errorName(status));
 716         goto cleanup;
 717     }
 718
 719     if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
 720         errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
 721     }
 722     delete enumer1;
 723     enumer1 = NULL;
 724     delete enumer2;
 725     enumer2 = NULL;
 726
 727     // Now un-compact it
 728     mutable2 = compactDict->cloneMutable(status);
 729     if (U_FAILURE(status)) {
 730         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
 731         goto cleanup;
 732     }
 733
 734     cloneEnum = mutable2->openWords(status);
 735     if (U_FAILURE(status)) {
 736         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
 737         goto cleanup;
 738     }
 739
 740     if (wordCount != (testCount = cloneEnum->count(status))) {
 741         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 742             testCount, wordCount, u_errorName(status));
 743         goto cleanup;
 744     }
 745
 746     // Compact original dictionary to clone. Note that we can only compare the same kind of
 747     // dictionary as the order of the enumerators is not guaranteed to be the same between
 748     // different kinds
 749     enumer1 = mutableDict->openWords(status);
 750     if (U_FAILURE(status)) {
 751         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
 752         goto cleanup;
 753      }
 754
 755     originalWord = enumer1->snext(status);
 756     cloneWord = cloneEnum->snext(status);
 757     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
 758         if (*originalWord != *cloneWord) {
 759             errln("Original and cloned MutableTrieDictionary word mismatch\n");
 760             goto cleanup;
 761         }
 762         originalWord = enumer1->snext(status);
 763         cloneWord = cloneEnum->snext(status);
 764     }
 765
 766     if (U_FAILURE(status)) {
 767         errln("Enumeration failed: %s\n", u_errorName(status));
 768         goto cleanup;
 769     }
 770
 771     if (originalWord != cloneWord) {
 772         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
 773         goto cleanup;
 774     }
 775
 776     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
 777     compact2 = new CompactTrieDictionary(compactDict->data(), status);
 778     if (U_FAILURE(status)) {
 779         errln("CompactTrieDictionary(const void *,...) failed\n");
 780         goto cleanup;
 781     }
 782
 783     if (compact2->dataSize() == 0) {
 784         errln("CompactTrieDictionary->dataSize() == 0\n");
 785         goto cleanup;
 786     }
 787
 788     // Now count the words via the second dictionary
 789     delete enumer1;
 790     enumer1 = compact2->openWords(status);
 791     if (U_FAILURE(status)) {
 792         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
 793         goto cleanup;
 794     }
 795
 796     if (wordCount != (testCount = enumer1->count(status))) {
 797         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
 798             testCount, wordCount, u_errorName(status));
 799         goto cleanup;
 800     }
 801
 802 cleanup:
 803     delete compactDict;
 804     delete mutableDict;
 805     delete breaks;
 806     delete[] testFile;
 807     delete enumer1;
 808     delete mutable2;
 809     delete cloneEnum;
 810     delete compact2;
 811 }
 812
 813
 814 //----------------------------------------------------------------------------
 815 //
 816 // generalIteratorTest      Given a break iterator and a set of test data,
 817 //                          Run the tests and report the results.
 818 //
 819 //----------------------------------------------------------------------------
 820 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 821 {
 822
 823     bi.setText(td.fDataToBreak);
 824
 825     testFirstAndNext(bi, td);
 826
 827     testLastAndPrevious(bi, td);
 828
 829     testFollowing(bi, td);
 830     testPreceding(bi, td);
 831     testIsBoundary(bi, td);
 832     doMultipleSelectionTest(bi, td);
 833 }
 834
 835
 836 //
 837 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 838 //                       kind of loop.
 839 //
 840 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 841 {
 842     UErrorCode  status = U_ZERO_ERROR;
 843     int32_t     p;
 844     int32_t     lastP = -1;
 845     int32_t     tag;
 846
 847     logln("Test first and next");
 848     bi.setText(td.fDataToBreak);
 849     td.clearResults();
 850
 851     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 852         td.fActualBreakPositions.addElement(p, status);  // Save result.
 853         tag = bi.getRuleStatus();
 854         td.fActualTags.addElement(tag, status);
 855         if (p <= lastP) {
 856             // If the iterator is not making forward progress, stop.
 857             //  No need to raise an error here, it'll be detected in the normal check of results.
 858             break;
 859         }
 860         lastP = p;
 861     }
 862     td.checkResults("testFirstAndNext", this);
 863 }
 864
 865
 866 //
 867 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 868 //
 869 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 870 {
 871     UErrorCode  status = U_ZERO_ERROR;
 872     int32_t     p;
 873     int32_t     lastP  = 0x7ffffffe;
 874     int32_t     tag;
 875
 876     logln("Test last and previous");
 877     bi.setText(td.fDataToBreak);
 878     td.clearResults();
 879
 880     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 881         // Save break position.  Insert it at start of vector of results, shoving
 882         //    already-saved results further towards the end.
 883         td.fActualBreakPositions.insertElementAt(p, 0, status);
 884         // bi.previous();   // TODO:  Why does this fix things up????
 885         // bi.next();
 886         tag = bi.getRuleStatus();
 887         td.fActualTags.insertElementAt(tag, 0, status);
 888         if (p >= lastP) {
 889             // If the iterator is not making progress, stop.
 890             //  No need to raise an error here, it'll be detected in the normal check of results.
 891             break;
 892         }
 893         lastP = p;
 894     }
 895     td.checkResults("testLastAndPrevious", this);
 896 }
 897
 898
 899 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 900 {
 901     UErrorCode  status = U_ZERO_ERROR;
 902     int32_t     p;
 903     int32_t     tag;
 904     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 905                                  //   cannot be -1; that is returned for DONE.
 906     int         i;
 907
 908     logln("testFollowing():");
 909     bi.setText(td.fDataToBreak);
 910     td.clearResults();
 911
 912     // Save the starting point, since we won't get that out of following.
 913     p = bi.first();
 914     td.fActualBreakPositions.addElement(p, status);  // Save result.
 915     tag = bi.getRuleStatus();
 916     td.fActualTags.addElement(tag, status);
 917
 918     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 919         p = bi.following(i);
 920         if (p != lastP) {
 921             if (p == RuleBasedBreakIterator::DONE) {
 922                 break;
 923             }
 924             // We've reached a new break position.  Save it.
 925             td.fActualBreakPositions.addElement(p, status);  // Save result.
 926             tag = bi.getRuleStatus();
 927             td.fActualTags.addElement(tag, status);
 928             lastP = p;
 929         }
 930     }
 931     // The loop normally exits by means of the break in the middle.
 932     // Make sure that the index was at the correct position for the break iterator to have
 933     //   returned DONE.
 934     if (i != td.fDataToBreak.length()) {
 935         errln("testFollowing():  iterator returned DONE prematurely.");
 936     }
 937
 938     // Full check of all results.
 939     td.checkResults("testFollowing", this);
 940 }
 941
 942
 943
 944 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 945     UErrorCode  status = U_ZERO_ERROR;
 946     int32_t     p;
 947     int32_t     tag;
 948     int32_t     lastP  = 0x7ffffffe;
 949     int         i;
 950
 951     logln("testPreceding():");
 952     bi.setText(td.fDataToBreak);
 953     td.clearResults();
 954
 955     p = bi.last();
 956     td.fActualBreakPositions.addElement(p, status);
 957     tag = bi.getRuleStatus();
 958     td.fActualTags.addElement(tag, status);
 959
 960     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 961         p = bi.preceding(i);
 962         if (p != lastP) {
 963             if (p == RuleBasedBreakIterator::DONE) {
 964                 break;
 965             }
 966             // We've reached a new break position.  Save it.
 967             td.fActualBreakPositions.insertElementAt(p, 0, status);
 968             lastP = p;
 969             tag = bi.getRuleStatus();
 970             td.fActualTags.insertElementAt(tag, 0, status);
 971         }
 972     }
 973     // The loop normally exits by means of the break in the middle.
 974     // Make sure that the index was at the correct position for the break iterator to have
 975     //   returned DONE.
 976     if (i != 0) {
 977         errln("testPreceding():  iterator returned DONE prematurely.");
 978     }
 979
 980     // Full check of all results.
 981     td.checkResults("testPreceding", this);
 982 }
 983
 984
 985
 986 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 987     UErrorCode  status = U_ZERO_ERROR;
 988     int         i;
 989     int32_t     tag;
 990
 991     logln("testIsBoundary():");
 992     bi.setText(td.fDataToBreak);
 993     td.clearResults();
 994
 995     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 996         if (bi.isBoundary(i)) {
 997             td.fActualBreakPositions.addElement(i, status);  // Save result.
 998             tag = bi.getRuleStatus();
 999             td.fActualTags.addElement(tag, status);
1000         }
1001     }
1002     td.checkResults("testIsBoundary: ", this);
1003 }
1004
1005
1006
1007 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1008 {
1009     iterator.setText(td.fDataToBreak);
1010
1011     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1012     int32_t offset = iterator.first();
1013     int32_t testOffset;
1014     int32_t count = 0;
1015
1016     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1017
1018     if (*testIterator != iterator)
1019         errln("clone() or operator!= failed: two clones compared unequal");
1020
1021     do {
1022         testOffset = testIterator->first();
1023         testOffset = testIterator->next(count);
1024         if (offset != testOffset)
1025             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1026
1027         if (offset != RuleBasedBreakIterator::DONE) {
1028             count++;
1029             offset = iterator.next();
1030
1031             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1032                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1033                 if (count > 10000 || offset == -1) {
1034                     errln("operator== failed too many times. Stopping test.");
1035                     if (offset == -1) {
1036                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1037                     }
1038                     return;
1039                 }
1040             }
1041         }
1042     } while (offset != RuleBasedBreakIterator::DONE);
1043
1044     // now do it backwards...
1045     offset = iterator.last();
1046     count = 0;
1047
1048     do {
1049         testOffset = testIterator->last();
1050         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
1051         if (offset != testOffset)
1052             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1053
1054         if (offset != RuleBasedBreakIterator::DONE) {
1055             count--;
1056             offset = iterator.previous();
1057         }
1058     } while (offset != RuleBasedBreakIterator::DONE);
1059
1060     delete testIterator;
1061 }
1062
1063
1064 //---------------------------------------------
1065 //
1066 //     other tests
1067 //
1068 //---------------------------------------------
1069 void RBBITest::TestEmptyString()
1070 {
1071     UnicodeString text = "";
1072     UErrorCode status = U_ZERO_ERROR;
1073
1074     BITestData x(status);
1075     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1076     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1077     if (U_FAILURE(status))
1078     {
1079         errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1080         return;
1081     }
1082     generalIteratorTest(*bi, x);
1083     delete bi;
1084 }
1085
1086 void RBBITest::TestGetAvailableLocales()
1087 {
1088     int32_t locCount = 0;
1089     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1090
1091     if (locCount == 0)
1092         errln("getAvailableLocales() returned an empty list!");
1093     // Just make sure that it's returning good memory.
1094     int32_t i;
1095     for (i = 0; i < locCount; ++i) {
1096         logln(locList[i].getName());
1097     }
1098 }
1099
1100 //Testing the BreakIterator::getDisplayName() function
1101 void RBBITest::TestGetDisplayName()
1102 {
1103     UnicodeString   result;
1104
1105     BreakIterator::getDisplayName(Locale::getUS(), result);
1106     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1107         errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1108                 + result);
1109
1110     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1111     if (result != "French (France)")
1112         errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1113                 + result);
1114 }
1115 /**
1116  * Test End Behaviour
1117  * @bug 4068137
1118  */
1119 void RBBITest::TestEndBehaviour()
1120 {
1121     UErrorCode status = U_ZERO_ERROR;
1122     UnicodeString testString("boo.");
1123     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1124     if (U_FAILURE(status))
1125     {
1126         errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1127         return;
1128     }
1129     wb->setText(testString);
1130
1131     if (wb->first() != 0)
1132         errln("Didn't get break at beginning of string.");
1133     if (wb->next() != 3)
1134         errln("Didn't get break before period in \"boo.\"");
1135     if (wb->current() != 4 && wb->next() != 4)
1136         errln("Didn't get break at end of string.");
1137     delete wb;
1138 }
1139 /*
1140  * @bug 4153072
1141  */
1142 void RBBITest::TestBug4153072() {
1143     UErrorCode status = U_ZERO_ERROR;
1144     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1145     if (U_FAILURE(status))
1146     {
1147         errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1148         return;
1149     }
1150     UnicodeString str("...Hello, World!...");
1151     int32_t begin = 3;
1152     int32_t end = str.length() - 3;
1153     UBool onBoundary;
1154
1155     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1156     iter->adoptText(textIterator);
1157     int index;
1158     // Note: with the switch to UText, there is no way to restrict the
1159     //       iteration range to begin at an index other than zero.
1160     //       String character iterators created with a non-zero bound are
1161     //         treated by RBBI as being empty.
1162     for (index = -1; index < begin + 1; ++index) {
1163         onBoundary = iter->isBoundary(index);
1164         if (index == 0?  !onBoundary : onBoundary) {
1165             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1166                             " and begin index = " + begin);
1167         }
1168     }
1169     delete iter;
1170 }
1171
1172
1173 //
1174 // Test for problem reported by Ashok Matoria on 9 July 2007
1175 //    One.<kSoftHyphen><kSpace>Two.
1176 //
1177 //    Sentence break at start (0) and then on calling next() it breaks at
1178 //   'T' of "Two". Now, at this point if I do next() and
1179 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1180 //
1181 void RBBITest::TestBug5775() {
1182     UErrorCode status = U_ZERO_ERROR;
1183     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1184     TEST_ASSERT_SUCCESS(status);
1185     TEST_ASSERT(bi != NULL);
1186
1187     if (U_FAILURE(status) || bi == NULL) {
1188         // TEST_ASSERT already printed error message.
1189         return;
1190     }
1191
1192     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1193     //               01234      56789
1194     s = s.unescape();
1195     bi->setText(s);
1196     int pos = bi->next();
1197     TEST_ASSERT(pos == 6);
1198     pos = bi->next();
1199     TEST_ASSERT(pos == 10);
1200     pos = bi->previous();
1201     TEST_ASSERT(pos == 6);
1202     delete bi;
1203 }
1204
1205
1206
1207 /**
1208  * Test Japanese Line Break
1209  * @bug 4095322
1210  */
1211 void RBBITest::TestJapaneseLineBreak()
1212 {
1213 #if 0
1214     // Test needs updating some more...   Dump it for now.
1215
1216
1217     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1218     //        as opening and closing punctuation for line breaking.
1219     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1220     //        from these tests.    6-13-2002
1221     //
1222     UErrorCode status = U_ZERO_ERROR;
1223     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1224     UnicodeString precedingChars = CharsToUnicodeString(
1225         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1226         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1227     UnicodeString followingChars = CharsToUnicodeString(
1228         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1229         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1230         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1231         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1232         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1233     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1234
1235     int32_t i;
1236     if (U_FAILURE(status))
1237     {
1238         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1239         return;
1240     }
1241
1242     for (i = 0; i < precedingChars.length(); i++) {
1243         testString.setCharAt(1, precedingChars[i]);
1244         iter->setText(testString);
1245         int32_t j = iter->first();
1246         if (j != 0)
1247             errln("ja line break failure: failed to start at 0");
1248         j = iter->next();
1249         if (j != 1)
1250             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1251                         + "' (" + ((int)(precedingChars[i])) + ")");
1252         j = iter->next();
1253         if (j != 3)
1254             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1255                         + "' (" + ((int)(precedingChars[i])) + ")");
1256     }
1257
1258     for (i = 0; i < followingChars.length(); i++) {
1259         testString.setCharAt(1, followingChars[i]);
1260         iter->setText(testString);
1261         int j = iter->first();
1262         if (j != 0)
1263             errln("ja line break failure: failed to start at 0");
1264         j = iter->next();
1265         if (j != 2)
1266             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1267                         + "' (" + ((int)(followingChars[i])) + ")");
1268         j = iter->next();
1269         if (j != 3)
1270             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1271                         + "' (" + ((int)(followingChars[i])) + ")");
1272     }
1273     delete iter;
1274 #endif
1275 }
1276
1277
1278 //------------------------------------------------------------------------------
1279 //
1280 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
1281 //
1282 //------------------------------------------------------------------------------
1283
1284 struct TestParams {
1285     BreakIterator   *bi;
1286     UnicodeString    dataToBreak;
1287     UVector32       *expectedBreaks;
1288     UVector32       *srcLine;
1289     UVector32       *srcCol;
1290 };
1291
1292 void RBBITest::executeTest(TestParams *t) {
1293     int32_t    bp;
1294     int32_t    prevBP;
1295     int32_t    i;
1296
1297     if (t->bi == NULL) {
1298         return;
1299     }
1300
1301     t->bi->setText(t->dataToBreak);
1302     //
1303     //  Run the iterator forward
1304     //
1305     prevBP = -1;
1306     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1307         if (prevBP ==  bp) {
1308             // Fail for lack of forward progress.
1309             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1310                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1311             break;
1312         }
1313
1314         // Check that there were we didn't miss an expected break between the last one
1315         //  and this one.
1316         for (i=prevBP+1; i<bp; i++) {
1317             if (t->expectedBreaks->elementAti(i) != 0) {
1318                 int expected[] = {0, i};
1319                 printStringBreaks(t->dataToBreak, expected, 2);
1320                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1321                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1322             }
1323         }
1324
1325         // Check that the break we did find was expected
1326         if (t->expectedBreaks->elementAti(bp) == 0) {
1327             int expected[] = {0, bp};
1328             printStringBreaks(t->dataToBreak, expected, 2);
1329             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1330                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1331         } else {
1332             // The break was expected.
1333             //   Check that the {nnn} tag value is correct.
1334             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1335             if (expectedTagVal == -1) {
1336                 expectedTagVal = 0;
1337             }
1338             int32_t line = t->srcLine->elementAti(bp);
1339             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1340             if (rs != expectedTagVal) {
1341                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1342                       "          Actual, Expected status = %4d, %4d",
1343                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1344             }
1345         }
1346
1347
1348         prevBP = bp;
1349     }
1350
1351     // Verify that there were no missed expected breaks after the last one found
1352     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1353         if (t->expectedBreaks->elementAti(i) != 0) {
1354             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1355                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1356         }
1357     }
1358
1359     //
1360     //  Run the iterator backwards, verify that the same breaks are found.
1361     //
1362     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1363     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1364         if (prevBP ==  bp) {
1365             // Fail for lack of progress.
1366             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1367                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1368             break;
1369         }
1370
1371         // Check that there were we didn't miss an expected break between the last one
1372         //  and this one.  (UVector returns zeros for index out of bounds.)
1373         for (i=prevBP-1; i>bp; i--) {
1374             if (t->expectedBreaks->elementAti(i) != 0) {
1375                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1376                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1377             }
1378         }
1379
1380         // Check that the break we did find was expected
1381         if (t->expectedBreaks->elementAti(bp) == 0) {
1382             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1383                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1384         } else {
1385             // The break was expected.
1386             //   Check that the {nnn} tag value is correct.
1387             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1388             if (expectedTagVal == -1) {
1389                 expectedTagVal = 0;
1390             }
1391             int line = t->srcLine->elementAti(bp);
1392             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1393             if (rs != expectedTagVal) {
1394                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1395                       "          Actual, Expected status = %4d, %4d",
1396                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1397             }
1398         }
1399
1400         prevBP = bp;
1401     }
1402
1403     // Verify that there were no missed breaks prior to the last one found
1404     for (i=prevBP-1; i>=0; i--) {
1405         if (t->expectedBreaks->elementAti(i) != 0) {
1406             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1407                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1408         }
1409     }
1410 }
1411
1412
1413 void RBBITest::TestExtended() {
1414 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1415     UErrorCode      status  = U_ZERO_ERROR;
1416     Locale          locale("");
1417
1418     UnicodeString       rules;
1419     TestParams          tp;
1420     tp.bi             = NULL;
1421     tp.expectedBreaks = new UVector32(status);
1422     tp.srcLine        = new UVector32(status);
1423     tp.srcCol         = new UVector32(status);
1424
1425     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1426     TEST_ASSERT_SUCCESS(status);
1427
1428
1429     //
1430     //  Open and read the test data file.
1431     //
1432     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1433     char testFileName[1000];
1434     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1435         errln("Can't open test data.  Path too long.");
1436         return;
1437     }
1438     strcpy(testFileName, testDataDirectory);
1439     strcat(testFileName, "rbbitst.txt");
1440
1441     int    len;
1442     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1443     if (U_FAILURE(status)) {
1444         return; /* something went wrong, error already output */
1445     }
1446
1447
1448
1449
1450     //
1451     //  Put the test data into a UnicodeString
1452     //
1453     UnicodeString testString(FALSE, testFile, len);
1454
1455     enum EParseState{
1456         PARSE_COMMENT,
1457         PARSE_TAG,
1458         PARSE_DATA,
1459         PARSE_NUM
1460     }
1461     parseState = PARSE_TAG;
1462
1463     EParseState savedState = PARSE_TAG;
1464
1465     static const UChar CH_LF        = 0x0a;
1466     static const UChar CH_CR        = 0x0d;
1467     static const UChar CH_HASH      = 0x23;
1468     /*static const UChar CH_PERIOD    = 0x2e;*/
1469     static const UChar CH_LT        = 0x3c;
1470     static const UChar CH_GT        = 0x3e;
1471     static const UChar CH_BACKSLASH = 0x5c;
1472     static const UChar CH_BULLET    = 0x2022;
1473
1474     int32_t    lineNum  = 1;
1475     int32_t    colStart = 0;
1476     int32_t    column   = 0;
1477     int32_t    charIdx  = 0;
1478
1479     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1480
1481     for (charIdx = 0; charIdx < len; ) {
1482         status = U_ZERO_ERROR;
1483         UChar  c = testString.charAt(charIdx);
1484         charIdx++;
1485         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1486             // treat CRLF as a unit
1487             c = CH_LF;
1488             charIdx++;
1489         }
1490         if (c == CH_LF || c == CH_CR) {
1491             lineNum++;
1492             colStart = charIdx;
1493         }
1494         column = charIdx - colStart + 1;
1495
1496         switch (parseState) {
1497         case PARSE_COMMENT:
1498             if (c == 0x0a || c == 0x0d) {
1499                 parseState = savedState;
1500             }
1501             break;
1502
1503         case PARSE_TAG:
1504             {
1505             if (c == CH_HASH) {
1506                 parseState = PARSE_COMMENT;
1507                 savedState = PARSE_TAG;
1508                 break;
1509             }
1510             if (u_isUWhiteSpace(c)) {
1511                 break;
1512             }
1513             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1514                 delete tp.bi;
1515                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1516                 charIdx += 5;
1517                 break;
1518             }
1519             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1520                 delete tp.bi;
1521                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1522                 charIdx += 5;
1523                 break;
1524             }
1525             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1526                 delete tp.bi;
1527                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1528                 charIdx += 5;
1529                 break;
1530             }
1531             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1532                 delete tp.bi;
1533                 tp.bi = NULL;
1534                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1535                 charIdx += 5;
1536                 break;
1537             }
1538             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1539                 delete tp.bi;
1540                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1541                 charIdx += 6;
1542                 break;
1543             }
1544
1545             // <locale  loc_name>
1546             localeMatcher.reset(testString);
1547             if (localeMatcher.lookingAt(charIdx-1, status)) {
1548                 UnicodeString localeName = localeMatcher.group(1, status);
1549                 char localeName8[100];
1550                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1551                 locale = Locale::createFromName(localeName8);
1552                 charIdx += localeMatcher.group(0, status).length();
1553                 TEST_ASSERT_SUCCESS(status);
1554                 break;
1555             }
1556             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1557                 parseState = PARSE_DATA;
1558                 charIdx += 5;
1559                 tp.dataToBreak = "";
1560                 tp.expectedBreaks->removeAllElements();
1561                 tp.srcCol ->removeAllElements();
1562                 tp.srcLine->removeAllElements();
1563                 break;
1564             }
1565
1566             errln("line %d: Tag expected in test file.", lineNum);
1567             parseState = PARSE_COMMENT;
1568             savedState = PARSE_DATA;
1569             goto end_test; // Stop the test.
1570             }
1571             break;
1572
1573         case PARSE_DATA:
1574             if (c == CH_BULLET) {
1575                 int32_t  breakIdx = tp.dataToBreak.length();
1576                 tp.expectedBreaks->setSize(breakIdx+1);
1577                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1578                 tp.srcLine->setSize(breakIdx+1);
1579                 tp.srcLine->setElementAt(lineNum, breakIdx);
1580                 tp.srcCol ->setSize(breakIdx+1);
1581                 tp.srcCol ->setElementAt(column, breakIdx);
1582                 break;
1583             }
1584
1585             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1586                 // Add final entry to mappings from break location to source file position.
1587                 //  Need one extra because last break position returned is after the
1588                 //    last char in the data, not at the last char.
1589                 tp.srcLine->addElement(lineNum, status);
1590                 tp.srcCol ->addElement(column, status);
1591
1592                 parseState = PARSE_TAG;
1593                 charIdx += 6;
1594
1595                 // RUN THE TEST!
1596                 executeTest(&tp);
1597                 break;
1598             }
1599
1600             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1601                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1602                 // Get the code point from the name and insert it into the test data.
1603                 //   (Damn, no API takes names in Unicode  !!!
1604                 //    we've got to take it back to char *)
1605                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1606                 int32_t nameLength = nameEndIdx - (charIdx+2);
1607                 char charNameBuf[200];
1608                 UChar32 theChar = -1;
1609                 if (nameEndIdx != -1) {
1610                     UErrorCode status = U_ZERO_ERROR;
1611                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1612                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1613                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1614                     if (U_FAILURE(status)) {
1615                         theChar = -1;
1616                     }
1617                 }
1618                 if (theChar == -1) {
1619                     errln("Error in named character in test file at line %d, col %d",
1620                         lineNum, column);
1621                 } else {
1622                     // Named code point was recognized.  Insert it
1623                     //   into the test data.
1624                     tp.dataToBreak.append(theChar);
1625                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1626                         tp.srcLine->addElement(lineNum, status);
1627                         tp.srcCol ->addElement(column, status);
1628                     }
1629                 }
1630                 if (nameEndIdx > charIdx) {
1631                     charIdx = nameEndIdx+1;
1632
1633                 }
1634                 break;
1635             }
1636
1637
1638
1639
1640             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1641                 charIdx++;
1642                 int32_t  breakIdx = tp.dataToBreak.length();
1643                 tp.expectedBreaks->setSize(breakIdx+1);
1644                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1645                 tp.srcLine->setSize(breakIdx+1);
1646                 tp.srcLine->setElementAt(lineNum, breakIdx);
1647                 tp.srcCol ->setSize(breakIdx+1);
1648                 tp.srcCol ->setElementAt(column, breakIdx);
1649                 break;
1650             }
1651
1652             if (c == CH_LT) {
1653                 tagValue   = 0;
1654                 parseState = PARSE_NUM;
1655                 break;
1656             }
1657
1658             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1659                 parseState = PARSE_COMMENT;
1660                 savedState = PARSE_DATA;
1661                 break;
1662             }
1663
1664             if (c == CH_BACKSLASH) {
1665                 // Check for \ at end of line, a line continuation.
1666                 //     Advance over (discard) the newline
1667                 UChar32 cp = testString.char32At(charIdx);
1668                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1669                     // We have a CR LF
1670                     //  Need an extra increment of the input ptr to move over both of them
1671                     charIdx++;
1672                 }
1673                 if (cp == CH_LF || cp == CH_CR) {
1674                     lineNum++;
1675                     colStart = charIdx;
1676                     charIdx++;
1677                     break;
1678                 }
1679
1680                 // Let unescape handle the back slash.
1681                 cp = testString.unescapeAt(charIdx);
1682                 if (cp != -1) {
1683                     // Escape sequence was recognized.  Insert the char
1684                     //   into the test data.
1685                     tp.dataToBreak.append(cp);
1686                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1687                         tp.srcLine->addElement(lineNum, status);
1688                         tp.srcCol ->addElement(column, status);
1689                     }
1690                     break;
1691                 }
1692
1693
1694                 // Not a recognized backslash escape sequence.
1695                 // Take the next char as a literal.
1696                 //  TODO:  Should this be an error?
1697                 c = testString.charAt(charIdx);
1698                 charIdx = testString.moveIndex32(charIdx, 1);
1699             }
1700
1701             // Normal, non-escaped data char.
1702             tp.dataToBreak.append(c);
1703
1704             // Save the mapping from offset in the data to line/column numbers in
1705             //   the original input file.  Will be used for better error messages only.
1706             //   If there's an expected break before this char, the slot in the mapping
1707             //     vector will already be set for this char; don't overwrite it.
1708             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1709                 tp.srcLine->addElement(lineNum, status);
1710                 tp.srcCol ->addElement(column, status);
1711             }
1712             break;
1713
1714
1715         case PARSE_NUM:
1716             // We are parsing an expected numeric tag value, like <1234>,
1717             //   within a chunk of data.
1718             if (u_isUWhiteSpace(c)) {
1719                 break;
1720             }
1721
1722             if (c == CH_GT) {
1723                 // Finished the number.  Add the info to the expected break data,
1724                 //   and switch parse state back to doing plain data.
1725                 parseState = PARSE_DATA;
1726                 if (tagValue == 0) {
1727                     tagValue = -1;
1728                 }
1729                 int32_t  breakIdx = tp.dataToBreak.length();
1730                 tp.expectedBreaks->setSize(breakIdx+1);
1731                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1732                 tp.srcLine->setSize(breakIdx+1);
1733                 tp.srcLine->setElementAt(lineNum, breakIdx);
1734                 tp.srcCol ->setSize(breakIdx+1);
1735                 tp.srcCol ->setElementAt(column, breakIdx);
1736                 break;
1737             }
1738
1739             if (u_isdigit(c)) {
1740                 tagValue = tagValue*10 + u_charDigitValue(c);
1741                 break;
1742             }
1743
1744             errln("Syntax Error in test file at line %d, col %d",
1745                 lineNum, column);
1746             parseState = PARSE_COMMENT;
1747             goto end_test; // Stop the test
1748             break;
1749         }
1750
1751
1752         if (U_FAILURE(status)) {
1753             errln("ICU Error %s while parsing test file at line %d.",
1754                 u_errorName(status), lineNum);
1755             status = U_ZERO_ERROR;
1756             goto end_test; // Stop the test
1757         }
1758
1759     }
1760
1761 end_test:
1762     delete tp.bi;
1763     delete tp.expectedBreaks;
1764     delete tp.srcLine;
1765     delete tp.srcCol;
1766     delete [] testFile;
1767 #endif
1768 }
1769
1770 void RBBITest::TestThaiBreaks() {
1771     UErrorCode status=U_ZERO_ERROR;
1772     BreakIterator* b;
1773     Locale locale = Locale("th");
1774     int32_t p, index;
1775     UChar c[]= {
1776             0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
1777             0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
1778             0x0E16, 0x0E49, 0x0E33
1779     };
1780     int32_t expectedWordResult[] = {
1781             2, 3, 6, 10, 11, 15, 17, 20, 22
1782     };
1783     int32_t expectedLineResult[] = {
1784             3, 6, 11, 15, 17, 20, 22
1785     };
1786     int32_t size = sizeof(c)/sizeof(UChar);
1787     UnicodeString text=UnicodeString(c);
1788
1789     b = BreakIterator::createWordInstance(locale, status);
1790     if (U_FAILURE(status)) {
1791         errln("Unable to create thai word break iterator.\n");
1792         return;
1793     }
1794     b->setText(text);
1795     p = index = 0;
1796     while ((p=b->next())!=BreakIterator::DONE && p < size) {
1797         if (p != expectedWordResult[index++]) {
1798             errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
1799         }
1800     }
1801     delete b;
1802
1803     b = BreakIterator::createLineInstance(locale, status);
1804     if (U_FAILURE(status)) {
1805         printf("Unable to create thai line break iterator.\n");
1806         return;
1807     }
1808     b->setText(text);
1809     p = index = 0;
1810     while ((p=b->next())!=BreakIterator::DONE && p < size) {
1811         if (p != expectedLineResult[index++]) {
1812             errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
1813         }
1814     }
1815
1816     delete b;
1817 }
1818
1819
1820 //-------------------------------------------------------------------------------
1821 //
1822 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1823 //    return the datain one big UChar * buffer, which the caller must delete.
1824 //
1825 //    parameters:
1826 //          fileName:   the name of the file, with no directory part.  The test data directory
1827 //                      is assumed.
1828 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1829 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1830 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1831 //                      Pass NULL for the system default encoding.
1832 //          status
1833 //    returns:
1834 //                      The file data, converted to UChar.
1835 //                      The caller must delete this when done with
1836 //                           delete [] theBuffer;
1837 //
1838 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1839 //           Move this function to some common place.
1840 //
1841 //--------------------------------------------------------------------------------
1842 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1843     UChar       *retPtr  = NULL;
1844     char        *fileBuf = NULL;
1845     UConverter* conv     = NULL;
1846     FILE        *f       = NULL;
1847
1848     ulen = 0;
1849     if (U_FAILURE(status)) {
1850         return retPtr;
1851     }
1852
1853     //
1854     //  Open the file.
1855     //
1856     f = fopen(fileName, "rb");
1857     if (f == 0) {
1858         dataerrln("[DATA] Error opening test data file %s\n", fileName);
1859         status = U_FILE_ACCESS_ERROR;
1860         return NULL;
1861     }
1862     //
1863     //  Read it in
1864     //
1865     int   fileSize;
1866     int   amt_read;
1867
1868     fseek( f, 0, SEEK_END);
1869     fileSize = ftell(f);
1870     fileBuf = new char[fileSize];
1871     fseek(f, 0, SEEK_SET);
1872     amt_read = fread(fileBuf, 1, fileSize, f);
1873     if (amt_read != fileSize || fileSize <= 0) {
1874         errln("Error reading test data file.");
1875         goto cleanUpAndReturn;
1876     }
1877
1878     //
1879     // Look for a Unicode Signature (BOM) on the data just read
1880     //
1881     int32_t        signatureLength;
1882     const char *   fileBufC;
1883     const char*    bomEncoding;
1884
1885     fileBufC = fileBuf;
1886     bomEncoding = ucnv_detectUnicodeSignature(
1887         fileBuf, fileSize, &signatureLength, &status);
1888     if(bomEncoding!=NULL ){
1889         fileBufC  += signatureLength;
1890         fileSize  -= signatureLength;
1891         encoding = bomEncoding;
1892     }
1893
1894     //
1895     // Open a converter to take the rule file to UTF-16
1896     //
1897     conv = ucnv_open(encoding, &status);
1898     if (U_FAILURE(status)) {
1899         goto cleanUpAndReturn;
1900     }
1901
1902     //
1903     // Convert the rules to UChar.
1904     //  Preflight first to determine required buffer size.
1905     //
1906     ulen = ucnv_toUChars(conv,
1907         NULL,           //  dest,
1908         0,              //  destCapacity,
1909         fileBufC,
1910         fileSize,
1911         &status);
1912     if (status == U_BUFFER_OVERFLOW_ERROR) {
1913         // Buffer Overflow is expected from the preflight operation.
1914         status = U_ZERO_ERROR;
1915
1916         retPtr = new UChar[ulen+1];
1917         ucnv_toUChars(conv,
1918             retPtr,       //  dest,
1919             ulen+1,
1920             fileBufC,
1921             fileSize,
1922             &status);
1923     }
1924
1925 cleanUpAndReturn:
1926     fclose(f);
1927     delete []fileBuf;
1928     ucnv_close(conv);
1929     if (U_FAILURE(status)) {
1930         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1931         delete retPtr;
1932         retPtr = 0;
1933         ulen   = 0;
1934     };
1935     return retPtr;
1936 }
1937
1938
1939
1940 //--------------------------------------------------------------------------------------------
1941 //
1942 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1943 //
1944 //-------------------------------------------------------------------------------------------
1945 void RBBITest::TestUnicodeFiles() {
1946     RuleBasedBreakIterator  *bi;
1947     UErrorCode               status = U_ZERO_ERROR;
1948
1949     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getDefault(), status);
1950     TEST_ASSERT_SUCCESS(status);
1951     if (U_SUCCESS(status)) {
1952         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1953     }
1954     delete bi;
1955
1956     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
1957     TEST_ASSERT_SUCCESS(status);
1958     if (U_SUCCESS(status)) {
1959         runUnicodeTestData("WordBreakTest.txt", bi);
1960     }
1961     delete bi;
1962
1963     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
1964     TEST_ASSERT_SUCCESS(status);
1965     if (U_SUCCESS(status)) {
1966         runUnicodeTestData("SentenceBreakTest.txt", bi);
1967     }
1968     delete bi;
1969
1970     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1971     TEST_ASSERT_SUCCESS(status);
1972     if (U_SUCCESS(status)) {
1973         runUnicodeTestData("LineBreakTest.txt", bi);
1974     }
1975     delete bi;
1976 }
1977
1978
1979 //--------------------------------------------------------------------------------------------
1980 //
1981 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1982 //
1983 //-------------------------------------------------------------------------------------------
1984 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1985 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1986     UErrorCode  status = U_ZERO_ERROR;
1987
1988     //
1989     //  Open and read the test data file, put it into a UnicodeString.
1990     //
1991     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1992     char testFileName[1000];
1993     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1994         dataerrln("[DATA] Can't open test data.  Path too long.");
1995         return;
1996     }
1997     strcpy(testFileName, testDataDirectory);
1998     strcat(testFileName, fileName);
1999
2000     logln("Opening data file %s\n", fileName);
2001
2002     int    len;
2003     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2004     if (status != U_FILE_ACCESS_ERROR) {
2005         TEST_ASSERT_SUCCESS(status);
2006         TEST_ASSERT(testFile != NULL);
2007     }
2008     if (U_FAILURE(status) || testFile == NULL) {
2009         return; /* something went wrong, error already output */
2010     }
2011     UnicodeString testFileAsString(TRUE, testFile, len);
2012
2013     //
2014     //  Parse the test data file using a regular expression.
2015     //  Each kind of token is recognized in its own capture group; what type of item was scanned
2016     //     is identified by which group had a match.
2017     //
2018     //    Caputure Group #                  1          2            3            4           5
2019     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
2020     //
2021     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2022     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2023     UnicodeString   testString;
2024     UVector32       breakPositions(status);
2025     int             lineNumber = 1;
2026     TEST_ASSERT_SUCCESS(status);
2027     if (U_FAILURE(status)) {
2028         return;
2029     }
2030
2031     //
2032     //  Scan through each test case, building up the string to be broken in testString,
2033     //   and the positions that should be boundaries in the breakPositions vector.
2034     //
2035     while (tokenMatcher.find()) {
2036         if (tokenMatcher.start(1, status) >= 0) {
2037             // Scanned a divide sign, indicating a break position in the test data.
2038             if (testString.length()>0) {
2039                 breakPositions.addElement(testString.length(), status);
2040             }
2041         }
2042         else if (tokenMatcher.start(2, status) >= 0) {
2043             // Scanned an 'x', meaning no break at this position in the test data
2044             //   Nothing to be done here.
2045             }
2046         else if (tokenMatcher.start(3, status) >= 0) {
2047             // Scanned Hex digits.  Convert them to binary, append to the character data string.
2048             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2049             int length = hexNumber.length();
2050             if (length<=8) {
2051                 char buf[10];
2052                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2053                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
2054                 if (c<=0x10ffff) {
2055                     testString.append(c);
2056                 } else {
2057                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2058                        fileName, lineNumber);
2059                 }
2060             } else {
2061                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2062                        fileName, lineNumber);
2063              }
2064         }
2065         else if (tokenMatcher.start(4, status) >= 0) {
2066             // Scanned to end of a line, possibly skipping over a comment in the process.
2067             //   If the line from the file contained test data, run the test now.
2068             //
2069             if (testString.length() > 0) {
2070                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
2071             }
2072
2073             // Clear out this test case.
2074             //    The string and breakPositions vector will be refilled as the next
2075             //       test case is parsed.
2076             testString.remove();
2077             breakPositions.removeAllElements();
2078             lineNumber++;
2079         } else {
2080             // Scanner catchall.  Something unrecognized appeared on the line.
2081             char token[16];
2082             UnicodeString uToken = tokenMatcher.group(0, status);
2083             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2084             token[sizeof(token)-1] = 0;
2085             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
2086
2087             // Clean up, in preparation for continuing with the next line.
2088             testString.remove();
2089             breakPositions.removeAllElements();
2090             lineNumber++;
2091         }
2092         TEST_ASSERT_SUCCESS(status);
2093         if (U_FAILURE(status)) {
2094             break;
2095         }
2096     }
2097
2098     delete [] testFile;
2099  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
2100 }
2101
2102 //--------------------------------------------------------------------------------------------
2103 //
2104 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
2105 //                            test data files.  Do only a simple, forward-only check -
2106 //                            this test is mostly to check that ICU and the Unicode
2107 //                            data agree with each other.
2108 //
2109 //--------------------------------------------------------------------------------------------
2110 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2111                          const UnicodeString &testString,   // Text data to be broken
2112                          UVector32 *breakPositions,         // Positions where breaks should be found.
2113                          RuleBasedBreakIterator *bi) {
2114     int32_t pos;                 // Break Position in the test string
2115     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
2116     int32_t expectedPos;         // Expected break position (index into test string)
2117
2118     bi->setText(testString);
2119     pos = bi->first();
2120     pos = bi->next();
2121
2122     while (pos != BreakIterator::DONE) {
2123         if (expectedI >= breakPositions->size()) {
2124             errln("Test file \"%s\", line %d, unexpected break found at position %d",
2125                 testFileName, lineNumber, pos);
2126             break;
2127         }
2128         expectedPos = breakPositions->elementAti(expectedI);
2129         if (pos < expectedPos) {
2130             errln("Test file \"%s\", line %d, unexpected break found at position %d",
2131                 testFileName, lineNumber, pos);
2132             break;
2133         }
2134         if (pos > expectedPos) {
2135             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2136                 testFileName, lineNumber, expectedPos);
2137             break;
2138         }
2139         pos = bi->next();
2140         expectedI++;
2141     }
2142
2143     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2144         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2145             testFileName, lineNumber, breakPositions->elementAti(expectedI));
2146     }
2147 }
2148
2149
2150
2151 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2152 //---------------------------------------------------------------------------------------
2153 //
2154 //   classs RBBIMonkeyKind
2155 //
2156 //      Monkey Test for Break Iteration
2157 //      Abstract interface class.   Concrete derived classes independently
2158 //      implement the break rules for different iterator types.
2159 //
2160 //      The Monkey Test itself uses doesn't know which type of break iterator it is
2161 //      testing, but works purely in terms of the interface defined here.
2162 //
2163 //---------------------------------------------------------------------------------------
2164 class RBBIMonkeyKind {
2165 public:
2166     // Return a UVector of UnicodeSets, representing the character classes used
2167     //   for this type of iterator.
2168     virtual  UVector  *charClasses() = 0;
2169
2170     // Set the test text on which subsequent calls to next() will operate
2171     virtual  void      setText(const UnicodeString &s) = 0;
2172
2173     // Find the next break postion, starting from the prev break position, or from zero.
2174     // Return -1 after reaching end of string.
2175     virtual  int32_t   next(int32_t i) = 0;
2176
2177     virtual ~RBBIMonkeyKind();
2178     UErrorCode       deferredStatus;
2179
2180
2181 protected:
2182     RBBIMonkeyKind();
2183
2184 private:
2185 };
2186
2187 RBBIMonkeyKind::RBBIMonkeyKind() {
2188     deferredStatus = U_ZERO_ERROR;
2189 }
2190
2191 RBBIMonkeyKind::~RBBIMonkeyKind() {
2192 }
2193
2194
2195 //----------------------------------------------------------------------------------------
2196 //
2197 //   Random Numbers.  Similar to standard lib rand() and srand()
2198 //                    Not using library to
2199 //                      1.  Get same results on all platforms.
2200 //                      2.  Get access to current seed, to more easily reproduce failures.
2201 //
2202 //---------------------------------------------------------------------------------------
2203 static uint32_t m_seed = 1;
2204
2205 static uint32_t m_rand()
2206 {
2207     m_seed = m_seed * 1103515245 + 12345;
2208     return (uint32_t)(m_seed/65536) % 32768;
2209 }
2210
2211
2212 //------------------------------------------------------------------------------------------
2213 //
2214 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2215 //                             of RBBIMonkeyKind.
2216 //
2217 //------------------------------------------------------------------------------------------
2218 class RBBICharMonkey: public RBBIMonkeyKind {
2219 public:
2220     RBBICharMonkey();
2221     virtual          ~RBBICharMonkey();
2222     virtual  UVector *charClasses();
2223     virtual  void     setText(const UnicodeString &s);
2224     virtual  int32_t  next(int32_t i);
2225 private:
2226     UVector   *fSets;
2227
2228     UnicodeSet  *fCRLFSet;
2229     UnicodeSet  *fControlSet;
2230     UnicodeSet  *fExtendSet;
2231     UnicodeSet  *fPrependSet;
2232     UnicodeSet  *fSpacingSet;
2233     UnicodeSet  *fLSet;
2234     UnicodeSet  *fVSet;
2235     UnicodeSet  *fTSet;
2236     UnicodeSet  *fLVSet;
2237     UnicodeSet  *fLVTSet;
2238     UnicodeSet  *fHangulSet;
2239     UnicodeSet  *fAnySet;
2240
2241     const UnicodeString *fText;
2242 };
2243
2244
2245 RBBICharMonkey::RBBICharMonkey() {
2246     UErrorCode  status = U_ZERO_ERROR;
2247
2248     fText = NULL;
2249
2250     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2251     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2252     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2253     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2254     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2255     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2256     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2257     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2258     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2259     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2260     fHangulSet  = new UnicodeSet();
2261     fHangulSet->addAll(*fLSet);
2262     fHangulSet->addAll(*fVSet);
2263     fHangulSet->addAll(*fTSet);
2264     fHangulSet->addAll(*fLVSet);
2265     fHangulSet->addAll(*fLVTSet);
2266     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2267
2268     fSets       = new UVector(status);
2269     fSets->addElement(fCRLFSet,    status);
2270     fSets->addElement(fControlSet, status);
2271     fSets->addElement(fExtendSet,  status);
2272     fSets->addElement(fPrependSet, status);
2273     fSets->addElement(fSpacingSet, status);
2274     fSets->addElement(fHangulSet,  status);
2275     fSets->addElement(fAnySet,     status);
2276     if (U_FAILURE(status)) {
2277         deferredStatus = status;
2278     }
2279 }
2280
2281
2282 void RBBICharMonkey::setText(const UnicodeString &s) {
2283     fText = &s;
2284 }
2285
2286
2287
2288 int32_t RBBICharMonkey::next(int32_t prevPos) {
2289     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2290                               //   break position being tested.  The candidate break
2291                               //   location is before p2.
2292
2293     int     breakPos = -1;
2294
2295     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2296
2297     if (U_FAILURE(deferredStatus)) {
2298         return -1;
2299     }
2300
2301     // Previous break at end of string.  return DONE.
2302     if (prevPos >= fText->length()) {
2303         return -1;
2304     }
2305     p0 = p1 = p2 = p3 = prevPos;
2306     c3 =  fText->char32At(prevPos);
2307     c0 = c1 = c2 = 0;
2308
2309     // Loop runs once per "significant" character position in the input text.
2310     for (;;) {
2311         // Move all of the positions forward in the input string.
2312         p0 = p1;  c0 = c1;
2313         p1 = p2;  c1 = c2;
2314         p2 = p3;  c2 = c3;
2315
2316         // Advancd p3 by one codepoint
2317         p3 = fText->moveIndex32(p3, 1);
2318         c3 = fText->char32At(p3);
2319
2320         if (p1 == p2) {
2321             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2322             continue;
2323         }
2324         if (p2 == fText->length()) {
2325             // Reached end of string.  Always a break position.
2326             break;
2327         }
2328
2329         // Rule  GB3   CR x LF
2330         //     No Extend or Format characters may appear between the CR and LF,
2331         //     which requires the additional check for p2 immediately following p1.
2332         //
2333         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2334             continue;
2335         }
2336
2337         // Rule (GB4).   ( Control | CR | LF ) <break>
2338         if (fControlSet->contains(c1) ||
2339             c1 == 0x0D ||
2340             c1 == 0x0A)  {
2341             break;
2342         }
2343
2344         // Rule (GB5)    <break>  ( Control | CR | LF )
2345         //
2346         if (fControlSet->contains(c2) ||
2347             c2 == 0x0D ||
2348             c2 == 0x0A)  {
2349             break;
2350         }
2351
2352
2353         // Rule (GB6)  L x ( L | V | LV | LVT )
2354         if (fLSet->contains(c1) &&
2355                (fLSet->contains(c2)  ||
2356                 fVSet->contains(c2)  ||
2357                 fLVSet->contains(c2) ||
2358                 fLVTSet->contains(c2))) {
2359             continue;
2360         }
2361
2362         // Rule (GB7)    ( LV | V )  x  ( V | T )
2363         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2364             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2365             continue;
2366         }
2367
2368         // Rule (GB8)    ( LVT | T)  x T
2369         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2370             fTSet->contains(c2))  {
2371             continue;
2372         }
2373
2374         // Rule (GB9)    Numeric x ALetter
2375         if (fExtendSet->contains(c2))  {
2376             continue;
2377         }
2378
2379         // Rule (GB9a)   x  SpacingMark
2380         if (fSpacingSet->contains(c2)) {
2381             continue;
2382         }
2383
2384         // Rule (GB9b)   Prepend x
2385         if (fPrependSet->contains(c1)) {
2386             continue;
2387         }
2388
2389         // Rule (GB10)  Any  <break>  Any
2390         break;
2391     }
2392
2393     breakPos = p2;
2394     return breakPos;
2395 }
2396
2397
2398
2399 UVector  *RBBICharMonkey::charClasses() {
2400     return fSets;
2401 }
2402
2403
2404 RBBICharMonkey::~RBBICharMonkey() {
2405     delete fSets;
2406     delete fCRLFSet;
2407     delete fControlSet;
2408     delete fExtendSet;
2409     delete fPrependSet;
2410     delete fSpacingSet;
2411     delete fLSet;
2412     delete fVSet;
2413     delete fTSet;
2414     delete fLVSet;
2415     delete fLVTSet;
2416     delete fHangulSet;
2417     delete fAnySet;
2418 }
2419
2420 //------------------------------------------------------------------------------------------
2421 //
2422 //   class RBBIWordMonkey      Word Break specific implementation
2423 //                             of RBBIMonkeyKind.
2424 //
2425 //------------------------------------------------------------------------------------------
2426 class RBBIWordMonkey: public RBBIMonkeyKind {
2427 public:
2428     RBBIWordMonkey();
2429     virtual          ~RBBIWordMonkey();
2430     virtual  UVector *charClasses();
2431     virtual  void     setText(const UnicodeString &s);
2432     virtual int32_t   next(int32_t i);
2433 private:
2434     UVector      *fSets;
2435
2436     UnicodeSet  *fCRSet;
2437     UnicodeSet  *fLFSet;
2438     UnicodeSet  *fNewlineSet;
2439     UnicodeSet  *fKatakanaSet;
2440     UnicodeSet  *fALetterSet;
2441     UnicodeSet  *fMidNumLetSet;
2442     UnicodeSet  *fMidLetterSet;
2443     UnicodeSet  *fMidNumSet;
2444     UnicodeSet  *fNumericSet;
2445     UnicodeSet  *fFormatSet;
2446     UnicodeSet  *fOtherSet;
2447     UnicodeSet  *fExtendSet;
2448     UnicodeSet  *fExtendNumLetSet;
2449
2450     RegexMatcher  *fMatcher;
2451
2452     const UnicodeString  *fText;
2453 };
2454
2455
2456 RBBIWordMonkey::RBBIWordMonkey()
2457 {
2458     UErrorCode  status = U_ZERO_ERROR;
2459
2460     fSets            = new UVector(status);
2461
2462     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2463     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2464     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2465     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
2466     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2467     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2468     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2469     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2470     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2471     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2472     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2473     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2474
2475     fOtherSet        = new UnicodeSet();
2476     if(U_FAILURE(status)) {
2477       deferredStatus = status;
2478       return;
2479     }
2480
2481     fOtherSet->complement();
2482     fOtherSet->removeAll(*fCRSet);
2483     fOtherSet->removeAll(*fLFSet);
2484     fOtherSet->removeAll(*fNewlineSet);
2485     fOtherSet->removeAll(*fKatakanaSet);
2486     fOtherSet->removeAll(*fALetterSet);
2487     fOtherSet->removeAll(*fMidLetterSet);
2488     fOtherSet->removeAll(*fMidNumSet);
2489     fOtherSet->removeAll(*fNumericSet);
2490     fOtherSet->removeAll(*fExtendNumLetSet);
2491     fOtherSet->removeAll(*fFormatSet);
2492     fOtherSet->removeAll(*fExtendSet);
2493     // Inhibit dictionary characters from being tested at all.
2494     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2495
2496     fSets->addElement(fCRSet,        status);
2497     fSets->addElement(fLFSet,        status);
2498     fSets->addElement(fNewlineSet,   status);
2499     fSets->addElement(fALetterSet,   status);
2500     fSets->addElement(fKatakanaSet,  status);
2501     fSets->addElement(fMidLetterSet, status);
2502     fSets->addElement(fMidNumLetSet, status);
2503     fSets->addElement(fMidNumSet,    status);
2504     fSets->addElement(fNumericSet,   status);
2505     fSets->addElement(fFormatSet,    status);
2506     fSets->addElement(fExtendSet,    status);
2507     fSets->addElement(fOtherSet,     status);
2508     fSets->addElement(fExtendNumLetSet, status);
2509
2510     if (U_FAILURE(status)) {
2511         deferredStatus = status;
2512     }
2513 }
2514
2515 void RBBIWordMonkey::setText(const UnicodeString &s) {
2516     fText       = &s;
2517 }
2518
2519
2520 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2521     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2522                               //   break position being tested.  The candidate break
2523                               //   location is before p2.
2524
2525     int     breakPos = -1;
2526
2527     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2528
2529     if (U_FAILURE(deferredStatus)) {
2530         return -1;
2531     }
2532
2533     // Prev break at end of string.  return DONE.
2534     if (prevPos >= fText->length()) {
2535         return -1;
2536     }
2537     p0 = p1 = p2 = p3 = prevPos;
2538     c3 =  fText->char32At(prevPos);
2539     c0 = c1 = c2 = 0;
2540
2541     // Loop runs once per "significant" character position in the input text.
2542     for (;;) {
2543         // Move all of the positions forward in the input string.
2544         p0 = p1;  c0 = c1;
2545         p1 = p2;  c1 = c2;
2546         p2 = p3;  c2 = c3;
2547
2548         // Advancd p3 by    X(Extend | Format)*   Rule 4
2549         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2550         do {
2551             p3 = fText->moveIndex32(p3, 1);
2552             c3 = fText->char32At(p3);
2553             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2554                break;
2555             };
2556         }
2557         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2558
2559
2560         if (p1 == p2) {
2561             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2562             continue;
2563         }
2564         if (p2 == fText->length()) {
2565             // Reached end of string.  Always a break position.
2566             break;
2567         }
2568
2569         // Rule  (3)   CR x LF
2570         //     No Extend or Format characters may appear between the CR and LF,
2571         //     which requires the additional check for p2 immediately following p1.
2572         //
2573         if (c1==0x0D && c2==0x0A) {
2574             continue;
2575         }
2576
2577         // Rule (3a)  Break before and after newlines (including CR and LF)
2578         //
2579         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2580             break;
2581         };
2582         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2583             break;
2584         };
2585
2586         // Rule (5).   ALetter x ALetter
2587         if (fALetterSet->contains(c1) &&
2588             fALetterSet->contains(c2))  {
2589             continue;
2590         }
2591
2592         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2593         //
2594         if ( fALetterSet->contains(c1)   &&
2595              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2596              fALetterSet->contains(c3)) {
2597             continue;
2598         }
2599
2600
2601         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2602         if (fALetterSet->contains(c0) &&
2603             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2604             fALetterSet->contains(c2)) {
2605             continue;
2606         }
2607
2608         // Rule (8)    Numeric x Numeric
2609         if (fNumericSet->contains(c1) &&
2610             fNumericSet->contains(c2))  {
2611             continue;
2612         }
2613
2614         // Rule (9)    ALetter x Numeric
2615         if (fALetterSet->contains(c1) &&
2616             fNumericSet->contains(c2))  {
2617             continue;
2618         }
2619
2620         // Rule (10)    Numeric x ALetter
2621         if (fNumericSet->contains(c1) &&
2622             fALetterSet->contains(c2))  {
2623             continue;
2624         }
2625
2626         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2627         if (fNumericSet->contains(c0) &&
2628             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2629             fNumericSet->contains(c2)) {
2630             continue;
2631         }
2632
2633         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2634         if (fNumericSet->contains(c1) &&
2635             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2636             fNumericSet->contains(c3)) {
2637             continue;
2638         }
2639
2640         // Rule (13)  Katakana x Katakana
2641         if (fKatakanaSet->contains(c1) &&
2642             fKatakanaSet->contains(c2))  {
2643             continue;
2644         }
2645
2646         // Rule 13a
2647         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2648              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2649              fExtendNumLetSet->contains(c2)) {
2650                 continue;
2651              }
2652
2653         // Rule 13b
2654         if (fExtendNumLetSet->contains(c1) &&
2655                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2656                 fKatakanaSet->contains(c2)))  {
2657                 continue;
2658              }
2659
2660         // Rule 14.  Break found here.
2661         break;
2662     }
2663
2664     breakPos = p2;
2665     return breakPos;
2666 }
2667
2668
2669 UVector  *RBBIWordMonkey::charClasses() {
2670     return fSets;
2671 }
2672
2673
2674 RBBIWordMonkey::~RBBIWordMonkey() {
2675     delete fSets;
2676     delete fCRSet;
2677     delete fLFSet;
2678     delete fNewlineSet;
2679     delete fKatakanaSet;
2680     delete fALetterSet;
2681     delete fMidNumLetSet;
2682     delete fMidLetterSet;
2683     delete fMidNumSet;
2684     delete fNumericSet;
2685     delete fFormatSet;
2686     delete fExtendSet;
2687     delete fExtendNumLetSet;
2688     delete fOtherSet;
2689 }
2690
2691
2692
2693
2694 //------------------------------------------------------------------------------------------
2695 //
2696 //   class RBBISentMonkey      Sentence Break specific implementation
2697 //                             of RBBIMonkeyKind.
2698 //
2699 //------------------------------------------------------------------------------------------
2700 class RBBISentMonkey: public RBBIMonkeyKind {
2701 public:
2702     RBBISentMonkey();
2703     virtual          ~RBBISentMonkey();
2704     virtual  UVector *charClasses();
2705     virtual  void     setText(const UnicodeString &s);
2706     virtual int32_t   next(int32_t i);
2707 private:
2708     int               moveBack(int posFrom);
2709     int               moveForward(int posFrom);
2710     UChar32           cAt(int pos);
2711
2712     UVector      *fSets;
2713
2714     UnicodeSet  *fSepSet;
2715     UnicodeSet  *fFormatSet;
2716     UnicodeSet  *fSpSet;
2717     UnicodeSet  *fLowerSet;
2718     UnicodeSet  *fUpperSet;
2719     UnicodeSet  *fOLetterSet;
2720     UnicodeSet  *fNumericSet;
2721     UnicodeSet  *fATermSet;
2722     UnicodeSet  *fSContinueSet;
2723     UnicodeSet  *fSTermSet;
2724     UnicodeSet  *fCloseSet;
2725     UnicodeSet  *fOtherSet;
2726     UnicodeSet  *fExtendSet;
2727
2728     const UnicodeString  *fText;
2729
2730 };
2731
2732 RBBISentMonkey::RBBISentMonkey()
2733 {
2734     UErrorCode  status = U_ZERO_ERROR;
2735
2736     fSets            = new UVector(status);
2737
2738     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2739     //                       set and made into character classes of their own.  For the monkey impl,
2740     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2741     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2742     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2743     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2744     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2745     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2746     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2747     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2748     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2749     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2750     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2751     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2752     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2753     fOtherSet        = new UnicodeSet();
2754
2755     if(U_FAILURE(status)) {
2756       deferredStatus = status;
2757       return;
2758     }
2759
2760     fOtherSet->complement();
2761     fOtherSet->removeAll(*fSepSet);
2762     fOtherSet->removeAll(*fFormatSet);
2763     fOtherSet->removeAll(*fSpSet);
2764     fOtherSet->removeAll(*fLowerSet);
2765     fOtherSet->removeAll(*fUpperSet);
2766     fOtherSet->removeAll(*fOLetterSet);
2767     fOtherSet->removeAll(*fNumericSet);
2768     fOtherSet->removeAll(*fATermSet);
2769     fOtherSet->removeAll(*fSContinueSet);
2770     fOtherSet->removeAll(*fSTermSet);
2771     fOtherSet->removeAll(*fCloseSet);
2772     fOtherSet->removeAll(*fExtendSet);
2773
2774     fSets->addElement(fSepSet,       status);
2775     fSets->addElement(fFormatSet,    status);
2776     fSets->addElement(fSpSet,        status);
2777     fSets->addElement(fLowerSet,     status);
2778     fSets->addElement(fUpperSet,     status);
2779     fSets->addElement(fOLetterSet,   status);
2780     fSets->addElement(fNumericSet,   status);
2781     fSets->addElement(fATermSet,     status);
2782     fSets->addElement(fSContinueSet, status);
2783     fSets->addElement(fSTermSet,     status);
2784     fSets->addElement(fCloseSet,     status);
2785     fSets->addElement(fOtherSet,     status);
2786     fSets->addElement(fExtendSet,    status);
2787
2788     if (U_FAILURE(status)) {
2789         deferredStatus = status;
2790     }
2791 }
2792
2793
2794
2795 void RBBISentMonkey::setText(const UnicodeString &s) {
2796     fText       = &s;
2797 }
2798
2799 UVector  *RBBISentMonkey::charClasses() {
2800     return fSets;
2801 }
2802
2803
2804 //  moveBack()   Find the "significant" code point preceding the index i.
2805 //               Skips over ($Extend | $Format)* .
2806 //
2807 int RBBISentMonkey::moveBack(int i) {
2808     if (i <= 0) {
2809         return -1;
2810     }
2811     UChar32   c;
2812     int32_t   j = i;
2813     do {
2814         j = fText->moveIndex32(j, -1);
2815         c = fText->char32At(j);
2816     }
2817     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2818     return j;
2819
2820  }
2821
2822
2823 int RBBISentMonkey::moveForward(int i) {
2824     if (i>=fText->length()) {
2825         return fText->length();
2826     }
2827     UChar32   c;
2828     int32_t   j = i;
2829     do {
2830         j = fText->moveIndex32(j, 1);
2831         c = cAt(j);
2832     }
2833     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2834     return j;
2835 }
2836
2837 UChar32 RBBISentMonkey::cAt(int pos) {
2838     if (pos<0 || pos>=fText->length()) {
2839         return -1;
2840     } else {
2841         return fText->char32At(pos);
2842     }
2843 }
2844
2845 int32_t RBBISentMonkey::next(int32_t prevPos) {
2846     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2847                               //   break position being tested.  The candidate break
2848                               //   location is before p2.
2849
2850     int     breakPos = -1;
2851
2852     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2853     UChar32 c;
2854
2855     if (U_FAILURE(deferredStatus)) {
2856         return -1;
2857     }
2858
2859     // Prev break at end of string.  return DONE.
2860     if (prevPos >= fText->length()) {
2861         return -1;
2862     }
2863     p0 = p1 = p2 = p3 = prevPos;
2864     c3 =  fText->char32At(prevPos);
2865     c0 = c1 = c2 = 0;
2866
2867     // Loop runs once per "significant" character position in the input text.
2868     for (;;) {
2869         // Move all of the positions forward in the input string.
2870         p0 = p1;  c0 = c1;
2871         p1 = p2;  c1 = c2;
2872         p2 = p3;  c2 = c3;
2873
2874         // Advancd p3 by    X(Extend | Format)*   Rule 4
2875         p3 = moveForward(p3);
2876         c3 = cAt(p3);
2877
2878         // Rule (3)  CR x LF
2879         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2880             continue;
2881         }
2882
2883         // Rule (4).   Sep  <break>
2884         if (fSepSet->contains(c1)) {
2885             p2 = p1+1;   // Separators don't combine with Extend or Format.
2886             break;
2887         }
2888
2889         if (p2 >= fText->length()) {
2890             // Reached end of string.  Always a break position.
2891             break;
2892         }
2893
2894         if (p2 == prevPos) {
2895             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2896             continue;
2897         }
2898
2899         // Rule (6).   ATerm x Numeric
2900         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2901             continue;
2902         }
2903
2904         // Rule (7).  Upper ATerm  x  Uppper
2905         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2906             continue;
2907         }
2908
2909         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2910         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2911         //                  note to the Unicode 5.0 documents.
2912         int p8 = p1;
2913         while (fSpSet->contains(cAt(p8))) {
2914             p8 = moveBack(p8);
2915         }
2916         while (fCloseSet->contains(cAt(p8))) {
2917             p8 = moveBack(p8);
2918         }
2919         if (fATermSet->contains(cAt(p8))) {
2920             p8=p2;
2921             for (;;) {
2922                 c = cAt(p8);
2923                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2924                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2925                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2926                     break;
2927                 }
2928                 p8 = moveForward(p8);
2929             }
2930             if (fLowerSet->contains(cAt(p8))) {
2931                 continue;
2932             }
2933         }
2934
2935         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2936         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2937             p8 = p1;
2938             while (fSpSet->contains(cAt(p8))) {
2939                 p8 = moveBack(p8);
2940             }
2941             while (fCloseSet->contains(cAt(p8))) {
2942                 p8 = moveBack(p8);
2943             }
2944             c = cAt(p8);
2945             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2946                 continue;
2947             }
2948         }
2949
2950         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2951         int p9 = p1;
2952         while (fCloseSet->contains(cAt(p9))) {
2953             p9 = moveBack(p9);
2954         }
2955         c = cAt(p9);
2956         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2957             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2958                 continue;
2959             }
2960         }
2961
2962         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2963         int p10 = p1;
2964         while (fSpSet->contains(cAt(p10))) {
2965             p10 = moveBack(p10);
2966         }
2967         while (fCloseSet->contains(cAt(p10))) {
2968             p10 = moveBack(p10);
2969         }
2970         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2971             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2972                 continue;
2973             }
2974         }
2975
2976         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2977         int p11 = p1;
2978         if (fSepSet->contains(cAt(p11))) {
2979             p11 = moveBack(p11);
2980         }
2981         while (fSpSet->contains(cAt(p11))) {
2982             p11 = moveBack(p11);
2983         }
2984         while (fCloseSet->contains(cAt(p11))) {
2985             p11 = moveBack(p11);
2986         }
2987         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2988             break;
2989         }
2990
2991         //  Rule (12)  Any x Any
2992         continue;
2993     }
2994     breakPos = p2;
2995     return breakPos;
2996 }
2997
2998 RBBISentMonkey::~RBBISentMonkey() {
2999     delete fSets;
3000     delete fSepSet;
3001     delete fFormatSet;
3002     delete fSpSet;
3003     delete fLowerSet;
3004     delete fUpperSet;
3005     delete fOLetterSet;
3006     delete fNumericSet;
3007     delete fATermSet;
3008     delete fSContinueSet;
3009     delete fSTermSet;
3010     delete fCloseSet;
3011     delete fOtherSet;
3012     delete fExtendSet;
3013 }
3014
3015
3016
3017 //-------------------------------------------------------------------------------------------
3018 //
3019 //  RBBILineMonkey
3020 //
3021 //-------------------------------------------------------------------------------------------
3022
3023 class RBBILineMonkey: public RBBIMonkeyKind {
3024 public:
3025     RBBILineMonkey();
3026     virtual          ~RBBILineMonkey();
3027     virtual  UVector *charClasses();
3028     virtual  void     setText(const UnicodeString &s);
3029     virtual  int32_t  next(int32_t i);
3030     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3031 private:
3032     UVector      *fSets;
3033
3034     UnicodeSet  *fBK;
3035     UnicodeSet  *fCR;
3036     UnicodeSet  *fLF;
3037     UnicodeSet  *fCM;
3038     UnicodeSet  *fNL;
3039     UnicodeSet  *fSG;
3040     UnicodeSet  *fWJ;
3041     UnicodeSet  *fZW;
3042     UnicodeSet  *fGL;
3043     UnicodeSet  *fCB;
3044     UnicodeSet  *fSP;
3045     UnicodeSet  *fB2;
3046     UnicodeSet  *fBA;
3047     UnicodeSet  *fBB;
3048     UnicodeSet  *fHY;
3049     UnicodeSet  *fH2;
3050     UnicodeSet  *fH3;
3051     UnicodeSet  *fCL;
3052     UnicodeSet  *fEX;
3053     UnicodeSet  *fIN;
3054     UnicodeSet  *fJL;
3055     UnicodeSet  *fJV;
3056     UnicodeSet  *fJT;
3057     UnicodeSet  *fNS;
3058     UnicodeSet  *fOP;
3059     UnicodeSet  *fQU;
3060     UnicodeSet  *fIS;
3061     UnicodeSet  *fNU;
3062     UnicodeSet  *fPO;
3063     UnicodeSet  *fPR;
3064     UnicodeSet  *fSY;
3065     UnicodeSet  *fAI;
3066     UnicodeSet  *fAL;
3067     UnicodeSet  *fID;
3068     UnicodeSet  *fSA;
3069     UnicodeSet  *fXX;
3070
3071     BreakIterator  *fCharBI;
3072
3073     const UnicodeString  *fText;
3074     int32_t              *fOrigPositions;
3075
3076     RegexMatcher         *fNumberMatcher;
3077     RegexMatcher         *fLB11Matcher;
3078 };
3079
3080
3081 RBBILineMonkey::RBBILineMonkey()
3082 {
3083     UErrorCode  status = U_ZERO_ERROR;
3084
3085     fSets  = new UVector(status);
3086
3087     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3088     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3089     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3090     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3091     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3092     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3093     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3094     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3095     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3096     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3097     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3098     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3099     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3100     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3101     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3102     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3103     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3104     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3105     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3106     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3107     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3108     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3109     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3110     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3111     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3112     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3113     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3114     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3115     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3116     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3117     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3118     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3119     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3120     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3121     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3122     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3123
3124     if (U_FAILURE(status)) {
3125         deferredStatus = status;
3126         fCharBI = NULL;
3127         fNumberMatcher = NULL;
3128         return;
3129     }
3130
3131     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3132     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3133     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
3134     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3135
3136     fSets->addElement(fBK, status);
3137     fSets->addElement(fCR, status);
3138     fSets->addElement(fLF, status);
3139     fSets->addElement(fCM, status);
3140     fSets->addElement(fNL, status);
3141     fSets->addElement(fWJ, status);
3142     fSets->addElement(fZW, status);
3143     fSets->addElement(fGL, status);
3144     fSets->addElement(fCB, status);
3145     fSets->addElement(fSP, status);
3146     fSets->addElement(fB2, status);
3147     fSets->addElement(fBA, status);
3148     fSets->addElement(fBB, status);
3149     fSets->addElement(fHY, status);
3150     fSets->addElement(fH2, status);
3151     fSets->addElement(fH3, status);
3152     fSets->addElement(fCL, status);
3153     fSets->addElement(fEX, status);
3154     fSets->addElement(fIN, status);
3155     fSets->addElement(fJL, status);
3156     fSets->addElement(fJT, status);
3157     fSets->addElement(fJV, status);
3158     fSets->addElement(fNS, status);
3159     fSets->addElement(fOP, status);
3160     fSets->addElement(fQU, status);
3161     fSets->addElement(fIS, status);
3162     fSets->addElement(fNU, status);
3163     fSets->addElement(fPO, status);
3164     fSets->addElement(fPR, status);
3165     fSets->addElement(fSY, status);
3166     fSets->addElement(fAI, status);
3167     fSets->addElement(fAL, status);
3168     fSets->addElement(fID, status);
3169     fSets->addElement(fWJ, status);
3170     fSets->addElement(fSA, status);
3171     fSets->addElement(fSG, status);
3172
3173     const char *rules =
3174             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3175             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3176             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3177             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3178             "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
3179             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3180
3181     fNumberMatcher = new RegexMatcher(
3182         UnicodeString(rules, -1, US_INV), 0, status);
3183
3184     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3185
3186     if (U_FAILURE(status)) {
3187         deferredStatus = status;
3188     }
3189 }
3190
3191
3192 void RBBILineMonkey::setText(const UnicodeString &s) {
3193     fText       = &s;
3194     fCharBI->setText(s);
3195     fNumberMatcher->reset(s);
3196 }
3197
3198 //
3199 //  rule9Adjust
3200 //     Line Break TR rules 9 and 10 implementation.
3201 //     This deals with combining marks and other sequences that
3202 //     that must be treated as if they were something other than what they actually are.
3203 //
3204 //     This is factored out into a separate function because it must be applied twice for
3205 //     each potential break, once to the chars before the position being checked, then
3206 //     again to the text following the possible break.
3207 //
3208 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3209     if (pos == -1) {
3210         // Invalid initial position.  Happens during the warmup iteration of the
3211         //   main loop in next().
3212         return;
3213     }
3214
3215     int32_t  nPos = *nextPos;
3216
3217     // LB 9  Keep combining sequences together.
3218     //  advance over any CM class chars.  Note that Line Break CM is different
3219     //  from the normal Grapheme Extend property.
3220     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3221           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3222         for (;;) {
3223             *nextChar = fText->char32At(nPos);
3224             if (!fCM->contains(*nextChar)) {
3225                 break;
3226             }
3227             nPos = fText->moveIndex32(nPos, 1);
3228         }
3229     }
3230
3231
3232     // LB 9 Treat X CM* as if it were x.
3233     //       No explicit action required.
3234
3235     // LB 10  Treat any remaining combining mark as AL
3236     if (fCM->contains(*posChar)) {
3237         *posChar = 0x41;   // thisChar = 'A';
3238     }
3239
3240     // Push the updated nextPos and nextChar back to our caller.
3241     // This only makes a difference if posChar got bigger by consuming a
3242     // combining sequence.
3243     *nextPos  = nPos;
3244     *nextChar = fText->char32At(nPos);
3245 }
3246
3247
3248
3249 int32_t RBBILineMonkey::next(int32_t startPos) {
3250     UErrorCode status = U_ZERO_ERROR;
3251     int32_t    pos;       //  Index of the char following a potential break position
3252     UChar32    thisChar;  //  Character at above position "pos"
3253
3254     int32_t    prevPos;   //  Index of the char preceding a potential break position
3255     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3256                           //   and thisChar may not be adjacent because combining
3257                           //   characters between them will be ignored.
3258
3259     int32_t    nextPos;   //  Index of the next character following pos.
3260                           //     Usually skips over combining marks.
3261     int32_t    nextCPPos; //  Index of the code point following "pos."
3262                           //     May point to a combining mark.
3263     int32_t    tPos;      //  temp value.
3264     UChar32    c;
3265
3266     if (U_FAILURE(deferredStatus)) {
3267         return -1;
3268     }
3269
3270     if (startPos >= fText->length()) {
3271         return -1;
3272     }
3273
3274
3275     // Initial values for loop.  Loop will run the first time without finding breaks,
3276     //                           while the invalid values shift out and the "this" and
3277     //                           "prev" positions are filled in with good values.
3278     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
3279     thisChar = prevChar  = 0;
3280     nextPos  = nextCPPos = startPos;
3281
3282
3283     // Loop runs once per position in the test text, until a break position
3284     //  is found.
3285     for (;;) {
3286         prevPos   = pos;
3287         prevChar  = thisChar;
3288
3289         pos       = nextPos;
3290         thisChar  = fText->char32At(pos);
3291
3292         nextCPPos = fText->moveIndex32(pos, 1);
3293         nextPos   = nextCPPos;
3294
3295         // Rule LB2 - Break at end of text.
3296         if (pos >= fText->length()) {
3297             break;
3298         }
3299
3300         // Rule LB 9 - adjust for combining sequences.
3301         //             We do this one out-of-order because the adjustment does not change anything
3302         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3303         //             be applied.
3304         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3305         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3306         c = fText->char32At(nextPos);
3307         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3308
3309         // If the loop is still warming up - if we haven't shifted the initial
3310         //   -1 positions out of prevPos yet - loop back to advance the
3311         //    position in the input without any further looking for breaks.
3312         if (prevPos == -1) {
3313             continue;
3314         }
3315
3316         // LB 4  Always break after hard line breaks,
3317         if (fBK->contains(prevChar)) {
3318             break;
3319         }
3320
3321         // LB 5  Break after CR, LF, NL, but not inside CR LF
3322         if (prevChar == 0x0d && thisChar == 0x0a) {
3323             continue;
3324         }
3325         if (prevChar == 0x0d ||
3326             prevChar == 0x0a ||
3327             prevChar == 0x85)  {
3328             break;
3329         }
3330
3331         // LB 6  Don't break before hard line breaks
3332         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3333             fBK->contains(thisChar)) {
3334                 continue;
3335         }
3336
3337
3338         // LB 7  Don't break before spaces or zero-width space.
3339         if (fSP->contains(thisChar)) {
3340             continue;
3341         }
3342
3343         if (fZW->contains(thisChar)) {
3344             continue;
3345         }
3346
3347         // LB 8  Break after zero width space
3348         if (fZW->contains(prevChar)) {
3349             break;
3350         }
3351
3352         // LB 9, 10  Already done, at top of loop.
3353         //
3354
3355
3356         // LB 11  Do not break before or after WORD JOINER and related characters.
3357         //    x  WJ
3358         //    WJ  x
3359         //
3360         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3361             continue;
3362         }
3363
3364         // LB 12
3365         //    GL  x
3366         if (fGL->contains(prevChar)) {
3367             continue;
3368         }
3369
3370         // LB 12a
3371         //    [^SP BA HY] x GL
3372         if (!(fSP->contains(prevChar) ||
3373               fBA->contains(prevChar) ||
3374               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3375             continue;
3376         }
3377
3378
3379
3380         // LB 13  Don't break before closings.
3381         //        NU x CL  and NU x IS are not matched here so that they will
3382         //        fall into LB 17 and the more general number regular expression.
3383         //
3384         if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
3385                                         fEX->contains(thisChar) ||
3386             !fNU->contains(prevChar) && fIS->contains(thisChar) ||
3387             !fNU->contains(prevChar) && fSY->contains(thisChar))    {
3388             continue;
3389         }
3390
3391         // LB 14 Don't break after OP SP*
3392         //       Scan backwards, checking for this sequence.
3393         //       The OP char could include combining marks, so we actually check for
3394         //           OP CM* SP*
3395         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3396         //       sequence into a ID char, so before scanning back through spaces,
3397         //       verify that prevChar is indeed a space.  The prevChar variable
3398         //       may differ from fText[prevPos]
3399         tPos = prevPos;
3400         if (fSP->contains(prevChar)) {
3401             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3402                 tPos=fText->moveIndex32(tPos, -1);
3403             }
3404         }
3405         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3406             tPos=fText->moveIndex32(tPos, -1);
3407         }
3408         if (fOP->contains(fText->char32At(tPos))) {
3409             continue;
3410         }
3411
3412
3413         // LB 15    QU SP* x OP
3414         if (fOP->contains(thisChar)) {
3415             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3416             int tPos = prevPos;
3417             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3418                 tPos = fText->moveIndex32(tPos, -1);
3419             }
3420             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3421                 tPos = fText->moveIndex32(tPos, -1);
3422             }
3423             if (fQU->contains(fText->char32At(tPos))) {
3424                 continue;
3425             }
3426         }
3427
3428
3429
3430         // LB 16   CL SP* x NS
3431         //    Scan backwards for SP* CM* CL
3432         if (fNS->contains(thisChar)) {
3433             int tPos = prevPos;
3434             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3435                 tPos = fText->moveIndex32(tPos, -1);
3436             }
3437             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3438                 tPos = fText->moveIndex32(tPos, -1);
3439             }
3440             if (fCL->contains(fText->char32At(tPos))) {
3441                 continue;
3442             }
3443         }
3444
3445
3446         // LB 17        B2 SP* x B2
3447         if (fB2->contains(thisChar)) {
3448             //  Scan backwards, checking for the B2 CM* SP* sequence.
3449             tPos = prevPos;
3450             if (fSP->contains(prevChar)) {
3451                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3452                     tPos=fText->moveIndex32(tPos, -1);
3453                 }
3454             }
3455             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3456                 tPos=fText->moveIndex32(tPos, -1);
3457             }
3458             if (fB2->contains(fText->char32At(tPos))) {
3459                 continue;
3460             }
3461         }
3462
3463
3464         // LB 18    break after space
3465         if (fSP->contains(prevChar)) {
3466             break;
3467         }
3468
3469         // LB 19
3470         //    x   QU
3471         //    QU  x
3472         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3473             continue;
3474         }
3475
3476         // LB 20  Break around a CB
3477         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3478             break;
3479         }
3480
3481         // LB 21
3482         if (fBA->contains(thisChar) ||
3483             fHY->contains(thisChar) ||
3484             fNS->contains(thisChar) ||
3485             fBB->contains(prevChar) )   {
3486             continue;
3487         }
3488
3489         // LB 22
3490         if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
3491             fID->contains(prevChar) && fIN->contains(thisChar) ||
3492             fIN->contains(prevChar) && fIN->contains(thisChar) ||
3493             fNU->contains(prevChar) && fIN->contains(thisChar) )   {
3494             continue;
3495         }
3496
3497
3498         // LB 23    ID x PO
3499         //          AL x NU
3500         //          NU x AL
3501         if (fID->contains(prevChar) && fPO->contains(thisChar) ||
3502             fAL->contains(prevChar) && fNU->contains(thisChar) ||
3503             fNU->contains(prevChar) && fAL->contains(thisChar) )   {
3504             continue;
3505         }
3506
3507         // LB 24  Do not break between prefix and letters or ideographs.
3508         //        PR x ID
3509         //        PR x AL
3510         //        PO x AL
3511         if (fPR->contains(prevChar) && fID->contains(thisChar) ||
3512             fPR->contains(prevChar) && fAL->contains(thisChar) ||
3513             fPO->contains(prevChar) && fAL->contains(thisChar) )   {
3514             continue;
3515         }
3516
3517
3518
3519         // LB 25    Numbers
3520         if (fNumberMatcher->lookingAt(prevPos, status)) {
3521             if (U_FAILURE(status)) {
3522                 break;
3523             }
3524             // Matched a number.  But could have been just a single digit, which would
3525             //    not represent a "no break here" between prevChar and thisChar
3526             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3527             if (numEndIdx > pos) {
3528                 // Number match includes at least our two chars being checked
3529                 if (numEndIdx > nextPos) {
3530                     // Number match includes additional chars.  Update pos and nextPos
3531                     //   so that next loop iteration will continue at the end of the number,
3532                     //   checking for breaks between last char in number & whatever follows.
3533                     pos = nextPos = numEndIdx;
3534                     do {
3535                         pos = fText->moveIndex32(pos, -1);
3536                         thisChar = fText->char32At(pos);
3537                     } while (fCM->contains(thisChar));
3538                 }
3539                 continue;
3540             }
3541         }
3542
3543
3544         // LB 26 Do not break a Korean syllable.
3545         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3546                                         fJV->contains(thisChar) ||
3547                                         fH2->contains(thisChar) ||
3548                                         fH3->contains(thisChar))) {
3549                                             continue;
3550                                         }
3551
3552         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3553             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3554                 continue;
3555         }
3556
3557         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3558             fJT->contains(thisChar)) {
3559                 continue;
3560         }
3561
3562         // LB 27 Treat a Korean Syllable Block the same as ID.
3563         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3564             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3565             fIN->contains(thisChar)) {
3566                 continue;
3567             }
3568         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3569             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3570             fPO->contains(thisChar)) {
3571                 continue;
3572             }
3573         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3574             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3575                 continue;
3576             }
3577
3578
3579
3580         // LB 28  Do not break between alphabetics ("at").
3581         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3582             continue;
3583         }
3584
3585         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3586         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3587             continue;
3588         }
3589
3590         // LB 31    Break everywhere else
3591         break;
3592
3593     }
3594
3595     return pos;
3596 }
3597
3598
3599 UVector  *RBBILineMonkey::charClasses() {
3600     return fSets;
3601 }
3602
3603
3604 RBBILineMonkey::~RBBILineMonkey() {
3605     delete fSets;
3606
3607     delete fBK;
3608     delete fCR;
3609     delete fLF;
3610     delete fCM;
3611     delete fNL;
3612     delete fWJ;
3613     delete fZW;
3614     delete fGL;
3615     delete fCB;
3616     delete fSP;
3617     delete fB2;
3618     delete fBA;
3619     delete fBB;
3620     delete fHY;
3621     delete fH2;
3622     delete fH3;
3623     delete fCL;
3624     delete fEX;
3625     delete fIN;
3626     delete fJL;
3627     delete fJV;
3628     delete fJT;
3629     delete fNS;
3630     delete fOP;
3631     delete fQU;
3632     delete fIS;
3633     delete fNU;
3634     delete fPO;
3635     delete fPR;
3636     delete fSY;
3637     delete fAI;
3638     delete fAL;
3639     delete fID;
3640     delete fSA;
3641     delete fSG;
3642     delete fXX;
3643
3644     delete fCharBI;
3645     delete fNumberMatcher;
3646 }
3647
3648
3649 //-------------------------------------------------------------------------------------------
3650 //
3651 //   TestMonkey
3652 //
3653 //     params
3654 //       seed=nnnnn        Random number starting seed.
3655 //                         Setting the seed allows errors to be reproduced.
3656 //       loop=nnn          Looping count.  Controls running time.
3657 //                         -1:  run forever.
3658 //                          0 or greater:  run length.
3659 //
3660 //       type = char | word | line | sent | title
3661 //
3662 //-------------------------------------------------------------------------------------------
3663
3664 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3665     int32_t val = defaultVal;
3666     name.append(" *= *(-?\\d+)");
3667     UErrorCode status = U_ZERO_ERROR;
3668     RegexMatcher m(name, params, 0, status);
3669     if (m.find()) {
3670         // The param exists.  Convert the string to an int.
3671         char valString[100];
3672         int32_t paramLength = m.end(1, status) - m.start(1, status);
3673         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3674             paramLength = (int32_t)(sizeof(valString)-2);
3675         }
3676         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3677         val = strtol(valString,  NULL, 10);
3678
3679         // Delete this parameter from the params string.
3680         m.reset();
3681         params = m.replaceFirst("", status);
3682     }
3683     U_ASSERT(U_SUCCESS(status));
3684     return val;
3685 }
3686 #endif
3687
3688 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3689                                     BreakIterator *bi,
3690                                     int expected[],
3691                                     int expectedcount)
3692 {
3693     int count = 0;
3694     int i = 0;
3695     int forward[50];
3696     bi->setText(ustr);
3697     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3698         forward[count] = i;
3699         if (count < expectedcount && expected[count] != i) {
3700             test->errln("break forward test failed: expected %d but got %d",
3701                         expected[count], i);
3702             break;
3703         }
3704         count ++;
3705     }
3706     if (count != expectedcount) {
3707         printStringBreaks(ustr, expected, expectedcount);
3708         test->errln("break forward test failed: missed %d match",
3709                     expectedcount - count);
3710         return;
3711     }
3712     // testing boundaries
3713     for (i = 1; i < expectedcount; i ++) {
3714         int j = expected[i - 1];
3715         if (!bi->isBoundary(j)) {
3716             printStringBreaks(ustr, expected, expectedcount);
3717             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3718             return;
3719         }
3720         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3721             if (bi->isBoundary(j)) {
3722                 printStringBreaks(ustr, expected, expectedcount);
3723                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3724                 return;
3725             }
3726         }
3727     }
3728
3729     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3730         count --;
3731         if (forward[count] != i) {
3732             test->errln("happy break test previous() failed: expected %d but got %d",
3733                         forward[count], i);
3734             break;
3735         }
3736     }
3737     if (count != 0) {
3738         printStringBreaks(ustr, expected, expectedcount);
3739         test->errln("break test previous() failed: missed a match");
3740         return;
3741     }
3742
3743     // testing preceding
3744     for (i = 0; i < expectedcount - 1; i ++) {
3745         // int j = expected[i] + 1;
3746         int j = ustr.moveIndex32(expected[i], 1);
3747         for (; j <= expected[i + 1]; j ++) {
3748             if (bi->preceding(j) != expected[i]) {
3749                 printStringBreaks(ustr, expected, expectedcount);
3750                 test->errln("preceding(): Not expecting boundary at position %d", j);
3751                 return;
3752             }
3753         }
3754     }
3755 }
3756
3757 void RBBITest::TestWordBreaks(void)
3758 {
3759 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3760
3761     Locale        locale("en");
3762     UErrorCode    status = U_ZERO_ERROR;
3763     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3764     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3765     static const char *strlist[] =
3766     {
3767     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3768     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3769     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3770     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3771     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3772     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3773     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3774     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3775     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3776     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3777     "\\u2027\\U000e0067\\u0a47\\u00b7",
3778     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3779     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3780     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3781     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3782     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3783     "\\u0027\\u11af\\U000e0057\\u0602",
3784     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3785     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3786     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3787     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3788     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3789     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3790     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3791     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3792     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3793     "\\u58f4\\U000e0049\\u20e7\\u2027",
3794     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3795     "\\ua183\\u102d\\u0bec\\u003a",
3796     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3797     "\\u003a\\u0e57\\u0fad\\u002e",
3798     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3799     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3800     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3801     "\\u003a\\u0664\\u00b7\\u1fba",
3802     "\\u003b\\u0027\\u00b7\\u47a3",
3803     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3804     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3805     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3806     };
3807     int loop;
3808     if (U_FAILURE(status)) {
3809         errln("Creation of break iterator failed %s", u_errorName(status));
3810         return;
3811     }
3812     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3813         // printf("looping %d\n", loop);
3814         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3815         // RBBICharMonkey monkey;
3816         RBBIWordMonkey monkey;
3817
3818         int expected[50];
3819         int expectedcount = 0;
3820
3821         monkey.setText(ustr);
3822         int i;
3823         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3824             expected[expectedcount ++] = i;
3825         }
3826
3827         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3828     }
3829     delete bi;
3830 #endif
3831 }
3832
3833 void RBBITest::TestWordBoundary(void)
3834 {
3835     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3836     Locale        locale("en");
3837     UErrorCode    status = U_ZERO_ERROR;
3838     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3839     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3840     UChar         str[50];
3841     static const char *strlist[] =
3842     {
3843     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3844     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3845     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3846     "\\u2027\\U000e0067\\u0a47\\u00b7",
3847     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3848     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3849     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3850     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3851     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3852     "\\u0027\\u11af\\U000e0057\\u0602",
3853     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3854     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3855     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3856     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3857     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3858     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3859     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3860     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3861     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3862     "\\u58f4\\U000e0049\\u20e7\\u2027",
3863     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3864     "\\ua183\\u102d\\u0bec\\u003a",
3865     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3866     "\\u003a\\u0e57\\u0fad\\u002e",
3867     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3868     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3869     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3870     "\\u003a\\u0664\\u00b7\\u1fba",
3871     "\\u003b\\u0027\\u00b7\\u47a3",
3872     };
3873     int loop;
3874     if (U_FAILURE(status)) {
3875         errln("Creation of break iterator failed %s", u_errorName(status));
3876         return;
3877     }
3878     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3879         // printf("looping %d\n", loop);
3880         u_unescape(strlist[loop], str, 20);
3881         UnicodeString ustr(str);
3882         int forward[50];
3883         int count = 0;
3884
3885         bi->setText(ustr);
3886         int prev = 0;
3887         int i;
3888         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3889             forward[count ++] = i;
3890             if (i > prev) {
3891                 int j;
3892                 for (j = prev + 1; j < i; j ++) {
3893                     if (bi->isBoundary(j)) {
3894                         printStringBreaks(ustr, forward, count);
3895                         errln("happy boundary test failed: expected %d not a boundary",
3896                                j);
3897                         return;
3898                     }
3899                 }
3900             }
3901             if (!bi->isBoundary(i)) {
3902                 printStringBreaks(ustr, forward, count);
3903                 errln("happy boundary test failed: expected %d a boundary",
3904                        i);
3905                 return;
3906             }
3907             prev = i;
3908         }
3909     }
3910     delete bi;
3911 }
3912
3913 void RBBITest::TestLineBreaks(void)
3914 {
3915 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3916     Locale        locale("en");
3917     UErrorCode    status = U_ZERO_ERROR;
3918     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3919     const int32_t  STRSIZE = 50;
3920     UChar         str[STRSIZE];
3921     static const char *strlist[] =
3922     {
3923      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3924      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3925              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3926      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3927              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3928      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3929      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3930      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3931      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3932      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3933      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3934      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3935      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3936      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3937      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3938      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3939      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3940      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3941      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3942      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3943      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3944      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3945      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3946      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3947      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3948      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3949      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3950      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3951      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3952      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3953      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3954      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3955      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3956      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3957      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3958      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3959      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3960      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3961      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3962      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3963      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3964      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3965          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3966          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3967          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3968      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3969          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3970     };
3971     int loop;
3972     TEST_ASSERT_SUCCESS(status);
3973     if (U_FAILURE(status)) {
3974         return;
3975     }
3976     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3977         // printf("looping %d\n", loop);
3978         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3979         if (t >= STRSIZE) {
3980             TEST_ASSERT(FALSE);
3981             continue;
3982         }
3983
3984
3985         UnicodeString ustr(str);
3986         RBBILineMonkey monkey;
3987         if (U_FAILURE(monkey.deferredStatus)) {
3988             continue;
3989         }
3990
3991         const int EXPECTEDSIZE = 50;
3992         int expected[EXPECTEDSIZE];
3993         int expectedcount = 0;
3994
3995         monkey.setText(ustr);
3996         int i;
3997         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3998             if (expectedcount >= EXPECTEDSIZE) {
3999                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4000                 return;
4001             }
4002             expected[expectedcount ++] = i;
4003         }
4004
4005         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4006     }
4007     delete bi;
4008 #endif
4009 }
4010
4011 void RBBITest::TestSentBreaks(void)
4012 {
4013 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4014     Locale        locale("en");
4015     UErrorCode    status = U_ZERO_ERROR;
4016     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4017     UChar         str[200];
4018     static const char *strlist[] =
4019     {
4020      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4021      "This\n",
4022      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4023      "\"Sentence ending with a quote.\" Bye.",
4024      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4025      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4026      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4027      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4028      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4029      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4030      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4031              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4032              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4033              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4034      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4035              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4036              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4037              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4038              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4039              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4040     };
4041     int loop;
4042     if (U_FAILURE(status)) {
4043         errln("Creation of break iterator failed %s", u_errorName(status));
4044         return;
4045     }
4046     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4047         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4048         UnicodeString ustr(str);
4049
4050         RBBISentMonkey monkey;
4051         if (U_FAILURE(monkey.deferredStatus)) {
4052             continue;
4053         }
4054
4055         const int EXPECTEDSIZE = 50;
4056         int expected[EXPECTEDSIZE];
4057         int expectedcount = 0;
4058
4059         monkey.setText(ustr);
4060         int i;
4061         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4062             if (expectedcount >= EXPECTEDSIZE) {
4063                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4064                 return;
4065             }
4066             expected[expectedcount ++] = i;
4067         }
4068
4069         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4070     }
4071     delete bi;
4072 #endif
4073 }
4074
4075 void RBBITest::TestMonkey(char *params) {
4076 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4077
4078     UErrorCode     status    = U_ZERO_ERROR;
4079     int32_t        loopCount = 500;
4080     int32_t        seed      = 1;
4081     UnicodeString  breakType = "all";
4082     Locale         locale("en");
4083     UBool          useUText  = FALSE;
4084
4085     if (quick == FALSE) {
4086         loopCount = 10000;
4087     }
4088
4089     if (params) {
4090         UnicodeString p(params);
4091         loopCount = getIntParam("loop", p, loopCount);
4092         seed      = getIntParam("seed", p, seed);
4093
4094         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4095         if (m.find()) {
4096             breakType = m.group(1, status);
4097             m.reset();
4098             p = m.replaceFirst("", status);
4099         }
4100
4101         RegexMatcher u(" *utext", p, 0, status);
4102         if (u.find()) {
4103             useUText = TRUE;
4104             u.reset();
4105             p = u.replaceFirst("", status);
4106         }
4107
4108
4109         // m.reset(p);
4110         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4111             // Each option is stripped out of the option string as it is processed.
4112             // All options have been checked.  The option string should have been completely emptied..
4113             char buf[100];
4114             p.extract(buf, sizeof(buf), NULL, status);
4115             buf[sizeof(buf)-1] = 0;
4116             errln("Unrecognized or extra parameter:  %s\n", buf);
4117             return;
4118         }
4119
4120     }
4121
4122     if (breakType == "char" || breakType == "all") {
4123         RBBICharMonkey  m;
4124         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4125         if (U_SUCCESS(status)) {
4126             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4127             if (breakType == "all" && useUText==FALSE) {
4128                 // Also run a quick test with UText when "all" is specified
4129                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4130             }
4131         }
4132         else {
4133             errln("Creation of character break iterator failed %s", u_errorName(status));
4134         }
4135         delete bi;
4136     }
4137
4138     if (breakType == "word" || breakType == "all") {
4139         logln("Word Break Monkey Test");
4140         RBBIWordMonkey  m;
4141         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4142         if (U_SUCCESS(status)) {
4143             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4144         }
4145         else {
4146             errln("Creation of word break iterator failed %s", u_errorName(status));
4147         }
4148         delete bi;
4149     }
4150
4151     if (breakType == "line" || breakType == "all") {
4152         logln("Line Break Monkey Test");
4153         RBBILineMonkey  m;
4154         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4155         if (loopCount >= 10) {
4156             loopCount = loopCount / 5;   // Line break runs slower than the others.
4157         }
4158         if (U_SUCCESS(status)) {
4159             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4160         }
4161         else {
4162             errln("Creation of line break iterator failed %s", u_errorName(status));
4163         }
4164         delete bi;
4165     }
4166
4167     if (breakType == "sent" || breakType == "all"  ) {
4168         logln("Sentence Break Monkey Test");
4169         RBBISentMonkey  m;
4170         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4171         if (loopCount >= 10) {
4172             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4173         }
4174         if (U_SUCCESS(status)) {
4175             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4176         }
4177         else {
4178             errln("Creation of line break iterator failed %s", u_errorName(status));
4179         }
4180         delete bi;
4181     }
4182
4183 #endif
4184 }
4185
4186 //
4187 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4188 //    Parameters:
4189 //       bi      - the break iterator to use
4190 //       mk      - MonkeyKind, abstraction for obtaining expected results
4191 //       name    - Name of test (char, word, etc.) for use in error messages
4192 //       seed    - Seed for starting random number generator (parameter from user)
4193 //       numIterations
4194 //
4195 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4196                          int32_t numIterations, UBool useUText) {
4197
4198 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4199
4200     const int32_t    TESTSTRINGLEN = 500;
4201     UnicodeString    testText;
4202     int32_t          numCharClasses;
4203     UVector          *chClasses;
4204     int              expected[TESTSTRINGLEN*2 + 1];
4205     int              expectedCount = 0;
4206     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4207     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4208     char             reverseBreaks[TESTSTRINGLEN*2+1];
4209     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4210     char             followingBreaks[TESTSTRINGLEN*2+1];
4211     char             precedingBreaks[TESTSTRINGLEN*2+1];
4212     int              i;
4213     int              loopCount = 0;
4214
4215     m_seed = seed;
4216
4217     numCharClasses = mk.charClasses()->size();
4218     chClasses      = mk.charClasses();
4219
4220     // Check for errors that occured during the construction of the MonkeyKind object.
4221     //  Can't report them where they occured because errln() is a method coming from intlTest,
4222     //  and is not visible outside of RBBITest :-(
4223     if (U_FAILURE(mk.deferredStatus)) {
4224         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4225         return;
4226     }
4227
4228     // Verify that the character classes all have at least one member.
4229     for (i=0; i<numCharClasses; i++) {
4230         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4231         if (s == NULL || s->size() == 0) {
4232             errln("Character Class #%d is null or of zero size.", i);
4233             return;
4234         }
4235     }
4236
4237     while (loopCount < numIterations || numIterations == -1) {
4238         if (numIterations == -1 && loopCount % 10 == 0) {
4239             // If test is running in an infinite loop, display a periodic tic so
4240             //   we can tell that it is making progress.
4241             fprintf(stderr, ".");
4242         }
4243         // Save current random number seed, so that we can recreate the random numbers
4244         //   for this loop iteration in event of an error.
4245         seed = m_seed;
4246
4247         // Populate a test string with data.
4248         testText.truncate(0);
4249         for (i=0; i<TESTSTRINGLEN; i++) {
4250             int32_t  aClassNum = m_rand() % numCharClasses;
4251             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4252             int32_t   charIdx = m_rand() % classSet->size();
4253             UChar32   c = classSet->charAt(charIdx);
4254             if (c < 0) {   // TODO:  deal with sets containing strings.
4255                 errln("c < 0");
4256                 break;
4257             }
4258             testText.append(c);
4259         }
4260
4261         // Calculate the expected results for this test string.
4262         mk.setText(testText);
4263         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4264         expectedBreaks[0] = 1;
4265         int32_t breakPos = 0;
4266         expectedCount = 0;
4267         for (;;) {
4268             breakPos = mk.next(breakPos);
4269             if (breakPos == -1) {
4270                 break;
4271             }
4272             if (breakPos > testText.length()) {
4273                 errln("breakPos > testText.length()");
4274             }
4275             expectedBreaks[breakPos] = 1;
4276             U_ASSERT(expectedCount<testText.length());
4277             expected[expectedCount ++] = breakPos;
4278         }
4279
4280         // Find the break positions using forward iteration
4281         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4282         if (useUText) {
4283             UErrorCode status = U_ZERO_ERROR;
4284             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4285             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4286             bi->setText(testUText, status);
4287             TEST_ASSERT_SUCCESS(status);
4288             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4289                                       //  This UText can be closed immediately, so long as the
4290                                       //  testText string continues to exist.
4291         } else {
4292             bi->setText(testText);
4293         }
4294
4295         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4296             if (i < 0 || i > testText.length()) {
4297                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4298                 break;
4299             }
4300             forwardBreaks[i] = 1;
4301         }
4302
4303         // Find the break positions using reverse iteration
4304         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4305         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4306             if (i < 0 || i > testText.length()) {
4307                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4308                 break;
4309             }
4310             reverseBreaks[i] = 1;
4311         }
4312
4313         // Find the break positions using isBoundary() tests.
4314         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4315         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4316         for (i=0; i<=testText.length(); i++) {
4317             isBoundaryBreaks[i] = bi->isBoundary(i);
4318         }
4319
4320
4321         // Find the break positions using the following() function.
4322         // printf(".");
4323         memset(followingBreaks, 0, sizeof(followingBreaks));
4324         int32_t   lastBreakPos = 0;
4325         followingBreaks[0] = 1;
4326         for (i=0; i<testText.length(); i++) {
4327             breakPos = bi->following(i);
4328             if (breakPos <= i ||
4329                 breakPos < lastBreakPos ||
4330                 breakPos > testText.length() ||
4331                 breakPos > lastBreakPos && lastBreakPos > i ) {
4332                 errln("%s break monkey test: "
4333                     "Out of range value returned by BreakIterator::following().\n"
4334                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4335                          name, seed, i, breakPos, lastBreakPos);
4336                 break;
4337             }
4338             followingBreaks[breakPos] = 1;
4339             lastBreakPos = breakPos;
4340         }
4341
4342         // Find the break positions using the preceding() function.
4343         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4344         lastBreakPos = testText.length();
4345         precedingBreaks[testText.length()] = 1;
4346         for (i=testText.length(); i>0; i--) {
4347             breakPos = bi->preceding(i);
4348             if (breakPos >= i ||
4349                 breakPos > lastBreakPos ||
4350                 breakPos < 0 && testText.getChar32Start(i)>0 ||
4351                 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
4352                 errln("%s break monkey test: "
4353                     "Out of range value returned by BreakIterator::preceding().\n"
4354                     "index=%d;  prev returned %d; lastBreak=%d" ,
4355                     name,  i, breakPos, lastBreakPos);
4356                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4357                     precedingBreaks[i] = 2;   // Forces an error.
4358                 }
4359             } else {
4360                 if (breakPos >= 0) {
4361                     precedingBreaks[breakPos] = 1;
4362                 }
4363                 lastBreakPos = breakPos;
4364             }
4365         }
4366
4367         // Compare the expected and actual results.
4368         for (i=0; i<=testText.length(); i++) {
4369             const char *errorType = NULL;
4370             if  (forwardBreaks[i] != expectedBreaks[i]) {
4371                 errorType = "next()";
4372             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4373                 errorType = "previous()";
4374             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4375                 errorType = "isBoundary()";
4376             } else if (followingBreaks[i] != expectedBreaks[i]) {
4377                 errorType = "following()";
4378             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4379                 errorType = "preceding()";
4380             }
4381
4382
4383             if (errorType != NULL) {
4384                 // Format a range of the test text that includes the failure as
4385                 //  a data item that can be included in the rbbi test data file.
4386
4387                 // Start of the range is the last point where expected and actual results
4388                 //   both agreed that there was a break position.
4389                 int startContext = i;
4390                 int32_t count = 0;
4391                 for (;;) {
4392                     if (startContext==0) { break; }
4393                     startContext --;
4394                     if (expectedBreaks[startContext] != 0) {
4395                         if (count == 2) break;
4396                         count ++;
4397                     }
4398                 }
4399
4400                 // End of range is two expected breaks past the start position.
4401                 int endContext = i + 1;
4402                 int ci;
4403                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4404                     for (;;) {
4405                         if (endContext >= testText.length()) {break;}
4406                         if (expectedBreaks[endContext-1] != 0) {
4407                             if (count == 0) break;
4408                             count --;
4409                         }
4410                         endContext ++;
4411                     }
4412                 }
4413
4414                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4415                 UnicodeString errorText = "<data>";
4416                 /***if (strcmp(errorType, "next()") == 0) {
4417                     startContext = 0;
4418                     endContext = testText.length();
4419
4420                     printStringBreaks(testText, expected, expectedCount);
4421                 }***/
4422
4423                 for (ci=startContext; ci<endContext;) {
4424                     UnicodeString hexChars("0123456789abcdef");
4425                     UChar32  c;
4426                     int      bn;
4427                     c = testText.char32At(ci);
4428                     if (ci == i) {
4429                         // This is the location of the error.
4430                         errorText.append("<?>");
4431                     } else if (expectedBreaks[ci] != 0) {
4432                         // This a non-error expected break position.
4433                         errorText.append("\\");
4434                     }
4435                     if (c < 0x10000) {
4436                         errorText.append("\\u");
4437                         for (bn=12; bn>=0; bn-=4) {
4438                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4439                         }
4440                     } else {
4441                         errorText.append("\\U");
4442                         for (bn=28; bn>=0; bn-=4) {
4443                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4444                         }
4445                     }
4446                     ci = testText.moveIndex32(ci, 1);
4447                 }
4448                 errorText.append("\\");
4449                 errorText.append("</data>\n");
4450
4451                 // Output the error
4452                 char  charErrorTxt[500];
4453                 UErrorCode status = U_ZERO_ERROR;
4454                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4455                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4456                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4457                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4458                     errorType, seed, i, charErrorTxt);
4459                 break;
4460             }
4461         }
4462
4463         loopCount++;
4464     }
4465 #endif
4466 }
4467
4468 //
4469 //  TestDebug    -  A place-holder test for debugging purposes.
4470 //                  For putting in fragments of other tests that can be invoked
4471 //                  for tracing  without a lot of unwanted extra stuff happening.
4472 //
4473 void RBBITest::TestDebug(void) {
4474 #if 0
4475     UErrorCode   status = U_ZERO_ERROR;
4476     int pos = 0;
4477     int ruleStatus = 0;
4478
4479     RuleBasedBreakIterator* bi =
4480        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4481        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4482        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4483     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4484     // UnicodeString s("Aaa.  Bcd");
4485     s = s.unescape();
4486     bi->setText(s);
4487     UBool r = bi->isBoundary(8);
4488     printf("%s", r?"true":"false");
4489     return;
4490     pos = bi->last();
4491     do {
4492         // ruleStatus = bi->getRuleStatus();
4493         printf("%d\t%d\n", pos, ruleStatus);
4494         pos = bi->previous();
4495     } while (pos != BreakIterator::DONE);
4496 #endif
4497 }
4498
4499 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */