icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2006, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_BREAK_ITERATION
  15
  16 #include "unicode/utypes.h"
  17 #include "unicode/brkiter.h"
  18 #include "unicode/rbbi.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/utf16.h"
  21 #include "unicode/ucnv.h"
  22 #include "unicode/schriter.h"
  23 #include "unicode/uniset.h"
  24 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  25 #include "unicode/ustring.h"
  26 #include "unicode/utext.h"
  27 #include "intltest.h"
  28 #include "rbbitst.h"
  29 #include <string.h>
  30 #include "uvector.h"
  31 #include "uvectr32.h"
  32 #include "triedict.h"
  33 #include <string.h>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36
  37 #define TEST_ASSERT(x) {if (!(x)) { \
  38     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  39
  40 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
  41     errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  42
  43
  44 //---------------------------------------------------------------------------
  45 //
  46 //   class BITestData   Holds a set of Break iterator test data and results
  47 //                      Includes
  48 //                         - the string data to be broken
  49 //                         - a vector of the expected break positions.
  50 //                         - a vector of source line numbers for the data,
  51 //                               (to help see where errors occured.)
  52 //                         - The expected break tag values.
  53 //                         - Vectors of actual break positions and tag values.
  54 //                         - Functions for comparing actual with expected and
  55 //                            reporting errors.
  56 //
  57 //----------------------------------------------------------------------------
  58 class BITestData {
  59 public:
  60     UnicodeString    fDataToBreak;
  61     UVector          fExpectedBreakPositions;
  62     UVector          fExpectedTags;
  63     UVector          fLineNum;
  64     UVector          fActualBreakPositions;   // Test Results.
  65     UVector          fActualTags;
  66
  67     BITestData(UErrorCode &status);
  68     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
  69     void             checkResults(const char *heading, RBBITest *test);
  70     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
  71     void             clearResults();
  72 };
  73
  74 //
  75 // Constructor.
  76 //
  77 BITestData::BITestData(UErrorCode &status)
  78 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
  79   fActualTags(status)
  80 {
  81 }
  82
  83 //
  84 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
  85 //                 The macro form collects the line number, which is helpful
  86 //                 when tracking down failures.
  87 //
  88 //                 A null data item is inserted at the start of each test's data
  89 //                  to put the starting zero into the data list.  The position saved for
  90 //                  each non-null item is its ending position.
  91 //
  92 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
  93 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
  94     if (U_FAILURE(status)) {return;}
  95     if (data != NULL) {
  96         fDataToBreak.append(CharsToUnicodeString(data));
  97     }
  98     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
  99     fExpectedTags.addElement(tag, status);
 100     fLineNum.addElement(lineNum, status);
 101 }
 102
 103
 104 //
 105 //  checkResults.   Compare the actual and expected break positions, report any differences.
 106 //
 107 void BITestData::checkResults(const char *heading, RBBITest *test) {
 108     int32_t   expectedIndex = 0;
 109     int32_t   actualIndex = 0;
 110
 111     for (;;) {
 112         // If we've run through both the expected and actual results vectors, we're done.
 113         //   break out of the loop.
 114         if (expectedIndex >= fExpectedBreakPositions.size() &&
 115             actualIndex   >= fActualBreakPositions.size()) {
 116             break;
 117         }
 118
 119
 120         if (expectedIndex >= fExpectedBreakPositions.size()) {
 121             err(heading, test, expectedIndex-1, actualIndex);
 122             actualIndex++;
 123             continue;
 124         }
 125
 126         if (actualIndex >= fActualBreakPositions.size()) {
 127             err(heading, test, expectedIndex, actualIndex-1);
 128             expectedIndex++;
 129             continue;
 130         }
 131
 132         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 133             err(heading, test, expectedIndex, actualIndex);
 134             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 135             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 136                 actualIndex++;
 137             } else {
 138                 expectedIndex++;
 139             }
 140             continue;
 141         }
 142
 143         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 144             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 145                 heading, fLineNum.elementAt(expectedIndex),
 146                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 147         }
 148
 149         actualIndex++;
 150         expectedIndex++;
 151     }
 152 }
 153
 154 //
 155 //  err   -  An error was found.  Report it, along with information about where the
 156 //                                incorrectly broken test data appeared in the source file.
 157 //
 158 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 159 {
 160     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 161     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 162     int32_t   o        = 0;
 163     int32_t   line     = fLineNum.elementAti(expectedIdx);
 164     if (expectedIdx > 0) {
 165         // The line numbers are off by one because a premature break occurs somewhere
 166         //    within the previous item, rather than at the start of the current (expected) item.
 167         //    We want to report the offset of the unexpected break from the start of
 168         //      this previous item.
 169         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 170     }
 171     if (actual < expected) {
 172         test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);
 173     } else {
 174         test->errln("%s Failed to find break at end of item from line %d", heading, line);
 175     }
 176 }
 177
 178
 179 void BITestData::clearResults() {
 180     fActualBreakPositions.removeAllElements();
 181     fActualTags.removeAllElements();
 182 }
 183
 184
 185 //-----------------------------------------------------------------------------------
 186 //
 187 //    Cannned Test Characters
 188 //
 189 //-----------------------------------------------------------------------------------
 190
 191 static const UChar cannedTestArray[] = {
 192     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
 193     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
 194     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
 195     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
 196     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
 197     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
 198     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
 199     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
 200 };
 201
 202 static UnicodeString* cannedTestChars = 0;
 203
 204 #define  halfNA     "\\u0928\\u094d\\u200d"
 205 #define  halfSA     "\\u0938\\u094d\\u200d"
 206 #define  halfCHA    "\\u091a\\u094d\\u200d"
 207 #define  halfKA     "\\u0915\\u094d\\u200d"
 208 #define  deadTA     "\\u0924\\u094d"
 209
 210 //--------------------------------------------------------------------------------------
 211 //
 212 //    RBBITest    constructor and destructor
 213 //
 214 //--------------------------------------------------------------------------------------
 215
 216 RBBITest::RBBITest() {
 217     UnicodeString temp(cannedTestArray);
 218     cannedTestChars = new UnicodeString();
 219     *cannedTestChars += (UChar)0x0000;
 220     *cannedTestChars += temp;
 221 }
 222
 223
 224 RBBITest::~RBBITest() {
 225     delete cannedTestChars;
 226 }
 227
 228
 229 static const int T_NUMBER = 100;
 230 static const int T_LETTER = 200;
 231 static const int T_H_OR_K = 300;
 232 static const int T_IDEO   = 400;
 233
 234
 235
 236
 237
 238
 239 //--------------------------------------------------------------------
 240 //Testing the BreakIterator for devanagari script
 241 //--------------------------------------------------------------------
 242
 243 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
 244 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
 245 #define deadTTHA "\\u0920\\u094d"
 246 #define deadPA   "\\u092a\\u094d"
 247 #define deadSA   "\\u0938\\u094d"
 248 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
 249
 250
 251
 252
 253
 254
 255 //-----------------------------------------------------------------------------------
 256 //
 257 //   Test for status {tag} return value from break rules.
 258 //        TODO:  a more thorough test.
 259 //
 260 //-----------------------------------------------------------------------------------
 261 void RBBITest::TestStatusReturn() {
 262      UnicodeString rulesString1 = "$Letters = [:L:];\n"
 263                                   "$Numbers = [:N:];\n"
 264                                   "$Letters+{1};\n"
 265                                   "$Numbers+{2};\n"
 266                                   "Help\\ {4}/me\\!;\n"
 267                                   "[^$Letters $Numbers];\n"
 268                                   "!.*;\n";
 269      UnicodeString testString1  = "abc123..abc Help me Help me!";
 270                                 // 01234567890123456789012345678
 271      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 272      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 273
 274      UErrorCode status=U_ZERO_ERROR;
 275      UParseError    parseError;
 276
 277      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 278      if(U_FAILURE(status)) {
 279          errln("FAIL : in construction");
 280      } else {
 281          int32_t  pos;
 282          int32_t  i = 0;
 283          bi->setText(testString1);
 284          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 285              if (pos != bounds1[i]) {
 286                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 287                  break;
 288              }
 289
 290              int tag = bi->getRuleStatus();
 291              if (tag != brkStatus[i]) {
 292                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 293                  break;
 294              }
 295              i++;
 296          }
 297      }
 298      delete bi;
 299 }
 300
 301
 302 static void printStringBreaks(UnicodeString ustr, int expected[],
 303                               int expectedcount)
 304 {
 305     UErrorCode status = U_ZERO_ERROR;
 306     char name[100];
 307     printf("code    alpha extend alphanum type word sent line name\n");
 308     int j;
 309     for (j = 0; j < ustr.length(); j ++) {
 310         if (expectedcount > 0) {
 311             int k;
 312             for (k = 0; k < expectedcount; k ++) {
 313                 if (j == expected[k]) {
 314                     printf("------------------------------------------------ %d\n",
 315                            j);
 316                 }
 317             }
 318         }
 319         UChar32 c = ustr.char32At(j);
 320         if (c > 0xffff) {
 321             j ++;
 322         }
 323         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 324         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 325                            u_isUAlphabetic(c),
 326                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 327                            u_isalnum(c),
 328                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 329                                                   u_charType(c),
 330                                                   U_SHORT_PROPERTY_NAME),
 331                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 332                                                   u_getIntPropertyValue(c,
 333                                                           UCHAR_WORD_BREAK),
 334                                                   U_SHORT_PROPERTY_NAME),
 335                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 336                                    u_getIntPropertyValue(c,
 337                                            UCHAR_SENTENCE_BREAK),
 338                                    U_SHORT_PROPERTY_NAME),
 339                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 340                                    u_getIntPropertyValue(c,
 341                                            UCHAR_LINE_BREAK),
 342                                    U_SHORT_PROPERTY_NAME),
 343                            name);
 344     }
 345 }
 346
 347 void RBBITest::TestThaiLineBreak() {
 348     UErrorCode status = U_ZERO_ERROR;
 349     BITestData thaiLineSelection(status);
 350
 351     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
 352     // represents elided letters at the end of a long word.  It should be bound to
 353     // the end of the word and not treated as an independent punctuation mark.
 354
 355
 356     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 357     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
 358     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
 359     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
 360     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
 361 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
 362 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 363     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
 364     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
 365     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
 366     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
 367     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
 368     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
 369     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
 370     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
 371
 372     // the one time where the paiyannoi occurs somewhere other than at the end
 373     // of a word is in the Thai abbrevation for "etc.", which both begins and
 374     // ends with a paiyannoi
 375     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
 376     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 377     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
 378
 379     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 380         Locale("th"), status);
 381     if (U_FAILURE(status))
 382     {
 383         errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
 384         return;
 385     }
 386
 387     generalIteratorTest(*e, thaiLineSelection);
 388     delete e;
 389 }
 390
 391
 392
 393 void RBBITest::TestMixedThaiLineBreak()
 394 {
 395     UErrorCode   status = U_ZERO_ERROR;
 396     BITestData   thaiLineSelection(status);
 397
 398     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 399
 400
 401     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
 402     // start
 403
 404     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
 405     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
 406     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
 407     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
 408     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
 409     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
 410     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
 411     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
 412     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
 413     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
 414     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
 415     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
 416     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
 417     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
 418     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
 419     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
 420
 421     // @suwit - end of changes
 422
 423
 424     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
 425     if (U_FAILURE(status))
 426     {
 427         errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
 428         return;
 429     }
 430
 431
 432     generalIteratorTest(*e, thaiLineSelection);
 433     delete e;
 434 }
 435
 436
 437 void RBBITest::TestMaiyamok()
 438 {
 439     UErrorCode status = U_ZERO_ERROR;
 440     BITestData   thaiLineSelection(status);
 441     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 442     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
 443     // word".  Instead of appearing as a word unto itself, however, it's kept together
 444     // with the word before it
 445     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
 446     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
 447     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
 448     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
 449     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
 450     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
 451     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
 452     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
 453     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
 454
 455     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 456         Locale("th"), status);
 457
 458     if (U_FAILURE(status))
 459     {
 460         errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
 461         return;
 462     }
 463     generalIteratorTest(*e, thaiLineSelection);
 464     delete e;
 465 }
 466
 467
 468
 469 void RBBITest::TestBug3818() {
 470     UErrorCode  status = U_ZERO_ERROR;
 471
 472     // Four Thai words...
 473     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 474                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 475     UnicodeString  thaiStr(thaiWordData);
 476
 477     RuleBasedBreakIterator* bi =
 478         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
 479     if (U_FAILURE(status) || bi == NULL) {
 480         errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 481         return;
 482     }
 483     bi->setText(thaiStr);
 484
 485     int32_t  startOfSecondWord = bi->following(1);
 486     if (startOfSecondWord != 4) {
 487         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 488             __FILE__, __LINE__, startOfSecondWord);
 489     }
 490     startOfSecondWord = bi->following(0);
 491     if (startOfSecondWord != 4) {
 492         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 493             __FILE__, __LINE__, startOfSecondWord);
 494     }
 495     delete bi;
 496 }
 497
 498
 499 void RBBITest::TestJapaneseWordBreak() {
 500     UErrorCode status = U_ZERO_ERROR;
 501     BITestData   japaneseWordSelection(status);
 502
 503     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
 504     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
 505     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
 506     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
 507     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
 508     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
 509     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
 510
 511     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
 512         Locale("ja"), status);
 513     if (U_FAILURE(status))
 514     {
 515         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
 516         return;
 517     }
 518
 519     generalIteratorTest(*e, japaneseWordSelection);
 520     delete e;
 521 }
 522
 523 void RBBITest::TestTrieDict() {
 524     UErrorCode      status  = U_ZERO_ERROR;
 525
 526     //
 527     //  Open and read the test data file.
 528     //
 529     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 530     char testFileName[1000];
 531     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
 532         errln("Can't open test data.  Path too long.");
 533         return;
 534     }
 535     strcpy(testFileName, testDataDirectory);
 536     strcat(testFileName, "riwords.txt");
 537
 538     // Items needing deleting at the end
 539     MutableTrieDictionary *mutableDict = NULL;
 540     CompactTrieDictionary *compactDict = NULL;
 541     UnicodeSet            *breaks      = NULL;
 542     UChar                 *testFile    = NULL;
 543     StringEnumeration     *enumer      = NULL;
 544     MutableTrieDictionary *mutable2    = NULL;
 545     StringEnumeration     *cloneEnum   = NULL;
 546     CompactTrieDictionary *compact2    = NULL;
 547
 548
 549     const UnicodeString *originalWord = NULL;
 550     const UnicodeString *cloneWord    = NULL;
 551     UChar *current;
 552     UChar *word;
 553     UChar uc;
 554     int32_t wordLen;
 555     int32_t wordCount;
 556     int32_t testCount;
 557
 558     int    len;
 559     testFile = ReadAndConvertFile(testFileName, len, status);
 560     if (U_FAILURE(status)) {
 561         goto cleanup; /* something went wrong, error already output */
 562     }
 563
 564     mutableDict = new MutableTrieDictionary(0x0E1C, status);
 565     if (U_FAILURE(status)) {
 566         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
 567         goto cleanup;
 568     }
 569
 570     breaks = new UnicodeSet;
 571     breaks->add(0x000A);     // Line Feed
 572     breaks->add(0x000D);     // Carriage Return
 573     breaks->add(0x2028);     // Line Separator
 574     breaks->add(0x2029);     // Paragraph Separator
 575
 576     // Now add each non-comment line of the file as a word.
 577     current = testFile;
 578     word = current;
 579     uc = *current++;
 580     wordLen = 0;
 581     wordCount = 0;
 582
 583     while (uc) {
 584         if (uc == 0x0023) {     // #comment line, skip
 585             while (uc && !breaks->contains(uc)) {
 586                 uc = *current++;
 587             }
 588         }
 589         else while (uc && !breaks->contains(uc)) {
 590             ++wordLen;
 591             uc = *current++;
 592         }
 593         if (wordLen > 0) {
 594             mutableDict->addWord(word, wordLen, status);
 595             if (U_FAILURE(status)) {
 596                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
 597                 goto cleanup;
 598             }
 599             wordCount += 1;
 600         }
 601
 602         // Find beginning of next line
 603         while (uc && breaks->contains(uc)) {
 604             uc = *current++;
 605         }
 606         word = current-1;
 607         wordLen = 0;
 608     }
 609
 610     if (wordCount < 50) {
 611         errln("Word count (%d) unreasonably small\n", wordCount);
 612         goto cleanup;
 613     }
 614
 615     enumer = mutableDict->openWords(status);
 616     if (U_FAILURE(status)) {
 617         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
 618         goto cleanup;
 619     }
 620
 621     testCount = 0;
 622     if (wordCount != (testCount = enumer->count(status))) {
 623         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 624             testCount, wordCount, u_errorName(status));
 625         goto cleanup;
 626     }
 627
 628     delete enumer;
 629     enumer = NULL;
 630
 631     // Now compact it
 632     compactDict = new CompactTrieDictionary(*mutableDict, status);
 633     if (U_FAILURE(status)) {
 634         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
 635         goto cleanup;
 636     }
 637
 638     enumer = compactDict->openWords(status);
 639     if (U_FAILURE(status)) {
 640         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
 641         goto cleanup;
 642     }
 643
 644     if (wordCount != (testCount = enumer->count(status))) {
 645         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 646             testCount, wordCount, u_errorName(status));
 647         goto cleanup;
 648     }
 649
 650     delete enumer;
 651     enumer = NULL;
 652
 653     // Now un-compact it
 654     mutable2 = compactDict->cloneMutable(status);
 655     if (U_FAILURE(status)) {
 656         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
 657         goto cleanup;
 658     }
 659
 660     cloneEnum = mutable2->openWords(status);
 661     if (U_FAILURE(status)) {
 662         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
 663         goto cleanup;
 664     }
 665
 666     if (wordCount != (testCount = cloneEnum->count(status))) {
 667         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 668             testCount, wordCount, u_errorName(status));
 669         goto cleanup;
 670     }
 671
 672     // Compact original dictionary to clone. Note that we can only compare the same kind of
 673     // dictionary as the order of the enumerators is not guaranteed to be the same between
 674     // different kinds
 675     enumer = mutableDict->openWords(status);
 676     if (U_FAILURE(status)) {
 677         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
 678         goto cleanup;
 679      }
 680
 681     originalWord = enumer->snext(status);
 682     cloneWord = cloneEnum->snext(status);
 683     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
 684         if (*originalWord != *cloneWord) {
 685             errln("Original and cloned MutableTrieDictionary word mismatch\n");
 686             goto cleanup;
 687         }
 688         originalWord = enumer->snext(status);
 689         cloneWord = cloneEnum->snext(status);
 690     }
 691
 692     if (U_FAILURE(status)) {
 693         errln("Enumeration failed: %s\n", u_errorName(status));
 694         goto cleanup;
 695     }
 696
 697     if (originalWord != cloneWord) {
 698         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
 699         goto cleanup;
 700     }
 701
 702     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
 703     compact2 = new CompactTrieDictionary(compactDict->data(), status);
 704     if (U_FAILURE(status)) {
 705         errln("CompactTrieDictionary(const void *,...) failed\n");
 706         goto cleanup;
 707     }
 708
 709     if (compact2->dataSize() == 0) {
 710         errln("CompactTrieDictionary->dataSize() == 0\n");
 711         goto cleanup;
 712     }
 713
 714     // Now count the words via the second dictionary
 715     delete enumer;
 716     enumer = compact2->openWords(status);
 717     if (U_FAILURE(status)) {
 718         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
 719         goto cleanup;
 720     }
 721
 722     if (wordCount != (testCount = enumer->count(status))) {
 723         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
 724             testCount, wordCount, u_errorName(status));
 725         goto cleanup;
 726     }
 727
 728 cleanup:
 729     delete compactDict;
 730     delete mutableDict;
 731     delete breaks;
 732     delete[] testFile;
 733     delete enumer;
 734     delete mutable2;
 735     delete cloneEnum;
 736     delete compact2;
 737 }
 738
 739 //---------------------------------------------
 740 // runIndexedTest
 741 //---------------------------------------------
 742
 743 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
 744 {
 745     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
 746
 747     switch (index) {
 748         case 0: name = "TestBug4153072";
 749             if(exec) TestBug4153072();                         break;
 750         case 1: name = "TestJapaneseLineBreak";
 751             if(exec) TestJapaneseLineBreak();                 break;
 752         case 2: name = "TestStatusReturn";
 753             if(exec) TestStatusReturn();                       break;
 754
 755         case 3: name = "TestLineBreakData";
 756             if(exec) TestLineBreakData();                      break;
 757         case 4: name = "TestEmptyString";
 758             if(exec) TestEmptyString();                        break;
 759
 760         case 5: name = "TestGetAvailableLocales";
 761             if(exec) TestGetAvailableLocales();                break;
 762
 763         case 6: name = "TestGetDisplayName";
 764             if(exec) TestGetDisplayName();                     break;
 765
 766         case 7: name = "TestEndBehaviour";
 767             if(exec) TestEndBehaviour();                       break;
 768         case 8: name = "TestMixedThaiLineBreak";
 769              if(exec) TestMixedThaiLineBreak();                break;
 770         case 9: name = "TestThaiLineBreak";
 771              if(exec) TestThaiLineBreak();                     break;
 772         case 10: name = "TestMaiyamok";
 773              if(exec) TestMaiyamok();                          break;
 774         case 11: name = "TestWordBreaks";
 775              if(exec) TestWordBreaks();                        break;
 776         case 12: name = "TestWordBoundary";
 777              if(exec) TestWordBoundary();                      break;
 778         case 13: name = "TestLineBreaks";
 779              if(exec) TestLineBreaks();                        break;
 780         case 14: name = "TestSentBreaks";
 781              if(exec) TestSentBreaks();                        break;
 782         case 15: name = "TestExtended";
 783              if(exec) TestExtended();                          break;
 784         case 16: name = "TestMonkey";
 785              if(exec) {
 786  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 787                TestMonkey(params);
 788  #else
 789                logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
 790  #endif
 791              }
 792                                                                break;
 793         case 17: name = "TestBug3818";
 794             if(exec) TestBug3818();                            break;
 795         case 18: name = "TestJapaneseWordBreak";
 796             if(exec) TestJapaneseWordBreak();                  break;
 797         case 19: name = "TestDebug";
 798             if(exec) TestDebug();                              break;
 799         case 20: name = "TestTrieDict";
 800             if(exec) TestTrieDict();                           break;
 801
 802         default: name = ""; break; //needed to end loop
 803     }
 804 }
 805
 806
 807 //----------------------------------------------------------------------------
 808 //
 809 // generalIteratorTest      Given a break iterator and a set of test data,
 810 //                          Run the tests and report the results.
 811 //
 812 //----------------------------------------------------------------------------
 813 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 814 {
 815
 816     bi.setText(td.fDataToBreak);
 817
 818     testFirstAndNext(bi, td);
 819
 820     testLastAndPrevious(bi, td);
 821
 822     testFollowing(bi, td);
 823     testPreceding(bi, td);
 824     testIsBoundary(bi, td);
 825     doMultipleSelectionTest(bi, td);
 826 }
 827
 828
 829 //
 830 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 831 //                       kind of loop.
 832 //
 833 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 834 {
 835     UErrorCode  status = U_ZERO_ERROR;
 836     int32_t     p;
 837     int32_t     lastP = -1;
 838     int32_t     tag;
 839
 840     logln("Test first and next");
 841     bi.setText(td.fDataToBreak);
 842     td.clearResults();
 843
 844     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 845         td.fActualBreakPositions.addElement(p, status);  // Save result.
 846         tag = bi.getRuleStatus();
 847         td.fActualTags.addElement(tag, status);
 848         if (p <= lastP) {
 849             // If the iterator is not making forward progress, stop.
 850             //  No need to raise an error here, it'll be detected in the normal check of results.
 851             break;
 852         }
 853         lastP = p;
 854     }
 855     td.checkResults("testFirstAndNext", this);
 856 }
 857
 858
 859 //
 860 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 861 //
 862 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 863 {
 864     UErrorCode  status = U_ZERO_ERROR;
 865     int32_t     p;
 866     int32_t     lastP  = 0x7ffffffe;
 867     int32_t     tag;
 868
 869     logln("Test first and next");
 870     bi.setText(td.fDataToBreak);
 871     td.clearResults();
 872
 873     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 874         // Save break position.  Insert it at start of vector of results, shoving
 875         //    already-saved results further towards the end.
 876         td.fActualBreakPositions.insertElementAt(p, 0, status);
 877         // bi.previous();   // TODO:  Why does this fix things up????
 878         // bi.next();
 879         tag = bi.getRuleStatus();
 880         td.fActualTags.insertElementAt(tag, 0, status);
 881         if (p >= lastP) {
 882             // If the iterator is not making progress, stop.
 883             //  No need to raise an error here, it'll be detected in the normal check of results.
 884             break;
 885         }
 886         lastP = p;
 887     }
 888     td.checkResults("testLastAndPrevious", this);
 889 }
 890
 891
 892 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 893 {
 894     UErrorCode  status = U_ZERO_ERROR;
 895     int32_t     p;
 896     int32_t     tag;
 897     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 898                                  //   cannot be -1; that is returned for DONE.
 899     int         i;
 900
 901     logln("testFollowing():");
 902     bi.setText(td.fDataToBreak);
 903     td.clearResults();
 904
 905     // Save the starting point, since we won't get that out of following.
 906     p = bi.first();
 907     td.fActualBreakPositions.addElement(p, status);  // Save result.
 908     tag = bi.getRuleStatus();
 909     td.fActualTags.addElement(tag, status);
 910
 911     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 912         p = bi.following(i);
 913         if (p != lastP) {
 914             if (p == RuleBasedBreakIterator::DONE) {
 915                 break;
 916             }
 917             // We've reached a new break position.  Save it.
 918             td.fActualBreakPositions.addElement(p, status);  // Save result.
 919             tag = bi.getRuleStatus();
 920             td.fActualTags.addElement(tag, status);
 921             lastP = p;
 922         }
 923     }
 924     // The loop normally exits by means of the break in the middle.
 925     // Make sure that the index was at the correct position for the break iterator to have
 926     //   returned DONE.
 927     if (i != td.fDataToBreak.length()) {
 928         errln("testFollowing():  iterator returned DONE prematurely.");
 929     }
 930
 931     // Full check of all results.
 932     td.checkResults("testFollowing", this);
 933 }
 934
 935
 936
 937 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 938     UErrorCode  status = U_ZERO_ERROR;
 939     int32_t     p;
 940     int32_t     tag;
 941     int32_t     lastP  = 0x7ffffffe;
 942     int         i;
 943
 944     logln("testPreceding():");
 945     bi.setText(td.fDataToBreak);
 946     td.clearResults();
 947
 948     p = bi.last();
 949     td.fActualBreakPositions.addElement(p, status);
 950     tag = bi.getRuleStatus();
 951     td.fActualTags.addElement(tag, status);
 952
 953     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 954         p = bi.preceding(i);
 955         if (p != lastP) {
 956             if (p == RuleBasedBreakIterator::DONE) {
 957                 break;
 958             }
 959             // We've reached a new break position.  Save it.
 960             td.fActualBreakPositions.insertElementAt(p, 0, status);
 961             lastP = p;
 962             tag = bi.getRuleStatus();
 963             td.fActualTags.insertElementAt(tag, 0, status);
 964         }
 965     }
 966     // The loop normally exits by means of the break in the middle.
 967     // Make sure that the index was at the correct position for the break iterator to have
 968     //   returned DONE.
 969     if (i != 0) {
 970         errln("testPreceding():  iterator returned DONE prematurely.");
 971     }
 972
 973     // Full check of all results.
 974     td.checkResults("testPreceding", this);
 975 }
 976
 977
 978
 979 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 980     UErrorCode  status = U_ZERO_ERROR;
 981     int         i;
 982     int32_t     tag;
 983
 984     logln("testIsBoundary():");
 985     bi.setText(td.fDataToBreak);
 986     td.clearResults();
 987
 988     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 989         if (bi.isBoundary(i)) {
 990             td.fActualBreakPositions.addElement(i, status);  // Save result.
 991             tag = bi.getRuleStatus();
 992             td.fActualTags.addElement(tag, status);
 993         }
 994     }
 995     td.checkResults("testIsBoundary: ", this);
 996 }
 997
 998
 999
1000 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1001 {
1002     iterator.setText(td.fDataToBreak);
1003
1004     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1005     int32_t offset = iterator.first();
1006     int32_t testOffset;
1007     int32_t count = 0;
1008
1009     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1010
1011     if (*testIterator != iterator)
1012         errln("clone() or operator!= failed: two clones compared unequal");
1013
1014     do {
1015         testOffset = testIterator->first();
1016         testOffset = testIterator->next(count);
1017         if (offset != testOffset)
1018             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1019
1020         if (offset != RuleBasedBreakIterator::DONE) {
1021             count++;
1022             offset = iterator.next();
1023
1024             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1025                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1026                 if (count > 10000 || offset == -1) {
1027                     errln("operator== failed too many times. Stopping test.");
1028                     if (offset == -1) {
1029                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1030                     }
1031                     return;
1032                 }
1033             }
1034         }
1035     } while (offset != RuleBasedBreakIterator::DONE);
1036
1037     // now do it backwards...
1038     offset = iterator.last();
1039     count = 0;
1040
1041     do {
1042         testOffset = testIterator->last();
1043         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
1044         if (offset != testOffset)
1045             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1046
1047         if (offset != RuleBasedBreakIterator::DONE) {
1048             count--;
1049             offset = iterator.previous();
1050         }
1051     } while (offset != RuleBasedBreakIterator::DONE);
1052
1053     delete testIterator;
1054 }
1055
1056
1057 //---------------------------------------------
1058 //
1059 //     other tests
1060 //
1061 //---------------------------------------------
1062 void RBBITest::TestEmptyString()
1063 {
1064     UnicodeString text = "";
1065     UErrorCode status = U_ZERO_ERROR;
1066
1067     BITestData x(status);
1068     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1069     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1070     if (U_FAILURE(status))
1071     {
1072         errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1073         return;
1074     }
1075     generalIteratorTest(*bi, x);
1076     delete bi;
1077 }
1078
1079 void RBBITest::TestGetAvailableLocales()
1080 {
1081     int32_t locCount = 0;
1082     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1083
1084     if (locCount == 0)
1085         errln("getAvailableLocales() returned an empty list!");
1086     // Just make sure that it's returning good memory.
1087     int32_t i;
1088     for (i = 0; i < locCount; ++i) {
1089         logln(locList[i].getName());
1090     }
1091 }
1092
1093 //Testing the BreakIterator::getDisplayName() function
1094 void RBBITest::TestGetDisplayName()
1095 {
1096     UnicodeString   result;
1097
1098     BreakIterator::getDisplayName(Locale::getUS(), result);
1099     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1100         errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1101                 + result);
1102
1103     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1104     if (result != "French (France)")
1105         errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1106                 + result);
1107 }
1108 /**
1109  * Test End Behaviour
1110  * @bug 4068137
1111  */
1112 void RBBITest::TestEndBehaviour()
1113 {
1114     UErrorCode status = U_ZERO_ERROR;
1115     UnicodeString testString("boo.");
1116     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1117     if (U_FAILURE(status))
1118     {
1119         errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1120         return;
1121     }
1122     wb->setText(testString);
1123
1124     if (wb->first() != 0)
1125         errln("Didn't get break at beginning of string.");
1126     if (wb->next() != 3)
1127         errln("Didn't get break before period in \"boo.\"");
1128     if (wb->current() != 4 && wb->next() != 4)
1129         errln("Didn't get break at end of string.");
1130     delete wb;
1131 }
1132 /*
1133  * @bug 4153072
1134  */
1135 void RBBITest::TestBug4153072() {
1136     UErrorCode status = U_ZERO_ERROR;
1137     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1138     if (U_FAILURE(status))
1139     {
1140         errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1141         return;
1142     }
1143     UnicodeString str("...Hello, World!...");
1144     int32_t begin = 3;
1145     int32_t end = str.length() - 3;
1146     UBool onBoundary;
1147
1148     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1149     iter->adoptText(textIterator);
1150     int index;
1151     // Note: with the switch to UText, there is no way to restrict the
1152     //       iteration range to begin at an index other than zero.
1153     //       String character iterators created with a non-zero bound are
1154     //         treated by RBBI as being empty.
1155     for (index = -1; index < begin + 1; ++index) {
1156         onBoundary = iter->isBoundary(index);
1157         if (index == 0?  !onBoundary : onBoundary) {
1158             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1159                             " and begin index = " + begin);
1160         }
1161     }
1162     delete iter;
1163 }
1164
1165
1166 /**
1167  * Test Japanese Line Break
1168  * @bug 4095322
1169  */
1170 void RBBITest::TestJapaneseLineBreak()
1171 {
1172 #if 0
1173     // Test needs updating some more...   Dump it for now.
1174
1175
1176     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1177     //        as opening and closing punctuation for line breaking.
1178     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1179     //        from these tests.    6-13-2002
1180     //
1181     UErrorCode status = U_ZERO_ERROR;
1182     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1183     UnicodeString precedingChars = CharsToUnicodeString(
1184         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1185         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1186     UnicodeString followingChars = CharsToUnicodeString(
1187         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1188         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1189         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1190         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1191         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1192     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1193
1194     int32_t i;
1195     if (U_FAILURE(status))
1196     {
1197         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1198         return;
1199     }
1200
1201     for (i = 0; i < precedingChars.length(); i++) {
1202         testString.setCharAt(1, precedingChars[i]);
1203         iter->setText(testString);
1204         int32_t j = iter->first();
1205         if (j != 0)
1206             errln("ja line break failure: failed to start at 0");
1207         j = iter->next();
1208         if (j != 1)
1209             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1210                         + "' (" + ((int)(precedingChars[i])) + ")");
1211         j = iter->next();
1212         if (j != 3)
1213             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1214                         + "' (" + ((int)(precedingChars[i])) + ")");
1215     }
1216
1217     for (i = 0; i < followingChars.length(); i++) {
1218         testString.setCharAt(1, followingChars[i]);
1219         iter->setText(testString);
1220         int j = iter->first();
1221         if (j != 0)
1222             errln("ja line break failure: failed to start at 0");
1223         j = iter->next();
1224         if (j != 2)
1225             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1226                         + "' (" + ((int)(followingChars[i])) + ")");
1227         j = iter->next();
1228         if (j != 3)
1229             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1230                         + "' (" + ((int)(followingChars[i])) + ")");
1231     }
1232     delete iter;
1233 #endif
1234 }
1235
1236
1237 //------------------------------------------------------------------------------
1238 //
1239 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
1240 //
1241 //------------------------------------------------------------------------------
1242
1243 struct TestParams {
1244     BreakIterator   *bi;
1245     UnicodeString    dataToBreak;
1246     UVector32       *expectedBreaks;
1247     UVector32       *srcLine;
1248     UVector32       *srcCol;
1249 };
1250
1251 void RBBITest::executeTest(TestParams *t) {
1252     int32_t    bp;
1253     int32_t    prevBP;
1254     int32_t    i;
1255
1256     if (t->bi == NULL) {
1257         return;
1258     }
1259
1260     t->bi->setText(t->dataToBreak);
1261     //
1262     //  Run the iterator forward
1263     //
1264     prevBP = -1;
1265     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1266         if (prevBP ==  bp) {
1267             // Fail for lack of forward progress.
1268             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1269                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1270             break;
1271         }
1272
1273         // Check that there were we didn't miss an expected break between the last one
1274         //  and this one.
1275         for (i=prevBP+1; i<bp; i++) {
1276             if (t->expectedBreaks->elementAti(i) != 0) {
1277                 int expected[] = {0, i};
1278                 printStringBreaks(t->dataToBreak, expected, 2);
1279                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1280                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1281             }
1282         }
1283
1284         // Check that the break we did find was expected
1285         if (t->expectedBreaks->elementAti(bp) == 0) {
1286             int expected[] = {0, bp};
1287             printStringBreaks(t->dataToBreak, expected, 2);
1288             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1289                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1290         } else {
1291             // The break was expected.
1292             //   Check that the {nnn} tag value is correct.
1293             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1294             if (expectedTagVal == -1) {
1295                 expectedTagVal = 0;
1296             }
1297             int32_t line = t->srcLine->elementAti(bp);
1298             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1299             if (rs != expectedTagVal) {
1300                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1301                       "          Actual, Expected status = %4d, %4d",
1302                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1303             }
1304         }
1305
1306
1307         prevBP = bp;
1308     }
1309
1310     // Verify that there were no missed expected breaks after the last one found
1311     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1312         if (t->expectedBreaks->elementAti(i) != 0) {
1313             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1314                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1315         }
1316     }
1317
1318     //
1319     //  Run the iterator backwards, verify that the same breaks are found.
1320     //
1321     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1322     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1323         if (prevBP ==  bp) {
1324             // Fail for lack of progress.
1325             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1326                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1327             break;
1328         }
1329
1330         // Check that there were we didn't miss an expected break between the last one
1331         //  and this one.  (UVector returns zeros for index out of bounds.)
1332         for (i=prevBP-1; i>bp; i--) {
1333             if (t->expectedBreaks->elementAti(i) != 0) {
1334                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1335                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1336             }
1337         }
1338
1339         // Check that the break we did find was expected
1340         if (t->expectedBreaks->elementAti(bp) == 0) {
1341             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1342                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1343         } else {
1344             // The break was expected.
1345             //   Check that the {nnn} tag value is correct.
1346             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1347             if (expectedTagVal == -1) {
1348                 expectedTagVal = 0;
1349             }
1350             int line = t->srcLine->elementAti(bp);
1351             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1352             if (rs != expectedTagVal) {
1353                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1354                       "          Actual, Expected status = %4d, %4d",
1355                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1356             }
1357         }
1358
1359         prevBP = bp;
1360     }
1361
1362     // Verify that there were no missed breaks prior to the last one found
1363     for (i=prevBP-1; i>=0; i--) {
1364         if (t->expectedBreaks->elementAti(i) != 0) {
1365             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1366                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1367         }
1368     }
1369 }
1370
1371
1372 void RBBITest::TestExtended() {
1373 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1374     UErrorCode      status  = U_ZERO_ERROR;
1375     Locale          locale("");
1376
1377     UnicodeString       rules;
1378     TestParams          tp;
1379     tp.bi             = NULL;
1380     tp.expectedBreaks = new UVector32(status);
1381     tp.srcLine        = new UVector32(status);
1382     tp.srcCol         = new UVector32(status);
1383
1384     RegexMatcher      localeMatcher("<locale *([\\p{L}\\p{Nd}_]*) *>", 0, status);
1385     TEST_ASSERT_SUCCESS(status);
1386
1387
1388     //
1389     //  Open and read the test data file.
1390     //
1391     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1392     char testFileName[1000];
1393     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1394         errln("Can't open test data.  Path too long.");
1395         return;
1396     }
1397     strcpy(testFileName, testDataDirectory);
1398     strcat(testFileName, "rbbitst.txt");
1399
1400     int    len;
1401     UChar *testFile = ReadAndConvertFile(testFileName, len, status);
1402     if (U_FAILURE(status)) {
1403         return; /* something went wrong, error already output */
1404     }
1405
1406
1407
1408     //
1409     //  Put the test data into a UnicodeString
1410     //
1411     UnicodeString testString(FALSE, testFile, len);
1412
1413     enum EParseState{
1414         PARSE_COMMENT,
1415         PARSE_TAG,
1416         PARSE_DATA,
1417         PARSE_NUM
1418     }
1419     parseState = PARSE_TAG;
1420
1421     EParseState savedState = PARSE_TAG;
1422
1423     static const UChar CH_LF        = 0x0a;
1424     static const UChar CH_CR        = 0x0d;
1425     static const UChar CH_HASH      = 0x23;
1426     /*static const UChar CH_PERIOD    = 0x2e;*/
1427     static const UChar CH_LT        = 0x3c;
1428     static const UChar CH_GT        = 0x3e;
1429     static const UChar CH_BACKSLASH = 0x5c;
1430     static const UChar CH_BULLET    = 0x2022;
1431
1432     int32_t    lineNum  = 1;
1433     int32_t    colStart = 0;
1434     int32_t    column   = 0;
1435     int32_t    charIdx  = 0;
1436
1437     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1438
1439     for (charIdx = 0; charIdx < len; ) {
1440         status = U_ZERO_ERROR;
1441         UChar  c = testString.charAt(charIdx);
1442         charIdx++;
1443         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1444             // treat CRLF as a unit
1445             c = CH_LF;
1446             charIdx++;
1447         }
1448         if (c == CH_LF || c == CH_CR) {
1449             lineNum++;
1450             colStart = charIdx;
1451         }
1452         column = charIdx - colStart + 1;
1453
1454         switch (parseState) {
1455         case PARSE_COMMENT:
1456             if (c == 0x0a || c == 0x0d) {
1457                 parseState = savedState;
1458             }
1459             break;
1460
1461         case PARSE_TAG:
1462             {
1463             if (c == CH_HASH) {
1464                 parseState = PARSE_COMMENT;
1465                 savedState = PARSE_TAG;
1466                 break;
1467             }
1468             if (u_isUWhiteSpace(c)) {
1469                 break;
1470             }
1471             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1472                 delete tp.bi;
1473                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1474                 charIdx += 5;
1475                 break;
1476             }
1477             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1478                 delete tp.bi;
1479                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1480                 charIdx += 5;
1481                 break;
1482             }
1483             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1484                 delete tp.bi;
1485                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1486                 charIdx += 5;
1487                 break;
1488             }
1489             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1490                 delete tp.bi;
1491                 tp.bi = NULL;
1492                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1493                 charIdx += 5;
1494                 break;
1495             }
1496             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1497                 delete tp.bi;
1498                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1499                 charIdx += 6;
1500                 break;
1501             }
1502             // <locale  loc_name>
1503             localeMatcher.reset(testString);
1504             if (localeMatcher.lookingAt(charIdx-1, status)) {
1505                 UnicodeString localeName = localeMatcher.group(1, status);
1506                 char localeName8[100];
1507                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1508                 locale = Locale::createFromName(localeName8);
1509                 charIdx += localeMatcher.group(0, status).length();
1510                 TEST_ASSERT_SUCCESS(status);
1511                 break;
1512             }
1513             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1514                 parseState = PARSE_DATA;
1515                 charIdx += 5;
1516                 tp.dataToBreak = "";
1517                 tp.expectedBreaks->removeAllElements();
1518                 tp.srcCol ->removeAllElements();
1519                 tp.srcLine->removeAllElements();
1520                 break;
1521             }
1522
1523             errln("line %d: Tag expected in test file.", lineNum);
1524             goto end_test;
1525             parseState = PARSE_COMMENT;
1526             savedState = PARSE_DATA;
1527             }
1528             break;
1529
1530         case PARSE_DATA:
1531             if (c == CH_BULLET) {
1532                 int32_t  breakIdx = tp.dataToBreak.length();
1533                 tp.expectedBreaks->setSize(breakIdx+1);
1534                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1535                 tp.srcLine->setSize(breakIdx+1);
1536                 tp.srcLine->setElementAt(lineNum, breakIdx);
1537                 tp.srcCol ->setSize(breakIdx+1);
1538                 tp.srcCol ->setElementAt(column, breakIdx);
1539                 break;
1540             }
1541
1542             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1543                 // Add final entry to mappings from break location to source file position.
1544                 //  Need one extra because last break position returned is after the
1545                 //    last char in the data, not at the last char.
1546                 tp.srcLine->addElement(lineNum, status);
1547                 tp.srcCol ->addElement(column, status);
1548
1549                 parseState = PARSE_TAG;
1550                 charIdx += 6;
1551
1552                 // RUN THE TEST!
1553                 executeTest(&tp);
1554                 break;
1555             }
1556
1557             if (testString.compare(charIdx-1, 3, "\\N{") == 0) {
1558                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1559                 // Get the code point from the name and insert it into the test data.
1560                 //   (Damn, no API takes names in Unicode  !!!
1561                 //    we've got to take it back to char *)
1562                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1563                 int32_t nameLength = nameEndIdx - (charIdx+2);
1564                 char charNameBuf[200];
1565                 UChar32 theChar = -1;
1566                 if (nameEndIdx != -1) {
1567                     UErrorCode status = U_ZERO_ERROR;
1568                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1569                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1570                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1571                     if (U_FAILURE(status)) {
1572                         theChar = -1;
1573                     }
1574                 }
1575                 if (theChar == -1) {
1576                     errln("Error in named character in test file at line %d, col %d",
1577                         lineNum, column);
1578                 } else {
1579                     // Named code point was recognized.  Insert it
1580                     //   into the test data.
1581                     tp.dataToBreak.append(theChar);
1582                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1583                         tp.srcLine->addElement(lineNum, status);
1584                         tp.srcCol ->addElement(column, status);
1585                     }
1586                 }
1587                 if (nameEndIdx > charIdx) {
1588                     charIdx = nameEndIdx+1;
1589
1590                 }
1591                 break;
1592             }
1593
1594
1595
1596
1597             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1598                 charIdx++;
1599                 int32_t  breakIdx = tp.dataToBreak.length();
1600                 tp.expectedBreaks->setSize(breakIdx+1);
1601                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1602                 tp.srcLine->setSize(breakIdx+1);
1603                 tp.srcLine->setElementAt(lineNum, breakIdx);
1604                 tp.srcCol ->setSize(breakIdx+1);
1605                 tp.srcCol ->setElementAt(column, breakIdx);
1606                 break;
1607             }
1608
1609             if (c == CH_LT) {
1610                 tagValue   = 0;
1611                 parseState = PARSE_NUM;
1612                 break;
1613             }
1614
1615             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1616                 parseState = PARSE_COMMENT;
1617                 savedState = PARSE_DATA;
1618                 break;
1619             }
1620
1621             if (c == CH_BACKSLASH) {
1622                 // Check for \ at end of line, a line continuation.
1623                 //     Advance over (discard) the newline
1624                 UChar32 cp = testString.char32At(charIdx);
1625                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1626                     // We have a CR LF
1627                     //  Need an extra increment of the input ptr to move over both of them
1628                     charIdx++;
1629                 }
1630                 if (cp == CH_LF || cp == CH_CR) {
1631                     lineNum++;
1632                     colStart = charIdx;
1633                     charIdx++;
1634                     break;
1635                 }
1636
1637                 // Let unescape handle the back slash.
1638                 cp = testString.unescapeAt(charIdx);
1639                 if (cp != -1) {
1640                     // Escape sequence was recognized.  Insert the char
1641                     //   into the test data.
1642                     tp.dataToBreak.append(cp);
1643                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1644                         tp.srcLine->addElement(lineNum, status);
1645                         tp.srcCol ->addElement(column, status);
1646                     }
1647                     break;
1648                 }
1649
1650
1651                 // Not a recognized backslash escape sequence.
1652                 // Take the next char as a literal.
1653                 //  TODO:  Should this be an error?
1654                 c = testString.charAt(charIdx);
1655                 charIdx = testString.moveIndex32(charIdx, 1);
1656             }
1657
1658             // Normal, non-escaped data char.
1659             tp.dataToBreak.append(c);
1660
1661             // Save the mapping from offset in the data to line/column numbers in
1662             //   the original input file.  Will be used for better error messages only.
1663             //   If there's an expected break before this char, the slot in the mapping
1664             //     vector will already be set for this char; don't overwrite it.
1665             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1666                 tp.srcLine->addElement(lineNum, status);
1667                 tp.srcCol ->addElement(column, status);
1668             }
1669             break;
1670
1671
1672         case PARSE_NUM:
1673             // We are parsing an expected numeric tag value, like <1234>,
1674             //   within a chunk of data.
1675             if (u_isUWhiteSpace(c)) {
1676                 break;
1677             }
1678
1679             if (c == CH_GT) {
1680                 // Finished the number.  Add the info to the expected break data,
1681                 //   and switch parse state back to doing plain data.
1682                 parseState = PARSE_DATA;
1683                 if (tagValue == 0) {
1684                     tagValue = -1;
1685                 }
1686                 int32_t  breakIdx = tp.dataToBreak.length();
1687                 tp.expectedBreaks->setSize(breakIdx+1);
1688                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1689                 tp.srcLine->setSize(breakIdx+1);
1690                 tp.srcLine->setElementAt(lineNum, breakIdx);
1691                 tp.srcCol ->setSize(breakIdx+1);
1692                 tp.srcCol ->setElementAt(column, breakIdx);
1693                 break;
1694             }
1695
1696             if (u_isdigit(c)) {
1697                 tagValue = tagValue*10 + u_charDigitValue(c);
1698                 break;
1699             }
1700
1701             errln("Syntax Error in test file at line %d, col %d",
1702                 lineNum, column);
1703             goto end_test;
1704             parseState = PARSE_COMMENT;
1705             break;
1706         }
1707
1708
1709         if (U_FAILURE(status)) {
1710             errln("ICU Error %s while parsing test file at line %d.",
1711                 u_errorName(status), lineNum);
1712             goto end_test;
1713             status = U_ZERO_ERROR;
1714         }
1715
1716     }
1717
1718 end_test:
1719     delete tp.bi;
1720     delete tp.expectedBreaks;
1721     delete tp.srcLine;
1722     delete tp.srcCol;
1723     delete [] testFile;
1724 #endif
1725 }
1726
1727
1728 //-------------------------------------------------------------------------------
1729 //
1730 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1731 //    return the datain one big UChar * buffer, which the caller must delete.
1732 //
1733 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1734 //           Move this function to some common place.
1735 //
1736 //--------------------------------------------------------------------------------
1737 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
1738     UChar       *retPtr  = NULL;
1739     char        *fileBuf = NULL;
1740     UConverter* conv     = NULL;
1741     FILE        *f       = NULL;
1742
1743     ulen = 0;
1744     if (U_FAILURE(status)) {
1745         return retPtr;
1746     }
1747
1748     //
1749     //  Open the file.
1750     //
1751     f = fopen(fileName, "rb");
1752     if (f == 0) {
1753         errln("Error opening test data file %s\n", fileName);
1754         status = U_FILE_ACCESS_ERROR;
1755         return NULL;
1756     }
1757     //
1758     //  Read it in
1759     //
1760     int   fileSize;
1761     int   amt_read;
1762
1763     fseek( f, 0, SEEK_END);
1764     fileSize = ftell(f);
1765     fileBuf = new char[fileSize];
1766     fseek(f, 0, SEEK_SET);
1767     amt_read = fread(fileBuf, 1, fileSize, f);
1768     if (amt_read != fileSize || fileSize <= 0) {
1769         errln("Error reading test data file.");
1770         goto cleanUpAndReturn;
1771     }
1772
1773     //
1774     // Look for a Unicode Signature (BOM) on the data just read
1775     //
1776     int32_t        signatureLength;
1777     const char *   fileBufC;
1778     const char*    encoding;
1779
1780     fileBufC = fileBuf;
1781     encoding = ucnv_detectUnicodeSignature(
1782         fileBuf, fileSize, &signatureLength, &status);
1783     if(encoding!=NULL ){
1784         fileBufC  += signatureLength;
1785         fileSize  -= signatureLength;
1786     }
1787
1788     //
1789     // Open a converter to take the rule file to UTF-16
1790     //
1791     conv = ucnv_open(encoding, &status);
1792     if (U_FAILURE(status)) {
1793         goto cleanUpAndReturn;
1794     }
1795
1796     //
1797     // Convert the rules to UChar.
1798     //  Preflight first to determine required buffer size.
1799     //
1800     ulen = ucnv_toUChars(conv,
1801         NULL,           //  dest,
1802         0,              //  destCapacity,
1803         fileBufC,
1804         fileSize,
1805         &status);
1806     if (status == U_BUFFER_OVERFLOW_ERROR) {
1807         // Buffer Overflow is expected from the preflight operation.
1808         status = U_ZERO_ERROR;
1809
1810         retPtr = new UChar[ulen+1];
1811         ucnv_toUChars(conv,
1812             retPtr,       //  dest,
1813             ulen+1,
1814             fileBufC,
1815             fileSize,
1816             &status);
1817     }
1818
1819 cleanUpAndReturn:
1820     fclose(f);
1821     delete []fileBuf;
1822     ucnv_close(conv);
1823     if (U_FAILURE(status)) {
1824         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1825         delete retPtr;
1826         retPtr = 0;
1827         ulen   = 0;
1828     };
1829     return retPtr;
1830 }
1831
1832
1833 //--------------------------------------------------------------------------------------------
1834 //
1835 //     Exhaustive Tests, using Unicode Data Files.
1836 //
1837 //--------------------------------------------------------------------------------------------
1838
1839 //
1840 //  Token level scanner for the Unicode Line Break Test Data file.
1841 //      Return the next token, as follows:
1842 //          >= 0:       a UChar32 character, scanned from hex in the file.
1843 //          -1:         a break position, a division sign in the file.
1844 //          -2:         end of rule.  A new line in the file.
1845 //          -3:         end of file.  No more rules.
1846 //          -4:         Error
1847 //
1848 //   The scanner
1849 //       strips comments, ('#' to end of line)
1850 //       Recognizes CR, CR/LF and LF as new lines.
1851 //       Skips over spaces and  Xs (don't break here) in the data.
1852 //
1853 struct ScanState {
1854     int32_t     fPeekChar;
1855     UBool       fPeeked;
1856     int32_t     fLineNum;
1857     FILE        *fFile;
1858     ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};
1859 };
1860
1861 //  Literal characters that are of interest.  In hex to keep EBCDIC based machines happy.
1862 //  The data itself is latin-1 on all platforms.
1863 static const int32_t chSpace  = 0x20;
1864 static const int32_t chTab    = 0x09;
1865 static const int32_t chCR     = 0x0D;
1866 static const int32_t chLF     = 0x0A;
1867 static const int32_t chHash   = 0x23;
1868 static const int32_t chMult   = 0xD7;
1869 static const int32_t chDivide = 0xF7;
1870
1871 static int32_t   nextLBDToken(ScanState *s) {
1872     int32_t     c;
1873
1874     // Read  characters from the input file until we get something interesting
1875     //   to return.  The file is in latin-1 encoding.
1876     for (;;) {
1877         // Get the next character to look at,
1878         if (s->fPeeked) {
1879             c = s->fPeekChar;
1880             s->fPeeked = FALSE;
1881         } else {
1882             c = getc(s->fFile);
1883         }
1884
1885         // EOF.  Return immediately.
1886         if (c == EOF) {
1887             return -3;
1888         }
1889
1890         // Spaces.  Treat the multiply sign as a space - it indicates a no-break position
1891         //          in the data, and the test program doesn't want to see them.
1892         //          Continue the next char loop, looking for something significant.
1893         if (c == chSpace || c == chTab || c == chMult) {
1894             continue;
1895         }
1896
1897         //  Divide sign.  Indicates an expected break position.
1898         if (c == chDivide) {
1899             return -1;
1900         }
1901
1902         // New Line Handling.  Keep track of line number in the file, which in turn
1903         //   requires keeping track of CR/LF as a single new line.
1904         if (c == chCR) {
1905             s->fLineNum++;
1906             s->fPeekChar = getc(s->fFile);
1907             if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};
1908             return -2;
1909         }
1910         if (c == chLF) {
1911             s->fLineNum++;
1912             return -2;
1913         }
1914
1915         // Comments.  Consume everything up to the next new line.
1916         if (c == chHash) {
1917             do {
1918                 c = getc(s->fFile);
1919             } while (!(c == EOF || c == chCR || c == chLF));
1920             s->fPeekChar = c;
1921             s->fPeeked = TRUE;
1922             return nextLBDToken(s);
1923         }
1924
1925         // Scan a hex character (UChar32) value.
1926         if (u_digit(c, 16) >= 0) {
1927             int32_t   v = u_digit(c, 16);
1928             for (;;) {
1929                 c = getc(s->fFile);
1930                 if (u_digit(c, 16) < 0) {break;};
1931                 v <<= 4;
1932                 v += u_digit(c, 16);
1933             }
1934             s->fPeekChar = c;
1935             s->fPeeked   = TRUE;
1936             return v;
1937         }
1938
1939         // Error.  Character was something unexpected.
1940         return -4;
1941     }
1942 }
1943
1944
1945
1946 void RBBITest::TestLineBreakData() {
1947
1948     UErrorCode      status = U_ZERO_ERROR;
1949     UnicodeString   testString;
1950     UVector         expectedBreaks(status);
1951     ScanState       ss;
1952     int32_t         tok;
1953
1954     BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
1955     if (U_FAILURE(status)) {
1956         errln("Failure creating break iterator");
1957         return;
1958     }
1959
1960     const char *    lbdfName = "LBTest.txt";
1961
1962     // Open the test data file.
1963     //   TODO:  a proper way to handle this data.
1964     ss.fFile = fopen(lbdfName, "rb");
1965     if (ss.fFile == NULL) {
1966         logln("Unable to open Line Break Test Data file.  Skipping test.");
1967         delete bi;
1968         return;
1969     }
1970
1971     // Loop once per line from the test data file.
1972     for (;;) {
1973         // Zero out test data from previous line.
1974         testString.truncate(0);
1975         expectedBreaks.removeAllElements();
1976
1977         // Read one test's (line's) worth of data from the file.
1978         //   Loop once per token on the input file line.
1979         for(;;)  {
1980             tok = nextLBDToken(&ss);
1981
1982             // If we scanned a character number in the file.
1983             //   save it in the test data array.
1984             if (tok >= 0) {
1985                 testString.append((UChar32)tok);
1986                 continue;
1987             }
1988
1989             // If we scanned a break position in the data, record it.
1990             if (tok == -1) {
1991                 expectedBreaks.addElement(testString.length(), status);
1992                 continue;
1993             }
1994
1995             // If we scanned a new line, or EOF
1996             //    drop out of scan loop and run the test case.
1997             if (tok == -2 || tok == -3) {break;};
1998
1999             // None of above.  Error.
2000             errln("Failure:  Unrecognized data format,  test file line %d", ss.fLineNum);
2001             break;
2002         }
2003
2004         // If this line from the test data file actually contained test data,
2005         //   run the test.
2006         if (testString.length() > 0) {
2007             int32_t pos;                 // Break Position in the test string
2008             int32_t expectedI = 0;       // Index of expected break position in vector of same.
2009             int32_t expectedPos;         // Expected break position (index into test string)
2010
2011             bi->setText(testString);
2012             pos = bi->first();
2013             pos = bi->next();
2014
2015             for (; pos != BreakIterator::DONE; ) {
2016                 expectedPos = expectedBreaks.elementAti(expectedI);
2017                 if (pos < expectedPos) {
2018                     errln("Failure: Test file line %d, unexpected break found at position %d",
2019                         ss.fLineNum, pos);
2020                     break;
2021                 }
2022                 if (pos > expectedPos) {
2023                     errln("Failure: Test file line %d, failed to find break at position %d",
2024                         ss.fLineNum, expectedPos);
2025                     break;
2026                 }
2027                 pos = bi->next();
2028                 expectedI++;
2029             }
2030         }
2031
2032         // If we've hit EOF on the input file, we're done.
2033         if (tok == -3) {
2034             break;
2035         }
2036
2037     }
2038
2039     fclose(ss.fFile);
2040     delete bi;
2041
2042 }
2043
2044 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2045
2046 //---------------------------------------------------------------------------------------
2047 //
2048 //   classs RBBIMonkeyKind
2049 //
2050 //      Monkey Test for Break Iteration
2051 //      Abstract interface class.   Concrete derived classes independently
2052 //      implement the break rules for different iterator types.
2053 //
2054 //      The Monkey Test itself uses doesn't know which type of break iterator it is
2055 //      testing, but works purely in terms of the interface defined here.
2056 //
2057 //---------------------------------------------------------------------------------------
2058 class RBBIMonkeyKind {
2059 public:
2060     // Return a UVector of UnicodeSets, representing the character classes used
2061     //   for this type of iterator.
2062     virtual  UVector  *charClasses() = 0;
2063
2064     // Set the test text on which subsequent calls to next() will operate
2065     virtual  void      setText(const UnicodeString &s) = 0;
2066
2067     // Find the next break postion, starting from the prev break position, or from zero.
2068     // Return -1 after reaching end of string.
2069     virtual  int32_t   next(int32_t i) = 0;
2070
2071     virtual ~RBBIMonkeyKind();
2072     UErrorCode       deferredStatus;
2073
2074
2075 protected:
2076     RBBIMonkeyKind();
2077
2078 private:
2079 };
2080
2081 RBBIMonkeyKind::RBBIMonkeyKind() {
2082     deferredStatus = U_ZERO_ERROR;
2083 }
2084
2085 RBBIMonkeyKind::~RBBIMonkeyKind() {
2086 }
2087
2088
2089 //----------------------------------------------------------------------------------------
2090 //
2091 //   Random Numbers.  Similar to standard lib rand() and srand()
2092 //                    Not using library to
2093 //                      1.  Get same results on all platforms.
2094 //                      2.  Get access to current seed, to more easily reproduce failures.
2095 //
2096 //---------------------------------------------------------------------------------------
2097 static uint32_t m_seed = 1;
2098
2099 static uint32_t m_rand()
2100 {
2101     m_seed = m_seed * 1103515245 + 12345;
2102     return (uint32_t)(m_seed/65536) % 32768;
2103 }
2104
2105
2106 //------------------------------------------------------------------------------------------
2107 //
2108 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2109 //                             of RBBIMonkeyKind.
2110 //
2111 //------------------------------------------------------------------------------------------
2112 class RBBICharMonkey: public RBBIMonkeyKind {
2113 public:
2114     RBBICharMonkey();
2115     virtual          ~RBBICharMonkey();
2116     virtual  UVector *charClasses();
2117     virtual  void     setText(const UnicodeString &s);
2118     virtual  int32_t  next(int32_t i);
2119 private:
2120     UVector   *fSets;
2121
2122     UnicodeSet  *fCRLFSet;
2123     UnicodeSet  *fControlSet;
2124     UnicodeSet  *fExtendSet;
2125     UnicodeSet  *fHangulSet;
2126     UnicodeSet  *fAnySet;
2127
2128     RegexMatcher  *fMatcher;
2129     const UnicodeString *fText;
2130 };
2131
2132
2133 RBBICharMonkey::RBBICharMonkey() {
2134     UErrorCode  status = U_ZERO_ERROR;
2135
2136     fText = NULL;
2137     fMatcher = new RegexMatcher("\\X", 0, status);     // Pattern to match a grampheme cluster
2138
2139     fCRLFSet    = new UnicodeSet("[\\r\\n]", status);
2140     fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
2141     fExtendSet  = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
2142     fHangulSet  = new UnicodeSet(
2143         "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
2144          "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
2145     fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]", status);
2146
2147     fSets       = new UVector(status);
2148     fSets->addElement(fCRLFSet,    status);
2149     fSets->addElement(fControlSet, status);
2150     fSets->addElement(fExtendSet,  status);
2151     fSets->addElement(fHangulSet,  status);
2152     fSets->addElement(fAnySet,     status);
2153     if (U_FAILURE(status)) {
2154         deferredStatus = status;
2155     }
2156 }
2157
2158
2159 void RBBICharMonkey::setText(const UnicodeString &s) {
2160     fText = &s;
2161     fMatcher->reset(s);
2162 }
2163
2164
2165 int32_t RBBICharMonkey::next(int32_t i) {
2166     UErrorCode status = U_ZERO_ERROR;
2167     int32_t  retVal = -1;
2168
2169     if (fMatcher->find(i, status)) {
2170         retVal = fMatcher->end(status);
2171     }
2172     if (U_FAILURE(status)){
2173         retVal = -1;
2174     }
2175     return retVal;
2176 }
2177
2178
2179 UVector  *RBBICharMonkey::charClasses() {
2180     return fSets;
2181 }
2182
2183
2184 RBBICharMonkey::~RBBICharMonkey() {
2185     delete fSets;
2186     delete fCRLFSet;
2187     delete fControlSet;
2188     delete fExtendSet;
2189     delete fHangulSet;
2190     delete fAnySet;
2191
2192     delete fMatcher;
2193 }
2194
2195 //------------------------------------------------------------------------------------------
2196 //
2197 //   class RBBIWordMonkey      Word Break specific implementation
2198 //                             of RBBIMonkeyKind.
2199 //
2200 //------------------------------------------------------------------------------------------
2201 class RBBIWordMonkey: public RBBIMonkeyKind {
2202 public:
2203     RBBIWordMonkey();
2204     virtual          ~RBBIWordMonkey();
2205     virtual  UVector *charClasses();
2206     virtual  void     setText(const UnicodeString &s);
2207     virtual int32_t   next(int32_t i);
2208 private:
2209     UVector      *fSets;
2210
2211     UnicodeSet  *fKatakanaSet;
2212     UnicodeSet  *fALetterSet;
2213     UnicodeSet  *fMidLetterSet;
2214     UnicodeSet  *fMidNumSet;
2215     UnicodeSet  *fNumericSet;
2216     UnicodeSet  *fFormatSet;
2217     UnicodeSet  *fOtherSet;
2218     UnicodeSet  *fExtendSet;
2219     UnicodeSet  *fExtendNumLetSet;
2220
2221     RegexMatcher  *fMatcher;
2222
2223     const UnicodeString  *fText;
2224 };
2225
2226
2227 RBBIWordMonkey::RBBIWordMonkey()
2228 {
2229     UErrorCode  status = U_ZERO_ERROR;
2230
2231
2232     fSets            = new UVector(status);
2233
2234     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2235                          "[\\p{Line_Break = Complex_Context}"
2236                          "-\\p{Grapheme_Cluster_Break = Extend}"
2237                          "-\\p{Grapheme_Cluster_Break = Control}]]",      status);
2238     //fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]",      status);
2239     fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]",     status);
2240     fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]",    status);
2241     fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]",       status);
2242     fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]",      status);
2243     fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]",       status);
2244     fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]", status);
2245     //fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]", status);
2246     fExtendSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]", status);
2247
2248     fOtherSet        = new UnicodeSet();
2249     if(U_FAILURE(status)) {
2250       deferredStatus = status;
2251       return;
2252     }
2253
2254     fOtherSet->complement();
2255     fOtherSet->removeAll(*fKatakanaSet);
2256     fOtherSet->removeAll(*fALetterSet);
2257     fOtherSet->removeAll(*fMidLetterSet);
2258     fOtherSet->removeAll(*fMidNumSet);
2259     fOtherSet->removeAll(*fNumericSet);
2260     fOtherSet->removeAll(*fExtendNumLetSet);
2261     fOtherSet->removeAll(*fFormatSet);
2262     fOtherSet->removeAll(*fExtendSet);
2263
2264     fSets->addElement(fALetterSet,   status);
2265     fSets->addElement(fKatakanaSet,  status);
2266     fSets->addElement(fMidLetterSet, status);
2267     fSets->addElement(fMidNumSet,    status);
2268     fSets->addElement(fNumericSet,   status);
2269     fSets->addElement(fFormatSet,    status);
2270     fSets->addElement(fExtendSet,    status);
2271     fSets->addElement(fOtherSet,     status);
2272     fSets->addElement(fExtendNumLetSet, status);
2273
2274
2275     if (U_FAILURE(status)) {
2276         deferredStatus = status;
2277     }
2278 }
2279
2280 void RBBIWordMonkey::setText(const UnicodeString &s) {
2281     fText       = &s;
2282 }
2283
2284
2285 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2286     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2287                               //   break position being tested.  The candidate break
2288                               //   location is before p2.
2289
2290     int     breakPos = -1;
2291
2292     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2293
2294     // Prev break at end of string.  return DONE.
2295     if (prevPos >= fText->length()) {
2296         return -1;
2297     }
2298     p0 = p1 = p2 = p3 = prevPos;
2299     c3 =  fText->char32At(prevPos);
2300     c0 = c1 = c2 = 0;
2301
2302     // Loop runs once per "significant" character position in the input text.
2303     for (;;) {
2304         // Move all of the positions forward in the input string.
2305         p0 = p1;  c0 = c1;
2306         p1 = p2;  c1 = c2;
2307         p2 = p3;  c2 = c3;
2308
2309         // Advancd p3 by    X(Extend | Format)*   Rule 4
2310         do {
2311             p3 = fText->moveIndex32(p3, 1);
2312             c3 = fText->char32At(p3);
2313         }
2314         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2315
2316
2317         if (p1 == p2) {
2318             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2319             continue;
2320         }
2321         if (p2 == fText->length()) {
2322             // Reached end of string.  Always a break position.
2323             break;
2324         }
2325
2326         // Rule  (3)   CR x LF
2327         //     No Extend or Format characters may appear between the CR and LF,
2328         //     which requires the additional check for p2 immediately following p1.
2329         //
2330         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2331             continue;
2332         }
2333
2334         // Rule (5).   ALetter x ALetter
2335         if (fALetterSet->contains(c1) &&
2336             fALetterSet->contains(c2))  {
2337             continue;
2338         }
2339
2340         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2341         //
2342         //    Also incorporates rule 7 by skipping pos ahead to position of the
2343         //    terminating ALetter.
2344         if ( fALetterSet->contains(c1)   &&
2345              fMidLetterSet->contains(c2) &&
2346              fALetterSet->contains(c3)) {
2347             continue;
2348         }
2349
2350
2351         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2352         if (fALetterSet->contains(c0) &&
2353             (fMidLetterSet->contains(c1)  ) &&
2354             fALetterSet->contains(c2)) {
2355             continue;
2356         }
2357
2358         // Rule (8)    Numeric x Numeric
2359         if (fNumericSet->contains(c1) &&
2360             fNumericSet->contains(c2))  {
2361             continue;
2362         }
2363
2364         // Rule (9)    ALetter x Numeric
2365         if (fALetterSet->contains(c1) &&
2366             fNumericSet->contains(c2))  {
2367             continue;
2368         }
2369
2370         // Rule (10)    Numeric x ALetter
2371         if (fNumericSet->contains(c1) &&
2372             fALetterSet->contains(c2))  {
2373             continue;
2374         }
2375
2376         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2377         if ( fNumericSet->contains(c0) &&
2378              fMidNumSet->contains(c1)  &&
2379             fNumericSet->contains(c2)) {
2380             continue;
2381         }
2382
2383         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2384         if (fNumericSet->contains(c1) &&
2385             fMidNumSet->contains(c2)  &&
2386             fNumericSet->contains(c3)) {
2387             continue;
2388         }
2389
2390         // Rule (13)  Katakana x Katakana
2391         if (fKatakanaSet->contains(c1) &&
2392             fKatakanaSet->contains(c2))  {
2393             continue;
2394         }
2395
2396         // Rule 13a
2397         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2398              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2399              fExtendNumLetSet->contains(c2)) {
2400                 continue;
2401              }
2402
2403         // Rule 13b
2404         if (fExtendNumLetSet->contains(c1) &&
2405                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2406                 fKatakanaSet->contains(c2)))  {
2407                 continue;
2408              }
2409
2410         // Rule 14.  Break found here.
2411         break;
2412     }
2413
2414     breakPos = p2;
2415     return breakPos;
2416 }
2417
2418
2419 UVector  *RBBIWordMonkey::charClasses() {
2420     return fSets;
2421 }
2422
2423
2424 RBBIWordMonkey::~RBBIWordMonkey() {
2425     delete fSets;
2426     delete fKatakanaSet;
2427     delete fALetterSet;
2428     delete fMidLetterSet;
2429     delete fMidNumSet;
2430     delete fNumericSet;
2431     delete fFormatSet;
2432     delete fExtendSet;
2433     delete fExtendNumLetSet;
2434     delete fOtherSet;
2435 }
2436
2437
2438
2439
2440 //------------------------------------------------------------------------------------------
2441 //
2442 //   class RBBISentMonkey      Sentence Break specific implementation
2443 //                             of RBBIMonkeyKind.
2444 //
2445 //------------------------------------------------------------------------------------------
2446 class RBBISentMonkey: public RBBIMonkeyKind {
2447 public:
2448     RBBISentMonkey();
2449     virtual          ~RBBISentMonkey();
2450     virtual  UVector *charClasses();
2451     virtual  void     setText(const UnicodeString &s);
2452     virtual int32_t   next(int32_t i);
2453 private:
2454     int               moveBack(int posFrom);
2455     int               moveForward(int posFrom);
2456     UChar32           cAt(int pos);
2457
2458     UVector      *fSets;
2459
2460     UnicodeSet  *fSepSet;
2461     UnicodeSet  *fFormatSet;
2462     UnicodeSet  *fSpSet;
2463     UnicodeSet  *fLowerSet;
2464     UnicodeSet  *fUpperSet;
2465     UnicodeSet  *fOLetterSet;
2466     UnicodeSet  *fNumericSet;
2467     UnicodeSet  *fATermSet;
2468     UnicodeSet  *fSTermSet;
2469     UnicodeSet  *fCloseSet;
2470     UnicodeSet  *fOtherSet;
2471     UnicodeSet  *fExtendSet;
2472
2473     const UnicodeString  *fText;
2474
2475 };
2476
2477 RBBISentMonkey::RBBISentMonkey()
2478 {
2479     UErrorCode  status = U_ZERO_ERROR;
2480
2481     fSets            = new UVector(status);
2482
2483     fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep}]",     status);
2484     fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]",  status);
2485     fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]",      status);
2486     fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]",   status);
2487     fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]",   status);
2488     fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]", status);
2489     fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]", status);
2490     fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]",   status);
2491     fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]",   status);
2492     fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]",   status);
2493     fExtendSet       = new UnicodeSet("[\\p{Grapheme_Extend}\\uff9e\\uff9f]", status);
2494     fOtherSet        = new UnicodeSet();
2495
2496     if(U_FAILURE(status)) {
2497       deferredStatus = status;
2498       return;
2499     }
2500
2501     fOtherSet->complement();
2502     fOtherSet->removeAll(*fSepSet);
2503     fOtherSet->removeAll(*fFormatSet);
2504     fOtherSet->removeAll(*fSpSet);
2505     fOtherSet->removeAll(*fLowerSet);
2506     fOtherSet->removeAll(*fUpperSet);
2507     fOtherSet->removeAll(*fOLetterSet);
2508     fOtherSet->removeAll(*fNumericSet);
2509     fOtherSet->removeAll(*fATermSet);
2510     fOtherSet->removeAll(*fSTermSet);
2511     fOtherSet->removeAll(*fCloseSet);
2512     fOtherSet->removeAll(*fExtendSet);
2513
2514     fSets->addElement(fSepSet,     status);
2515     fSets->addElement(fFormatSet,  status);
2516
2517     fSets->addElement(fSpSet,      status);
2518     fSets->addElement(fLowerSet,   status);
2519     fSets->addElement(fUpperSet,   status);
2520     fSets->addElement(fOLetterSet, status);
2521     fSets->addElement(fNumericSet, status);
2522     fSets->addElement(fATermSet,   status);
2523     fSets->addElement(fSTermSet,   status);
2524     fSets->addElement(fCloseSet,   status);
2525     fSets->addElement(fOtherSet,   status);
2526     fSets->addElement(fExtendSet,  status);
2527
2528     if (U_FAILURE(status)) {
2529         deferredStatus = status;
2530     }
2531 }
2532
2533
2534
2535 void RBBISentMonkey::setText(const UnicodeString &s) {
2536     fText       = &s;
2537 }
2538
2539 UVector  *RBBISentMonkey::charClasses() {
2540     return fSets;
2541 }
2542
2543
2544 //  moveBack()   Find the "significant" code point preceding the index i.
2545 //               Skips over ($Extend | $Format)* .
2546 //
2547 int RBBISentMonkey::moveBack(int i) {
2548     if (i <= 0) {
2549         return -1;
2550     }
2551     UChar32   c;
2552     int32_t   j = i;
2553     do {
2554         j = fText->moveIndex32(j, -1);
2555         c = fText->char32At(j);
2556     }
2557     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2558     return j;
2559
2560  }
2561
2562
2563 int RBBISentMonkey::moveForward(int i) {
2564     if (i>=fText->length()) {
2565         return fText->length();
2566     }
2567     UChar32   c;
2568     int32_t   j = i;
2569     do {
2570         j = fText->moveIndex32(j, 1);
2571         c = cAt(j);
2572     }
2573     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2574     return j;
2575 }
2576
2577 UChar32 RBBISentMonkey::cAt(int pos) {
2578     if (pos<0 || pos>=fText->length()) {
2579         return -1;
2580     } else {
2581         return fText->char32At(pos);
2582     }
2583 }
2584
2585 int32_t RBBISentMonkey::next(int32_t prevPos) {
2586     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2587                               //   break position being tested.  The candidate break
2588                               //   location is before p2.
2589
2590     int     breakPos = -1;
2591
2592     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2593     UChar32 c;
2594
2595     // Prev break at end of string.  return DONE.
2596     if (prevPos >= fText->length()) {
2597         return -1;
2598     }
2599     p0 = p1 = p2 = p3 = prevPos;
2600     c3 =  fText->char32At(prevPos);
2601     c0 = c1 = c2 = 0;
2602
2603     // Loop runs once per "significant" character position in the input text.
2604     for (;;) {
2605         // Move all of the positions forward in the input string.
2606         p0 = p1;  c0 = c1;
2607         p1 = p2;  c1 = c2;
2608         p2 = p3;  c2 = c3;
2609
2610         // Advancd p3 by    X(Extend | Format)*   Rule 4
2611         p3 = moveForward(p3);
2612         c3 = cAt(p3);
2613
2614         // Rule (3)  CR x LF
2615         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2616             continue;
2617         }
2618
2619         // Rule (4).   Sep  <break>
2620         if (fSepSet->contains(c1)) {
2621             p2 = p1+1;   // Separators don't combine with Extend or Format.
2622             break;
2623         }
2624
2625         if (p2 >= fText->length()) {
2626             // Reached end of string.  Always a break position.
2627             break;
2628         }
2629
2630         if (p2 == prevPos) {
2631             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2632             continue;
2633         }
2634
2635         // Rule (6).   ATerm x Numeric
2636         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2637             continue;
2638         }
2639
2640         // Rule (7).  Upper ATerm  x  Uppper
2641         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2642             continue;
2643         }
2644
2645         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2646         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2647         //                  note to the Unicode 5.0 documents.
2648         int p8 = p1;
2649         while (fSpSet->contains(cAt(p8))) {
2650             p8 = moveBack(p8);
2651         }
2652         while (fCloseSet->contains(cAt(p8))) {
2653             p8 = moveBack(p8);
2654         }
2655         if (fATermSet->contains(cAt(p8))) {
2656             p8=p2;
2657             for (;;) {
2658                 c = cAt(p8);
2659                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2660                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2661                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2662                     break;
2663                 }
2664                 p8 = moveForward(p8);
2665             }
2666             if (fLowerSet->contains(cAt(p8))) {
2667                 continue;
2668             }
2669         }
2670
2671         // Rule 8a   (STerm | ATerm) Close* Sp* x (STerm | ATerm);
2672         if (fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2673             p8 = p1;
2674             while (fSpSet->contains(cAt(p8))) {
2675                 p8 = moveBack(p8);
2676             }
2677             while (fCloseSet->contains(cAt(p8))) {
2678                 p8 = moveBack(p8);
2679             }
2680             c = cAt(p8);
2681             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2682                 continue;
2683             }
2684         }
2685
2686         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep)
2687         int p9 = p1;
2688         while (fCloseSet->contains(cAt(p9))) {
2689             p9 = moveBack(p9);
2690         }
2691         c = cAt(p9);
2692         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2693             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2694                 continue;
2695             }
2696         }
2697
2698         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep)
2699         int p10 = p1;
2700         while (fSpSet->contains(cAt(p10))) {
2701             p10 = moveBack(p10);
2702         }
2703         while (fCloseSet->contains(cAt(p10))) {
2704             p10 = moveBack(p10);
2705         }
2706         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2707             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2708                 continue;
2709             }
2710         }
2711
2712         // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
2713         int p11 = p1;
2714         while (fSpSet->contains(cAt(p11))) {
2715             p11 = moveBack(p11);
2716         }
2717         while (fCloseSet->contains(cAt(p11))) {
2718             p11 = moveBack(p11);
2719         }
2720         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2721             break;
2722         }
2723
2724         //  Rule (12)  Any x Any
2725         continue;
2726     }
2727     breakPos = p2;
2728     return breakPos;
2729 }
2730
2731 RBBISentMonkey::~RBBISentMonkey() {
2732     delete fSets;
2733     delete fSepSet;
2734     delete fFormatSet;
2735     delete fSpSet;
2736     delete fLowerSet;
2737     delete fUpperSet;
2738     delete fOLetterSet;
2739     delete fNumericSet;
2740     delete fATermSet;
2741     delete fSTermSet;
2742     delete fCloseSet;
2743     delete fOtherSet;
2744     delete fExtendSet;
2745 }
2746
2747
2748
2749 //-------------------------------------------------------------------------------------------
2750 //
2751 //  RBBILineMonkey
2752 //
2753 //-------------------------------------------------------------------------------------------
2754
2755 class RBBILineMonkey: public RBBIMonkeyKind {
2756 public:
2757     RBBILineMonkey();
2758     virtual          ~RBBILineMonkey();
2759     virtual  UVector *charClasses();
2760     virtual  void     setText(const UnicodeString &s);
2761     virtual  int32_t  next(int32_t i);
2762     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2763 private:
2764     UVector      *fSets;
2765
2766     UnicodeSet  *fBK;
2767     UnicodeSet  *fCR;
2768     UnicodeSet  *fLF;
2769     UnicodeSet  *fCM;
2770     UnicodeSet  *fNL;
2771     UnicodeSet  *fSG;
2772     UnicodeSet  *fWJ;
2773     UnicodeSet  *fZW;
2774     UnicodeSet  *fGL;
2775     UnicodeSet  *fCB;
2776     UnicodeSet  *fSP;
2777     UnicodeSet  *fB2;
2778     UnicodeSet  *fBA;
2779     UnicodeSet  *fBB;
2780     UnicodeSet  *fHY;
2781     UnicodeSet  *fH2;
2782     UnicodeSet  *fH3;
2783     UnicodeSet  *fCL;
2784     UnicodeSet  *fEX;
2785     UnicodeSet  *fIN;
2786     UnicodeSet  *fJL;
2787     UnicodeSet  *fJV;
2788     UnicodeSet  *fJT;
2789     UnicodeSet  *fNS;
2790     UnicodeSet  *fOP;
2791     UnicodeSet  *fQU;
2792     UnicodeSet  *fIS;
2793     UnicodeSet  *fNU;
2794     UnicodeSet  *fPO;
2795     UnicodeSet  *fPR;
2796     UnicodeSet  *fSY;
2797     UnicodeSet  *fAI;
2798     UnicodeSet  *fAL;
2799     UnicodeSet  *fID;
2800     UnicodeSet  *fSA;
2801     UnicodeSet  *fXX;
2802
2803     BreakIterator  *fCharBI;
2804
2805     const UnicodeString  *fText;
2806     int32_t              *fOrigPositions;
2807
2808     RegexMatcher         *fNumberMatcher;
2809     RegexMatcher         *fLB11Matcher;
2810 };
2811
2812
2813 RBBILineMonkey::RBBILineMonkey()
2814 {
2815     UErrorCode  status = U_ZERO_ERROR;
2816
2817     fSets  = new UVector(status);
2818
2819     fBK    = new UnicodeSet("[\\p{Line_Break=BK}]", status);
2820     fCR    = new UnicodeSet("[\\p{Line_break=CR}]", status);
2821     fLF    = new UnicodeSet("[\\p{Line_break=LF}]", status);
2822     fCM    = new UnicodeSet("[\\p{Line_break=CM}]", status);
2823     fNL    = new UnicodeSet("[\\p{Line_break=NL}]", status);
2824     fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]", status);
2825     fZW    = new UnicodeSet("[\\p{Line_break=ZW}]", status);
2826     fGL    = new UnicodeSet("[\\p{Line_break=GL}]", status);
2827     fCB    = new UnicodeSet("[\\p{Line_break=CB}]", status);
2828     fSP    = new UnicodeSet("[\\p{Line_break=SP}]", status);
2829     fB2    = new UnicodeSet("[\\p{Line_break=B2}]", status);
2830     fBA    = new UnicodeSet("[\\p{Line_break=BA}]", status);
2831     fBB    = new UnicodeSet("[\\p{Line_break=BB}]", status);
2832     fHY    = new UnicodeSet("[\\p{Line_break=HY}]", status);
2833     fH2    = new UnicodeSet("[\\p{Line_break=H2}]", status);
2834     fH3    = new UnicodeSet("[\\p{Line_break=H3}]", status);
2835     fCL    = new UnicodeSet("[\\p{Line_break=CL}]", status);
2836     fEX    = new UnicodeSet("[\\p{Line_break=EX}]", status);
2837     fIN    = new UnicodeSet("[\\p{Line_break=IN}]", status);
2838     fJL    = new UnicodeSet("[\\p{Line_break=JL}]", status);
2839     fJV    = new UnicodeSet("[\\p{Line_break=JV}]", status);
2840     fJT    = new UnicodeSet("[\\p{Line_break=JT}]", status);
2841     fNS    = new UnicodeSet("[\\p{Line_break=NS}]", status);
2842     fOP    = new UnicodeSet("[\\p{Line_break=OP}]", status);
2843     fQU    = new UnicodeSet("[\\p{Line_break=QU}]", status);
2844     fIS    = new UnicodeSet("[\\p{Line_break=IS}]", status);
2845     fNU    = new UnicodeSet("[\\p{Line_break=NU}]", status);
2846     fPO    = new UnicodeSet("[\\p{Line_break=PO}]", status);
2847     fPR    = new UnicodeSet("[\\p{Line_break=PR}]", status);
2848     fSY    = new UnicodeSet("[\\p{Line_break=SY}]", status);
2849     fAI    = new UnicodeSet("[\\p{Line_break=AI}]", status);
2850     fAL    = new UnicodeSet("[\\p{Line_break=AL}]", status);
2851     fID    = new UnicodeSet("[\\p{Line_break=ID}]", status);
2852     fSA    = new UnicodeSet("[\\p{Line_break=SA}]", status);
2853     fSG    = new UnicodeSet("[\\ud800-\\udfff]", status);
2854     fXX    = new UnicodeSet("[\\p{Line_break=XX}]", status);
2855
2856     if (U_FAILURE(status)) {
2857         deferredStatus = status;
2858         fCharBI = NULL;
2859         fNumberMatcher = NULL;
2860         return;
2861     }
2862
2863     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2864     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2865     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2866     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2867
2868     fSets->addElement(fBK, status);
2869     fSets->addElement(fCR, status);
2870     fSets->addElement(fLF, status);
2871     fSets->addElement(fCM, status);
2872     fSets->addElement(fNL, status);
2873     fSets->addElement(fWJ, status);
2874     fSets->addElement(fZW, status);
2875     fSets->addElement(fGL, status);
2876     fSets->addElement(fCB, status);
2877     fSets->addElement(fSP, status);
2878     fSets->addElement(fB2, status);
2879     fSets->addElement(fBA, status);
2880     fSets->addElement(fBB, status);
2881     fSets->addElement(fHY, status);
2882     fSets->addElement(fH2, status);
2883     fSets->addElement(fH3, status);
2884     fSets->addElement(fCL, status);
2885     fSets->addElement(fEX, status);
2886     fSets->addElement(fIN, status);
2887     fSets->addElement(fJL, status);
2888     fSets->addElement(fJT, status);
2889     fSets->addElement(fJV, status);
2890     fSets->addElement(fNS, status);
2891     fSets->addElement(fOP, status);
2892     fSets->addElement(fQU, status);
2893     fSets->addElement(fIS, status);
2894     fSets->addElement(fNU, status);
2895     fSets->addElement(fPO, status);
2896     fSets->addElement(fPR, status);
2897     fSets->addElement(fSY, status);
2898     fSets->addElement(fAI, status);
2899     fSets->addElement(fAL, status);
2900     fSets->addElement(fID, status);
2901     fSets->addElement(fWJ, status);
2902     fSets->addElement(fSA, status);
2903     fSets->addElement(fSG, status);
2904
2905     fNumberMatcher = new RegexMatcher(
2906         "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2907         "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2908         "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2909         "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2910         "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
2911         "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?",
2912         0, status);
2913
2914     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2915
2916     if (U_FAILURE(status)) {
2917         deferredStatus = status;
2918     }
2919 }
2920
2921
2922 void RBBILineMonkey::setText(const UnicodeString &s) {
2923     fText       = &s;
2924     fCharBI->setText(s);
2925     fNumberMatcher->reset(s);
2926 }
2927
2928 //
2929 //  rule9Adjust
2930 //     Line Break TR rules 9 and 10 implementation.
2931 //     This deals with combining marks and other sequences that
2932 //     that must be treated as if they were something other than what they actually are.
2933 //
2934 //     This is factored out into a separate function because it must be applied twice for
2935 //     each potential break, once to the chars before the position being checked, then
2936 //     again to the text following the possible break.
2937 //
2938 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2939     if (pos == -1) {
2940         // Invalid initial position.  Happens during the warmup iteration of the
2941         //   main loop in next().
2942         return;
2943     }
2944
2945     int32_t  nPos = *nextPos;
2946
2947     // LB 9  Keep combining sequences together.
2948     //  advance over any CM class chars.  Note that Line Break CM is different
2949     //  from the normal Grapheme Extend property.
2950     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2951           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2952         for (;;) {
2953             *nextChar = fText->char32At(nPos);
2954             if (!fCM->contains(*nextChar)) {
2955                 break;
2956             }
2957             nPos = fText->moveIndex32(nPos, 1);
2958         }
2959     }
2960
2961
2962     // LB 9 Treat X CM* as if it were x.
2963     //       No explicit action required.
2964
2965     // LB 10  Treat any remaining combining mark as AL
2966     if (fCM->contains(*posChar)) {
2967         *posChar = 0x41;   // thisChar = 'A';
2968     }
2969
2970     // Push the updated nextPos and nextChar back to our caller.
2971     // This only makes a difference if posChar got bigger by consuming a
2972     // combining sequence.
2973     *nextPos  = nPos;
2974     *nextChar = fText->char32At(nPos);
2975 }
2976
2977
2978
2979 int32_t RBBILineMonkey::next(int32_t startPos) {
2980     UErrorCode status = U_ZERO_ERROR;
2981     int32_t    pos;       //  Index of the char following a potential break position
2982     UChar32    thisChar;  //  Character at above position "pos"
2983
2984     int32_t    prevPos;   //  Index of the char preceding a potential break position
2985     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2986                           //   and thisChar may not be adjacent because combining
2987                           //   characters between them will be ignored.
2988
2989     int32_t    nextPos;   //  Index of the next character following pos.
2990                           //     Usually skips over combining marks.
2991     int32_t    nextCPPos; //  Index of the code point following "pos."
2992                           //     May point to a combining mark.
2993     int32_t    tPos;      //  temp value.
2994     UChar32    c;
2995
2996     if (startPos >= fText->length()) {
2997         return -1;
2998     }
2999
3000
3001     // Initial values for loop.  Loop will run the first time without finding breaks,
3002     //                           while the invalid values shift out and the "this" and
3003     //                           "prev" positions are filled in with good values.
3004     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
3005     thisChar = prevChar  = 0;
3006     nextPos  = nextCPPos = startPos;
3007
3008
3009     // Loop runs once per position in the test text, until a break position
3010     //  is found.
3011     for (;;) {
3012         prevPos   = pos;
3013         prevChar  = thisChar;
3014
3015         pos       = nextPos;
3016         thisChar  = fText->char32At(pos);
3017
3018         nextCPPos = fText->moveIndex32(pos, 1);
3019         nextPos   = nextCPPos;
3020
3021         // Rule LB2 - Break at end of text.
3022         if (pos >= fText->length()) {
3023             break;
3024         }
3025
3026         // Rule LB 9 - adjust for combining sequences.
3027         //             We do this one out-of-order because the adjustment does not change anything
3028         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3029         //             be applied.
3030         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3031         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3032         c = fText->char32At(nextPos);
3033         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3034
3035         // If the loop is still warming up - if we haven't shifted the initial
3036         //   -1 positions out of prevPos yet - loop back to advance the
3037         //    position in the input without any further looking for breaks.
3038         if (prevPos == -1) {
3039             continue;
3040         }
3041
3042         // LB 4  Always break after hard line breaks,
3043         if (fBK->contains(prevChar)) {
3044             break;
3045         }
3046
3047         // LB 5  Break after CR, LF, NL, but not inside CR LF
3048         if (prevChar == 0x0d && thisChar == 0x0a) {
3049             continue;
3050         }
3051         if (prevChar == 0x0d ||
3052             prevChar == 0x0a ||
3053             prevChar == 0x85)  {
3054             break;
3055         }
3056
3057         // LB 6  Don't break before hard line breaks
3058         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3059             fBK->contains(thisChar)) {
3060                 continue;
3061         }
3062
3063
3064         // LB 7  Don't break before spaces or zero-width space.
3065         if (fSP->contains(thisChar)) {
3066             continue;
3067         }
3068
3069         if (fZW->contains(thisChar)) {
3070             continue;
3071         }
3072
3073         // LB 8  Break after zero width space
3074         if (fZW->contains(prevChar)) {
3075             break;
3076         }
3077
3078         // LB 9, 10  Already done, at top of loop.
3079         //
3080
3081
3082         // LB 11  Do not break before or after WORD JOINER and related characters.
3083         //    x  WJ
3084         //    WJ  x
3085         //
3086         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3087             continue;
3088         }
3089
3090         // LB 12
3091         //    (!SP) x  GL
3092         //    GL  x
3093         if ((!fSP->contains(prevChar)) && fGL->contains(thisChar) ||
3094              fGL->contains(prevChar)) {
3095             continue;
3096         }
3097
3098
3099
3100         // LB 13  Don't break before closings.
3101         //        NU x CL  and NU x IS are not matched here so that they will
3102         //        fall into LB 17 and the more general number regular expression.
3103         //
3104         if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
3105                                         fEX->contains(thisChar) ||
3106             !fNU->contains(prevChar) && fIS->contains(thisChar) ||
3107             !fNU->contains(prevChar) && fSY->contains(thisChar))    {
3108             continue;
3109         }
3110
3111         // LB 14 Don't break after OP SP*
3112         //       Scan backwards, checking for this sequence.
3113         //       The OP char could include combining marks, so we actually check for
3114         //           OP CM* SP*
3115         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3116         //       sequence into a ID char, so before scanning back through spaces,
3117         //       verify that prevChar is indeed a space.  The prevChar variable
3118         //       may differ from fText[prevPos]
3119         tPos = prevPos;
3120         if (fSP->contains(prevChar)) {
3121             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3122                 tPos=fText->moveIndex32(tPos, -1);
3123             }
3124         }
3125         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3126             tPos=fText->moveIndex32(tPos, -1);
3127         }
3128         if (fOP->contains(fText->char32At(tPos))) {
3129             continue;
3130         }
3131
3132
3133         // LB 15    QU SP* x OP
3134         if (fOP->contains(thisChar)) {
3135             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3136             int tPos = prevPos;
3137             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3138                 tPos = fText->moveIndex32(tPos, -1);
3139             }
3140             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3141                 tPos = fText->moveIndex32(tPos, -1);
3142             }
3143             if (fQU->contains(fText->char32At(tPos))) {
3144                 continue;
3145             }
3146         }
3147
3148
3149
3150         // LB 16   CL SP* x NS
3151         //    Scan backwards for SP* CM* CL
3152         if (fNS->contains(thisChar)) {
3153             int tPos = prevPos;
3154             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3155                 tPos = fText->moveIndex32(tPos, -1);
3156             }
3157             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3158                 tPos = fText->moveIndex32(tPos, -1);
3159             }
3160             if (fCL->contains(fText->char32At(tPos))) {
3161                 continue;
3162             }
3163         }
3164
3165
3166         // LB 17        B2 SP* x B2
3167         if (fB2->contains(thisChar)) {
3168             //  Scan backwards, checking for the B2 CM* SP* sequence.
3169             tPos = prevPos;
3170             if (fSP->contains(prevChar)) {
3171                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3172                     tPos=fText->moveIndex32(tPos, -1);
3173                 }
3174             }
3175             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3176                 tPos=fText->moveIndex32(tPos, -1);
3177             }
3178             if (fB2->contains(fText->char32At(tPos))) {
3179                 continue;
3180             }
3181         }
3182
3183
3184         // LB 18    break after space
3185         if (fSP->contains(prevChar)) {
3186             break;
3187         }
3188
3189         // LB 19
3190         //    x   QU
3191         //    QU  x
3192         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3193             continue;
3194         }
3195
3196         // LB 20  Break around a CB
3197         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3198             break;
3199         }
3200
3201         // LB 21
3202         if (fBA->contains(thisChar) ||
3203             fHY->contains(thisChar) ||
3204             fNS->contains(thisChar) ||
3205             fBB->contains(prevChar) )   {
3206             continue;
3207         }
3208
3209         // LB 22
3210         if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
3211             fID->contains(prevChar) && fIN->contains(thisChar) ||
3212             fIN->contains(prevChar) && fIN->contains(thisChar) ||
3213             fNU->contains(prevChar) && fIN->contains(thisChar) )   {
3214             continue;
3215         }
3216
3217
3218         // LB 23    ID x PO
3219         //          AL x NU
3220         //          NU x AL
3221         if (fID->contains(prevChar) && fPO->contains(thisChar) ||
3222             fAL->contains(prevChar) && fNU->contains(thisChar) ||
3223             fNU->contains(prevChar) && fAL->contains(thisChar) )   {
3224             continue;
3225         }
3226
3227         // LB 24  Do not break between prefix and letters or ideographs.
3228         //        PR x ID
3229         //        PR x AL
3230         //        PO x AL
3231         if (fPR->contains(prevChar) && fID->contains(thisChar) ||
3232             fPR->contains(prevChar) && fAL->contains(thisChar) ||
3233             fPO->contains(prevChar) && fAL->contains(thisChar) )   {
3234             continue;
3235         }
3236
3237
3238
3239         // LB 25    Numbers
3240         if (fNumberMatcher->lookingAt(prevPos, status)) {
3241             if (U_FAILURE(status)) {
3242                 break;
3243             }
3244             // Matched a number.  But could have been just a single digit, which would
3245             //    not represent a "no break here" between prevChar and thisChar
3246             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3247             if (numEndIdx > pos) {
3248                 // Number match includes at least our two chars being checked
3249                 if (numEndIdx > nextPos) {
3250                     // Number match includes additional chars.  Update pos and nextPos
3251                     //   so that next loop iteration will continue at the end of the number,
3252                     //   checking for breaks between last char in number & whatever follows.
3253                     pos = nextPos = numEndIdx;
3254                     do {
3255                         pos = fText->moveIndex32(pos, -1);
3256                         thisChar = fText->char32At(pos);
3257                     } while (fCM->contains(thisChar));
3258                 }
3259                 continue;
3260             }
3261         }
3262
3263
3264         // LB 26 Do not break a Korean syllable.
3265         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3266                                         fJV->contains(thisChar) ||
3267                                         fH2->contains(thisChar) ||
3268                                         fH3->contains(thisChar))) {
3269                                             continue;
3270                                         }
3271
3272         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3273             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3274                 continue;
3275         }
3276
3277         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3278             fJT->contains(thisChar)) {
3279                 continue;
3280         }
3281
3282         // LB 27 Treat a Korean Syllable Block the same as ID.
3283         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3284             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3285             fIN->contains(thisChar)) {
3286                 continue;
3287             }
3288         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3289             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3290             fPO->contains(thisChar)) {
3291                 continue;
3292             }
3293         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3294             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3295                 continue;
3296             }
3297
3298
3299
3300         // LB 28  Do not break between alphabetics (“at”).
3301         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3302             continue;
3303         }
3304
3305         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3306         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3307             continue;
3308         }
3309
3310         //LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation
3311         //      (AL | NU) x OP
3312         //       CL x (AL | NU)
3313         if ((fAL->contains(prevChar) || fNU->contains(prevChar)) &&
3314               fOP->contains(thisChar)) {
3315             continue;
3316         }
3317         if (fCL->contains(prevChar) &&
3318             (fAL->contains(thisChar) || fNU->contains(thisChar))) {
3319             continue;
3320         }
3321
3322
3323         // LB 31    Break everywhere else
3324         break;
3325
3326     }
3327
3328     return pos;
3329 }
3330
3331
3332 UVector  *RBBILineMonkey::charClasses() {
3333     return fSets;
3334 }
3335
3336
3337 RBBILineMonkey::~RBBILineMonkey() {
3338     delete fSets;
3339
3340     delete fBK;
3341     delete fCR;
3342     delete fLF;
3343     delete fCM;
3344     delete fNL;
3345     delete fWJ;
3346     delete fZW;
3347     delete fGL;
3348     delete fCB;
3349     delete fSP;
3350     delete fB2;
3351     delete fBA;
3352     delete fBB;
3353     delete fHY;
3354     delete fH2;
3355     delete fH3;
3356     delete fCL;
3357     delete fEX;
3358     delete fIN;
3359     delete fJL;
3360     delete fJV;
3361     delete fJT;
3362     delete fNS;
3363     delete fOP;
3364     delete fQU;
3365     delete fIS;
3366     delete fNU;
3367     delete fPO;
3368     delete fPR;
3369     delete fSY;
3370     delete fAI;
3371     delete fAL;
3372     delete fID;
3373     delete fSA;
3374     delete fSG;
3375     delete fXX;
3376
3377     delete fCharBI;
3378     delete fNumberMatcher;
3379 }
3380
3381
3382 //-------------------------------------------------------------------------------------------
3383 //
3384 //   TestMonkey
3385 //
3386 //     params
3387 //       seed=nnnnn        Random number starting seed.
3388 //                         Setting the seed allows errors to be reproduced.
3389 //       loop=nnn          Looping count.  Controls running time.
3390 //                         -1:  run forever.
3391 //                          0 or greater:  run length.
3392 //
3393 //       type = char | word | line | sent | title
3394 //
3395 //-------------------------------------------------------------------------------------------
3396
3397 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3398     int32_t val = defaultVal;
3399     name.append(" *= *(-?\\d+)");
3400     UErrorCode status = U_ZERO_ERROR;
3401     RegexMatcher m(name, params, 0, status);
3402     if (m.find()) {
3403         // The param exists.  Convert the string to an int.
3404         char valString[100];
3405         int32_t paramLength = m.end(1, status) - m.start(1, status);
3406         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3407             paramLength = (int32_t)(sizeof(valString)-2);
3408         }
3409         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3410         val = strtol(valString,  NULL, 10);
3411
3412         // Delete this parameter from the params string.
3413         m.reset();
3414         params = m.replaceFirst("", status);
3415     }
3416     U_ASSERT(U_SUCCESS(status));
3417     return val;
3418 }
3419 #endif
3420
3421 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3422                                     BreakIterator *bi,
3423                                     int expected[],
3424                                     int expectedcount)
3425 {
3426     int count = 0;
3427     int i = 0;
3428     int forward[50];
3429     bi->setText(ustr);
3430     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3431         forward[count] = i;
3432         if (count < expectedcount && expected[count] != i) {
3433             test->errln("break forward test failed: expected %d but got %d",
3434                         expected[count], i);
3435             break;
3436         }
3437         count ++;
3438     }
3439     if (count != expectedcount) {
3440         printStringBreaks(ustr, expected, expectedcount);
3441         test->errln("break forward test failed: missed %d match",
3442                     expectedcount - count);
3443         return;
3444     }
3445     // testing boundaries
3446     for (i = 1; i < expectedcount; i ++) {
3447         int j = expected[i - 1];
3448         if (!bi->isBoundary(j)) {
3449             printStringBreaks(ustr, expected, expectedcount);
3450             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3451             return;
3452         }
3453         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3454             if (bi->isBoundary(j)) {
3455                 printStringBreaks(ustr, expected, expectedcount);
3456                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3457                 return;
3458             }
3459         }
3460     }
3461
3462     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3463         count --;
3464         if (forward[count] != i) {
3465             test->errln("happy break test previous() failed: expected %d but got %d",
3466                         forward[count], i);
3467             break;
3468         }
3469     }
3470     if (count != 0) {
3471         printStringBreaks(ustr, expected, expectedcount);
3472         test->errln("break test previous() failed: missed a match");
3473         return;
3474     }
3475
3476     // testing preceding
3477     for (i = 0; i < expectedcount - 1; i ++) {
3478         // int j = expected[i] + 1;
3479         int j = ustr.moveIndex32(expected[i], 1);
3480         for (; j <= expected[i + 1]; j ++) {
3481             if (bi->preceding(j) != expected[i]) {
3482                 printStringBreaks(ustr, expected, expectedcount);
3483                 test->errln("preceding(): Not expecting boundary at position %d", j);
3484                 return;
3485             }
3486         }
3487     }
3488 }
3489
3490 void RBBITest::TestWordBreaks(void)
3491 {
3492 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3493
3494     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3495     Locale        locale("en");
3496     UErrorCode    status = U_ZERO_ERROR;
3497     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3498     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3499     UChar         str[300];
3500     static const char *strlist[] =
3501     {
3502     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3503     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3504     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
3505     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3506     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3507     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3508     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3509     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3510     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3511     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3512     "\\u2027\\U000e0067\\u0a47\\u00b7",
3513     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3514     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3515     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3516     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3517     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3518     "\\u0027\\u11af\\U000e0057\\u0602",
3519     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3520     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3521     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3522     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3523     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3524     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3525     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3526     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3527     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3528     "\\u58f4\\U000e0049\\u20e7\\u2027",
3529     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3530     "\\ua183\\u102d\\u0bec\\u003a",
3531     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3532     "\\u003a\\u0e57\\u0fad\\u002e",
3533     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3534     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3535     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3536     "\\u003a\\u0664\\u00b7\\u1fba",
3537     "\\u003b\\u0027\\u00b7\\u47a3",
3538     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3539     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3540     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3541     };
3542     int loop;
3543     if (U_FAILURE(status)) {
3544         errln("Creation of break iterator failed %s", u_errorName(status));
3545         return;
3546     }
3547     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3548         // printf("looping %d\n", loop);
3549         u_unescape(strlist[loop], str, 25);
3550         UnicodeString ustr(str);
3551         // RBBICharMonkey monkey;
3552         RBBIWordMonkey monkey;
3553
3554         int expected[50];
3555         int expectedcount = 0;
3556
3557         monkey.setText(ustr);
3558         int i;
3559         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3560             expected[expectedcount ++] = i;
3561         }
3562
3563         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3564     }
3565     delete bi;
3566 #endif
3567 }
3568
3569 void RBBITest::TestWordBoundary(void)
3570 {
3571     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3572     Locale        locale("en");
3573     UErrorCode    status = U_ZERO_ERROR;
3574     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3575     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3576     UChar         str[50];
3577     static const char *strlist[] =
3578     {
3579     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3580     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3581     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3582     "\\u2027\\U000e0067\\u0a47\\u00b7",
3583     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3584     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3585     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3586     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3587     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3588     "\\u0027\\u11af\\U000e0057\\u0602",
3589     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3590     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3591     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3592     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3593     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3594     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3595     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3596     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3597     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3598     "\\u58f4\\U000e0049\\u20e7\\u2027",
3599     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3600     "\\ua183\\u102d\\u0bec\\u003a",
3601     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3602     "\\u003a\\u0e57\\u0fad\\u002e",
3603     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3604     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3605     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3606     "\\u003a\\u0664\\u00b7\\u1fba",
3607     "\\u003b\\u0027\\u00b7\\u47a3",
3608     };
3609     int loop;
3610     if (U_FAILURE(status)) {
3611         errln("Creation of break iterator failed %s", u_errorName(status));
3612         return;
3613     }
3614     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3615         // printf("looping %d\n", loop);
3616         u_unescape(strlist[loop], str, 20);
3617         UnicodeString ustr(str);
3618         int forward[50];
3619         int count = 0;
3620
3621         bi->setText(ustr);
3622         int prev = 0;
3623         int i;
3624         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3625             forward[count ++] = i;
3626             if (i > prev) {
3627                 int j;
3628                 for (j = prev + 1; j < i; j ++) {
3629                     if (bi->isBoundary(j)) {
3630                         printStringBreaks(ustr, forward, count);
3631                         errln("happy boundary test failed: expected %d not a boundary",
3632                                j);
3633                         return;
3634                     }
3635                 }
3636             }
3637             if (!bi->isBoundary(i)) {
3638                 printStringBreaks(ustr, forward, count);
3639                 errln("happy boundary test failed: expected %d a boundary",
3640                        i);
3641                 return;
3642             }
3643             prev = i;
3644         }
3645     }
3646     delete bi;
3647 }
3648
3649 void RBBITest::TestLineBreaks(void)
3650 {
3651 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3652     Locale        locale("en");
3653     UErrorCode    status = U_ZERO_ERROR;
3654     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3655     const int32_t  STRSIZE = 50;
3656     UChar         str[STRSIZE];
3657     static const char *strlist[] =
3658     {
3659      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3660      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3661              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3662      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3663              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3664      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3665      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3666      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3667      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3668      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3669      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3670      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3671      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3672      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3673      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3674      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3675      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3676      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3677      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3678      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3679      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3680      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3681      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3682      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3683      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3684      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3685      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3686      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3687      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3688      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3689      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3690      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3691      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3692      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3693      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3694      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3695      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3696      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3697      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3698      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3699      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3700      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3701          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3702          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3703          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3704      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3705          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3706     };
3707     int loop;
3708     TEST_ASSERT_SUCCESS(status);
3709     if (U_FAILURE(status)) {
3710         return;
3711     }
3712     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3713         // printf("looping %d\n", loop);
3714         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3715         if (t >= STRSIZE) {
3716             TEST_ASSERT(FALSE);
3717             continue;
3718         }
3719
3720
3721         UnicodeString ustr(str);
3722         RBBILineMonkey monkey;
3723         if (U_FAILURE(monkey.deferredStatus)) {
3724             continue;
3725         }
3726
3727         const int EXPECTEDSIZE = 50;
3728         int expected[EXPECTEDSIZE];
3729         int expectedcount = 0;
3730
3731         monkey.setText(ustr);
3732         int i;
3733         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3734             if (expectedcount >= EXPECTEDSIZE) {
3735                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3736                 return;
3737             }
3738             expected[expectedcount ++] = i;
3739         }
3740
3741         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3742     }
3743     delete bi;
3744 #endif
3745 }
3746
3747 void RBBITest::TestSentBreaks(void)
3748 {
3749 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3750     Locale        locale("en");
3751     UErrorCode    status = U_ZERO_ERROR;
3752     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3753     UChar         str[200];
3754     static const char *strlist[] =
3755     {
3756      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3757      "This\n",
3758      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3759      "\"Sentence ending with a quote.\" Bye.",
3760      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3761      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3762      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3763      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3764      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3765      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3766      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3767              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3768              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3769              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3770      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3771              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3772              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3773              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3774              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3775              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3776     };
3777     int loop;
3778     if (U_FAILURE(status)) {
3779         errln("Creation of break iterator failed %s", u_errorName(status));
3780         return;
3781     }
3782     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3783         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3784         UnicodeString ustr(str);
3785
3786         RBBISentMonkey monkey;
3787         if (U_FAILURE(monkey.deferredStatus)) {
3788             continue;
3789         }
3790
3791         const int EXPECTEDSIZE = 50;
3792         int expected[EXPECTEDSIZE];
3793         int expectedcount = 0;
3794
3795         monkey.setText(ustr);
3796         int i;
3797         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3798             if (expectedcount >= EXPECTEDSIZE) {
3799                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3800                 return;
3801             }
3802             expected[expectedcount ++] = i;
3803         }
3804
3805         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3806     }
3807     delete bi;
3808 #endif
3809 }
3810
3811 void RBBITest::TestMonkey(char *params) {
3812 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3813
3814     UErrorCode     status    = U_ZERO_ERROR;
3815     int32_t        loopCount = 500;
3816     int32_t        seed      = 1;
3817     UnicodeString  breakType = "all";
3818     Locale         locale("en");
3819     UBool          useUText  = FALSE;
3820
3821     if (quick == FALSE) {
3822         loopCount = 10000;
3823     }
3824
3825     if (params) {
3826         UnicodeString p(params);
3827         loopCount = getIntParam("loop", p, loopCount);
3828         seed      = getIntParam("seed", p, seed);
3829
3830         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3831         if (m.find()) {
3832             breakType = m.group(1, status);
3833             m.reset();
3834             p = m.replaceFirst("", status);
3835         }
3836
3837         RegexMatcher u(" *utext", p, 0, status);
3838         if (u.find()) {
3839             useUText = TRUE;
3840             u.reset();
3841             p = u.replaceFirst("", status);
3842         }
3843
3844
3845         // m.reset(p);
3846         if (RegexMatcher("\\S", p, 0, status).find()) {
3847             // Each option is stripped out of the option string as it is processed.
3848             // All options have been checked.  The option string should have been completely emptied..
3849             char buf[100];
3850             p.extract(buf, sizeof(buf), NULL, status);
3851             buf[sizeof(buf)-1] = 0;
3852             errln("Unrecognized or extra parameter:  %s\n", buf);
3853             return;
3854         }
3855
3856     }
3857
3858     if (breakType == "char" || breakType == "all") {
3859         RBBICharMonkey  m;
3860         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3861         if (U_SUCCESS(status)) {
3862             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3863             if (breakType == "all" && useUText==FALSE) {
3864                 // Also run a quick test with UText when "all" is specified
3865                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3866             }
3867         }
3868         else {
3869             errln("Creation of character break iterator failed %s", u_errorName(status));
3870         }
3871         delete bi;
3872     }
3873
3874     if (breakType == "word" || breakType == "all") {
3875         logln("Word Break Monkey Test");
3876         RBBIWordMonkey  m;
3877         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3878         if (U_SUCCESS(status)) {
3879             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3880         }
3881         else {
3882             errln("Creation of word break iterator failed %s", u_errorName(status));
3883         }
3884         delete bi;
3885     }
3886
3887     if (breakType == "line" || breakType == "all") {
3888         logln("Line Break Monkey Test");
3889         RBBILineMonkey  m;
3890         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3891         if (loopCount >= 10) {
3892             loopCount = loopCount / 5;   // Line break runs slower than the others.
3893         }
3894         if (U_SUCCESS(status)) {
3895             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3896         }
3897         else {
3898             errln("Creation of line break iterator failed %s", u_errorName(status));
3899         }
3900         delete bi;
3901     }
3902
3903     if (breakType == "sent" || breakType == "all"  ) {
3904         logln("Sentence Break Monkey Test");
3905         RBBISentMonkey  m;
3906         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3907         if (loopCount >= 10) {
3908             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3909         }
3910         if (U_SUCCESS(status)) {
3911             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3912         }
3913         else {
3914             errln("Creation of line break iterator failed %s", u_errorName(status));
3915         }
3916         delete bi;
3917     }
3918
3919 #endif
3920 }
3921
3922 //
3923 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3924 //    Parameters:
3925 //       bi      - the break iterator to use
3926 //       mk      - MonkeyKind, abstraction for obtaining expected results
3927 //       name    - Name of test (char, word, etc.) for use in error messages
3928 //       seed    - Seed for starting random number generator (parameter from user)
3929 //       numIterations
3930 //
3931 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3932                          int32_t numIterations, UBool useUText) {
3933
3934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3935
3936     const int32_t    TESTSTRINGLEN = 500;
3937     UnicodeString    testText;
3938     int32_t          numCharClasses;
3939     UVector          *chClasses;
3940     int              expected[TESTSTRINGLEN*2 + 1];
3941     int              expectedCount = 0;
3942     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3943     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3944     char             reverseBreaks[TESTSTRINGLEN*2+1];
3945     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3946     char             followingBreaks[TESTSTRINGLEN*2+1];
3947     char             precedingBreaks[TESTSTRINGLEN*2+1];
3948     int              i;
3949     int              loopCount = 0;
3950
3951     m_seed = seed;
3952
3953     numCharClasses = mk.charClasses()->size();
3954     chClasses      = mk.charClasses();
3955
3956     // Check for errors that occured during the construction of the MonkeyKind object.
3957     //  Can't report them where they occured because errln() is a method coming from intlTest,
3958     //  and is not visible outside of RBBITest :-(
3959     if (U_FAILURE(mk.deferredStatus)) {
3960         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3961         return;
3962     }
3963
3964     // Verify that the character classes all have at least one member.
3965     for (i=0; i<numCharClasses; i++) {
3966         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3967         if (s == NULL || s->size() == 0) {
3968             errln("Character Class #%d is null or of zero size.", i);
3969             return;
3970         }
3971     }
3972
3973     while (loopCount < numIterations || numIterations == -1) {
3974         if (numIterations == -1 && loopCount % 10 == 0) {
3975             // If test is running in an infinite loop, display a periodic tic so
3976             //   we can tell that it is making progress.
3977             fprintf(stderr, ".");
3978         }
3979         // Save current random number seed, so that we can recreate the random numbers
3980         //   for this loop iteration in event of an error.
3981         seed = m_seed;
3982
3983         // Populate a test string with data.
3984         testText.truncate(0);
3985         for (i=0; i<TESTSTRINGLEN; i++) {
3986             int32_t  aClassNum = m_rand() % numCharClasses;
3987             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3988             int32_t   charIdx = m_rand() % classSet->size();
3989             UChar32   c = classSet->charAt(charIdx);
3990             if (c < 0) {   // TODO:  deal with sets containing strings.
3991                 errln("c < 0");
3992                 break;
3993             }
3994             testText.append(c);
3995         }
3996
3997         // Calculate the expected results for this test string.
3998         mk.setText(testText);
3999         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4000         expectedBreaks[0] = 1;
4001         int32_t breakPos = 0;
4002         expectedCount = 0;
4003         for (;;) {
4004             breakPos = mk.next(breakPos);
4005             if (breakPos == -1) {
4006                 break;
4007             }
4008             if (breakPos > testText.length()) {
4009                 errln("breakPos > testText.length()");
4010             }
4011             expectedBreaks[breakPos] = 1;
4012             U_ASSERT(expectedCount<testText.length());
4013             expected[expectedCount ++] = breakPos;
4014         }
4015
4016         // Find the break positions using forward iteration
4017         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4018         if (useUText) {
4019             UErrorCode status = U_ZERO_ERROR;
4020             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4021             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4022             bi->setText(testUText, status);
4023             TEST_ASSERT_SUCCESS(status);
4024             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4025                                       //  This UText can be closed immediately, so long as the
4026                                       //  testText string continues to exist.
4027         } else {
4028             bi->setText(testText);
4029         }
4030
4031         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4032             if (i < 0 || i > testText.length()) {
4033                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4034                 break;
4035             }
4036             forwardBreaks[i] = 1;
4037         }
4038
4039         // Find the break positions using reverse iteration
4040         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4041         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4042             if (i < 0 || i > testText.length()) {
4043                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4044                 break;
4045             }
4046             reverseBreaks[i] = 1;
4047         }
4048
4049         // Find the break positions using isBoundary() tests.
4050         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4051         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4052         for (i=0; i<=testText.length(); i++) {
4053             isBoundaryBreaks[i] = bi->isBoundary(i);
4054         }
4055
4056
4057         // Find the break positions using the following() function.
4058         // printf(".");
4059         memset(followingBreaks, 0, sizeof(followingBreaks));
4060         int32_t   lastBreakPos = 0;
4061         followingBreaks[0] = 1;
4062         for (i=0; i<testText.length(); i++) {
4063             breakPos = bi->following(i);
4064             if (breakPos <= i ||
4065                 breakPos < lastBreakPos ||
4066                 breakPos > testText.length() ||
4067                 breakPos > lastBreakPos && lastBreakPos > i ) {
4068                 errln("%s break monkey test: "
4069                     "Out of range value returned by BreakIterator::following().\n"
4070                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4071                          name, seed, i, breakPos, lastBreakPos);
4072                 break;
4073             }
4074             followingBreaks[breakPos] = 1;
4075             lastBreakPos = breakPos;
4076         }
4077
4078         // Find the break positions using the preceding() function.
4079         memset(precedingBreaks, 0, sizeof(followingBreaks));
4080         lastBreakPos = testText.length();
4081         precedingBreaks[testText.length()] = 1;
4082         for (i=testText.length(); i>0; i--) {
4083             breakPos = bi->preceding(i);
4084             if (breakPos >= i ||
4085                 breakPos > lastBreakPos ||
4086                 breakPos < 0 && testText.getChar32Start(i)>0 ||
4087                 breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
4088                 errln("%s break monkey test: "
4089                     "Out of range value returned by BreakIterator::preceding().\n"
4090                     "index=%d;  prev returned %d; lastBreak=%d" ,
4091                     name,  i, breakPos, lastBreakPos);
4092                 precedingBreaks[i] = 2;   // Forces an error.
4093             } else {
4094                 precedingBreaks[breakPos] = 1;
4095                 lastBreakPos = breakPos;
4096             }
4097         }
4098
4099         // Compare the expected and actual results.
4100         for (i=0; i<=testText.length(); i++) {
4101             const char *errorType = NULL;
4102             if  (forwardBreaks[i] != expectedBreaks[i]) {
4103                 errorType = "next()";
4104             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4105                 errorType = "previous()";
4106             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4107                 errorType = "isBoundary()";
4108             } else if (followingBreaks[i] != expectedBreaks[i]) {
4109                 errorType = "following()";
4110             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4111                 errorType = "preceding()";
4112             }
4113
4114
4115             if (errorType != NULL) {
4116                 // Format a range of the test text that includes the failure as
4117                 //  a data item that can be included in the rbbi test data file.
4118
4119                 // Start of the range is the last point where expected and actual results
4120                 //   both agreed that there was a break position.
4121                 int startContext = i;
4122                 int32_t count = 0;
4123                 for (;;) {
4124                     if (startContext==0) { break; }
4125                     startContext --;
4126                     if (expectedBreaks[startContext] != 0) {
4127                         if (count == 2) break;
4128                         count ++;
4129                     }
4130                 }
4131
4132                 // End of range is two expected breaks past the start position.
4133                 int endContext = i + 1;
4134                 int ci;
4135                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4136                     for (;;) {
4137                         if (endContext >= testText.length()) {break;}
4138                         if (expectedBreaks[endContext-1] != 0) {
4139                             if (count == 0) break;
4140                             count --;
4141                         }
4142                         endContext ++;
4143                     }
4144                 }
4145
4146                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4147                 UnicodeString errorText = "<data>";
4148                 /***if (strcmp(errorType, "next()") == 0) {
4149                     startContext = 0;
4150                     endContext = testText.length();
4151
4152                     printStringBreaks(testText, expected, expectedCount);
4153                 }***/
4154
4155                 for (ci=startContext; ci<endContext;) {
4156                     UnicodeString hexChars("0123456789abcdef");
4157                     UChar32  c;
4158                     int      bn;
4159                     c = testText.char32At(ci);
4160                     if (ci == i) {
4161                         // This is the location of the error.
4162                         errorText.append("<?>");
4163                     } else if (expectedBreaks[ci] != 0) {
4164                         // This a non-error expected break position.
4165                         errorText.append("\\");
4166                     }
4167                     if (c < 0x10000) {
4168                         errorText.append("\\u");
4169                         for (bn=12; bn>=0; bn-=4) {
4170                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4171                         }
4172                     } else {
4173                         errorText.append("\\U");
4174                         for (bn=28; bn>=0; bn-=4) {
4175                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4176                         }
4177                     }
4178                     ci = testText.moveIndex32(ci, 1);
4179                 }
4180                 errorText.append("\\");
4181                 errorText.append("</data>\n");
4182
4183                 // Output the error
4184                 char  charErrorTxt[500];
4185                 UErrorCode status = U_ZERO_ERROR;
4186                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4187                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4188                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4189                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4190                     errorType, seed, i, charErrorTxt);
4191                 break;
4192             }
4193         }
4194
4195         loopCount++;
4196     }
4197 #endif
4198 }
4199
4200 //
4201 //  TestDebug    -  A place-holder test for debugging purposes.
4202 //                  For putting in fragments of other tests that can be invoked
4203 //                  for tracing  without a lot of unwanted extra stuff happening.
4204 //
4205 void RBBITest::TestDebug(void) {
4206 #if 0
4207     UErrorCode   status = U_ZERO_ERROR;
4208     int pos = 0;
4209     int ruleStatus = 0;
4210
4211     RuleBasedBreakIterator* bi =
4212        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4213        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4214        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4215     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4216     // UnicodeString s("Aaa.  Bcd");
4217     s = s.unescape();
4218     bi->setText(s);
4219     UBool r = bi->isBoundary(8);
4220     printf("%s", r?"true":"false");
4221     return;
4222     pos = bi->last();
4223     do {
4224         // ruleStatus = bi->getRuleStatus();
4225         printf("%d\t%d\n", pos, ruleStatus);
4226         pos = bi->previous();
4227     } while (pos != BreakIterator::DONE);
4228 #endif
4229 }
4230
4231 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */