icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2004, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_BREAK_ITERATION
  15
  16 #include "unicode/utypes.h"
  17 #include "unicode/brkiter.h"
  18 #include "unicode/rbbi.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/utf16.h"
  21 #include "unicode/ucnv.h"
  22 #include "unicode/schriter.h"
  23 #include "unicode/uniset.h"
  24 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  25 #include "unicode/ustring.h"
  26
  27 #include "intltest.h"
  28 #include "rbbitst.h"
  29 #include <string.h>
  30 #include "uvector.h"
  31 #include "uvectr32.h"
  32 #include <string.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35
  36
  37
  38 //---------------------------------------------------------------------------
  39 //
  40 //   class BITestData   Holds a set of Break iterator test data and results
  41 //                      Includes
  42 //                         - the string data to be broken
  43 //                         - a vector of the expected break positions.
  44 //                         - a vector of source line numbers for the data,
  45 //                               (to help see where errors occured.)
  46 //                         - The expected break tag values.
  47 //                         - Vectors of actual break positions and tag values.
  48 //                         - Functions for comparing actual with expected and
  49 //                            reporting errors.
  50 //
  51 //----------------------------------------------------------------------------
  52 class BITestData {
  53 public:
  54     UnicodeString    fDataToBreak;
  55     UVector          fExpectedBreakPositions;
  56     UVector          fExpectedTags;
  57     UVector          fLineNum;
  58     UVector          fActualBreakPositions;   // Test Results.
  59     UVector          fActualTags;
  60
  61     BITestData(UErrorCode &status);
  62     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
  63     void             checkResults(const char *heading, RBBITest *test);
  64     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
  65     void             clearResults();
  66 };
  67
  68 //
  69 // Constructor.
  70 //
  71 BITestData::BITestData(UErrorCode &status)
  72 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
  73   fActualTags(status)
  74 {
  75 };
  76
  77 //
  78 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
  79 //                 The macro form collects the line number, which is helpful
  80 //                 when tracking down failures.
  81 //
  82 //                 A null data item is inserted at the start of each test's data
  83 //                  to put the starting zero into the data list.  The position saved for
  84 //                  each non-null item is its ending position.
  85 //
  86 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
  87 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
  88     if (U_FAILURE(status)) {return;}
  89     if (data != NULL) {
  90         fDataToBreak.append(CharsToUnicodeString(data));
  91     }
  92     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
  93     fExpectedTags.addElement(tag, status);
  94     fLineNum.addElement(lineNum, status);
  95 };
  96
  97
  98 //
  99 //  checkResults.   Compare the actual and expected break positions, report any differences.
 100 //
 101 void BITestData::checkResults(const char *heading, RBBITest *test) {
 102     int32_t   expectedIndex = 0;
 103     int32_t   actualIndex = 0;
 104
 105     for (;;) {
 106         // If we've run through both the expected and actual results vectors, we're done.
 107         //   break out of the loop.
 108         if (expectedIndex >= fExpectedBreakPositions.size() &&
 109             actualIndex   >= fActualBreakPositions.size()) {
 110             break;
 111         }
 112
 113
 114         if (expectedIndex >= fExpectedBreakPositions.size()) {
 115             err(heading, test, expectedIndex-1, actualIndex);
 116             actualIndex++;
 117             continue;
 118         }
 119
 120         if (actualIndex >= fActualBreakPositions.size()) {
 121             err(heading, test, expectedIndex, actualIndex-1);
 122             expectedIndex++;
 123             continue;
 124         }
 125
 126         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 127             err(heading, test, expectedIndex, actualIndex);
 128             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 129             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 130                 actualIndex++;
 131             } else {
 132                 expectedIndex++;
 133             }
 134             continue;
 135         }
 136
 137         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 138             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 139                 heading, fLineNum.elementAt(expectedIndex),
 140                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 141         }
 142
 143         actualIndex++;
 144         expectedIndex++;
 145     }
 146 }
 147
 148 //
 149 //  err   -  An error was found.  Report it, along with information about where the
 150 //                                incorrectly broken test data appeared in the source file.
 151 //
 152 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 153 {
 154     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 155     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 156     int32_t   o        = 0;
 157     int32_t   line     = fLineNum.elementAti(expectedIdx);
 158     if (expectedIdx > 0) {
 159         // The line numbers are off by one because a premature break occurs somewhere
 160         //    within the previous item, rather than at the start of the current (expected) item.
 161         //    We want to report the offset of the unexpected break from the start of
 162         //      this previous item.
 163         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 164     }
 165     if (actual < expected) {
 166         test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);
 167     } else {
 168         test->errln("%s Failed to find break at end of item from line %d", heading, line);
 169     }
 170 }
 171
 172
 173 void BITestData::clearResults() {
 174     fActualBreakPositions.removeAllElements();
 175     fActualTags.removeAllElements();
 176 }
 177
 178
 179 //-----------------------------------------------------------------------------------
 180 //
 181 //    Cannned Test Characters
 182 //
 183 //-----------------------------------------------------------------------------------
 184
 185 static const UChar cannedTestArray[] = {
 186     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
 187     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
 188     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
 189     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
 190     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
 191     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
 192     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
 193     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
 194 };
 195
 196 static UnicodeString* cannedTestChars = 0;
 197
 198 #define  halfNA     "\\u0928\\u094d\\u200d"
 199 #define  halfSA     "\\u0938\\u094d\\u200d"
 200 #define  halfCHA    "\\u091a\\u094d\\u200d"
 201 #define  halfKA     "\\u0915\\u094d\\u200d"
 202 #define  deadTA     "\\u0924\\u094d"
 203
 204 //--------------------------------------------------------------------------------------
 205 //
 206 //    RBBITest    constructor and destructor
 207 //
 208 //--------------------------------------------------------------------------------------
 209
 210 RBBITest::RBBITest() {
 211     UnicodeString temp(cannedTestArray);
 212     cannedTestChars = new UnicodeString();
 213     *cannedTestChars += (UChar)0x0000;
 214     *cannedTestChars += temp;
 215 }
 216
 217
 218 RBBITest::~RBBITest() {
 219     delete cannedTestChars;
 220 }
 221
 222
 223 static const int T_NUMBER = 100;
 224 static const int T_LETTER = 200;
 225 static const int T_H_OR_K = 300;
 226 static const int T_IDEO   = 400;
 227
 228
 229
 230
 231
 232
 233 //--------------------------------------------------------------------
 234 //Testing the BreakIterator for devanagari script
 235 //--------------------------------------------------------------------
 236
 237 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
 238 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
 239 #define deadTTHA "\\u0920\\u094d"
 240 #define deadPA   "\\u092a\\u094d"
 241 #define deadSA   "\\u0938\\u094d"
 242 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
 243
 244
 245
 246
 247
 248
 249 //-----------------------------------------------------------------------------------
 250 //
 251 //   Test for status {tag} return value from break rules.
 252 //        TODO:  a more thorough test.
 253 //
 254 //-----------------------------------------------------------------------------------
 255 void RBBITest::TestStatusReturn() {
 256      UnicodeString rulesString1 = "$Letters = [:L:];\n"
 257                                   "$Numbers = [:N:];\n"
 258                                   "$Letters+{1};\n"
 259                                   "$Numbers+{2};\n"
 260                                   "Help\\ {4}/me\\!;\n"
 261                                   "[^$Letters $Numbers];\n"
 262                                   "!.*;\n";
 263      UnicodeString testString1  = "abc123..abc Help me Help me!";
 264                                 // 01234567890123456789012345678
 265      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 266      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 267
 268      UErrorCode status=U_ZERO_ERROR;
 269      UParseError    parseError;
 270
 271      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 272      if(U_FAILURE(status)) {
 273          errln("FAIL : in construction");
 274      } else {
 275          int32_t  pos;
 276          int32_t  i = 0;
 277          bi->setText(testString1);
 278          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 279              if (pos != bounds1[i]) {
 280                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 281                  break;
 282              }
 283
 284              int tag = bi->getRuleStatus();
 285              if (tag != brkStatus[i]) {
 286                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 287                  break;
 288              }
 289              i++;
 290          }
 291      }
 292      delete bi;
 293 }
 294
 295
 296 static void printStringBreaks(UnicodeString ustr, int expected[],
 297                               int expectedcount)
 298 {
 299     UErrorCode status = U_ZERO_ERROR;
 300     char name[100];
 301     printf("code    alpha extend alphanum type line name\n");
 302     int j;
 303     for (j = 0; j < ustr.length(); j ++) {
 304         if (expectedcount > 0) {
 305             int k;
 306             for (k = 0; k < expectedcount; k ++) {
 307                 if (j == expected[k]) {
 308                     printf("------------------------------------------------ %d\n",
 309                            j);
 310                 }
 311             }
 312         }
 313         UChar32 c = ustr.char32At(j);
 314         if (c > 0xffff) {
 315             j ++;
 316         }
 317         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 318         printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
 319                            u_isUAlphabetic(c),
 320                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 321                            u_isalnum(c),
 322                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 323                                                   u_charType(c),
 324                                                   U_SHORT_PROPERTY_NAME),
 325                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 326                                                   u_getIntPropertyValue(c,
 327                                                              UCHAR_LINE_BREAK),
 328                                                   U_SHORT_PROPERTY_NAME),
 329                            name);
 330     }
 331 }
 332
 333 void RBBITest::TestThaiLineBreak() {
 334     UErrorCode status = U_ZERO_ERROR;
 335     BITestData thaiLineSelection(status);
 336
 337     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
 338     // represents elided letters at the end of a long word.  It should be bound to
 339     // the end of the word and not treated as an independent punctuation mark.
 340
 341
 342     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 343     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
 344     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
 345     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
 346     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
 347 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
 348 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 349     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
 350     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
 351     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
 352     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
 353     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
 354     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
 355     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
 356     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
 357
 358     // the one time where the paiyannoi occurs somewhere other than at the end
 359     // of a word is in the Thai abbrevation for "etc.", which both begins and
 360     // ends with a paiyannoi
 361     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
 362     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 363     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
 364
 365     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 366         Locale("th"), status);
 367     if (U_FAILURE(status))
 368     {
 369         errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
 370         return;
 371     }
 372
 373     generalIteratorTest(*e, thaiLineSelection);
 374     delete e;
 375 }
 376
 377
 378
 379 void RBBITest::TestMixedThaiLineBreak()
 380 {
 381     UErrorCode   status = U_ZERO_ERROR;
 382     BITestData   thaiLineSelection(status);
 383
 384     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 385
 386     // Arabic numerals should always be separated from surrounding Thai text
 387 /*
 388         ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
 389         ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
 390         ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
 391         ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
 392         ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
 393         thaiLineSelection->addElement("39");
 394         ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
 395
 396         // words in non-Thai scripts should always be separated from surrounding Thai text
 397         ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);
 398         ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);
 399         thaiLineSelection->addElement("Java");
 400         ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);
 401         ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);
 402         ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);
 403
 404         // Thai numerals should always be separated from the text surrounding them
 405         ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
 406         ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
 407         ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
 408         ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
 409         ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
 410         ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);
 411         ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
 412
 413         // Thai text should interact correctly with punctuation and symbols
 414         ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);
 415 //        ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);
 416 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);
 417 ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);
 418 // I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
 419         ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);
 420         ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);
 421         ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
 422 */
 423
 424     // The Unicode Linebreak TR says do not break before or after quotes.
 425     //    So this test is changed ot not break around the quote.
 426     //    TODO:  should Thai break around the around the quotes, like the original behavior here?
 427 //    ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);
 428 //    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
 429       ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
 430                                                          "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
 431
 432     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
 433     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);
 434     ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);
 435     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);
 436     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);
 437     ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);
 438     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);
 439     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);
 440     ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);
 441
 442     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
 443     if (U_FAILURE(status))
 444     {
 445         errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
 446         return;
 447     }
 448
 449
 450     generalIteratorTest(*e, thaiLineSelection);
 451     delete e;
 452 }
 453
 454
 455 void RBBITest::TestMaiyamok()
 456 {
 457     UErrorCode status = U_ZERO_ERROR;
 458     BITestData   thaiLineSelection(status);
 459     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 460     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
 461     // word".  Instead of appearing as a word unto itself, however, it's kept together
 462     // with the word before it
 463     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
 464     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
 465     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
 466     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);
 467     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
 468     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);
 469     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
 470
 471     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 472         Locale("th"), status);
 473
 474     if (U_FAILURE(status))
 475     {
 476         errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
 477         return;
 478     }
 479     generalIteratorTest(*e, thaiLineSelection);
 480     delete e;
 481 }
 482
 483 void RBBITest::TestThaiWordBreak() {
 484     UErrorCode status = U_ZERO_ERROR;
 485     BITestData   thaiWordSelection(status);
 486
 487     ADD_DATACHUNK(thaiWordSelection, NULL, 0, status);           // Break at start of data
 488     ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2
 489     ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5
 490     ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6
 491     ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10
 492     ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16
 493     ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18
 494
 495     // This is the correct result
 496     //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24
 497     //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
 498
 499     // and this is what the dictionary does...
 500     ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20
 501     ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
 502
 503     ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33
 504
 505     // This is the correct result
 506     //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37
 507     //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
 508
 509     // and this is what the dictionary does
 510     ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
 511
 512     ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45
 513     ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49
 514     ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51
 515
 516     // This is the correct result
 517     //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57
 518     //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60
 519
 520     // and this is what the dictionary does
 521     ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54
 522     ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60
 523
 524     ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63
 525
 526     // This is the correct result
 527     //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68
 528     //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71
 529     //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74
 530     //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77
 531
 532     // and this is what the dictionary does
 533     ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65
 534     ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77
 535
 536     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
 537         Locale("th"), status);
 538     if (U_FAILURE(status))
 539     {
 540         errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
 541         return;
 542     }
 543
 544     generalIteratorTest(*e, thaiWordSelection);
 545     delete e;
 546 }
 547
 548
 549 void RBBITest::TestBug3818() {
 550     UErrorCode  status = U_ZERO_ERROR;
 551
 552     // Four Thai words...
 553     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 554                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 555     UnicodeString  thaiStr(thaiWordData);
 556
 557     RuleBasedBreakIterator* bi =
 558         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
 559     if (U_FAILURE(status) || bi == NULL) {
 560         errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 561         return;
 562     }
 563     bi->setText(thaiStr);
 564
 565     int32_t  startOfSecondWord = bi->following(1);
 566     if (startOfSecondWord != 4) {
 567         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 568             __FILE__, __LINE__, startOfSecondWord);
 569     }
 570     startOfSecondWord = bi->following(0);
 571     if (startOfSecondWord != 4) {
 572         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 573             __FILE__, __LINE__, startOfSecondWord);
 574     }
 575     delete bi;
 576 }
 577
 578
 579 void RBBITest::TestJapaneseWordBreak() {
 580     UErrorCode status = U_ZERO_ERROR;
 581     BITestData   japaneseWordSelection(status);
 582
 583     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
 584     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
 585     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
 586     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
 587     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
 588     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
 589     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
 590
 591     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
 592         Locale("ja"), status);
 593     if (U_FAILURE(status))
 594     {
 595         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
 596         return;
 597     }
 598
 599     generalIteratorTest(*e, japaneseWordSelection);
 600     delete e;
 601 }
 602
 603 //---------------------------------------------
 604 // runIndexedTest
 605 //---------------------------------------------
 606
 607 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
 608 {
 609     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
 610
 611     switch (index) {
 612         case 0: name = "TestBug4153072";
 613             if(exec) TestBug4153072();                         break;
 614         case 1: name = "TestJapaneseLineBreak";
 615             if(exec) TestJapaneseLineBreak();                 break;
 616         case 2: name = "TestStatusReturn";
 617             if(exec) TestStatusReturn();                       break;
 618
 619         case 3: name = "TestLineBreakData";
 620             if(exec) TestLineBreakData();                      break;
 621         case 4: name = "TestEmptyString";
 622             if(exec) TestEmptyString();                        break;
 623
 624         case 5: name = "TestGetAvailableLocales";
 625             if(exec) TestGetAvailableLocales();                break;
 626
 627         case 6: name = "TestGetDisplayName";
 628             if(exec) TestGetDisplayName();                     break;
 629
 630         case 7: name = "TestEndBehaviour";
 631             if(exec) TestEndBehaviour();                       break;
 632         case 8: name = "TestMixedThaiLineBreak";
 633              if(exec) TestMixedThaiLineBreak();                break;
 634         case 9: name = "TestThaiWordBreak";
 635              if(exec) TestThaiWordBreak();                     break;
 636         case 10: name = "TestThaiLineBreak";
 637              if(exec) TestThaiLineBreak();                     break;
 638         case 11: name = "TestMaiyamok";
 639              if(exec) TestMaiyamok();                          break;
 640         case 12: name = "TestWordBreaks";
 641              if(exec) TestWordBreaks();                        break;
 642         case 13: name = "TestWordBoundary";
 643              if(exec) TestWordBoundary();                      break;
 644         case 14: name = "TestLineBreaks";
 645              if(exec) TestLineBreaks();                        break;
 646         case 15: name = "TestSentBreaks";
 647              if(exec) TestSentBreaks();                        break;
 648         case 16: name = "TestExtended";
 649              if(exec) TestExtended();                          break;
 650         case 17: name = "TestMonkey";
 651              if(exec) {
 652  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 653                TestMonkey(params);
 654  #else
 655                logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
 656  #endif
 657              }
 658                                                                break;
 659         case 18: name = "TestBug3818";
 660             if(exec) TestBug3818();                            break;
 661         case 19: name = "TestJapaneseWordBreak";
 662             if(exec) TestJapaneseWordBreak();                  break;
 663
 664         default: name = ""; break; //needed to end loop
 665     }
 666 }
 667
 668
 669 //----------------------------------------------------------------------------
 670 //
 671 // generalIteratorTest      Given a break iterator and a set of test data,
 672 //                          Run the tests and report the results.
 673 //
 674 //----------------------------------------------------------------------------
 675 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 676 {
 677
 678     bi.setText(td.fDataToBreak);
 679
 680     testFirstAndNext(bi, td);
 681
 682     testLastAndPrevious(bi, td);
 683
 684     testFollowing(bi, td);
 685     testPreceding(bi, td);
 686     testIsBoundary(bi, td);
 687     doMultipleSelectionTest(bi, td);
 688 }
 689
 690
 691 //
 692 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 693 //                       kind of loop.
 694 //
 695 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 696 {
 697     UErrorCode  status = U_ZERO_ERROR;
 698     int32_t     p;
 699     int32_t     lastP = -1;
 700     int32_t     tag;
 701
 702     logln("Test first and next");
 703     bi.setText(td.fDataToBreak);
 704     td.clearResults();
 705
 706     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 707         td.fActualBreakPositions.addElement(p, status);  // Save result.
 708         tag = bi.getRuleStatus();
 709         td.fActualTags.addElement(tag, status);
 710         if (p <= lastP) {
 711             // If the iterator is not making forward progress, stop.
 712             //  No need to raise an error here, it'll be detected in the normal check of results.
 713             break;
 714         }
 715         lastP = p;
 716     }
 717     td.checkResults("testFirstAndNext", this);
 718 }
 719
 720
 721 //
 722 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 723 //
 724 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 725 {
 726     UErrorCode  status = U_ZERO_ERROR;
 727     int32_t     p;
 728     int32_t     lastP  = 0x7ffffffe;
 729     int32_t     tag;
 730
 731     logln("Test first and next");
 732     bi.setText(td.fDataToBreak);
 733     td.clearResults();
 734
 735     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 736         // Save break position.  Insert it at start of vector of results, shoving
 737         //    already-saved results further towards the end.
 738         td.fActualBreakPositions.insertElementAt(p, 0, status);
 739         // bi.previous();   // TODO:  Why does this fix things up????
 740         // bi.next();
 741         tag = bi.getRuleStatus();
 742         td.fActualTags.insertElementAt(tag, 0, status);
 743         if (p >= lastP) {
 744             // If the iterator is not making progress, stop.
 745             //  No need to raise an error here, it'll be detected in the normal check of results.
 746             break;
 747         }
 748         lastP = p;
 749     }
 750     td.checkResults("testLastAndPrevious", this);
 751 }
 752
 753
 754 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 755 {
 756     UErrorCode  status = U_ZERO_ERROR;
 757     int32_t     p;
 758     int32_t     tag;
 759     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 760                                  //   cannot be -1; that is returned for DONE.
 761     int         i;
 762
 763     logln("testFollowing():");
 764     bi.setText(td.fDataToBreak);
 765     td.clearResults();
 766
 767     // Save the starting point, since we won't get that out of following.
 768     p = bi.first();
 769     td.fActualBreakPositions.addElement(p, status);  // Save result.
 770     tag = bi.getRuleStatus();
 771     td.fActualTags.addElement(tag, status);
 772
 773     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 774         p = bi.following(i);
 775         if (p != lastP) {
 776             if (p == RuleBasedBreakIterator::DONE) {
 777                 break;
 778             }
 779             // We've reached a new break position.  Save it.
 780             td.fActualBreakPositions.addElement(p, status);  // Save result.
 781             tag = bi.getRuleStatus();
 782             td.fActualTags.addElement(tag, status);
 783             lastP = p;
 784         }
 785     }
 786     // The loop normally exits by means of the break in the middle.
 787     // Make sure that the index was at the correct position for the break iterator to have
 788     //   returned DONE.
 789     if (i != td.fDataToBreak.length()) {
 790         errln("testFollowing():  iterator returned DONE prematurely.");
 791     }
 792
 793     // Full check of all results.
 794     td.checkResults("testFollowing", this);
 795 }
 796
 797
 798
 799 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 800     UErrorCode  status = U_ZERO_ERROR;
 801     int32_t     p;
 802     int32_t     tag;
 803     int32_t     lastP  = 0x7ffffffe;
 804     int         i;
 805
 806     logln("testPreceding():");
 807     bi.setText(td.fDataToBreak);
 808     td.clearResults();
 809
 810     p = bi.last();
 811     td.fActualBreakPositions.addElement(p, status);
 812     tag = bi.getRuleStatus();
 813     td.fActualTags.addElement(tag, status);
 814
 815     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 816         p = bi.preceding(i);
 817         if (p != lastP) {
 818             if (p == RuleBasedBreakIterator::DONE) {
 819                 break;
 820             }
 821             // We've reached a new break position.  Save it.
 822             td.fActualBreakPositions.insertElementAt(p, 0, status);
 823             lastP = p;
 824             tag = bi.getRuleStatus();
 825             td.fActualTags.insertElementAt(tag, 0, status);
 826         }
 827     }
 828     // The loop normally exits by means of the break in the middle.
 829     // Make sure that the index was at the correct position for the break iterator to have
 830     //   returned DONE.
 831     if (i != 0) {
 832         errln("testPreceding():  iterator returned DONE prematurely.");
 833     }
 834
 835     // Full check of all results.
 836     td.checkResults("testPreceding", this);
 837 }
 838
 839
 840
 841 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 842     UErrorCode  status = U_ZERO_ERROR;
 843     int         i;
 844     int32_t     tag;
 845
 846     logln("testIsBoundary():");
 847     bi.setText(td.fDataToBreak);
 848     td.clearResults();
 849
 850     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 851         if (bi.isBoundary(i)) {
 852             td.fActualBreakPositions.addElement(i, status);  // Save result.
 853             tag = bi.getRuleStatus();
 854             td.fActualTags.addElement(tag, status);
 855         }
 856     }
 857     td.checkResults("testIsBoundary: ", this);
 858 }
 859
 860
 861
 862 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 863 {
 864     iterator.setText(td.fDataToBreak);
 865
 866     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 867     int32_t offset = iterator.first();
 868     int32_t testOffset;
 869     int32_t count = 0;
 870
 871     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 872
 873     if (*testIterator != iterator)
 874         errln("clone() or operator!= failed: two clones compared unequal");
 875
 876     do {
 877         testOffset = testIterator->first();
 878         testOffset = testIterator->next(count);
 879         if (offset != testOffset)
 880             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 881
 882         if (offset != RuleBasedBreakIterator::DONE) {
 883             count++;
 884             offset = iterator.next();
 885
 886             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 887                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 888                 if (count > 10000 || offset == -1) {
 889                     errln("operator== failed too many times. Stopping test.");
 890                     if (offset == -1) {
 891                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 892                     }
 893                     return;
 894                 }
 895             }
 896         }
 897     } while (offset != RuleBasedBreakIterator::DONE);
 898
 899     // now do it backwards...
 900     offset = iterator.last();
 901     count = 0;
 902
 903     do {
 904         testOffset = testIterator->last();
 905         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 906         if (offset != testOffset)
 907             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 908
 909         if (offset != RuleBasedBreakIterator::DONE) {
 910             count--;
 911             offset = iterator.previous();
 912         }
 913     } while (offset != RuleBasedBreakIterator::DONE);
 914
 915     delete testIterator;
 916 }
 917
 918
 919
 920 //--------------------------------------------------------------------------------------------
 921 //
 922 //    Break Iterator Invariants Tests
 923 //
 924 //--------------------------------------------------------------------------------------------
 925
 926 void RBBITest::TestCharacterInvariants()
 927 {
 928     UErrorCode status = U_ZERO_ERROR;
 929     BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
 930     if (U_FAILURE(status))
 931     {
 932         errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
 933         return;
 934     }
 935     UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
 936     doBreakInvariantTest(*e, s);
 937     s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
 938     doOtherInvariantTest(*e, s);
 939     delete e;
 940 }
 941
 942
 943 void RBBITest::TestWordInvariants()
 944 {
 945     UErrorCode status = U_ZERO_ERROR;
 946     BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);
 947     if (U_FAILURE(status))
 948     {
 949         errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
 950         return;
 951     }
 952     UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
 953     doBreakInvariantTest(*e, s);
 954     s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
 955     doOtherInvariantTest(*e, s);
 956     delete e;
 957 }
 958
 959
 960 void RBBITest::TestSentenceInvariants()
 961 {
 962     UErrorCode status = U_ZERO_ERROR;
 963     BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
 964     if (U_FAILURE(status))
 965     {
 966         errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
 967         return;
 968     }
 969     UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
 970     doOtherInvariantTest(*e, s);
 971     delete e;
 972 }
 973
 974
 975
 976
 977 void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
 978 {
 979     UnicodeString work("aaa");
 980     int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;
 981
 982     // a break should always occur after CR (unless followed by LF), LF, PS, and LS
 983     UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");
 984     int32_t i, j;
 985
 986     breaksLen = breaks.length();
 987     for (i = 0; i < breaksLen; i++) {
 988         UChar c1 = breaks[i];
 989         work.setCharAt(1, c1);
 990         for (j = 0; j < testCharsLen; j++) {
 991             UChar c0 = testChars[j];
 992             work.setCharAt(0, c0);
 993             int k;
 994             for (k = 0; k < testCharsLen; k++) {
 995                 UChar c2 = testChars[k];
 996                 work.setCharAt(2, c2);
 997
 998                 // if a cr is followed by lf, ps, ls or etx, don't do the check (that's
 999                 // not supposed to work)
1000                 if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029
1001                         || c2 == 0x2028 || c2 == 0x0003))
1002                     continue;
1003
1004                 if (u_charType(c1) == U_CONTROL_CHAR &&
1005                     (u_charType(c2) == U_NON_SPACING_MARK ||
1006                      u_charType(c2) == U_ENCLOSING_MARK ||
1007                      u_charType(c2) == U_COMBINING_SPACING_MARK)
1008                     ) {
1009                     // Combining marks don't combine with controls.
1010                     //  TODO:  enhance test to verify that the break actually occurs,
1011                     //         not just ignore the case.
1012                     continue;
1013                 }
1014
1015
1016                 tb.setText(work);
1017                 UBool seen2 = FALSE;
1018                 int l;
1019                 for (l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
1020                     if (l == 2) {
1021                         seen2 = TRUE;
1022                         break;
1023                     }
1024                 }
1025                 if (!seen2) {
1026                     printStringBreaks(work, NULL, 0);
1027                     errln("No Break between \\U%04x and \\U%04x", c1, c2);
1028                     errCount++;
1029                     if (errCount >= 75)
1030                         return;
1031                 }
1032             }
1033         }
1034     }
1035 }
1036
1037
1038
1039 void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
1040 {
1041     UnicodeString work("a\r\na");
1042     int32_t errCount = 0, testCharsLen = testChars.length();
1043     int32_t i, j;
1044     int8_t type;
1045
1046     // a break should never occur between CR and LF
1047     for (i = 0; i < testCharsLen; i++) {
1048         work.setCharAt(0, testChars[i]);
1049         for (j = 0; j < testCharsLen; j++) {
1050             work.setCharAt(3, testChars[j]);
1051             tb.setText(work);
1052             int32_t k;
1053             for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())
1054                 if (k == 2) {
1055                     errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
1056                         work[0], work[1], work[2], work[3]);
1057                     errCount++;
1058                     if (errCount >= 75)
1059                         return;
1060                 }
1061         }
1062     }
1063
1064     // a break should never occur before a non-spacing mark, unless the preceding
1065     // character is CR, LF, PS, or LS
1066     //   Or the general category == Control.
1067     work.remove();
1068     work += "aaaa";
1069     for (i = 0; i < testCharsLen; i++) {
1070         UChar c1 = testChars[i];
1071         if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||
1072             u_charType(c1) == U_CONTROL_CHAR  ||  u_charType(c1) == U_FORMAT_CHAR) {
1073             continue;
1074         }
1075         work.setCharAt(1, c1);
1076         for (j = 0; j < testCharsLen; j++) {
1077             UChar c2 = testChars[j];
1078             type = u_charType(c2);
1079             if ((type != U_NON_SPACING_MARK) &&
1080                 (type != U_ENCLOSING_MARK)) {
1081                 continue;
1082             }
1083             work.setCharAt(2, c2);
1084             tb.setText(work);
1085             int k;
1086             for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())
1087                 if (k == 2) {
1088                     //errln("Break between U+" + UCharToUnicodeString(work[1])
1089                     //        + " and U+" + UCharToUnicodeString(work[2]));
1090                     errln("Unexpected Break between %6x and %6x", c1, c2);
1091                     errCount++;
1092                     if (errCount >= 75)
1093                         return;
1094                 }
1095         }
1096     }
1097 }
1098
1099
1100
1101
1102 //---------------------------------------------
1103 //
1104 //     other tests
1105 //
1106 //---------------------------------------------
1107 void RBBITest::TestEmptyString()
1108 {
1109     UnicodeString text = "";
1110     UErrorCode status = U_ZERO_ERROR;
1111
1112     BITestData x(status);
1113     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1114     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1115     if (U_FAILURE(status))
1116     {
1117         errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1118         return;
1119     }
1120     generalIteratorTest(*bi, x);
1121     delete bi;
1122 }
1123
1124 void RBBITest::TestGetAvailableLocales()
1125 {
1126     int32_t locCount = 0;
1127     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1128
1129     if (locCount == 0)
1130         errln("getAvailableLocales() returned an empty list!");
1131     // Just make sure that it's returning good memory.
1132     int32_t i;
1133     for (i = 0; i < locCount; ++i) {
1134         logln(locList[i].getName());
1135     }
1136 }
1137
1138 //Testing the BreakIterator::getDisplayName() function
1139 void RBBITest::TestGetDisplayName()
1140 {
1141     UnicodeString   result;
1142
1143     BreakIterator::getDisplayName(Locale::getUS(), result);
1144     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1145         errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1146                 + result);
1147
1148     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1149     if (result != "French (France)")
1150         errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1151                 + result);
1152 }
1153 /**
1154  * Test End Behaviour
1155  * @bug 4068137
1156  */
1157 void RBBITest::TestEndBehaviour()
1158 {
1159     UErrorCode status = U_ZERO_ERROR;
1160     UnicodeString testString("boo.");
1161     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1162     if (U_FAILURE(status))
1163     {
1164         errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1165         return;
1166     }
1167     wb->setText(testString);
1168
1169     if (wb->first() != 0)
1170         errln("Didn't get break at beginning of string.");
1171     if (wb->next() != 3)
1172         errln("Didn't get break before period in \"boo.\"");
1173     if (wb->current() != 4 && wb->next() != 4)
1174         errln("Didn't get break at end of string.");
1175     delete wb;
1176 }
1177 /*
1178  * @bug 4153072
1179  */
1180 void RBBITest::TestBug4153072() {
1181     UErrorCode status = U_ZERO_ERROR;
1182     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1183     if (U_FAILURE(status))
1184     {
1185         errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1186         return;
1187     }
1188     UnicodeString str("...Hello, World!...");
1189     int32_t begin = 3;
1190     int32_t end = str.length() - 3;
1191     UBool dummy;
1192
1193     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1194     iter->adoptText(textIterator);
1195     int index;
1196     for (index = -1; index < begin + 1; ++index) {
1197         dummy = iter->isBoundary(index);
1198         if (index < begin && dummy == TRUE) {
1199             errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
1200                             " and begin index = " + begin);
1201         }
1202     }
1203     delete iter;
1204 }
1205
1206
1207 /**
1208  * Test Japanese Line Break
1209  * @bug 4095322
1210  */
1211 void RBBITest::TestJapaneseLineBreak()
1212 {
1213 #if 0
1214     // Test needs updating some more...   Dump it for now.
1215
1216
1217     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1218     //        as opening and closing punctuation for line breaking.
1219     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1220     //        from these tests.    6-13-2002
1221     //
1222     UErrorCode status = U_ZERO_ERROR;
1223     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1224     UnicodeString precedingChars = CharsToUnicodeString(
1225         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1226         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1227     UnicodeString followingChars = CharsToUnicodeString(
1228         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1229         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1230         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1231         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1232         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1233     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1234
1235     int32_t i;
1236     if (U_FAILURE(status))
1237     {
1238         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1239         return;
1240     }
1241
1242     for (i = 0; i < precedingChars.length(); i++) {
1243         testString.setCharAt(1, precedingChars[i]);
1244         iter->setText(testString);
1245         int32_t j = iter->first();
1246         if (j != 0)
1247             errln("ja line break failure: failed to start at 0");
1248         j = iter->next();
1249         if (j != 1)
1250             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1251                         + "' (" + ((int)(precedingChars[i])) + ")");
1252         j = iter->next();
1253         if (j != 3)
1254             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1255                         + "' (" + ((int)(precedingChars[i])) + ")");
1256     }
1257
1258     for (i = 0; i < followingChars.length(); i++) {
1259         testString.setCharAt(1, followingChars[i]);
1260         iter->setText(testString);
1261         int j = iter->first();
1262         if (j != 0)
1263             errln("ja line break failure: failed to start at 0");
1264         j = iter->next();
1265         if (j != 2)
1266             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1267                         + "' (" + ((int)(followingChars[i])) + ")");
1268         j = iter->next();
1269         if (j != 3)
1270             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1271                         + "' (" + ((int)(followingChars[i])) + ")");
1272     }
1273     delete iter;
1274 #endif
1275 }
1276
1277
1278 //------------------------------------------------------------------------------
1279 //
1280 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
1281 //
1282 //------------------------------------------------------------------------------
1283
1284 struct TestParams {
1285     BreakIterator   *bi;
1286     UnicodeString    dataToBreak;
1287     UVector32       *expectedBreaks;
1288     UVector32       *srcLine;
1289     UVector32       *srcCol;
1290 };
1291
1292 void RBBITest::executeTest(TestParams *t) {
1293     int32_t    bp;
1294     int32_t    prevBP;
1295     int32_t    i;
1296
1297     t->bi->setText(t->dataToBreak);
1298     //
1299     //  Run the iterator forward
1300     //
1301     prevBP = -1;
1302     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1303         if (prevBP ==  bp) {
1304             // Fail for lack of forward progress.
1305             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1306                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1307             break;
1308         }
1309
1310         // Check that there were we didn't miss an expected break between the last one
1311         //  and this one.
1312         for (i=prevBP+1; i<bp; i++) {
1313             if (t->expectedBreaks->elementAti(i) != 0) {
1314                 int expected[] = {0, i};
1315                 printStringBreaks(t->dataToBreak, expected, 2);
1316                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1317                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1318             }
1319         }
1320
1321         // Check that the break we did find was expected
1322         if (t->expectedBreaks->elementAti(bp) == 0) {
1323             int expected[] = {0, bp};
1324             printStringBreaks(t->dataToBreak, expected, 2);
1325             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1326                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1327         } else {
1328             // The break was expected.
1329             //   Check that the {nnn} tag value is correct.
1330             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1331             if (expectedTagVal == -1) {
1332                 expectedTagVal = 0;
1333             }
1334             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1335             if (rs != expectedTagVal) {
1336                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1337                       "          Actual, Expected status = %4d, %4d",
1338                     bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
1339             }
1340         }
1341
1342
1343         prevBP = bp;
1344     }
1345
1346     // Verify that there were no missed expected breaks after the last one found
1347     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1348         if (t->expectedBreaks->elementAti(i) != 0) {
1349             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1350                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1351         }
1352     }
1353
1354     //
1355     //  Run the iterator backwards, verify that the same breaks are found.
1356     //
1357     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1358     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1359         if (prevBP ==  bp) {
1360             // Fail for lack of progress.
1361             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1362                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1363             break;
1364         }
1365
1366         // Check that there were we didn't miss an expected break between the last one
1367         //  and this one.  (UVector returns zeros for index out of bounds.)
1368         for (i=prevBP-1; i>bp; i--) {
1369             if (t->expectedBreaks->elementAti(i) != 0) {
1370                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1371                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1372             }
1373         }
1374
1375         // Check that the break we did find was expected
1376         if (t->expectedBreaks->elementAti(bp) == 0) {
1377             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1378                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1379         } else {
1380             // The break was expected.
1381             //   Check that the {nnn} tag value is correct.
1382             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1383             if (expectedTagVal == -1) {
1384                 expectedTagVal = 0;
1385             }
1386             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1387             if (rs != expectedTagVal) {
1388                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1389                       "          Actual, Expected status = %4d, %4d",
1390                     bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
1391             }
1392         }
1393
1394         prevBP = bp;
1395     }
1396
1397     // Verify that there were no missed breaks prior to the last one found
1398     for (i=prevBP-1; i>=0; i--) {
1399         if (t->expectedBreaks->elementAti(i) != 0) {
1400             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1401                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1402         }
1403     }
1404 }
1405
1406
1407 void RBBITest::TestExtended() {
1408     UErrorCode      status  = U_ZERO_ERROR;
1409     Locale          locale   = Locale::getDefault();
1410
1411     UnicodeString       rules;
1412     TestParams          tp;
1413     tp.bi             = NULL;
1414     tp.expectedBreaks = new UVector32(status);
1415     tp.srcLine        = new UVector32(status);
1416     tp.srcCol         = new UVector32(status);
1417
1418
1419     //
1420     //  Open and read the test data file.
1421     //
1422     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1423     char testFileName[1000];
1424     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1425         errln("Can't open test data.  Path too long.");
1426         return;
1427     }
1428     strcpy(testFileName, testDataDirectory);
1429     strcat(testFileName, "rbbitst.txt");
1430
1431     int    len;
1432     UChar *testFile = ReadAndConvertFile(testFileName, len, status);
1433     if (U_FAILURE(status)) {
1434         return; /* something went wrong, error already output */
1435     }
1436
1437
1438
1439     //
1440     //  Put the test data into a UnicodeString
1441     //
1442     UnicodeString testString(FALSE, testFile, len);
1443
1444     enum EParseState{
1445         PARSE_COMMENT,
1446         PARSE_TAG,
1447         PARSE_DATA,
1448         PARSE_NUM
1449     }
1450     parseState = PARSE_TAG;
1451
1452     EParseState savedState = PARSE_TAG;
1453
1454     static const UChar CH_LF        = 0x0a;
1455     static const UChar CH_CR        = 0x0d;
1456     static const UChar CH_HASH      = 0x23;
1457     /*static const UChar CH_PERIOD    = 0x2e;*/
1458     static const UChar CH_LT        = 0x3c;
1459     static const UChar CH_GT        = 0x3e;
1460     static const UChar CH_BACKSLASH = 0x5c;
1461     static const UChar CH_BULLET    = 0x2022;
1462
1463     int32_t    lineNum  = 1;
1464     int32_t    colStart = 0;
1465     int32_t    column   = 0;
1466     int32_t    charIdx  = 0;
1467
1468     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1469
1470     for (charIdx = 0; charIdx < len; ) {
1471         UChar  c = testString.charAt(charIdx);
1472         charIdx++;
1473         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1474             // treat CRLF as a unit
1475             c = CH_LF;
1476             charIdx++;
1477         }
1478         if (c == CH_LF || c == CH_CR) {
1479             lineNum++;
1480             colStart = charIdx;
1481         }
1482         column = charIdx - colStart + 1;
1483
1484         switch (parseState) {
1485         case PARSE_COMMENT:
1486             if (c == 0x0a || c == 0x0d) {
1487                 parseState = savedState;
1488             }
1489             break;
1490
1491         case PARSE_TAG:
1492             {
1493             if (c == CH_HASH) {
1494                 parseState = PARSE_COMMENT;
1495                 savedState = PARSE_TAG;
1496                 break;
1497             }
1498             if (u_isUWhiteSpace(c)) {
1499                 break;
1500             }
1501             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1502                 delete tp.bi;
1503                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1504                 charIdx += 5;
1505                 break;
1506             }
1507             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1508                 delete tp.bi;
1509                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1510                 charIdx += 5;
1511                 break;
1512             }
1513             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1514                 delete tp.bi;
1515                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1516                 charIdx += 5;
1517                 break;
1518             }
1519             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1520                 delete tp.bi;
1521                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1522                 charIdx += 5;
1523                 break;
1524             }
1525             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1526                 delete tp.bi;
1527                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1528                 charIdx += 6;
1529                 break;
1530             }
1531             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1532                 parseState = PARSE_DATA;
1533                 charIdx += 5;
1534                 tp.dataToBreak = "";
1535                 tp.expectedBreaks->removeAllElements();
1536                 tp.srcCol ->removeAllElements();
1537                 tp.srcLine->removeAllElements();
1538                 break;
1539             }
1540
1541             errln("line %d: Tag expected in test file.", lineNum);
1542             goto end_test;
1543             parseState = PARSE_COMMENT;
1544             savedState = PARSE_DATA;
1545             }
1546             break;
1547
1548         case PARSE_DATA:
1549             if (c == CH_BULLET) {
1550                 int32_t  breakIdx = tp.dataToBreak.length();
1551                 tp.expectedBreaks->setSize(breakIdx+1);
1552                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1553                 tp.srcLine->setSize(breakIdx+1);
1554                 tp.srcLine->setElementAt(lineNum, breakIdx);
1555                 tp.srcCol ->setSize(breakIdx+1);
1556                 tp.srcCol ->setElementAt(column, breakIdx);
1557                 break;
1558             }
1559
1560             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1561                 // Add final entry to mappings from break location to source file position.
1562                 //  Need one extra because last break position returned is after the
1563                 //    last char in the data, not at the last char.
1564                 tp.srcLine->addElement(lineNum, status);
1565                 tp.srcCol ->addElement(column, status);
1566
1567                 parseState = PARSE_TAG;
1568                 charIdx += 7;
1569
1570                 // RUN THE TEST!
1571                 executeTest(&tp);
1572                 break;
1573             }
1574
1575             if (testString.compare(charIdx-1, 3, "\\N{") == 0) {
1576                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1577                 // Get the code point from the name and insert it into the test data.
1578                 //   (Damn, no API takes names in Unicode  !!!
1579                 //    we've got to take it back to char *)
1580                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1581                 int32_t nameLength = nameEndIdx - (charIdx+2);
1582                 char charNameBuf[200];
1583                 UChar32 theChar = -1;
1584                 if (nameEndIdx != -1) {
1585                     UErrorCode status = U_ZERO_ERROR;
1586                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1587                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1588                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1589                     if (U_FAILURE(status)) {
1590                         theChar = -1;
1591                     }
1592                 }
1593                 if (theChar == -1) {
1594                     errln("Error in named character in test file at line %d, col %d",
1595                         lineNum, column);
1596                 } else {
1597                     // Named code point was recognized.  Insert it
1598                     //   into the test data.
1599                     tp.dataToBreak.append(theChar);
1600                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1601                         tp.srcLine->addElement(lineNum, status);
1602                         tp.srcCol ->addElement(column, status);
1603                     }
1604                 }
1605                 if (nameEndIdx > charIdx) {
1606                     charIdx = nameEndIdx+1;
1607                 }
1608                 break;
1609             }
1610
1611
1612
1613
1614             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1615                 charIdx++;
1616                 int32_t  breakIdx = tp.dataToBreak.length();
1617                 tp.expectedBreaks->setSize(breakIdx+1);
1618                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1619                 tp.srcLine->setSize(breakIdx+1);
1620                 tp.srcLine->setElementAt(lineNum, breakIdx);
1621                 tp.srcCol ->setSize(breakIdx+1);
1622                 tp.srcCol ->setElementAt(column, breakIdx);
1623                 break;
1624             }
1625
1626             if (c == CH_LT) {
1627                 tagValue   = 0;
1628                 parseState = PARSE_NUM;
1629                 break;
1630             }
1631
1632             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1633                 parseState = PARSE_COMMENT;
1634                 savedState = PARSE_DATA;
1635                 break;
1636             }
1637
1638             if (c == CH_BACKSLASH) {
1639                 // Check for \ at end of line, a line continuation.
1640                 //     Advance over (discard) the newline
1641                 UChar32 cp = testString.char32At(charIdx);
1642                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1643                     // We have a CR LF
1644                     //  Need an extra increment of the input ptr to move over both of them
1645                     charIdx++;
1646                 }
1647                 if (cp == CH_LF || cp == CH_CR) {
1648                     lineNum++;
1649                     colStart = charIdx;
1650                     charIdx++;
1651                     break;
1652                 }
1653
1654                 // Let unescape handle the back slash.
1655                 cp = testString.unescapeAt(charIdx);
1656                 if (cp != -1) {
1657                     // Escape sequence was recognized.  Insert the char
1658                     //   into the test data.
1659                     tp.dataToBreak.append(cp);
1660                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1661                         tp.srcLine->addElement(lineNum, status);
1662                         tp.srcCol ->addElement(column, status);
1663                     }
1664                     break;
1665                 }
1666
1667
1668                 // Not a recognized backslash escape sequence.
1669                 // Take the next char as a literal.
1670                 //  TODO:  Should this be an error?
1671                 c = testString.charAt(charIdx);
1672                 charIdx = testString.moveIndex32(charIdx, 1);
1673             }
1674
1675             // Normal, non-escaped data char.
1676             tp.dataToBreak.append(c);
1677
1678             // Save the mapping from offset in the data to line/column numbers in
1679             //   the original input file.  Will be used for better error messages only.
1680             //   If there's an expected break before this char, the slot in the mapping
1681             //     vector will already be set for this char; don't overwrite it.
1682             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1683                 tp.srcLine->addElement(lineNum, status);
1684                 tp.srcCol ->addElement(column, status);
1685             }
1686             break;
1687
1688
1689         case PARSE_NUM:
1690             // We are parsing an expected numeric tag value, like <1234>,
1691             //   within a chunk of data.
1692             if (u_isUWhiteSpace(c)) {
1693                 break;
1694             }
1695
1696             if (c == CH_GT) {
1697                 // Finished the number.  Add the info to the expected break data,
1698                 //   and switch parse state back to doing plain data.
1699                 parseState = PARSE_DATA;
1700                 if (tagValue == 0) {
1701                     tagValue = -1;
1702                 }
1703                 int32_t  breakIdx = tp.dataToBreak.length();
1704                 tp.expectedBreaks->setSize(breakIdx+1);
1705                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1706                 tp.srcLine->setSize(breakIdx+1);
1707                 tp.srcLine->setElementAt(lineNum, breakIdx);
1708                 tp.srcCol ->setSize(breakIdx+1);
1709                 tp.srcCol ->setElementAt(column, breakIdx);
1710                 break;
1711             }
1712
1713             if (u_isdigit(c)) {
1714                 tagValue = tagValue*10 + u_charDigitValue(c);
1715                 break;
1716             }
1717
1718             errln("Syntax Error in test file at line %d, col %d",
1719                 lineNum, column);
1720             goto end_test;
1721             parseState = PARSE_COMMENT;
1722             break;
1723         }
1724
1725
1726         if (U_FAILURE(status)) {
1727             errln("ICU Error %s while parsing test file at line %d.",
1728                 u_errorName(status), lineNum);
1729             goto end_test;
1730             status = U_ZERO_ERROR;
1731         }
1732
1733     }
1734
1735 end_test:
1736     delete tp.bi;
1737     delete tp.expectedBreaks;
1738     delete tp.srcLine;
1739     delete tp.srcCol;
1740     delete [] testFile;
1741 }
1742
1743
1744 //-------------------------------------------------------------------------------
1745 //
1746 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1747 //    return the datain one big UChar * buffer, which the caller must delete.
1748 //
1749 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1750 //           Move this function to some common place.
1751 //
1752 //--------------------------------------------------------------------------------
1753 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
1754     UChar       *retPtr  = NULL;
1755     char        *fileBuf = NULL;
1756     UConverter* conv     = NULL;
1757     FILE        *f       = NULL;
1758
1759     ulen = 0;
1760     if (U_FAILURE(status)) {
1761         return retPtr;
1762     }
1763
1764     //
1765     //  Open the file.
1766     //
1767     f = fopen(fileName, "rb");
1768     if (f == 0) {
1769         errln("Error opening test data file %s\n", fileName);
1770         status = U_FILE_ACCESS_ERROR;
1771         return NULL;
1772     }
1773     //
1774     //  Read it in
1775     //
1776     int   fileSize;
1777     int   amt_read;
1778
1779     fseek( f, 0, SEEK_END);
1780     fileSize = ftell(f);
1781     fileBuf = new char[fileSize];
1782     fseek(f, 0, SEEK_SET);
1783     amt_read = fread(fileBuf, 1, fileSize, f);
1784     if (amt_read != fileSize || fileSize <= 0) {
1785         errln("Error reading test data file.");
1786         goto cleanUpAndReturn;
1787     }
1788
1789     //
1790     // Look for a Unicode Signature (BOM) on the data just read
1791     //
1792     int32_t        signatureLength;
1793     const char *   fileBufC;
1794     const char*    encoding;
1795
1796     fileBufC = fileBuf;
1797     encoding = ucnv_detectUnicodeSignature(
1798         fileBuf, fileSize, &signatureLength, &status);
1799     if(encoding!=NULL ){
1800         fileBufC  += signatureLength;
1801         fileSize  -= signatureLength;
1802     }
1803
1804     //
1805     // Open a converter to take the rule file to UTF-16
1806     //
1807     conv = ucnv_open(encoding, &status);
1808     if (U_FAILURE(status)) {
1809         goto cleanUpAndReturn;
1810     }
1811
1812     //
1813     // Convert the rules to UChar.
1814     //  Preflight first to determine required buffer size.
1815     //
1816     ulen = ucnv_toUChars(conv,
1817         NULL,           //  dest,
1818         0,              //  destCapacity,
1819         fileBufC,
1820         fileSize,
1821         &status);
1822     if (status == U_BUFFER_OVERFLOW_ERROR) {
1823         // Buffer Overflow is expected from the preflight operation.
1824         status = U_ZERO_ERROR;
1825
1826         retPtr = new UChar[ulen+1];
1827         ucnv_toUChars(conv,
1828             retPtr,       //  dest,
1829             ulen+1,
1830             fileBufC,
1831             fileSize,
1832             &status);
1833     }
1834
1835 cleanUpAndReturn:
1836     fclose(f);
1837     delete fileBuf;
1838     ucnv_close(conv);
1839     if (U_FAILURE(status)) {
1840         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1841         delete retPtr;
1842         retPtr = 0;
1843         ulen   = 0;
1844     };
1845     return retPtr;
1846 }
1847
1848
1849 //--------------------------------------------------------------------------------------------
1850 //
1851 //     Exhaustive Tests, using Unicode Data Files.
1852 //
1853 //--------------------------------------------------------------------------------------------
1854
1855 //
1856 //  Token level scanner for the Unicode Line Break Test Data file.
1857 //      Return the next token, as follows:
1858 //          >= 0:       a UChar32 character, scanned from hex in the file.
1859 //          -1:         a break position, a division sign in the file.
1860 //          -2:         end of rule.  A new line in the file.
1861 //          -3:         end of file.  No more rules.
1862 //          -4:         Error
1863 //
1864 //   The scanner
1865 //       strips comments, ('#' to end of line)
1866 //       Recognizes CR, CR/LF and LF as new lines.
1867 //       Skips over spaces and  Xs (don't break here) in the data.
1868 //
1869 struct ScanState {
1870     int32_t     fPeekChar;
1871     UBool       fPeeked;
1872     int32_t     fLineNum;
1873     FILE        *fFile;
1874     ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};
1875 };
1876
1877 //  Literal characters that are of interest.  In hex to keep EBCDIC based machines happy.
1878 //  The data itself is latin-1 on all platforms.
1879 static const int32_t chSpace  = 0x20;
1880 static const int32_t chTab    = 0x09;
1881 static const int32_t chCR     = 0x0D;
1882 static const int32_t chLF     = 0x0A;
1883 static const int32_t chHash   = 0x23;
1884 static const int32_t chMult   = 0xD7;
1885 static const int32_t chDivide = 0xF7;
1886
1887 static int32_t   nextLBDToken(ScanState *s) {
1888     int32_t     c;
1889
1890     // Read  characters from the input file until we get something interesting
1891     //   to return.  The file is in latin-1 encoding.
1892     for (;;) {
1893         // Get the next character to look at,
1894         if (s->fPeeked) {
1895             c = s->fPeekChar;
1896             s->fPeeked = FALSE;
1897         } else {
1898             c = getc(s->fFile);
1899         }
1900
1901         // EOF.  Return immediately.
1902         if (c == EOF) {
1903             return -3;
1904         }
1905
1906         // Spaces.  Treat the multiply sign as a space - it indicates a no-break position
1907         //          in the data, and the test program doesn't want to see them.
1908         //          Continue the next char loop, looking for something significant.
1909         if (c == chSpace || c == chTab || c == chMult) {
1910             continue;
1911         }
1912
1913         //  Divide sign.  Indicates an expected break position.
1914         if (c == chDivide) {
1915             return -1;
1916         }
1917
1918         // New Line Handling.  Keep track of line number in the file, which in turn
1919         //   requires keeping track of CR/LF as a single new line.
1920         if (c == chCR) {
1921             s->fLineNum++;
1922             s->fPeekChar = getc(s->fFile);
1923             if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};
1924             return -2;
1925         }
1926         if (c == chLF) {
1927             s->fLineNum++;
1928             return -2;
1929         }
1930
1931         // Comments.  Consume everything up to the next new line.
1932         if (c == chHash) {
1933             do {
1934                 c = getc(s->fFile);
1935             } while (!(c == EOF || c == chCR || c == chLF));
1936             s->fPeekChar = c;
1937             s->fPeeked = TRUE;
1938             return nextLBDToken(s);
1939         }
1940
1941         // Scan a hex character (UChar32) value.
1942         if (u_digit(c, 16) >= 0) {
1943             int32_t   v = u_digit(c, 16);
1944             for (;;) {
1945                 c = getc(s->fFile);
1946                 if (u_digit(c, 16) < 0) {break;};
1947                 v <<= 4;
1948                 v += u_digit(c, 16);
1949             }
1950             s->fPeekChar = c;
1951             s->fPeeked   = TRUE;
1952             return v;
1953         }
1954
1955         // Error.  Character was something unexpected.
1956         return -4;
1957     }
1958 }
1959
1960
1961
1962 void RBBITest::TestLineBreakData() {
1963
1964     UErrorCode      status = U_ZERO_ERROR;
1965     UnicodeString   testString;
1966     UVector         expectedBreaks(status);
1967     ScanState       ss;
1968     int32_t         tok;
1969
1970     BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
1971     if (U_FAILURE(status)) {
1972         errln("Failure creating break iterator");
1973         return;
1974     }
1975
1976     const char *    lbdfName = "LBTest.txt";
1977
1978     // Open the test data file.
1979     //   TODO:  a proper way to handle this data.
1980     ss.fFile = fopen(lbdfName, "rb");
1981     if (ss.fFile == NULL) {
1982         logln("Unable to open Line Break Test Data file.  Skipping test.");
1983         delete bi;
1984         return;
1985     }
1986
1987     // Loop once per line from the test data file.
1988     for (;;) {
1989         // Zero out test data from previous line.
1990         testString.truncate(0);
1991         expectedBreaks.removeAllElements();
1992
1993         // Read one test's (line's) worth of data from the file.
1994         //   Loop once per token on the input file line.
1995         for(;;)  {
1996             tok = nextLBDToken(&ss);
1997
1998             // If we scanned a character number in the file.
1999             //   save it in the test data array.
2000             if (tok >= 0) {
2001                 testString.append((UChar32)tok);
2002                 continue;
2003             }
2004
2005             // If we scanned a break position in the data, record it.
2006             if (tok == -1) {
2007                 expectedBreaks.addElement(testString.length(), status);
2008                 continue;
2009             }
2010
2011             // If we scanned a new line, or EOF
2012             //    drop out of scan loop and run the test case.
2013             if (tok == -2 || tok == -3) {break;};
2014
2015             // None of above.  Error.
2016             errln("Failure:  Unrecognized data format,  test file line %d", ss.fLineNum);
2017             break;
2018         }
2019
2020         // If this line from the test data file actually contained test data,
2021         //   run the test.
2022         if (testString.length() > 0) {
2023             int32_t pos;                 // Break Position in the test string
2024             int32_t expectedI = 0;       // Index of expected break position in vector of same.
2025             int32_t expectedPos;         // Expected break position (index into test string)
2026
2027             bi->setText(testString);
2028             pos = bi->first();       // TODO:  break iterators always return a match at pos 0.
2029             pos = bi->next();        //        Line Break TR says no match at position 0.
2030                                      //        Resolve.
2031
2032             for (; pos != BreakIterator::DONE; ) {
2033                 expectedPos = expectedBreaks.elementAti(expectedI);
2034                 if (pos < expectedPos) {
2035                     errln("Failure: Test file line %d, unexpected break found at position %d",
2036                         ss.fLineNum, pos);
2037                     break;
2038                 }
2039                 if (pos > expectedPos) {
2040                     errln("Failure: Test file line %d, failed to find break at position %d",
2041                         ss.fLineNum, expectedPos);
2042                     break;
2043                 }
2044                 pos = bi->next();
2045                 expectedI++;
2046             }
2047         }
2048
2049         // If we've hit EOF on the input file, we're done.
2050         if (tok == -3) {
2051             break;
2052         }
2053
2054     }
2055
2056     fclose(ss.fFile);
2057     delete bi;
2058
2059 }
2060
2061 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2062
2063 //---------------------------------------------------------------------------------------
2064 //
2065 //   classs RBBIMonkeyKind
2066 //
2067 //      Monkey Test for Break Iteration
2068 //      Abstract interface class.   Concrete derived classes independently
2069 //      implement the break rules for different iterator types.
2070 //
2071 //      The Monkey Test itself uses doesn't know which type of break iterator it is
2072 //      testing, but works purely in terms of the interface defined here.
2073 //
2074 //---------------------------------------------------------------------------------------
2075 class RBBIMonkeyKind {
2076 public:
2077     // Return a UVector of UnicodeSets, representing the character classes used
2078     //   for this type of iterator.
2079     virtual  UVector  *charClasses() = 0;
2080
2081     // Set the test text on which subsequent calls to next() will operate
2082     virtual  void      setText(const UnicodeString &s) = 0;
2083
2084     // Find the next break postion, starting from the prev break position, or from zero.
2085     // Return -1 after reaching end of string.
2086     virtual  int32_t   next(int32_t i) = 0;
2087
2088     virtual ~RBBIMonkeyKind();
2089     UErrorCode       deferredStatus;
2090
2091
2092 protected:
2093     RBBIMonkeyKind();
2094
2095 private:
2096 };
2097
2098 RBBIMonkeyKind::RBBIMonkeyKind() {
2099     deferredStatus = U_ZERO_ERROR;
2100 }
2101
2102 RBBIMonkeyKind::~RBBIMonkeyKind() {
2103 }
2104
2105
2106 //----------------------------------------------------------------------------------------
2107 //
2108 //   Random Numbers.  Similar to standard lib rand() and srand()
2109 //                    Not using library to
2110 //                      1.  Get same results on all platforms.
2111 //                      2.  Get access to current seed, to more easily reproduce failures.
2112 //
2113 //---------------------------------------------------------------------------------------
2114 static uint32_t m_seed = 1;
2115
2116 static uint32_t m_rand()
2117 {
2118     m_seed = m_seed * 1103515245 + 12345;
2119     return (uint32_t)(m_seed/65536) % 32768;
2120 }
2121
2122
2123 //------------------------------------------------------------------------------------------
2124 //
2125 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2126 //                             of RBBIMonkeyKind.
2127 //
2128 //------------------------------------------------------------------------------------------
2129 class RBBICharMonkey: public RBBIMonkeyKind {
2130 public:
2131     RBBICharMonkey();
2132     virtual          ~RBBICharMonkey();
2133     virtual  UVector *charClasses();
2134     virtual  void     setText(const UnicodeString &s);
2135     virtual  int32_t  next(int32_t i);
2136 private:
2137     UVector   *fSets;
2138
2139     UnicodeSet  *fCRLFSet;
2140     UnicodeSet  *fControlSet;
2141     UnicodeSet  *fExtendSet;
2142     UnicodeSet  *fHangulSet;
2143     UnicodeSet  *fAnySet;
2144
2145     RegexMatcher  *fMatcher;
2146     const UnicodeString *fText;
2147 };
2148
2149
2150 RBBICharMonkey::RBBICharMonkey() {
2151     UErrorCode  status = U_ZERO_ERROR;
2152
2153     fText = NULL;
2154     fMatcher = new RegexMatcher("\\X", 0, status);     // Pattern to match a grampheme cluster
2155
2156     fCRLFSet    = new UnicodeSet("[\\r\\n]", status);
2157     fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
2158     fExtendSet  = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
2159     fHangulSet  = new UnicodeSet(
2160         "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
2161          "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
2162     fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]", status);
2163
2164     fSets       = new UVector(status);
2165     fSets->addElement(fCRLFSet,    status);
2166     fSets->addElement(fControlSet, status);
2167     fSets->addElement(fExtendSet,  status);
2168     fSets->addElement(fHangulSet,  status);
2169     fSets->addElement(fAnySet,     status);
2170     if (U_FAILURE(status)) {
2171         deferredStatus = status;
2172     }
2173 };
2174
2175
2176 void RBBICharMonkey::setText(const UnicodeString &s) {
2177     fText = &s;
2178     fMatcher->reset(s);
2179 }
2180
2181
2182 int32_t RBBICharMonkey::next(int32_t i) {
2183     UErrorCode status = U_ZERO_ERROR;
2184     int32_t  retVal = -1;
2185
2186     if (fMatcher->find(i, status)) {
2187         retVal = fMatcher->end(status);
2188     }
2189     if (U_FAILURE(status)){
2190         retVal = -1;
2191     }
2192     return retVal;
2193 }
2194
2195
2196 UVector  *RBBICharMonkey::charClasses() {
2197     return fSets;
2198 }
2199
2200
2201 RBBICharMonkey::~RBBICharMonkey() {
2202     delete fSets;
2203     delete fCRLFSet;
2204     delete fControlSet;
2205     delete fExtendSet;
2206     delete fHangulSet;
2207     delete fAnySet;
2208
2209     delete fMatcher;
2210 }
2211
2212 //------------------------------------------------------------------------------------------
2213 //
2214 //   class RBBIWordMonkey      Word Break specific implementation
2215 //                             of RBBIMonkeyKind.
2216 //
2217 //------------------------------------------------------------------------------------------
2218 class RBBIWordMonkey: public RBBIMonkeyKind {
2219 public:
2220     RBBIWordMonkey();
2221     virtual          ~RBBIWordMonkey();
2222     virtual  UVector *charClasses();
2223     virtual  void     setText(const UnicodeString &s);
2224     virtual int32_t   next(int32_t i);
2225 private:
2226     UVector      *fSets;
2227
2228     UnicodeSet  *fKatakanaSet;
2229     UnicodeSet  *fALetterSet;
2230     UnicodeSet  *fMidLetterSet;
2231     UnicodeSet  *fMidNumSet;
2232     UnicodeSet  *fNumericSet;
2233     UnicodeSet  *fFormatSet;
2234     UnicodeSet  *fOtherSet;
2235     UnicodeSet  *fExtendSet;
2236     UnicodeSet  *fExtendNumLetSet;
2237
2238     RegexMatcher  *fMatcher;
2239
2240     const UnicodeString  *fText;
2241
2242     RegexMatcher         *fGCFMatcher;
2243     RegexMatcher         *fGCMatcher;
2244
2245 };
2246
2247
2248 RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
2249                                    fGCMatcher(0)
2250 {
2251     UErrorCode  status = U_ZERO_ERROR;
2252
2253     fSets          = new UVector(status);
2254
2255     fKatakanaSet   = new UnicodeSet("[\\p{script=KATAKANA}"
2256         "\\u3031-\\u3035\\u309b\\u309c\\u30a0"
2257         "\\u30fc\\uff70\\uff9e\\uff9f]", status);
2258
2259     const UnicodeString ALetterStr( "[[\\p{Alphabetic}"
2260                                         "\\u00a0"         // NBSP
2261                                         "\\u05f3]"        // Hebrew punct Geresh
2262                                         "-[\\p{Ideographic}]"
2263                                         "-[\\p{Script=Lao}]"
2264                                         "-[\\p{Script=Hiragana}]"
2265                                         "-[\\p{Grapheme_Extend}]]");
2266     fALetterSet    = new UnicodeSet(ALetterStr, status);
2267     fALetterSet->removeAll(*fKatakanaSet);
2268
2269     fMidLetterSet  = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);
2270     fMidNumSet     = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);
2271     fNumericSet    = new UnicodeSet("[\\p{Nd}\\u066b\\u066c]", status);
2272     fFormatSet     = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);
2273     fExtendSet     = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
2274     fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);
2275     fOtherSet      = new UnicodeSet();
2276     if(U_FAILURE(status)) {
2277       deferredStatus = status;
2278       return;
2279     }
2280
2281     fOtherSet->complement();
2282     fOtherSet->removeAll(*fKatakanaSet);
2283     fOtherSet->removeAll(*fALetterSet);
2284     fOtherSet->removeAll(*fMidLetterSet);
2285     fOtherSet->removeAll(*fMidNumSet);
2286     fOtherSet->removeAll(*fNumericSet);
2287     fOtherSet->removeAll(*fExtendNumLetSet);
2288
2289     fSets->addElement(fALetterSet,   status);
2290     fSets->addElement(fKatakanaSet,  status);
2291     fSets->addElement(fMidLetterSet, status);
2292     fSets->addElement(fMidNumSet,    status);
2293     fSets->addElement(fNumericSet,   status);
2294     fSets->addElement(fFormatSet,    status);
2295     fSets->addElement(fOtherSet,     status);
2296     fSets->addElement(fExtendNumLetSet, status);
2297
2298
2299     fGCFMatcher = new RegexMatcher("\\X(?:[\\p{Format}-\\p{Grapheme_Extend}])*", 0, status);
2300     fGCMatcher  = new RegexMatcher("\\X", 0, status);
2301
2302     if (U_FAILURE(status)) {
2303         deferredStatus = status;
2304     }
2305 };
2306
2307 void RBBIWordMonkey::setText(const UnicodeString &s) {
2308     fText       = &s;
2309     fGCMatcher->reset(*fText);
2310     fGCFMatcher->reset(*fText);
2311 }
2312
2313
2314 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2315     UErrorCode status = U_ZERO_ERROR;
2316
2317     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2318                               //   break position being tested.  The candidate break
2319                               //   location is before p2.
2320
2321     int     breakPos = -1;
2322
2323     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2324
2325     // Prev break at end of string.  return DONE.
2326     if (prevPos >= fText->length()) {
2327         return -1;
2328     }
2329     p0 = p1 = p2 = p3 = prevPos;
2330     c3 =  fText->char32At(prevPos);
2331     c0 = c1 = c2 = 0;
2332
2333
2334     // Format char after prev break?  Special case, see last Note for Word Boundaries TR.
2335     //    break immdiately after the format char.
2336     if (fFormatSet->contains(c3)) {
2337         breakPos = fText->moveIndex32(prevPos, 1);
2338         return breakPos;
2339     }
2340
2341
2342     // Loop runs once per "significant" character position in the input text.
2343     for (;;) {
2344         // Move all of the positions forward in the input string.
2345         p0 = p1;  c0 = c1;
2346         p1 = p2;  c1 = c2;
2347         p2 = p3;  c2 = c3;
2348         // Advancd p3 by    (GC Format*)   Rules 3, 4
2349         status = U_ZERO_ERROR;
2350         if  (fGCFMatcher->find(p3, status) == FALSE) {
2351             p3 = fText->length();
2352             c3 = 0;
2353         } else {
2354             p3 = fGCFMatcher->end(0, status);
2355             U_ASSERT(U_SUCCESS(status));
2356             c3 = fText->char32At(p3);
2357         }
2358
2359         if (p1 == p2) {
2360             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2361             continue;
2362         }
2363         if (p2 == fText->length()) {
2364             // Reached end of string.  Always a break position.
2365             break;
2366         }
2367
2368         // Rule (5).   ALetter x ALetter
2369         if (fALetterSet->contains(c1) &&
2370             fALetterSet->contains(c2))  {
2371             continue;
2372         }
2373
2374         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2375         //
2376         //    Also incorporates rule 7 by skipping pos ahead to position of the
2377         //    terminating ALetter.
2378         if ( fALetterSet->contains(c1)   &&
2379              fMidLetterSet->contains(c2) &&
2380              fALetterSet->contains(c3)) {
2381             continue;
2382         }
2383
2384
2385         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2386         if (fALetterSet->contains(c0) &&
2387             (fMidLetterSet->contains(c1)  ) &&
2388             fALetterSet->contains(c2)) {
2389             continue;
2390         }
2391
2392         // Rule (8)    Numeric x Numeric
2393         if (fNumericSet->contains(c1) &&
2394             fNumericSet->contains(c2))  {
2395             continue;
2396         }
2397
2398         // Rule (9)    ALetter x Numeric
2399         if (fALetterSet->contains(c1) &&
2400             fNumericSet->contains(c2))  {
2401             continue;
2402         }
2403
2404         // Rule (10)    Numeric x ALetter
2405         if (fNumericSet->contains(c1) &&
2406             fALetterSet->contains(c2))  {
2407             continue;
2408         }
2409
2410         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2411         if ( fNumericSet->contains(c0) &&
2412              fMidNumSet->contains(c1)  &&
2413             fNumericSet->contains(c2)) {
2414             continue;
2415         }
2416
2417         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2418         if (fNumericSet->contains(c1) &&
2419             fMidNumSet->contains(c2)  &&
2420             fNumericSet->contains(c3)) {
2421             continue;
2422         }
2423
2424         // Rule (13)  Katakana x Katakana
2425         if (fKatakanaSet->contains(c1) &&
2426             fKatakanaSet->contains(c2))  {
2427             continue;
2428         }
2429
2430         // Rule 13a
2431         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2432              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2433              fExtendNumLetSet->contains(c2)) {
2434                 continue;
2435              }
2436
2437         // Rule 13b
2438         if (fExtendNumLetSet->contains(c1) &&
2439                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2440                 fKatakanaSet->contains(c2)))  {
2441                 continue;
2442              }
2443
2444
2445         // Rule 14.  Break found here.
2446         break;
2447     }
2448
2449
2450     //  Rule 4 fixup,  back up before any trailing
2451     //                 format characters at the end of the word.
2452     breakPos = p2;
2453     status = U_ZERO_ERROR;
2454     if  (fGCMatcher->find(p1, status)) {
2455         breakPos = fGCMatcher->end(0, status);
2456         U_ASSERT(U_SUCCESS(status));
2457     }
2458     return breakPos;
2459 }
2460
2461
2462 UVector  *RBBIWordMonkey::charClasses() {
2463     return fSets;
2464 }
2465
2466
2467 RBBIWordMonkey::~RBBIWordMonkey() {
2468     delete fSets;
2469     delete fKatakanaSet;
2470     delete fALetterSet;
2471     delete fMidLetterSet;
2472     delete fMidNumSet;
2473     delete fNumericSet;
2474     delete fFormatSet;
2475     delete fExtendSet;
2476     delete fOtherSet;
2477
2478     delete fGCFMatcher;
2479     delete fGCMatcher;
2480 }
2481
2482
2483
2484
2485 //-------------------------------------------------------------------------------------------
2486 //
2487 //  RBBILineMonkey
2488 //
2489 //-------------------------------------------------------------------------------------------
2490
2491 class RBBILineMonkey: public RBBIMonkeyKind {
2492 public:
2493     RBBILineMonkey();
2494     virtual          ~RBBILineMonkey();
2495     virtual  UVector *charClasses();
2496     virtual  void     setText(const UnicodeString &s);
2497     virtual  int32_t  next(int32_t i);
2498     virtual  void     rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2499 private:
2500     UVector      *fSets;
2501
2502     UnicodeSet  *fBK;
2503     UnicodeSet  *fCR;
2504     UnicodeSet  *fLF;
2505     UnicodeSet  *fCM;
2506     UnicodeSet  *fNL;
2507     UnicodeSet  *fSG;
2508     UnicodeSet  *fWJ;
2509     UnicodeSet  *fZW;
2510     UnicodeSet  *fGL;
2511     UnicodeSet  *fCB;
2512     UnicodeSet  *fSP;
2513     UnicodeSet  *fB2;
2514     UnicodeSet  *fBA;
2515     UnicodeSet  *fBB;
2516     UnicodeSet  *fHY;
2517     UnicodeSet  *fCL;
2518     UnicodeSet  *fEX;
2519     UnicodeSet  *fIN;
2520     UnicodeSet  *fNS;
2521     UnicodeSet  *fOP;
2522     UnicodeSet  *fQU;
2523     UnicodeSet  *fIS;
2524     UnicodeSet  *fNU;
2525     UnicodeSet  *fPO;
2526     UnicodeSet  *fPR;
2527     UnicodeSet  *fSY;
2528     UnicodeSet  *fAI;
2529     UnicodeSet  *fAL;
2530     UnicodeSet  *fID;
2531     UnicodeSet  *fSA;
2532     UnicodeSet  *fXX;
2533
2534     BreakIterator  *fCharBI;
2535
2536     const UnicodeString  *fText;
2537     int32_t              *fOrigPositions;
2538
2539     RegexMatcher         *fNumberMatcher;
2540     RegexMatcher         *fLB10Matcher;
2541     RegexMatcher         *fLB11Matcher;
2542 };
2543
2544
2545 RBBILineMonkey::RBBILineMonkey()
2546 {
2547     UErrorCode  status = U_ZERO_ERROR;
2548
2549     fSets    = new UVector(status);
2550
2551     fBK    = new UnicodeSet("[\\p{Line_Break=BK}]", status);
2552     fCR    = new UnicodeSet("[\\p{Line_break=CR}]", status);
2553     fLF    = new UnicodeSet("[\\p{Line_break=LF}]", status);
2554     fCM    = new UnicodeSet("[\\p{Line_break=CM}]", status);
2555     fNL    = new UnicodeSet("[\\p{Line_break=NL}]", status);
2556     fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]", status);
2557     fZW    = new UnicodeSet("[\\p{Line_break=ZW}]", status);
2558     fGL    = new UnicodeSet("[\\p{Line_break=GL}]", status);
2559     fCB    = new UnicodeSet("[\\p{Line_break=CB}]", status);
2560     fSP    = new UnicodeSet("[\\p{Line_break=SP}]", status);
2561     fB2    = new UnicodeSet("[\\p{Line_break=B2}]", status);
2562     fBA    = new UnicodeSet("[\\p{Line_break=BA}]", status);
2563     fBB    = new UnicodeSet("[\\p{Line_break=BB}]", status);
2564     fHY    = new UnicodeSet("[\\p{Line_break=HY}]", status);
2565     fCL    = new UnicodeSet("[\\p{Line_break=CL}]", status);
2566     fEX    = new UnicodeSet("[\\p{Line_break=EX}]", status);
2567     fIN    = new UnicodeSet("[\\p{Line_break=IN}]", status);
2568     fNS    = new UnicodeSet("[\\p{Line_break=NS}]", status);
2569     fOP    = new UnicodeSet("[\\p{Line_break=OP}]", status);
2570     fQU    = new UnicodeSet("[\\p{Line_break=QU}]", status);
2571     fIS    = new UnicodeSet("[\\p{Line_break=IS}]", status);
2572     fNU    = new UnicodeSet("[\\p{Line_break=NU}]", status);
2573     fPO    = new UnicodeSet("[\\p{Line_break=PO}]", status);
2574     fPR    = new UnicodeSet("[\\p{Line_break=PR}]", status);
2575     fSY    = new UnicodeSet("[\\p{Line_break=SY}]", status);
2576     fAI    = new UnicodeSet("[\\p{Line_break=AI}]", status);
2577     fAL    = new UnicodeSet("[\\p{Line_break=AL}]", status);
2578     fID    = new UnicodeSet("[\\p{Line_break=ID}]", status);
2579     fSA    = new UnicodeSet("[\\p{Line_break=SA}]", status);
2580     fXX    = new UnicodeSet("[\\p{Line_break=XX}]", status);
2581
2582     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2583     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2584     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2585
2586
2587
2588     fSets->addElement(fBK, status);
2589     fSets->addElement(fCR, status);
2590     fSets->addElement(fLF, status);
2591     fSets->addElement(fCM, status);
2592     fSets->addElement(fNL, status);
2593     fSets->addElement(fWJ, status);
2594     fSets->addElement(fZW, status);
2595     fSets->addElement(fGL, status);
2596     fSets->addElement(fCB, status);
2597     fSets->addElement(fSP, status);
2598     fSets->addElement(fB2, status);
2599     fSets->addElement(fBA, status);
2600     fSets->addElement(fBB, status);
2601     fSets->addElement(fHY, status);
2602     fSets->addElement(fCL, status);
2603     fSets->addElement(fEX, status);
2604     fSets->addElement(fIN, status);
2605     fSets->addElement(fNS, status);
2606     fSets->addElement(fOP, status);
2607     fSets->addElement(fQU, status);
2608     fSets->addElement(fIS, status);
2609     fSets->addElement(fNU, status);
2610     fSets->addElement(fPO, status);
2611     fSets->addElement(fPR, status);
2612     fSets->addElement(fSY, status);
2613     fSets->addElement(fAI, status);
2614     fSets->addElement(fAL, status);
2615     fSets->addElement(fID, status);
2616     fSets->addElement(fWJ, status);
2617     fSets->addElement(fSA, status);
2618     // fSets->addElement(fXX, status);
2619
2620
2621
2622     fNumberMatcher = new RegexMatcher(
2623         "(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
2624         "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2625         "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2626         "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2627         "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
2628         "(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
2629         0, status);
2630
2631     fLB10Matcher = new RegexMatcher(
2632         "\\p{Line_Break=QU}\\p{Line_Break=CM}*"
2633         "\\p{Line_Break=SP}*"
2634         "(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
2635         0, status);
2636
2637     fLB11Matcher = new RegexMatcher(
2638         "\\p{Line_Break=CL}\\p{Line_Break=CM}*"
2639         "\\p{Line_Break=SP}*"
2640         "(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
2641         0, status);
2642
2643     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2644
2645     if (U_FAILURE(status)) {
2646         deferredStatus = status;
2647     }
2648 };
2649
2650
2651 void RBBILineMonkey::setText(const UnicodeString &s) {
2652     fText       = &s;
2653     fCharBI->setText(s);
2654     fNumberMatcher->reset(s);
2655 }
2656
2657 //
2658 //  rule67Adjust
2659 //     Line Break TR rules 6 and 7 implementation.
2660 //     This deals with combining marks, Hangul Syllables, and other sequences that
2661 //     that must be treated as if they were something other than what they actually are.
2662 //
2663 //     This is factored out into a separate function because it must be applied twice for
2664 //     each potential break, once to the chars before the position being checked, then
2665 //     again to the text following the possible break.
2666 //
2667 void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2668     if (pos == -1) {
2669         // Invalid initial position.  Happens during the warmup iteration of the
2670         //   main loop in next().
2671         return;
2672     }
2673
2674     int32_t  nPos = *nextPos;
2675
2676     // LB 6  Treat Korean Syllables as a single unit
2677     int32_t  hangultype = u_getIntPropertyValue(*posChar, UCHAR_HANGUL_SYLLABLE_TYPE);
2678     if (hangultype != U_HST_NOT_APPLICABLE) {
2679         nPos = fCharBI->following(pos);   // Advance by grapheme cluster, which
2680                                           //  contains the logic to locate Hangul syllables.
2681         // Grapheme Cluster Ugliness: some Grapheme_Extend chars, which are absorbed
2682         //   into a grapheme cluster, are NOT Line Break CM. (Some are GL, for example.)
2683         //   We don't want consume any of these.  The Approach is
2684         //      1.  Back nPos up, undoing the consumption of any
2685         //          Grapheme_Extend chars by the char break iterator.
2686         //      2.  Let the LB 7b logic below reconsume any Line Break CM chars.
2687         for (;;) {
2688             nPos = fText->moveIndex32(nPos, -1);
2689             UChar32 possiblyExtendChar = fText->char32At(nPos);
2690             if (fID->contains(possiblyExtendChar)) {
2691                 // We hit into the Hangul Syllable itself, class is ID.
2692                 nPos = fText->moveIndex32(nPos, +1);
2693                 break;
2694             }
2695         }
2696     }
2697
2698     // LB 7b  Keep combining sequences together.
2699     //  advance over any CM class chars.  (Line Break CM class is different from
2700     //    grapheme cluster CM, so we need to do this even for HangulSyllables.
2701     //    Line Break may eat additional stuff as combining, beyond what graphem cluster did.
2702     if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a
2703         || *posChar==0x0d || *posChar==0x85)) {
2704         for (;;) {
2705             *nextChar = fText->char32At(nPos);
2706             if (!fCM->contains(*nextChar)) {
2707                 break;
2708             }
2709             nPos = fText->moveIndex32(nPos, 1);
2710         }
2711     }
2712
2713
2714     // LB 7a In a SP CM* sequence, treat the SP as an ID
2715     if (nPos != *nextPos && fSP->contains(*posChar)) {
2716         *posChar = 0x4e00;   // 0x4e00 is a CJK Ideograph, linebreak type is ID.
2717     }
2718
2719     // LB 7b Treat X CM* as if it were x.
2720     //       No explicit action required.
2721
2722     // LB 7c  Treat any remaining combining mark as AL
2723     if (fCM->contains(*posChar)) {
2724         *posChar = 0x41;   // thisChar = 'A';
2725     }
2726
2727     // Push the updated nextPos and nextChar back to our caller.
2728     // This only makes a difference if posChar got bigger, by slurping up a
2729     // combining sequence or Hangul syllable.
2730     *nextPos  = nPos;
2731     *nextChar = fText->char32At(nPos);
2732 }
2733
2734
2735
2736 int32_t RBBILineMonkey::next(int32_t startPos) {
2737     UErrorCode status = U_ZERO_ERROR;
2738     int32_t    pos;       //  Index of the char following a potential break position
2739     UChar32    thisChar;  //  Character at above position "pos"
2740
2741     int32_t    prevPos;   //  Index of the char preceding a potential break position
2742     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2743                           //   and thisChar may not be adjacent because combining
2744                           //   characters between them will be ignored.
2745
2746     int32_t    nextPos;   //  Index of the next character following pos.
2747                           //     Usually skips over combining marks.
2748     int32_t    nextCPPos; //  Index of the code point following "pos."
2749                           //     May point to a combining mark.
2750     int32_t    tPos;      //  temp value.
2751     UChar32    c;
2752
2753     if (startPos >= fText->length()) {
2754         return -1;
2755     }
2756
2757
2758     // Initial values for loop.  Loop will run the first time without finding breaks,
2759     //                           while the invalid values shift out and the "this" and
2760     //                           "prev" positions are filled in with good values.
2761     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
2762     thisChar = prevChar  = 0;
2763     nextPos  = nextCPPos = startPos;
2764
2765
2766     // Loop runs once per position in the test text, until a break position
2767     //  is found.
2768     for (;;) {
2769         prevPos   = pos;
2770         prevChar  = thisChar;
2771
2772         pos       = nextPos;
2773         thisChar  = fText->char32At(pos);
2774
2775         nextCPPos = fText->moveIndex32(pos, 1);
2776         nextPos   = nextCPPos;
2777
2778         // Break at end of text.
2779         if (pos >= fText->length()) {
2780             break;
2781         }
2782
2783         // LB 3a  Always break after hard line breaks,
2784         if (fBK->contains(prevChar)) {
2785             break;
2786         }
2787
2788         // LB 3b  Break after CR, LF, NL, but not inside CR LF
2789         if (prevChar == 0x0d && thisChar == 0x0a) {
2790             continue;
2791         }
2792         if (prevChar == 0x0d ||
2793             prevChar == 0x0a ||
2794             prevChar == 0x85)  {
2795             break;
2796         }
2797
2798         // LB 3c  Don't break before hard line breaks
2799         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2800             fBK->contains(thisChar)) {
2801                 continue;
2802         }
2803
2804         // LB 10    QU SP* x OP
2805         if (prevPos >= 0) {
2806             UnicodeString  subStr10(*fText, prevPos);
2807             fLB10Matcher->reset(subStr10);
2808             status = U_ZERO_ERROR;
2809             if (fLB10Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
2810                 // TODO:  Check status codes
2811                 pos      = prevPos + fLB10Matcher->start(1, status);
2812                 nextPos  = prevPos + fLB10Matcher->end(0, status);
2813                 thisChar = fText->char32At(pos);
2814                 continue;
2815             }
2816         }
2817
2818         // LB 11   CL SP* x NS
2819         if (prevPos >= 0) {
2820             UnicodeString  subStr11(*fText, prevPos);
2821             fLB11Matcher->reset(subStr11);
2822             status = U_ZERO_ERROR;
2823             if (fLB11Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
2824                 // TODO:  Check status codes
2825                 pos      = prevPos + fLB11Matcher->start(1, status);
2826                 nextPos  = prevPos + fLB11Matcher->end(0, status);
2827                 thisChar = fText->char32At(pos);
2828                 continue;
2829             }
2830         }
2831
2832         // LB 4  Don't break before spaces or zero-width space.
2833         if (fSP->contains(thisChar)) {
2834             continue;
2835         }
2836
2837         if (fZW->contains(thisChar)) {
2838             continue;
2839         }
2840
2841         // LB 5  Break after zero width space
2842         if (fZW->contains(prevChar)) {
2843             break;
2844         }
2845
2846         // LB 6, LB 7
2847         /*int32_t oldpos = pos;*/
2848         rule67Adjust(prevPos, &prevChar, &pos,     &thisChar);
2849
2850         nextCPPos = fText->moveIndex32(pos, 1);
2851         nextPos   = nextCPPos;
2852         c = fText->char32At(nextPos);
2853         // another percularity of LB 4 - Dont break before space
2854         if (fSP->contains(thisChar)) {
2855             continue;
2856         }
2857         rule67Adjust(pos,     &thisChar, &nextPos, &c);
2858
2859         // If the loop is still warming up - if we haven't shifted the initial
2860         //   -1 positions out of prevPos yet - loop back to advance the
2861         //    position in the input without any further looking for breaks.
2862         if (prevPos == -1) {
2863             continue;
2864         }
2865
2866         // Re-apply rules 3c, 4 because these could be affected by having
2867         //                      a new thisChar from doing rule 6 or 7.
2868         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||   // 3c
2869             fBK->contains(thisChar)) {
2870                 continue;
2871         }
2872         if (fSP->contains(thisChar)) {    // LB 4
2873             continue;
2874         }
2875         if (fZW->contains(thisChar)) {    // LB 4
2876             continue;
2877         }
2878
2879
2880         // LB 8  Don't break before closings.
2881         //       NU x CL  and NU x IS are not matched here so that they will
2882         //       fall into LB 17 and the more general number regular expression.
2883         //
2884         if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
2885                                         fEX->contains(thisChar) ||
2886             !fNU->contains(prevChar) && fIS->contains(thisChar) ||
2887             !fNU->contains(prevChar) && fSY->contains(thisChar))    {
2888             continue;
2889         }
2890
2891         // LB 9  Don't break after OP SP*
2892         //       Scan backwards, checking for this sequence.
2893         //       The OP char could include combining marks, so we acually check for
2894         //           OP CM* SP*
2895         //       Another Twist: The Rule 67 fixes may have changed a CP CM
2896         //       sequence into a ID char, so before scanning back through spaces,
2897         //       verify that prevChar is indeed a space.  The prevChar variable
2898         //       may differ from fText[prevPos]
2899         tPos = prevPos;
2900         if (fSP->contains(prevChar)) {
2901             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2902                 tPos=fText->moveIndex32(tPos, -1);
2903             }
2904         }
2905         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2906             tPos=fText->moveIndex32(tPos, -1);
2907         }
2908         if (fOP->contains(fText->char32At(tPos))) {
2909             continue;
2910         }
2911
2912
2913         // LB 11a        B2 x B2
2914         if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
2915             continue;
2916         }
2917
2918         // LB 11b
2919         //    x  GL
2920         //    GL  x
2921         if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
2922             continue;
2923         }
2924         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2925             continue;
2926         }
2927
2928         // LB 12    break after space
2929         if (fSP->contains(prevChar)) {
2930             break;
2931         }
2932
2933         // LB 14
2934         //    x   QU
2935         //    QU  x
2936         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
2937             continue;
2938         }
2939
2940         // LB 14a  Break around a CB
2941         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
2942             break;
2943         }
2944
2945         // LB 15
2946         if (fBA->contains(thisChar) ||
2947             fHY->contains(thisChar) ||
2948             fNS->contains(thisChar) ||
2949             fBB->contains(prevChar) )   {
2950             continue;
2951         }
2952
2953         // LB 16
2954         if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
2955             fID->contains(prevChar) && fIN->contains(thisChar) ||
2956             fIN->contains(prevChar) && fIN->contains(thisChar) ||
2957             fNU->contains(prevChar) && fIN->contains(thisChar) )   {
2958             continue;
2959         }
2960
2961
2962         // LB 17    ID x PO    (Note:  Leading CM behaves like ID)
2963         //          AL x NU
2964         //          NU x AL
2965         if (fID->contains(prevChar) && fPO->contains(thisChar) ||
2966             fCM->contains(prevChar) && fPO->contains(thisChar) ||
2967             fAL->contains(prevChar) && fNU->contains(thisChar) ||
2968             fNU->contains(prevChar) && fAL->contains(thisChar) )   {
2969             continue;
2970         }
2971
2972         // LB 18    Numbers
2973         UnicodeString  subStr18(*fText, prevPos);
2974         fNumberMatcher->reset(subStr18);
2975         if (fNumberMatcher->lookingAt(status)) {
2976             // TODO:  Check status codes
2977             // Matched a number.  But could have been just a single digit, which would
2978             //    not represent a "no break here" between prevChar and thisChar
2979             int32_t numEndIdx = prevPos + fNumberMatcher->end(status);  // idx of first char following num
2980             if (numEndIdx > pos) {
2981                 // Number match includes at least our two chars being checked
2982                 if (numEndIdx > nextPos) {
2983                     // Number match includes additional chars.  Update pos and nextPos
2984                     //   so that next loop iteration will continue at the end of the number,
2985                     //   checking for breaks between last char in number & whatever follows.
2986                     nextPos = numEndIdx;
2987                     pos = fCharBI->preceding(numEndIdx);
2988                     thisChar = fText->char32At(pos);
2989                     while (fCM->contains(thisChar)) {
2990                         pos = fCharBI->preceding(pos);
2991                         thisChar = fText->char32At(pos);
2992                     }
2993                 }
2994                 continue;
2995             }
2996         }
2997
2998         if (fPR->contains(prevChar) && fAL->contains(thisChar)) {
2999             continue;
3000         }
3001
3002         if (fPR->contains(prevChar) && fID->contains(thisChar)) {
3003             continue;
3004         }
3005
3006         // LB 18b
3007         if (fHY->contains(prevChar) || fBB->contains(thisChar)) {
3008             break;
3009         }
3010
3011         // LB 19
3012         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3013             continue;
3014         }
3015
3016         // LB 19b
3017         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3018             continue;
3019         }
3020
3021         // LB 20    Break everywhere else
3022         break;
3023
3024     }
3025
3026     return pos;
3027 }
3028
3029
3030 UVector  *RBBILineMonkey::charClasses() {
3031     return fSets;
3032 }
3033
3034
3035 RBBILineMonkey::~RBBILineMonkey() {
3036     delete fSets;
3037
3038     delete fBK;
3039     delete fCR;
3040     delete fLF;
3041     delete fCM;
3042     delete fNL;
3043     delete fWJ;
3044     delete fZW;
3045     delete fGL;
3046     delete fCB;
3047     delete fSP;
3048     delete fB2;
3049     delete fBA;
3050     delete fBB;
3051     delete fHY;
3052     delete fCL;
3053     delete fEX;
3054     delete fIN;
3055     delete fNS;
3056     delete fOP;
3057     delete fQU;
3058     delete fIS;
3059     delete fNU;
3060     delete fPO;
3061     delete fPR;
3062     delete fSY;
3063     delete fAI;
3064     delete fAL;
3065     delete fID;
3066     delete fSA;
3067     delete fXX;
3068
3069     delete fCharBI;
3070     delete fNumberMatcher;
3071     delete fLB10Matcher;
3072     delete fLB11Matcher;
3073 }
3074
3075
3076 //-------------------------------------------------------------------------------------------
3077 //
3078 //   TestMonkey
3079 //
3080 //     params
3081 //       seed=nnnnn        Random number starting seed.
3082 //                         Setting the seed allows errors to be reproduced.
3083 //       loop=nnn          Looping count.  Controls running time.
3084 //                         -1:  run forever.
3085 //                          0 or greater:  run length.
3086 //
3087 //       type = char | word | line | sent | title
3088 //
3089 //-------------------------------------------------------------------------------------------
3090
3091 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3092     int32_t val = defaultVal;
3093     name.append(" *= *(-?\\d+)");
3094     UErrorCode status = U_ZERO_ERROR;
3095     RegexMatcher m(name, params, 0, status);
3096     if (m.find()) {
3097         // The param exists.  Convert the string to an int.
3098         char valString[100];
3099         int32_t paramLength = m.end(1, status) - m.start(1, status);
3100         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3101             paramLength = (int32_t)(sizeof(valString)-2);
3102         }
3103         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3104         val = strtol(valString,  NULL, 10);
3105
3106         // Delete this parameter from the params string.
3107         m.reset();
3108         params = m.replaceFirst("", status);
3109     }
3110     U_ASSERT(U_SUCCESS(status));
3111     return val;
3112 }
3113 #endif
3114
3115 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3116                                     BreakIterator *bi,
3117                                     int expected[],
3118                                     int expectedcount)
3119 {
3120     int count = 0;
3121     int i = 0;
3122     int forward[50];
3123     bi->setText(ustr);
3124     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3125         forward[count] = i;
3126         if (count < expectedcount && expected[count] != i) {
3127             test->errln("break forward test failed: expected %d but got %d",
3128                         expected[count], i);
3129             break;
3130         }
3131         count ++;
3132     }
3133     if (count != expectedcount) {
3134         printStringBreaks(ustr, expected, expectedcount);
3135         test->errln("break test failed: missed %d match",
3136                     expectedcount - count);
3137         return;
3138     }
3139     // testing boundaries
3140     for (i = 1; i < expectedcount; i ++) {
3141         int j = expected[i - 1];
3142         if (!bi->isBoundary(j)) {
3143             printStringBreaks(ustr, expected, expectedcount);
3144             test->errln("Expected boundary at position %d", j);
3145             return;
3146         }
3147         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3148             if (bi->isBoundary(j)) {
3149                 printStringBreaks(ustr, expected, expectedcount);
3150                 test->errln("Not expecting boundary at position %d", j);
3151                 return;
3152             }
3153         }
3154     }
3155
3156     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3157         count --;
3158         if (forward[count] != i) {
3159             test->errln("happy break test reverse failed: expected %d but got %d",
3160                         forward[count], i);
3161             break;
3162         }
3163     }
3164     if (count != 0) {
3165         printStringBreaks(ustr, expected, expectedcount);
3166         test->errln("happy break test failed: missed a match");
3167         return;
3168     }
3169
3170     // testing preceding
3171     for (i = 0; i < expectedcount - 1; i ++) {
3172         int j = expected[i] + 1;
3173         for (; j <= expected[i + 1]; j ++) {
3174             if (bi->preceding(j) != expected[i]) {
3175                 printStringBreaks(ustr, expected, expectedcount);
3176                 test->errln("Not expecting backwards boundary at position %d", j);
3177                 return;
3178             }
3179         }
3180     }
3181 }
3182
3183 void RBBITest::TestWordBreaks(void)
3184 {
3185 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3186
3187     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3188     Locale        locale("en");
3189     UErrorCode    status = U_ZERO_ERROR;
3190     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3191     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3192     UChar         str[300];
3193     static const char *strlist[] =
3194     {
3195     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3196     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3197     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
3198     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3199     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3200     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3201     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3202     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3203     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3204     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3205     "\\u2027\\U000e0067\\u0a47\\u00b7",
3206     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3207     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3208     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3209     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3210     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3211     "\\u0027\\u11af\\U000e0057\\u0602",
3212     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3213     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3214     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3215     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3216     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3217     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3218     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3219     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3220     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3221     "\\u58f4\\U000e0049\\u20e7\\u2027",
3222     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3223     "\\ua183\\u102d\\u0bec\\u003a",
3224     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3225     "\\u003a\\u0e57\\u0fad\\u002e",
3226     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3227     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3228     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3229     "\\u003a\\u0664\\u00b7\\u1fba",
3230     "\\u003b\\u0027\\u00b7\\u47a3",
3231     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3232     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3233     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3234     };
3235     int loop;
3236     if (U_FAILURE(status)) {
3237         errln("Creation of break iterator failed %s", u_errorName(status));
3238         return;
3239     }
3240     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3241         // printf("looping %d\n", loop);
3242         u_unescape(strlist[loop], str, 25);
3243         UnicodeString ustr(str);
3244         // RBBICharMonkey monkey;
3245         RBBIWordMonkey monkey;
3246
3247         int expected[50];
3248         int expectedcount = 0;
3249
3250         monkey.setText(ustr);
3251         int i;
3252         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3253             expected[expectedcount ++] = i;
3254         }
3255
3256         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3257     }
3258     delete bi;
3259 #endif
3260 }
3261
3262 void RBBITest::TestWordBoundary(void)
3263 {
3264     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3265     Locale        locale("en");
3266     UErrorCode    status = U_ZERO_ERROR;
3267     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3268     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3269     UChar         str[50];
3270     static const char *strlist[] =
3271     {
3272     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3273     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3274     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3275     "\\u2027\\U000e0067\\u0a47\\u00b7",
3276     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3277     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3278     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3279     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3280     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3281     "\\u0027\\u11af\\U000e0057\\u0602",
3282     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3283     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3284     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3285     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3286     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3287     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3288     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3289     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3290     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3291     "\\u58f4\\U000e0049\\u20e7\\u2027",
3292     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3293     "\\ua183\\u102d\\u0bec\\u003a",
3294     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3295     "\\u003a\\u0e57\\u0fad\\u002e",
3296     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3297     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3298     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3299     "\\u003a\\u0664\\u00b7\\u1fba",
3300     "\\u003b\\u0027\\u00b7\\u47a3",
3301     };
3302     int loop;
3303     if (U_FAILURE(status)) {
3304         errln("Creation of break iterator failed %s", u_errorName(status));
3305         return;
3306     }
3307     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3308         // printf("looping %d\n", loop);
3309         u_unescape(strlist[loop], str, 20);
3310         UnicodeString ustr(str);
3311         int forward[50];
3312         int count = 0;
3313
3314         bi->setText(ustr);
3315         int prev = 0;
3316         int i;
3317         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3318             forward[count ++] = i;
3319             if (i > prev) {
3320                 int j;
3321                 for (j = prev + 1; j < i; j ++) {
3322                     if (bi->isBoundary(j)) {
3323                         printStringBreaks(ustr, forward, count);
3324                         errln("happy boundary test failed: expected %d not a boundary",
3325                                j);
3326                         return;
3327                     }
3328                 }
3329             }
3330             if (!bi->isBoundary(i)) {
3331                 printStringBreaks(ustr, forward, count);
3332                 errln("happy boundary test failed: expected %d a boundary",
3333                        i);
3334                 return;
3335             }
3336             prev = i;
3337         }
3338     }
3339     delete bi;
3340 }
3341
3342 void RBBITest::TestLineBreaks(void)
3343 {
3344 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3345     Locale        locale("en");
3346     UErrorCode    status = U_ZERO_ERROR;
3347     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3348     UChar         str[50];
3349     static const char *strlist[] =
3350     {
3351      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3352      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3353      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3354      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3355      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3356      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3357      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3358      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3359      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3360      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3361      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3362      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3363      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3364      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3365      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3366      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3367      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3368      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3369      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3370      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3371      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3372      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3373      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3374      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3375      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3376      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3377      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3378      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3379      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3380      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3381      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3382      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3383      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3384      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3385      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3386      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3387      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3388     };
3389     int loop;
3390     if (U_FAILURE(status)) {
3391         errln("Creation of break iterator failed %s", u_errorName(status));
3392         return;
3393     }
3394     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3395         // printf("looping %d\n", loop);
3396         u_unescape(strlist[loop], str, 20);
3397         UnicodeString ustr(str);
3398         RBBILineMonkey monkey;
3399
3400         int expected[50];
3401         int expectedcount = 0;
3402
3403         monkey.setText(ustr);
3404         int i;
3405         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3406             expected[expectedcount ++] = i;
3407         }
3408
3409         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3410     }
3411     delete bi;
3412 #endif
3413 }
3414
3415 void RBBITest::TestSentBreaks(void)
3416 {
3417     Locale        locale("en");
3418     UErrorCode    status = U_ZERO_ERROR;
3419     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3420     UChar         str[100];
3421     static const char *strlist[] =
3422     {
3423      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3424      "This\n",
3425      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3426      "\"Sentence ending with a quote.\" Bye.",
3427      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3428      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3429      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3430      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3431      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3432      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3433     };
3434     int loop;
3435     int forward[100];
3436     if (U_FAILURE(status)) {
3437         errln("Creation of break iterator failed %s", u_errorName(status));
3438         return;
3439     }
3440     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3441         u_unescape(strlist[loop], str, 100);
3442         UnicodeString ustr(str);
3443
3444         int count = 0;
3445         bi->setText(ustr);
3446         int i;
3447         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3448             forward[count ++] = i;
3449         }
3450         testBreakBoundPreceding(this, ustr, bi, forward, count);
3451     }
3452     delete bi;
3453 }
3454
3455 void RBBITest::TestMonkey(char *params) {
3456 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3457
3458     UErrorCode     status    = U_ZERO_ERROR;
3459     int32_t        loopCount = 500;
3460     int32_t        seed      = 1;
3461     UnicodeString  breakType = "all";
3462     Locale         locale("en");
3463
3464     if (quick == FALSE) {
3465         loopCount = 10000;
3466     }
3467
3468     if (params) {
3469         UnicodeString p(params);
3470         loopCount = getIntParam("loop", p, loopCount);
3471         seed      = getIntParam("seed", p, seed);
3472
3473         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3474         if (m.find()) {
3475             breakType = m.group(1, status);
3476             m.reset();
3477             p = m.replaceFirst("", status);
3478         }
3479
3480         m.reset(p);
3481         if (RegexMatcher("\\S", p, 0, status).find()) {
3482             // Each option is stripped out of the option string as it is processed.
3483             // All options have been checked.  The option string should have been completely emptied..
3484             char buf[100];
3485             p.extract(buf, sizeof(buf), NULL, status);
3486             buf[sizeof(buf)-1] = 0;
3487             errln("Unrecognized or extra parameter:  %s\n", buf);
3488             return;
3489         }
3490
3491     }
3492
3493     if (breakType == "char" || breakType == "all") {
3494         RBBICharMonkey  m;
3495         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3496         if (U_SUCCESS(status)) {
3497             RunMonkey(bi, m, "char", seed, loopCount);
3498         }
3499         else {
3500             errln("Creation of character break iterator failed %s", u_errorName(status));
3501         }
3502         delete bi;
3503     }
3504
3505     if (breakType == "word" || breakType == "all") {
3506         logln("Word Break Monkey Test");
3507         RBBIWordMonkey  m;
3508         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3509         if (U_SUCCESS(status)) {
3510             RunMonkey(bi, m, "word", seed, loopCount);
3511         }
3512         else {
3513             errln("Creation of word break iterator failed %s", u_errorName(status));
3514         }
3515         delete bi;
3516     }
3517
3518     if (breakType == "line" || breakType == "all") {
3519         logln("Line Break Monkey Test");
3520         RBBILineMonkey  m;
3521         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3522         if (params == NULL) {
3523             loopCount = 50;
3524         }
3525         if (U_SUCCESS(status)) {
3526             RunMonkey(bi, m, "line", seed, loopCount);
3527         }
3528         else {
3529             errln("Creation of line break iterator failed %s", u_errorName(status));
3530         }
3531         delete bi;
3532     }
3533
3534
3535 #endif
3536 }
3537
3538 //
3539 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3540 //    Parameters:
3541 //       bi      - the break iterator to use
3542 //       mk      - MonkeyKind, abstraction for obtaining expected results
3543 //       name    - Name of test (char, word, etc.) for use in error messages
3544 //       seed    - Seed for starting random number generator (parameter from user)
3545 //       numIterations
3546 //
3547 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed, int32_t numIterations) {
3548
3549 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3550
3551     const int32_t    TESTSTRINGLEN = 500;
3552     UnicodeString    testText;
3553     int32_t          numCharClasses;
3554     UVector          *chClasses;
3555     int              expected[TESTSTRINGLEN*2 + 1];
3556     int              expectedCount = 0;
3557     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3558     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3559     char             reverseBreaks[TESTSTRINGLEN*2+1];
3560     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3561     char             followingBreaks[TESTSTRINGLEN*2+1];
3562     char             precedingBreaks[TESTSTRINGLEN*2+1];
3563     int              i;
3564     int              loopCount = 0;
3565
3566     m_seed = seed;
3567
3568     numCharClasses = mk.charClasses()->size();
3569     chClasses      = mk.charClasses();
3570
3571     // Check for errors that occured during the construction of the MonkeyKind object.
3572     //  Can't report them where they occured because errln() is a method coming from intlTest,
3573     //  and is not visible outside of RBBITest :-(
3574     if (U_FAILURE(mk.deferredStatus)) {
3575         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3576         return;
3577     }
3578
3579     // Verify that the character classes all have at least one member.
3580     for (i=0; i<numCharClasses; i++) {
3581         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3582         if (s == NULL || s->size() == 0) {
3583             errln("Character Class #%d is null or of zero size.", i);
3584             return;
3585         }
3586     }
3587
3588     while (loopCount < numIterations || numIterations == -1) {
3589         if (numIterations == -1 && loopCount % 10 == 0) {
3590             // If test is running in an infinite loop, display a periodic tic so
3591             //   we can tell that it is making progress.
3592             fprintf(stderr, ".");
3593         }
3594         // Save current random number seed, so that we can recreate the random numbers
3595         //   for this loop iteration in event of an error.
3596         seed = m_seed;
3597
3598         // Populate a test string with data.
3599         testText.truncate(0);
3600         for (i=0; i<TESTSTRINGLEN; i++) {
3601             int32_t  aClassNum = m_rand() % numCharClasses;
3602             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3603             int32_t   charIdx = m_rand() % classSet->size();
3604             UChar32   c = classSet->charAt(charIdx);
3605             if (c < 0) {   // TODO:  deal with sets containing strings.
3606                 errln("c < 0");
3607             }
3608             testText.append(c);
3609         }
3610
3611         // Calculate the expected results for this test string.
3612         mk.setText(testText);
3613         memset(expectedBreaks, 0, sizeof(expectedBreaks));
3614         expectedBreaks[0] = 1;
3615         int32_t breakPos = 0;
3616         expectedCount = 0;
3617         for (;;) {
3618             breakPos = mk.next(breakPos);
3619             if (breakPos == -1) {
3620                 break;
3621             }
3622             if (breakPos > testText.length()) {
3623                 errln("breakPos > testText.length()");
3624             }
3625             expectedBreaks[breakPos] = 1;
3626             expected[expectedCount ++] = breakPos;
3627         }
3628
3629         // Find the break positions using forward iteration
3630         memset(forwardBreaks, 0, sizeof(forwardBreaks));
3631         bi->setText(testText);
3632         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3633             if (i < 0 || i > testText.length()) {
3634                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3635                 break;
3636             }
3637             forwardBreaks[i] = 1;
3638         }
3639
3640         // Find the break positions using reverse iteration
3641         memset(reverseBreaks, 0, sizeof(reverseBreaks));
3642         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3643             if (i < 0 || i > testText.length()) {
3644                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3645                 break;
3646             }
3647             reverseBreaks[i] = 1;
3648         }
3649
3650         // Find the break positions using isBoundary() tests.
3651         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3652         U_ASSERT(sizeof(isBoundaryBreaks) > testText.length());
3653         for (i=0; i<=testText.length(); i++) {
3654             isBoundaryBreaks[i] = bi->isBoundary(i);
3655         }
3656
3657
3658         // Find the break positions using the following() function.
3659         // printf(".");
3660         memset(followingBreaks, 0, sizeof(followingBreaks));
3661         int32_t   lastBreakPos = 0;
3662         followingBreaks[0] = 1;
3663         for (i=0; i<testText.length(); i++) {
3664             breakPos = bi->following(i);
3665             if (breakPos <= i ||
3666                 breakPos < lastBreakPos ||
3667                 breakPos > testText.length() ||
3668                 breakPos > lastBreakPos && lastBreakPos > i ) {
3669                 errln("%s break monkey test: "
3670                     "Out of range value returned by BreakIterator::following().\n"
3671                     "Random seed=%d",  name, seed);
3672                 break;
3673             }
3674             followingBreaks[breakPos] = 1;
3675             lastBreakPos = breakPos;
3676         }
3677
3678         // Find the break positions using the preceding() function.
3679         memset(precedingBreaks, 0, sizeof(followingBreaks));
3680         lastBreakPos = testText.length();
3681         precedingBreaks[testText.length()] = 1;
3682         for (i=testText.length(); i>0; i--) {
3683             breakPos = bi->preceding(i);
3684             if (breakPos >= i ||
3685                 breakPos > lastBreakPos ||
3686                 breakPos < 0 ||
3687                 breakPos < lastBreakPos && lastBreakPos < i ) {
3688                 errln("%s break monkey test: "
3689                     "Out of range value returned by BreakIterator::preceding().\n"
3690                     "index=%d;  prev returned %d; lastBreak=%d" ,
3691                     name,  i, breakPos, lastBreakPos);
3692                 precedingBreaks[i] = 2;   // Forces an error.
3693             } else {
3694                 precedingBreaks[breakPos] = 1;
3695                 lastBreakPos = breakPos;
3696             }
3697         }
3698
3699         // Compare the expected and actual results.
3700         for (i=0; i<=testText.length(); i++) {
3701             const char *errorType = NULL;
3702             if  (forwardBreaks[i] != expectedBreaks[i]) {
3703                 errorType = "next()";
3704             } else if (reverseBreaks[i] != forwardBreaks[i]) {
3705                 errorType = "previous()";
3706             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
3707                 errorType = "isBoundary()";
3708             } else if (followingBreaks[i] != expectedBreaks[i]) {
3709                 errorType = "following()";
3710             } else if (precedingBreaks[i] != expectedBreaks[i]) {
3711                 errorType = "preceding()";
3712             }
3713
3714
3715             if (errorType != NULL) {
3716                 // Format a range of the test text that includes the failure as
3717                 //  a data item that can be included in the rbbi test data file.
3718
3719                 // Start of the range is the last point where expected and actual results
3720                 //   both agreed that there was a break position.
3721                 int startContext = i;
3722                 int32_t count = 0;
3723                 for (;;) {
3724                     if (startContext==0) { break; }
3725                     startContext --;
3726                     if (expectedBreaks[startContext] != 0) {
3727                         if (count == 2) break;
3728                         count ++;
3729                     }
3730                 }
3731
3732                 // End of range is two expected breaks past the start position.
3733                 int endContext = i + 1;
3734                 int ci;
3735                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
3736                     for (;;) {
3737                         if (endContext >= testText.length()) {break;}
3738                         if (expectedBreaks[endContext-1] != 0) {
3739                             if (count == 0) break;
3740                             count --;
3741                         }
3742                         endContext ++;
3743                     }
3744                 }
3745
3746                 // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
3747                 UnicodeString errorText = "<data>";
3748                 /***if (strcmp(errorType, "next()") == 0) {
3749                     startContext = 0;
3750                     endContext = testText.length();
3751
3752                     printStringBreaks(testText, expected, expectedCount);
3753                 }***/
3754
3755                 for (ci=startContext; ci<endContext;) {
3756                     UnicodeString hexChars("0123456789abcdef");
3757                     UChar32  c;
3758                     int      bn;
3759                     c = testText.char32At(ci);
3760                     if (ci == i) {
3761                         // This is the location of the error.
3762                         errorText.append("<?>");
3763                     } else if (expectedBreaks[ci] != 0) {
3764                         // This a non-error expected break position.
3765                         errorText.append("<>");
3766                     }
3767                     if (c < 0x10000) {
3768                         errorText.append("\\u");
3769                         for (bn=12; bn>=0; bn-=4) {
3770                             errorText.append(hexChars.charAt((c>>bn)&0xf));
3771                         }
3772                     } else {
3773                         errorText.append("\\U");
3774                         for (bn=28; bn>=0; bn-=4) {
3775                             errorText.append(hexChars.charAt((c>>bn)&0xf));
3776                         }
3777                     }
3778                     ci = testText.moveIndex32(ci, 1);
3779                 }
3780                 errorText.append("<>");
3781                 errorText.append("</data>\n");
3782
3783                 // Output the error
3784                 char  charErrorTxt[500];
3785                 UErrorCode status = U_ZERO_ERROR;
3786                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
3787                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
3788                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
3789                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
3790                     errorType, seed, i, charErrorTxt);
3791                 break;
3792             }
3793         }
3794
3795         loopCount++;
3796     }
3797 #endif
3798 }
3799
3800
3801 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */