1 /******************************************************************** 
   3  * Copyright (c) 1999-2015, International Business Machines Corporation and 
   4  * others. All Rights Reserved. 
   5  ********************************************************************/ 
   6 /************************************************************************ 
   7 *   Date        Name        Description 
   8 *   12/15/99    Madhu        Creation. 
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests 
  10 ************************************************************************/ 
  12 #include "utypeinfo.h"  // for 'typeid' to work 
  14 #include "unicode/utypes.h" 
  16 #if !UCONFIG_NO_BREAK_ITERATION 
  18 #include "unicode/utypes.h" 
  19 #include "unicode/brkiter.h" 
  20 #include "unicode/rbbi.h" 
  21 #include "unicode/uchar.h" 
  22 #include "unicode/utf16.h" 
  23 #include "unicode/ucnv.h" 
  24 #include "unicode/schriter.h" 
  25 #include "unicode/uniset.h" 
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
  27 #include "unicode/regex.h" 
  29 #include "unicode/ustring.h" 
  30 #include "unicode/utext.h" 
  39 #include "unicode/numfmt.h" 
  40 #include "unicode/uscript.h" 
  43 #define TEST_ASSERT(x) {if (!(x)) { \ 
  44     errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 
  46 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 
  47     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 
  50 //--------------------------------------------- 
  52 //--------------------------------------------- 
  55 //  Note:  Before adding new tests to this file, check whether the desired test data can  
  56 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can, 
  57 //         it's much less work than writing a new test, diagnostic output in the event of failures 
  58 //         is good, and the test data file will is shared with ICU4J, so eventually the test 
  59 //         will run there as well, without additional effort. 
  61 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params 
) 
  63     if (exec
) logln("TestSuite RuleBasedBreakIterator: "); 
  66 #if !UCONFIG_NO_FILE_IO 
  67         case 0: name 
= "TestBug4153072"; 
  68             if(exec
) TestBug4153072();                         break; 
  70         case 0: name 
= "skip"; 
  74         case 1: name 
= "skip"; 
  76         case 2: name 
= "TestStatusReturn"; 
  77             if(exec
) TestStatusReturn();                       break; 
  79 #if !UCONFIG_NO_FILE_IO 
  80         case 3: name 
= "TestUnicodeFiles"; 
  81             if(exec
) TestUnicodeFiles();                       break; 
  82         case 4: name 
= "TestEmptyString"; 
  83             if(exec
) TestEmptyString();                        break; 
  85         case 3: case 4: name 
= "skip"; 
  89         case 5: name 
= "TestGetAvailableLocales"; 
  90             if(exec
) TestGetAvailableLocales();                break; 
  92         case 6: name 
= "TestGetDisplayName"; 
  93             if(exec
) TestGetDisplayName();                     break; 
  95 #if !UCONFIG_NO_FILE_IO 
  96         case 7: name 
= "TestEndBehaviour"; 
  97             if(exec
) TestEndBehaviour();                       break; 
  98         case 8: case 9: case 10: name 
= "skip"; 
 100         case 11: name 
= "TestWordBreaks"; 
 101              if(exec
) TestWordBreaks();                        break; 
 102         case 12: name 
= "TestWordBoundary"; 
 103              if(exec
) TestWordBoundary();                      break; 
 104         case 13: name 
= "TestLineBreaks"; 
 105              if(exec
) TestLineBreaks();                        break; 
 106         case 14: name 
= "TestSentBreaks"; 
 107              if(exec
) TestSentBreaks();                        break; 
 108         case 15: name 
= "TestExtended"; 
 109              if(exec
) TestExtended();                          break; 
 111         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name 
= "skip"; 
 115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 
 117             name 
= "TestMonkey"; if(exec
)  TestMonkey(params
); break; 
 120              name 
= "skip";                                    break; 
 123 #if !UCONFIG_NO_FILE_IO 
 124         case 17: name 
= "TestBug3818"; 
 125             if(exec
) TestBug3818();                            break; 
 127         case 17: name 
= "skip"; 
 131         case 18: name 
= "skip"; 
 133         case 19: name 
= "TestDebug"; 
 134             if(exec
) TestDebug();                              break; 
 135         case 20: name 
= "skip"; 
 138 #if !UCONFIG_NO_FILE_IO 
 139         case 21: name 
= "TestBug5775"; 
 140             if (exec
) TestBug5775();                           break; 
 142         case 21: name 
= "skip"; 
 146         case 22: name 
= "TestBug9983"; 
 147             if (exec
) TestBug9983();                           break; 
 148         case 23: name 
= "TestDictRules"; 
 149             if (exec
) TestDictRules();                         break; 
 150         case 24: name 
= "TestBug5532"; 
 151             if (exec
) TestBug5532();                           break; 
 152         default: name 
= ""; break; //needed to end loop 
 157 //--------------------------------------------------------------------------- 
 159 //   class BITestData   Holds a set of Break iterator test data and results 
 161 //                         - the string data to be broken 
 162 //                         - a vector of the expected break positions. 
 163 //                         - a vector of source line numbers for the data, 
 164 //                               (to help see where errors occured.) 
 165 //                         - The expected break tag values. 
 166 //                         - Vectors of actual break positions and tag values. 
 167 //                         - Functions for comparing actual with expected and 
 170 //---------------------------------------------------------------------------- 
 173     UnicodeString    fDataToBreak
; 
 174     UVector          fExpectedBreakPositions
; 
 175     UVector          fExpectedTags
; 
 177     UVector          fActualBreakPositions
;   // Test Results. 
 180     BITestData(UErrorCode 
&status
); 
 181     void             addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
); 
 182     void             checkResults(const char *heading
, RBBITest 
*test
); 
 183     void             err(const char *heading
, RBBITest 
*test
, int32_t expectedIdx
, int32_t actualIdx
); 
 190 BITestData::BITestData(UErrorCode 
&status
) 
 191 : fExpectedBreakPositions(status
), fExpectedTags(status
),  fLineNum(status
), fActualBreakPositions(status
), 
 197 // addDataChunk.   Add a section (non-breaking) piece if data to the test data. 
 198 //                 The macro form collects the line number, which is helpful 
 199 //                 when tracking down failures. 
 201 //                 A null data item is inserted at the start of each test's data 
 202 //                  to put the starting zero into the data list.  The position saved for 
 203 //                  each non-null item is its ending position. 
 205 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status); 
 206 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) { 
 207     if (U_FAILURE(status
)) {return;} 
 209         fDataToBreak
.append(CharsToUnicodeString(data
)); 
 211     fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
); 
 212     fExpectedTags
.addElement(tag
, status
); 
 213     fLineNum
.addElement(lineNum
, status
); 
 218 //  checkResults.   Compare the actual and expected break positions, report any differences. 
 220 void BITestData::checkResults(const char *heading
, RBBITest 
*test
) { 
 221     int32_t   expectedIndex 
= 0; 
 222     int32_t   actualIndex 
= 0; 
 225         // If we've run through both the expected and actual results vectors, we're done. 
 226         //   break out of the loop. 
 227         if (expectedIndex 
>= fExpectedBreakPositions
.size() && 
 228             actualIndex   
>= fActualBreakPositions
.size()) { 
 233         if (expectedIndex 
>= fExpectedBreakPositions
.size()) { 
 234             err(heading
, test
, expectedIndex
-1, actualIndex
); 
 239         if (actualIndex 
>= fActualBreakPositions
.size()) { 
 240             err(heading
, test
, expectedIndex
, actualIndex
-1); 
 245         if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) { 
 246             err(heading
, test
, expectedIndex
, actualIndex
); 
 247             // Try to resync the positions of the indices, to avoid a rash of spurious erros. 
 248             if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) { 
 256         if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) { 
 257             test
->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d", 
 258                 heading
, fLineNum
.elementAt(expectedIndex
), 
 259                 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
)); 
 268 //  err   -  An error was found.  Report it, along with information about where the 
 269 //                                incorrectly broken test data appeared in the source file. 
 271 void    BITestData::err(const char *heading
, RBBITest 
*test
, int32_t expectedIdx
, int32_t actualIdx
) 
 273     int32_t   expected 
= fExpectedBreakPositions
.elementAti(expectedIdx
); 
 274     int32_t   actual   
= fActualBreakPositions
.elementAti(actualIdx
); 
 276     int32_t   line     
= fLineNum
.elementAti(expectedIdx
); 
 277     if (expectedIdx 
> 0) { 
 278         // The line numbers are off by one because a premature break occurs somewhere 
 279         //    within the previous item, rather than at the start of the current (expected) item. 
 280         //    We want to report the offset of the unexpected break from the start of 
 281         //      this previous item. 
 282         o    
= actual 
- fExpectedBreakPositions
.elementAti(expectedIdx
-1); 
 284     if (actual 
< expected
) { 
 285         test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading
, o
, line
, actual
, expected
); 
 287         test
->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading
, line
, actual
, expected
); 
 292 void BITestData::clearResults() { 
 293     fActualBreakPositions
.removeAllElements(); 
 294     fActualTags
.removeAllElements(); 
 298 //-------------------------------------------------------------------------------------- 
 300 //    RBBITest    constructor and destructor 
 302 //-------------------------------------------------------------------------------------- 
 304 RBBITest::RBBITest() { 
 308 RBBITest::~RBBITest() { 
 311 //----------------------------------------------------------------------------------- 
 313 //   Test for status {tag} return value from break rules. 
 314 //        TODO:  a more thorough test. 
 316 //----------------------------------------------------------------------------------- 
 317 void RBBITest::TestStatusReturn() { 
 318      UnicodeString 
rulesString1("$Letters = [:L:];\n" 
 319                                   "$Numbers = [:N:];\n" 
 322                                   "Help\\ {4}/me\\!;\n" 
 323                                   "[^$Letters $Numbers];\n" 
 324                                   "!.*;\n", -1, US_INV
); 
 325      UnicodeString testString1  
= "abc123..abc Help me Help me!"; 
 326                                 // 01234567890123456789012345678 
 327      int32_t bounds1
[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 
 328      int32_t brkStatus
[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1}; 
 330      UErrorCode status
=U_ZERO_ERROR
; 
 331      UParseError    parseError
; 
 333      BreakIterator 
*bi 
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
); 
 334      if(U_FAILURE(status
)) { 
 335          dataerrln("FAIL : in construction - %s", u_errorName(status
)); 
 339          bi
->setText(testString1
); 
 340          for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) { 
 341              if (pos 
!= bounds1
[i
]) { 
 342                  errln("FAIL:  expected break at %d, got %d\n", bounds1
[i
], pos
); 
 346              int tag 
= bi
->getRuleStatus(); 
 347              if (tag 
!= brkStatus
[i
]) { 
 348                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
); 
 358 static void printStringBreaks(UText 
*tstr
, int expected
[], int expectedCount
) { 
 359     UErrorCode status 
= U_ZERO_ERROR
; 
 361     printf("code    alpha extend alphanum type word sent line name\n"); 
 362     int nextExpectedIndex 
= 0; 
 363     utext_setNativeIndex(tstr
, 0); 
 364     for (int j 
= 0; j 
< utext_nativeLength(tstr
); j
=utext_getNativeIndex(tstr
)) { 
 365         if (nextExpectedIndex 
< expectedCount 
&& j 
>= expected
[nextExpectedIndex
] ) { 
 366             printf("------------------------------------------------ %d\n", j
); 
 370         UChar32 c 
= utext_next32(tstr
); 
 371         u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
); 
 372         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
, 
 374                            u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
), 
 376                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
, 
 378                                                   U_SHORT_PROPERTY_NAME
), 
 379                            u_getPropertyValueName(UCHAR_WORD_BREAK
, 
 380                                                   u_getIntPropertyValue(c
, 
 382                                                   U_SHORT_PROPERTY_NAME
), 
 383                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK
, 
 384                                    u_getIntPropertyValue(c
, 
 385                                            UCHAR_SENTENCE_BREAK
), 
 386                                    U_SHORT_PROPERTY_NAME
), 
 387                            u_getPropertyValueName(UCHAR_LINE_BREAK
, 
 388                                    u_getIntPropertyValue(c
, 
 390                                    U_SHORT_PROPERTY_NAME
), 
 396 static void printStringBreaks(const UnicodeString 
&ustr
, int expected
[], int expectedCount
) { 
 397    UErrorCode status 
= U_ZERO_ERROR
; 
 399    tstr 
= utext_openConstUnicodeString(NULL
, &ustr
, &status
); 
 400    if (U_FAILURE(status
)) { 
 401        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
)); 
 404    printStringBreaks(tstr
, expected
, expectedCount
); 
 409 void RBBITest::TestBug3818() { 
 410     UErrorCode  status 
= U_ZERO_ERROR
; 
 412     // Four Thai words... 
 413     static const UChar thaiWordData
[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 
 414                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 
 415     UnicodeString  
thaiStr(thaiWordData
); 
 417     BreakIterator
* bi 
= BreakIterator::createWordInstance(Locale("th"), status
); 
 418     if (U_FAILURE(status
) || bi 
== NULL
) { 
 419         errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
)); 
 422     bi
->setText(thaiStr
); 
 424     int32_t  startOfSecondWord 
= bi
->following(1); 
 425     if (startOfSecondWord 
!= 4) { 
 426         errln("Fail at file %s, line %d expected start of word at 4, got %d", 
 427             __FILE__
, __LINE__
, startOfSecondWord
); 
 429     startOfSecondWord 
= bi
->following(0); 
 430     if (startOfSecondWord 
!= 4) { 
 431         errln("Fail at file %s, line %d expected start of word at 4, got %d", 
 432             __FILE__
, __LINE__
, startOfSecondWord
); 
 437 //---------------------------------------------------------------------------- 
 439 // generalIteratorTest      Given a break iterator and a set of test data, 
 440 //                          Run the tests and report the results. 
 442 //---------------------------------------------------------------------------- 
 443 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData 
&td
) 
 446     bi
.setText(td
.fDataToBreak
); 
 448     testFirstAndNext(bi
, td
); 
 450     testLastAndPrevious(bi
, td
); 
 452     testFollowing(bi
, td
); 
 453     testPreceding(bi
, td
); 
 454     testIsBoundary(bi
, td
); 
 455     doMultipleSelectionTest(bi
, td
); 
 460 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next() 
 463 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData 
&td
) 
 465     UErrorCode  status 
= U_ZERO_ERROR
; 
 470     logln("Test first and next"); 
 471     bi
.setText(td
.fDataToBreak
); 
 474     for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) { 
 475         td
.fActualBreakPositions
.addElement(p
, status
);  // Save result. 
 476         tag 
= bi
.getRuleStatus(); 
 477         td
.fActualTags
.addElement(tag
, status
); 
 479             // If the iterator is not making forward progress, stop. 
 480             //  No need to raise an error here, it'll be detected in the normal check of results. 
 485     td
.checkResults("testFirstAndNext", this); 
 490 //  TestLastAndPrevious.   Run the iterator backwards, starting with last(). 
 492 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
,  BITestData 
&td
) 
 494     UErrorCode  status 
= U_ZERO_ERROR
; 
 496     int32_t     lastP  
= 0x7ffffffe; 
 499     logln("Test last and previous"); 
 500     bi
.setText(td
.fDataToBreak
); 
 503     for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) { 
 504         // Save break position.  Insert it at start of vector of results, shoving 
 505         //    already-saved results further towards the end. 
 506         td
.fActualBreakPositions
.insertElementAt(p
, 0, status
); 
 507         // bi.previous();   // TODO:  Why does this fix things up???? 
 509         tag 
= bi
.getRuleStatus(); 
 510         td
.fActualTags
.insertElementAt(tag
, 0, status
); 
 512             // If the iterator is not making progress, stop. 
 513             //  No need to raise an error here, it'll be detected in the normal check of results. 
 518     td
.checkResults("testLastAndPrevious", this); 
 522 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData 
&td
) 
 524     UErrorCode  status 
= U_ZERO_ERROR
; 
 527     int32_t     lastP  
= -2;     // A value that will never be returned as a break position. 
 528                                  //   cannot be -1; that is returned for DONE. 
 531     logln("testFollowing():"); 
 532     bi
.setText(td
.fDataToBreak
); 
 535     // Save the starting point, since we won't get that out of following. 
 537     td
.fActualBreakPositions
.addElement(p
, status
);  // Save result. 
 538     tag 
= bi
.getRuleStatus(); 
 539     td
.fActualTags
.addElement(tag
, status
); 
 541     for (i 
= 0; i 
<= td
.fDataToBreak
.length()+1; i
++) { 
 544             if (p 
== RuleBasedBreakIterator::DONE
) { 
 547             // We've reached a new break position.  Save it. 
 548             td
.fActualBreakPositions
.addElement(p
, status
);  // Save result. 
 549             tag 
= bi
.getRuleStatus(); 
 550             td
.fActualTags
.addElement(tag
, status
); 
 554     // The loop normally exits by means of the break in the middle. 
 555     // Make sure that the index was at the correct position for the break iterator to have 
 557     if (i 
!= td
.fDataToBreak
.length()) { 
 558         errln("testFollowing():  iterator returned DONE prematurely."); 
 561     // Full check of all results. 
 562     td
.checkResults("testFollowing", this); 
 567 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
,  BITestData 
&td
) { 
 568     UErrorCode  status 
= U_ZERO_ERROR
; 
 571     int32_t     lastP  
= 0x7ffffffe; 
 574     logln("testPreceding():"); 
 575     bi
.setText(td
.fDataToBreak
); 
 579     td
.fActualBreakPositions
.addElement(p
, status
); 
 580     tag 
= bi
.getRuleStatus(); 
 581     td
.fActualTags
.addElement(tag
, status
); 
 583     for (i 
= td
.fDataToBreak
.length(); i
>=-1; i
--) { 
 586             if (p 
== RuleBasedBreakIterator::DONE
) { 
 589             // We've reached a new break position.  Save it. 
 590             td
.fActualBreakPositions
.insertElementAt(p
, 0, status
); 
 592             tag 
= bi
.getRuleStatus(); 
 593             td
.fActualTags
.insertElementAt(tag
, 0, status
); 
 596     // The loop normally exits by means of the break in the middle. 
 597     // Make sure that the index was at the correct position for the break iterator to have 
 600         errln("testPreceding():  iterator returned DONE prematurely."); 
 603     // Full check of all results. 
 604     td
.checkResults("testPreceding", this); 
 609 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
,  BITestData 
&td
) { 
 610     UErrorCode  status 
= U_ZERO_ERROR
; 
 614     logln("testIsBoundary():"); 
 615     bi
.setText(td
.fDataToBreak
); 
 618     for (i 
= 0; i 
<= td
.fDataToBreak
.length(); i
++) { 
 619         if (bi
.isBoundary(i
)) { 
 620             td
.fActualBreakPositions
.addElement(i
, status
);  // Save result. 
 621             tag 
= bi
.getRuleStatus(); 
 622             td
.fActualTags
.addElement(tag
, status
); 
 625     td
.checkResults("testIsBoundary: ", this); 
 630 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData 
&td
) 
 632     iterator
.setText(td
.fDataToBreak
); 
 634     RuleBasedBreakIterator
* testIterator 
=(RuleBasedBreakIterator
*)iterator
.clone(); 
 635     int32_t offset 
= iterator
.first(); 
 639     logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length()); 
 641     if (*testIterator 
!= iterator
) 
 642         errln("clone() or operator!= failed: two clones compared unequal"); 
 645         testOffset 
= testIterator
->first(); 
 646         testOffset 
= testIterator
->next(count
); 
 647         if (offset 
!= testOffset
) 
 648             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count 
+ ", next(n) returned " + testOffset 
+ " and next() had " + offset
); 
 650         if (offset 
!= RuleBasedBreakIterator::DONE
) { 
 652             offset 
= iterator
.next(); 
 654             if (offset 
!= RuleBasedBreakIterator::DONE 
&& *testIterator 
== iterator
) { 
 655                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
); 
 656                 if (count 
> 10000 || offset 
== -1) { 
 657                     errln("operator== failed too many times. Stopping test."); 
 659                         errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 
 665     } while (offset 
!= RuleBasedBreakIterator::DONE
); 
 667     // now do it backwards... 
 668     offset 
= iterator
.last(); 
 672         testOffset 
= testIterator
->last(); 
 673         testOffset 
= testIterator
->next(count
);   // next() with a negative arg is same as previous 
 674         if (offset 
!= testOffset
) 
 675             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count 
+ ", next(n) returned " + testOffset 
+ " and next() had " + offset
); 
 677         if (offset 
!= RuleBasedBreakIterator::DONE
) { 
 679             offset 
= iterator
.previous(); 
 681     } while (offset 
!= RuleBasedBreakIterator::DONE
); 
 687 //--------------------------------------------- 
 691 //--------------------------------------------- 
 692 void RBBITest::TestEmptyString() 
 694     UnicodeString text 
= ""; 
 695     UErrorCode status 
= U_ZERO_ERROR
; 
 697     BITestData 
x(status
); 
 698     ADD_DATACHUNK(x
, "", 0, status
);           // Break at start of data 
 699     RuleBasedBreakIterator
* bi 
= (RuleBasedBreakIterator 
*)BreakIterator::createLineInstance(Locale::getDefault(), status
); 
 700     if (U_FAILURE(status
)) 
 702         errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
)); 
 705     generalIteratorTest(*bi
, x
); 
 709 void RBBITest::TestGetAvailableLocales() 
 711     int32_t locCount 
= 0; 
 712     const Locale
* locList 
= BreakIterator::getAvailableLocales(locCount
); 
 715         dataerrln("getAvailableLocales() returned an empty list!"); 
 716     // Just make sure that it's returning good memory. 
 718     for (i 
= 0; i 
< locCount
; ++i
) { 
 719         logln(locList
[i
].getName()); 
 723 //Testing the BreakIterator::getDisplayName() function 
 724 void RBBITest::TestGetDisplayName() 
 726     UnicodeString   result
; 
 728     BreakIterator::getDisplayName(Locale::getUS(), result
); 
 729     if (Locale::getDefault() == Locale::getUS() && result 
!= "English (United States)") 
 730         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 
 733     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
); 
 734     if (result 
!= "French (France)") 
 735         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 
 742 void RBBITest::TestEndBehaviour() 
 744     UErrorCode status 
= U_ZERO_ERROR
; 
 745     UnicodeString 
testString("boo."); 
 746     BreakIterator 
*wb 
= BreakIterator::createWordInstance(Locale::getDefault(), status
); 
 747     if (U_FAILURE(status
)) 
 749         errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
)); 
 752     wb
->setText(testString
); 
 754     if (wb
->first() != 0) 
 755         errln("Didn't get break at beginning of string."); 
 757         errln("Didn't get break before period in \"boo.\""); 
 758     if (wb
->current() != 4 && wb
->next() != 4) 
 759         errln("Didn't get break at end of string."); 
 765 void RBBITest::TestBug4153072() { 
 766     UErrorCode status 
= U_ZERO_ERROR
; 
 767     BreakIterator 
*iter 
= BreakIterator::createWordInstance(Locale::getDefault(), status
); 
 768     if (U_FAILURE(status
)) 
 770         errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
)); 
 773     UnicodeString 
str("...Hello, World!..."); 
 775     int32_t end 
= str
.length() - 3; 
 778     StringCharacterIterator
* textIterator 
= new StringCharacterIterator(str
, begin
, end
, begin
); 
 779     iter
->adoptText(textIterator
); 
 781     // Note: with the switch to UText, there is no way to restrict the 
 782     //       iteration range to begin at an index other than zero. 
 783     //       String character iterators created with a non-zero bound are 
 784     //         treated by RBBI as being empty. 
 785     for (index 
= -1; index 
< begin 
+ 1; ++index
) { 
 786         onBoundary 
= iter
->isBoundary(index
); 
 787         if (index 
== 0?  !onBoundary 
: onBoundary
) { 
 788             errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index 
+ 
 789                             " and begin index = " + begin
); 
 797 // Test for problem reported by Ashok Matoria on 9 July 2007 
 798 //    One.<kSoftHyphen><kSpace>Two. 
 800 //    Sentence break at start (0) and then on calling next() it breaks at 
 801 //   'T' of "Two". Now, at this point if I do next() and 
 802 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 
 804 void RBBITest::TestBug5775() { 
 805     UErrorCode status 
= U_ZERO_ERROR
; 
 806     BreakIterator 
*bi 
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
); 
 807     TEST_ASSERT_SUCCESS(status
); 
 808     if (U_FAILURE(status
)) { 
 811 // Check for status first for better handling of no data errors. 
 812     TEST_ASSERT(bi 
!= NULL
); 
 817     UnicodeString 
s("One.\\u00ad Two.", -1, US_INV
); 
 821     int pos 
= bi
->next(); 
 822     TEST_ASSERT(pos 
== 6); 
 824     TEST_ASSERT(pos 
== 10); 
 825     pos 
= bi
->previous(); 
 826     TEST_ASSERT(pos 
== 6); 
 832 //------------------------------------------------------------------------------ 
 834 //   RBBITest::Extended    Run  RBBI Tests from an external test data file 
 836 //------------------------------------------------------------------------------ 
 839     BreakIterator   
*bi
;                   // Break iterator is set while parsing test source. 
 840                                            //   Changed out whenever test data changes break type. 
 842     UnicodeString    dataToBreak
;          // Data that is built up while parsing the test. 
 843     UVector32       
*expectedBreaks
;       // Expected break positions, matches dataToBreak UnicodeString. 
 844     UVector32       
*srcLine
;              // Positions in source file, indexed same as dataToBreak. 
 847     UText           
*textToBreak
;          // UText, could be UTF8 or UTF16. 
 848     UVector32       
*textMap
;              // Map from UTF-16 dataToBreak offsets to UText offsets. 
 849     CharString       utf8String
;           // UTF-8 form of text to break. 
 851     TestParams(UErrorCode 
&status
) : dataToBreak() { 
 853         expectedBreaks   
= new UVector32(status
); 
 854         srcLine          
= new UVector32(status
); 
 855         srcCol           
= new UVector32(status
); 
 857         textMap          
= new UVector32(status
); 
 862         delete expectedBreaks
; 
 865         utext_close(textToBreak
); 
 869     int32_t getSrcLine(int32_t bp
); 
 870     int32_t getExpectedBreak(int32_t bp
); 
 871     int32_t getSrcCol(int32_t bp
); 
 873     void setUTF16(UErrorCode 
&status
); 
 874     void setUTF8(UErrorCode 
&status
); 
 877 // Append a UnicodeString to a CharString with UTF-8 encoding. 
 878 // Substitute any invalid chars. 
 879 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted. 
 880 static void CharStringAppend(CharString 
&dest
, const UnicodeString 
&src
, UErrorCode 
&status
) { 
 881     if (U_FAILURE(status
)) { 
 885     u_strToUTF8WithSub(NULL
, 0, &utf8Length
,            // Output Buffer, NULL for preflight. 
 886                        src
.getBuffer(), src
.length(),   // UTF-16 data 
 887                        0xfffd, NULL
,                    // Substitution char, number of subs. 
 889     if (U_FAILURE(status
) && status 
!= U_BUFFER_OVERFLOW_ERROR
) { 
 892     status 
= U_ZERO_ERROR
; 
 894     char *buffer 
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
); 
 895     u_strToUTF8WithSub(buffer
, utf8Length
, NULL
, 
 896                        src
.getBuffer(), src
.length(), 
 897                        0xfffd, NULL
, &status
); 
 898     dest
.append(buffer
, utf8Length
, status
); 
 902 void TestParams::setUTF16(UErrorCode 
&status
) { 
 903     textToBreak 
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
); 
 904     textMap
->removeAllElements(); 
 905     for (int32_t i
=0; i
<dataToBreak
.length(); i
++) { 
 906         if (i 
== dataToBreak
.getChar32Start(i
)) { 
 907             textMap
->addElement(i
, status
); 
 909             textMap
->addElement(-1, status
); 
 912     textMap
->addElement(dataToBreak
.length(), status
); 
 913     U_ASSERT(dataToBreak
.length() + 1 == textMap
->size()); 
 917 void TestParams::setUTF8(UErrorCode 
&status
) { 
 918     if (U_FAILURE(status
)) { 
 922     CharStringAppend(utf8String
, dataToBreak
, status
); 
 923     textToBreak 
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
); 
 924     if (U_FAILURE(status
)) { 
 928     textMap
->removeAllElements(); 
 929     int32_t utf16Index 
= 0; 
 931         textMap
->addElement(utf16Index
, status
); 
 932         UChar32 c32 
= utext_current32(textToBreak
); 
 936         utf16Index 
+= U16_LENGTH(c32
); 
 937         utext_next32(textToBreak
); 
 938         while (textMap
->size() < utext_getNativeIndex(textToBreak
)) { 
 939             textMap
->addElement(-1, status
); 
 942     U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size()); 
 946 int32_t TestParams::getSrcLine(int bp
) { 
 947     if (bp 
>= textMap
->size()) { 
 948         bp 
= textMap
->size() - 1; 
 951     for(; bp 
>= 0 ; --bp
) { 
 952         // Move to a character boundary if we are not on one already. 
 953         i 
= textMap
->elementAti(bp
); 
 958     return srcLine
->elementAti(i
); 
 962 int32_t TestParams::getExpectedBreak(int bp
) { 
 963     if (bp 
>= textMap
->size()) { 
 966     int32_t i 
= textMap
->elementAti(bp
); 
 969         retVal 
= expectedBreaks
->elementAti(i
); 
 975 int32_t TestParams::getSrcCol(int bp
) { 
 976     if (bp 
>= textMap
->size()) { 
 977         bp 
= textMap
->size() - 1; 
 980     for(; bp 
>= 0; --bp
) { 
 981         // Move bp to a character boundary if we are not on one already. 
 982         i 
= textMap
->elementAti(bp
); 
 987     return srcCol
->elementAti(i
); 
 991 void RBBITest::executeTest(TestParams 
*t
, UErrorCode 
&status
) { 
 996     TEST_ASSERT_SUCCESS(status
); 
 997     if (U_FAILURE(status
)) { 
1001     if (t
->bi 
== NULL
) { 
1005     t
->bi
->setText(t
->textToBreak
, status
); 
1007     //  Run the iterator forward 
1010     for (bp 
= t
->bi
->first(); bp 
!= BreakIterator::DONE
; bp 
= t
->bi
->next()) { 
1012             // Fail for lack of forward progress. 
1013             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d", 
1014                 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
)); 
1018         // Check that there we didn't miss an expected break between the last one 
1020         for (i
=prevBP
+1; i
<bp
; i
++) { 
1021             if (t
->getExpectedBreak(i
) != 0) { 
1022                 int expected
[] = {0, i
}; 
1023                 printStringBreaks(t
->dataToBreak
, expected
, 2); 
1024                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d", 
1025                       i
, t
->getSrcLine(i
), t
->getSrcCol(i
)); 
1029         // Check that the break we did find was expected 
1030         if (t
->getExpectedBreak(bp
) == 0) { 
1031             int expected
[] = {0, bp
}; 
1032             printStringBreaks(t
->textToBreak
, expected
, 2); 
1033             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d", 
1034                 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
)); 
1036             // The break was expected. 
1037             //   Check that the {nnn} tag value is correct. 
1038             int32_t expectedTagVal 
= t
->getExpectedBreak(bp
); 
1039             if (expectedTagVal 
== -1) { 
1042             int32_t line 
= t
->getSrcLine(bp
); 
1043             int32_t rs 
= ((RuleBasedBreakIterator 
*)t
->bi
)->getRuleStatus(); 
1044             if (rs 
!= expectedTagVal
) { 
1045                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n" 
1046                       "          Actual, Expected status = %4d, %4d", 
1047                     bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
); 
1054     // Verify that there were no missed expected breaks after the last one found 
1055     for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) { 
1056         if (t
->getExpectedBreak(i
) != 0) { 
1057             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d", 
1058                       i
, t
->getSrcLine(i
), t
->getSrcCol(i
)); 
1063     //  Run the iterator backwards, verify that the same breaks are found. 
1065     prevBP 
= utext_nativeLength(t
->textToBreak
)+2;  // start with a phony value for the last break pos seen. 
1066     for (bp 
= t
->bi
->last(); bp 
!= BreakIterator::DONE
; bp 
= t
->bi
->previous()) { 
1068             // Fail for lack of progress. 
1069             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d", 
1070                 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
)); 
1074         // Check that we didn't miss an expected break between the last one 
1075         //  and this one.  (UVector returns zeros for index out of bounds.) 
1076         for (i
=prevBP
-1; i
>bp
; i
--) { 
1077             if (t
->getExpectedBreak(i
) != 0) { 
1078                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d", 
1079                       i
, t
->getSrcLine(i
), t
->getSrcCol(i
)); 
1083         // Check that the break we did find was expected 
1084         if (t
->getExpectedBreak(bp
) == 0) { 
1085             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d", 
1086                    bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
)); 
1088             // The break was expected. 
1089             //   Check that the {nnn} tag value is correct. 
1090             int32_t expectedTagVal 
= t
->getExpectedBreak(bp
); 
1091             if (expectedTagVal 
== -1) { 
1094             int line 
= t
->getSrcLine(bp
); 
1095             int32_t rs 
= t
->bi
->getRuleStatus(); 
1096             if (rs 
!= expectedTagVal
) { 
1097                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n" 
1098                       "          Actual, Expected status = %4d, %4d", 
1099                     bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
); 
1106     // Verify that there were no missed breaks prior to the last one found 
1107     for (i
=prevBP
-1; i
>=0; i
--) { 
1108         if (t
->getExpectedBreak(i
) != 0) { 
1109             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d", 
1110                       i
, t
->getSrcLine(i
), t
->getSrcCol(i
)); 
1114     // Check isBoundary() 
1115     for (i
=0; i 
< utext_nativeLength(t
->textToBreak
); i
++) { 
1116         UBool boundaryExpected 
= (t
->getExpectedBreak(i
) != 0); 
1117         UBool boundaryFound    
= t
->bi
->isBoundary(i
); 
1118         if (boundaryExpected 
!= boundaryFound
) { 
1119             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 
1120                   "        Expected, Actual= %s, %s", 
1121                   i
, t
->getSrcLine(i
), t
->getSrcCol(i
), 
1122                   boundaryExpected 
? "true":"false", boundaryFound
? "true" : "false"); 
1126     // Check following() 
1127     for (i
=0; i 
< utext_nativeLength(t
->textToBreak
); i
++) { 
1128         int32_t actualBreak 
= t
->bi
->following(i
); 
1129         int32_t expectedBreak 
= BreakIterator::DONE
; 
1130         for (int32_t j
=i
+1; j 
<= utext_nativeLength(t
->textToBreak
); j
++) { 
1131             if (t
->getExpectedBreak(j
) != 0) { 
1136         if (expectedBreak 
!= actualBreak
) { 
1137             errln("following(%d) incorrect. File line,col= %4d,%4d\n" 
1138                   "        Expected, Actual= %d, %d", 
1139                   i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
); 
1143     // Check preceding() 
1144     for (i
=utext_nativeLength(t
->textToBreak
); i
>=0; i
--) { 
1145         int32_t actualBreak 
= t
->bi
->preceding(i
); 
1146         int32_t expectedBreak 
= BreakIterator::DONE
; 
1148         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent. 
1149         // preceding(trailing byte) will return the index of some preceding code point, 
1150         // not the lead byte of the current code point, even though that has a smaller index. 
1151         // Therefore, start looking at the expected break data not at i-1, but at 
1152         // the start of code point index - 1. 
1153         utext_setNativeIndex(t
->textToBreak
, i
); 
1154         int32_t j 
= utext_getNativeIndex(t
->textToBreak
) - 1; 
1155         for (; j 
>= 0; j
--) { 
1156             if (t
->getExpectedBreak(j
) != 0) { 
1161         if (expectedBreak 
!= actualBreak
) { 
1162             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 
1163                   "        Expected, Actual= %d, %d", 
1164                   i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
); 
1170 void RBBITest::TestExtended() { 
1171 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
1172     UErrorCode      status  
= U_ZERO_ERROR
; 
1175     UnicodeString       rules
; 
1176     TestParams          
tp(status
); 
1178     RegexMatcher      
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status
); 
1179     if (U_FAILURE(status
)) { 
1180         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
)); 
1185     //  Open and read the test data file. 
1187     const char *testDataDirectory 
= IntlTest::getSourceTestData(status
); 
1188     char testFileName
[1000]; 
1189     if (testDataDirectory 
== NULL 
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) { 
1190         errln("Can't open test data.  Path too long."); 
1193     strcpy(testFileName
, testDataDirectory
); 
1194     strcat(testFileName
, "rbbitst.txt"); 
1197     UChar 
*testFile 
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
); 
1198     if (U_FAILURE(status
)) { 
1199         return; /* something went wrong, error already output */ 
1206     //  Put the test data into a UnicodeString 
1208     UnicodeString 
testString(FALSE
, testFile
, len
); 
1216     parseState 
= PARSE_TAG
; 
1218     EParseState savedState 
= PARSE_TAG
; 
1220     static const UChar CH_LF        
= 0x0a; 
1221     static const UChar CH_CR        
= 0x0d; 
1222     static const UChar CH_HASH      
= 0x23; 
1223     /*static const UChar CH_PERIOD    = 0x2e;*/ 
1224     static const UChar CH_LT        
= 0x3c; 
1225     static const UChar CH_GT        
= 0x3e; 
1226     static const UChar CH_BACKSLASH 
= 0x5c; 
1227     static const UChar CH_BULLET    
= 0x2022; 
1229     int32_t    lineNum  
= 1; 
1230     int32_t    colStart 
= 0; 
1232     int32_t    charIdx  
= 0; 
1234     int32_t    tagValue 
= 0;       // The numeric value of a <nnn> tag. 
1236     for (charIdx 
= 0; charIdx 
< len
; ) { 
1237         status 
= U_ZERO_ERROR
; 
1238         UChar  c 
= testString
.charAt(charIdx
); 
1240         if (c 
== CH_CR 
&& charIdx
<len 
&& testString
.charAt(charIdx
) == CH_LF
) { 
1241             // treat CRLF as a unit 
1245         if (c 
== CH_LF 
|| c 
== CH_CR
) { 
1249         column 
= charIdx 
- colStart 
+ 1; 
1251         switch (parseState
) { 
1253             if (c 
== 0x0a || c 
== 0x0d) { 
1254                 parseState 
= savedState
; 
1261                 parseState 
= PARSE_COMMENT
; 
1262                 savedState 
= PARSE_TAG
; 
1265             if (u_isUWhiteSpace(c
)) { 
1268             if (testString
.compare(charIdx
-1, 6, "<word>") == 0) { 
1270                 tp
.bi 
= BreakIterator::createWordInstance(locale
,  status
); 
1274             if (testString
.compare(charIdx
-1, 6, "<char>") == 0) { 
1276                 tp
.bi 
= BreakIterator::createCharacterInstance(locale
,  status
); 
1280             if (testString
.compare(charIdx
-1, 6, "<line>") == 0) { 
1282                 tp
.bi 
= BreakIterator::createLineInstance(locale
,  status
); 
1286             if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) { 
1289                 tp
.bi 
= BreakIterator::createSentenceInstance(locale
,  status
); 
1293             if (testString
.compare(charIdx
-1, 7, "<title>") == 0) { 
1295                 tp
.bi 
= BreakIterator::createTitleInstance(locale
,  status
); 
1300             // <locale  loc_name> 
1301             localeMatcher
.reset(testString
); 
1302             if (localeMatcher
.lookingAt(charIdx
-1, status
)) { 
1303                 UnicodeString localeName 
= localeMatcher
.group(1, status
); 
1304                 char localeName8
[100]; 
1305                 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0); 
1306                 locale 
= Locale::createFromName(localeName8
); 
1307                 charIdx 
+= localeMatcher
.group(0, status
).length() - 1; 
1308                 TEST_ASSERT_SUCCESS(status
); 
1311             if (testString
.compare(charIdx
-1, 6, "<data>") == 0) { 
1312                 parseState 
= PARSE_DATA
; 
1314                 tp
.dataToBreak 
= ""; 
1315                 tp
.expectedBreaks
->removeAllElements(); 
1316                 tp
.srcCol 
->removeAllElements(); 
1317                 tp
.srcLine
->removeAllElements(); 
1321             errln("line %d: Tag expected in test file.", lineNum
); 
1322             parseState 
= PARSE_COMMENT
; 
1323             savedState 
= PARSE_DATA
; 
1324             goto end_test
; // Stop the test. 
1329             if (c 
== CH_BULLET
) { 
1330                 int32_t  breakIdx 
= tp
.dataToBreak
.length(); 
1331                 tp
.expectedBreaks
->setSize(breakIdx
+1); 
1332                 tp
.expectedBreaks
->setElementAt(-1, breakIdx
); 
1333                 tp
.srcLine
->setSize(breakIdx
+1); 
1334                 tp
.srcLine
->setElementAt(lineNum
, breakIdx
); 
1335                 tp
.srcCol 
->setSize(breakIdx
+1); 
1336                 tp
.srcCol 
->setElementAt(column
, breakIdx
); 
1340             if (testString
.compare(charIdx
-1, 7, "</data>") == 0) { 
1341                 // Add final entry to mappings from break location to source file position. 
1342                 //  Need one extra because last break position returned is after the 
1343                 //    last char in the data, not at the last char. 
1344                 tp
.srcLine
->addElement(lineNum
, status
); 
1345                 tp
.srcCol 
->addElement(column
, status
); 
1347                 parseState 
= PARSE_TAG
; 
1351                 status 
= U_ZERO_ERROR
; 
1352                 tp
.setUTF16(status
); 
1353                 executeTest(&tp
, status
); 
1354                 TEST_ASSERT_SUCCESS(status
); 
1356                 // Run again, this time with UTF-8 text wrapped in a UText. 
1357                 status 
= U_ZERO_ERROR
; 
1359                 TEST_ASSERT_SUCCESS(status
); 
1360                 executeTest(&tp
, status
); 
1364             if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 
1365                 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 
1366                 // Get the code point from the name and insert it into the test data. 
1367                 //   (Damn, no API takes names in Unicode  !!! 
1368                 //    we've got to take it back to char *) 
1369                 int32_t nameEndIdx 
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
); 
1370                 int32_t nameLength 
= nameEndIdx 
- (charIdx
+2); 
1371                 char charNameBuf
[200]; 
1372                 UChar32 theChar 
= -1; 
1373                 if (nameEndIdx 
!= -1) { 
1374                     UErrorCode status 
= U_ZERO_ERROR
; 
1375                     testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
)); 
1376                     charNameBuf
[sizeof(charNameBuf
)-1] = 0; 
1377                     theChar 
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
); 
1378                     if (U_FAILURE(status
)) { 
1382                 if (theChar 
== -1) { 
1383                     errln("Error in named character in test file at line %d, col %d", 
1386                     // Named code point was recognized.  Insert it 
1387                     //   into the test data. 
1388                     tp
.dataToBreak
.append(theChar
); 
1389                     while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) { 
1390                         tp
.srcLine
->addElement(lineNum
, status
); 
1391                         tp
.srcCol 
->addElement(column
, status
); 
1394                 if (nameEndIdx 
> charIdx
) { 
1395                     charIdx 
= nameEndIdx
+1; 
1404             if (testString
.compare(charIdx
-1, 2, "<>") == 0) { 
1406                 int32_t  breakIdx 
= tp
.dataToBreak
.length(); 
1407                 tp
.expectedBreaks
->setSize(breakIdx
+1); 
1408                 tp
.expectedBreaks
->setElementAt(-1, breakIdx
); 
1409                 tp
.srcLine
->setSize(breakIdx
+1); 
1410                 tp
.srcLine
->setElementAt(lineNum
, breakIdx
); 
1411                 tp
.srcCol 
->setSize(breakIdx
+1); 
1412                 tp
.srcCol 
->setElementAt(column
, breakIdx
); 
1418                 parseState 
= PARSE_NUM
; 
1422             if (c 
== CH_HASH 
&& column
==3) {   // TODO:  why is column off so far? 
1423                 parseState 
= PARSE_COMMENT
; 
1424                 savedState 
= PARSE_DATA
; 
1428             if (c 
== CH_BACKSLASH
) { 
1429                 // Check for \ at end of line, a line continuation. 
1430                 //     Advance over (discard) the newline 
1431                 UChar32 cp 
= testString
.char32At(charIdx
); 
1432                 if (cp 
== CH_CR 
&& charIdx
<len 
&& testString
.charAt(charIdx
+1) == CH_LF
) { 
1434                     //  Need an extra increment of the input ptr to move over both of them 
1437                 if (cp 
== CH_LF 
|| cp 
== CH_CR
) { 
1444                 // Let unescape handle the back slash. 
1445                 cp 
= testString
.unescapeAt(charIdx
); 
1447                     // Escape sequence was recognized.  Insert the char 
1448                     //   into the test data. 
1449                     tp
.dataToBreak
.append(cp
); 
1450                     while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) { 
1451                         tp
.srcLine
->addElement(lineNum
, status
); 
1452                         tp
.srcCol 
->addElement(column
, status
); 
1458                 // Not a recognized backslash escape sequence. 
1459                 // Take the next char as a literal. 
1460                 //  TODO:  Should this be an error? 
1461                 c 
= testString
.charAt(charIdx
); 
1462                 charIdx 
= testString
.moveIndex32(charIdx
, 1); 
1465             // Normal, non-escaped data char. 
1466             tp
.dataToBreak
.append(c
); 
1468             // Save the mapping from offset in the data to line/column numbers in 
1469             //   the original input file.  Will be used for better error messages only. 
1470             //   If there's an expected break before this char, the slot in the mapping 
1471             //     vector will already be set for this char; don't overwrite it. 
1472             if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) { 
1473                 tp
.srcLine
->addElement(lineNum
, status
); 
1474                 tp
.srcCol 
->addElement(column
, status
); 
1480             // We are parsing an expected numeric tag value, like <1234>, 
1481             //   within a chunk of data. 
1482             if (u_isUWhiteSpace(c
)) { 
1487                 // Finished the number.  Add the info to the expected break data, 
1488                 //   and switch parse state back to doing plain data. 
1489                 parseState 
= PARSE_DATA
; 
1490                 if (tagValue 
== 0) { 
1493                 int32_t  breakIdx 
= tp
.dataToBreak
.length(); 
1494                 tp
.expectedBreaks
->setSize(breakIdx
+1); 
1495                 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
); 
1496                 tp
.srcLine
->setSize(breakIdx
+1); 
1497                 tp
.srcLine
->setElementAt(lineNum
, breakIdx
); 
1498                 tp
.srcCol 
->setSize(breakIdx
+1); 
1499                 tp
.srcCol 
->setElementAt(column
, breakIdx
); 
1504                 tagValue 
= tagValue
*10 + u_charDigitValue(c
); 
1508             errln("Syntax Error in test file at line %d, col %d", 
1510             parseState 
= PARSE_COMMENT
; 
1511             goto end_test
; // Stop the test 
1516         if (U_FAILURE(status
)) { 
1517             dataerrln("ICU Error %s while parsing test file at line %d.", 
1518                 u_errorName(status
), lineNum
); 
1519             status 
= U_ZERO_ERROR
; 
1520             goto end_test
; // Stop the test 
1531 //------------------------------------------------------------------------------- 
1533 //  TestDictRules   create a break iterator from source rules that includes a 
1534 //                  dictionary range.   Regression for bug #7130.  Source rules 
1535 //                  do not declare a break iterator type (word, line, sentence, etc. 
1536 //                  but the dictionary code, without a type, would loop. 
1538 //------------------------------------------------------------------------------- 
1539 void RBBITest::TestDictRules() { 
1540     const char *rules 
=  "$dictionary = [a-z]; \n" 
1542                          "$dictionary $dictionary; \n" 
1544                          "$dictionary $dictionary; \n"; 
1545     const char *text 
= "aa"; 
1546     UErrorCode status 
= U_ZERO_ERROR
; 
1547     UParseError parseError
; 
1549     RuleBasedBreakIterator 
bi(rules
, parseError
, status
); 
1550     if (U_SUCCESS(status
)) { 
1551         UnicodeString utext 
= text
; 
1555         for (loops 
= 0; loops
<10; loops
++) { 
1556             position 
= bi
.next(); 
1557             if (position 
== RuleBasedBreakIterator::DONE
) { 
1561         TEST_ASSERT(loops 
== 1); 
1563         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
)); 
1569 //------------------------------------------------------------------------------- 
1571 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and 
1572 //    return the datain one big UChar * buffer, which the caller must delete. 
1575 //          fileName:   the name of the file, with no directory part.  The test data directory 
1577 //          ulen        an out parameter, receives the actual length (in UChars) of the file data. 
1578 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding 
1579 //                      specified here.  The BOM, if it exists, will be stripped from the returned data. 
1580 //                      Pass NULL for the system default encoding. 
1583 //                      The file data, converted to UChar. 
1584 //                      The caller must delete this when done with 
1585 //                           delete [] theBuffer; 
1587 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile. 
1588 //           Move this function to some common place. 
1590 //-------------------------------------------------------------------------------- 
1591 UChar 
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode 
&status
) { 
1592     UChar       
*retPtr  
= NULL
; 
1593     char        *fileBuf 
= NULL
; 
1594     UConverter
* conv     
= NULL
; 
1598     if (U_FAILURE(status
)) { 
1605     f 
= fopen(fileName
, "rb"); 
1607         dataerrln("Error opening test data file %s\n", fileName
); 
1608         status 
= U_FILE_ACCESS_ERROR
; 
1617     fseek( f
, 0, SEEK_END
); 
1618     fileSize 
= ftell(f
); 
1619     fileBuf 
= new char[fileSize
]; 
1620     fseek(f
, 0, SEEK_SET
); 
1621     amt_read 
= fread(fileBuf
, 1, fileSize
, f
); 
1622     if (amt_read 
!= fileSize 
|| fileSize 
<= 0) { 
1623         errln("Error reading test data file."); 
1624         goto cleanUpAndReturn
; 
1628     // Look for a Unicode Signature (BOM) on the data just read 
1630     int32_t        signatureLength
; 
1631     const char *   fileBufC
; 
1632     const char*    bomEncoding
; 
1635     bomEncoding 
= ucnv_detectUnicodeSignature( 
1636         fileBuf
, fileSize
, &signatureLength
, &status
); 
1637     if(bomEncoding
!=NULL 
){ 
1638         fileBufC  
+= signatureLength
; 
1639         fileSize  
-= signatureLength
; 
1640         encoding 
= bomEncoding
; 
1644     // Open a converter to take the rule file to UTF-16 
1646     conv 
= ucnv_open(encoding
, &status
); 
1647     if (U_FAILURE(status
)) { 
1648         goto cleanUpAndReturn
; 
1652     // Convert the rules to UChar. 
1653     //  Preflight first to determine required buffer size. 
1655     ulen 
= ucnv_toUChars(conv
, 
1661     if (status 
== U_BUFFER_OVERFLOW_ERROR
) { 
1662         // Buffer Overflow is expected from the preflight operation. 
1663         status 
= U_ZERO_ERROR
; 
1665         retPtr 
= new UChar
[ulen
+1]; 
1678     if (U_FAILURE(status
)) { 
1679         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
)); 
1689 //-------------------------------------------------------------------------------------------- 
1691 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium 
1693 //------------------------------------------------------------------------------------------- 
1694 void RBBITest::TestUnicodeFiles() { 
1695     RuleBasedBreakIterator  
*bi
; 
1696     UErrorCode               status 
= U_ZERO_ERROR
; 
1698     bi 
=  (RuleBasedBreakIterator 
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
); 
1699     TEST_ASSERT_SUCCESS(status
); 
1700     if (U_SUCCESS(status
)) { 
1701         runUnicodeTestData("GraphemeBreakTest.txt", bi
); 
1705     bi 
=  (RuleBasedBreakIterator 
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
); 
1706     TEST_ASSERT_SUCCESS(status
); 
1707     if (U_SUCCESS(status
)) { 
1708         runUnicodeTestData("WordBreakTest.txt", bi
); 
1712     bi 
=  (RuleBasedBreakIterator 
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
); 
1713     TEST_ASSERT_SUCCESS(status
); 
1714     if (U_SUCCESS(status
)) { 
1715         runUnicodeTestData("SentenceBreakTest.txt", bi
); 
1719     bi 
=  (RuleBasedBreakIterator 
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
); 
1720     TEST_ASSERT_SUCCESS(status
); 
1721     if (U_SUCCESS(status
)) { 
1722         runUnicodeTestData("LineBreakTest.txt", bi
); 
1728 // Check for test cases from the Unicode test data files that are known to fail 
1729 // and should be skipped because ICU is not yet able to fully implement the spec. 
1730 // See ticket #7270. 
1732 UBool 
RBBITest::testCaseIsKnownIssue(const UnicodeString 
&testCase
, const char *fileName
) { 
1733     static const UChar badTestCases
[][4] = {                     // Line Numbers from Unicode 7.0.0 file. 
1734         {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x007D, (UChar
)0x0000},   // Line 5198 
1735         {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0029, (UChar
)0x0000},   // Line 5202 
1736         {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0021, (UChar
)0x0000},   // Line 5214 
1737         {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002c, (UChar
)0x0000},   // Line 5246 
1738         {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002f, (UChar
)0x0000},   // Line 5298 
1739         {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x2060, (UChar
)0x0000}    // Line 5302 
1741     if (strcmp(fileName
, "LineBreakTest.txt") != 0) { 
1745     for (int i
=0; i
<UPRV_LENGTHOF(badTestCases
); i
++) { 
1746         if (testCase 
== UnicodeString(badTestCases
[i
])) { 
1747             return logKnownIssue("7270"); 
1754 //-------------------------------------------------------------------------------------------- 
1756 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium 
1758 //------------------------------------------------------------------------------------------- 
1759 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator 
*bi
) { 
1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
1761     UErrorCode  status 
= U_ZERO_ERROR
; 
1764     //  Open and read the test data file, put it into a UnicodeString. 
1766     const char *testDataDirectory 
= IntlTest::getSourceTestData(status
); 
1767     char testFileName
[1000]; 
1768     if (testDataDirectory 
== NULL 
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) { 
1769         dataerrln("Can't open test data.  Path too long."); 
1772     strcpy(testFileName
, testDataDirectory
); 
1773     strcat(testFileName
, fileName
); 
1775     logln("Opening data file %s\n", fileName
); 
1778     UChar 
*testFile 
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
); 
1779     if (status 
!= U_FILE_ACCESS_ERROR
) { 
1780         TEST_ASSERT_SUCCESS(status
); 
1781         TEST_ASSERT(testFile 
!= NULL
); 
1783     if (U_FAILURE(status
) || testFile 
== NULL
) { 
1784         return; /* something went wrong, error already output */ 
1786     UnicodeString 
testFileAsString(TRUE
, testFile
, len
); 
1789     //  Parse the test data file using a regular expression. 
1790     //  Each kind of token is recognized in its own capture group; what type of item was scanned 
1791     //     is identified by which group had a match. 
1793     //    Caputure Group #                  1          2            3            4           5 
1794     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n 
1796     UnicodeString 
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
); 
1797     RegexMatcher    
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE 
| UREGEX_DOTALL
, status
); 
1798     UnicodeString   testString
; 
1799     UVector32       
breakPositions(status
); 
1801     TEST_ASSERT_SUCCESS(status
); 
1802     if (U_FAILURE(status
)) { 
1807     //  Scan through each test case, building up the string to be broken in testString, 
1808     //   and the positions that should be boundaries in the breakPositions vector. 
1811     while (tokenMatcher
.find()) { 
1812         if(tokenMatcher
.hitEnd()) { 
1813           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for. 
1814              This occurred when the text file was corrupt (wasn't marked as UTF-8) 
1815              and caused an infinite loop here on EBCDIC systems! 
1817           fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
); 
1820         if (tokenMatcher
.start(1, status
) >= 0) { 
1821             // Scanned a divide sign, indicating a break position in the test data. 
1822             if (testString
.length()>0) { 
1823                 breakPositions
.addElement(testString
.length(), status
); 
1826         else if (tokenMatcher
.start(2, status
) >= 0) { 
1827             // Scanned an 'x', meaning no break at this position in the test data 
1828             //   Nothing to be done here. 
1830         else if (tokenMatcher
.start(3, status
) >= 0) { 
1831             // Scanned Hex digits.  Convert them to binary, append to the character data string. 
1832             const UnicodeString 
&hexNumber 
= tokenMatcher
.group(3, status
); 
1833             int length 
= hexNumber
.length(); 
1836                 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
); 
1837                 UChar32 c 
= (UChar32
)strtol(buf
, NULL
, 16); 
1839                     testString
.append(c
); 
1841                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 
1842                        fileName
, lineNumber
); 
1845                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 
1846                        fileName
, lineNumber
); 
1849         else if (tokenMatcher
.start(4, status
) >= 0) { 
1850             // Scanned to end of a line, possibly skipping over a comment in the process. 
1851             //   If the line from the file contained test data, run the test now. 
1852             if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {   
1853                 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
); 
1856             // Clear out this test case. 
1857             //    The string and breakPositions vector will be refilled as the next 
1858             //       test case is parsed. 
1859             testString
.remove(); 
1860             breakPositions
.removeAllElements(); 
1863             // Scanner catchall.  Something unrecognized appeared on the line. 
1865             UnicodeString uToken 
= tokenMatcher
.group(0, status
); 
1866             uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
)); 
1867             token
[sizeof(token
)-1] = 0; 
1868             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName
, lineNumber
, token
); 
1870             // Clean up, in preparation for continuing with the next line. 
1871             testString
.remove(); 
1872             breakPositions
.removeAllElements(); 
1875         TEST_ASSERT_SUCCESS(status
); 
1876         if (U_FAILURE(status
)) { 
1882  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS 
1885 //-------------------------------------------------------------------------------------------- 
1887 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium 
1888 //                            test data files.  Do only a simple, forward-only check - 
1889 //                            this test is mostly to check that ICU and the Unicode 
1890 //                            data agree with each other. 
1892 //-------------------------------------------------------------------------------------------- 
1893 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
, 
1894                          const UnicodeString 
&testString
,   // Text data to be broken 
1895                          UVector32 
*breakPositions
,         // Positions where breaks should be found. 
1896                          RuleBasedBreakIterator 
*bi
) { 
1897     int32_t pos
;                 // Break Position in the test string 
1898     int32_t expectedI 
= 0;       // Index of expected break position in the vector of expected results. 
1899     int32_t expectedPos
;         // Expected break position (index into test string) 
1901     bi
->setText(testString
); 
1905     while (pos 
!= BreakIterator::DONE
) { 
1906         if (expectedI 
>= breakPositions
->size()) { 
1907             errln("Test file \"%s\", line %d, unexpected break found at position %d", 
1908                 testFileName
, lineNumber
, pos
); 
1911         expectedPos 
= breakPositions
->elementAti(expectedI
); 
1912         if (pos 
< expectedPos
) { 
1913             errln("Test file \"%s\", line %d, unexpected break found at position %d", 
1914                 testFileName
, lineNumber
, pos
); 
1917         if (pos 
> expectedPos
) { 
1918             errln("Test file \"%s\", line %d, failed to find expected break at position %d", 
1919                 testFileName
, lineNumber
, expectedPos
); 
1926     if (pos
==BreakIterator::DONE 
&& expectedI
<breakPositions
->size()) { 
1927         errln("Test file \"%s\", line %d, failed to find expected break at position %d", 
1928             testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
)); 
1934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
1935 //--------------------------------------------------------------------------------------- 
1937 //   classs RBBIMonkeyKind 
1939 //      Monkey Test for Break Iteration 
1940 //      Abstract interface class.   Concrete derived classes independently 
1941 //      implement the break rules for different iterator types. 
1943 //      The Monkey Test itself uses doesn't know which type of break iterator it is 
1944 //      testing, but works purely in terms of the interface defined here. 
1946 //--------------------------------------------------------------------------------------- 
1947 class RBBIMonkeyKind 
{ 
1949     // Return a UVector of UnicodeSets, representing the character classes used 
1950     //   for this type of iterator. 
1951     virtual  UVector  
*charClasses() = 0; 
1953     // Set the test text on which subsequent calls to next() will operate 
1954     virtual  void      setText(const UnicodeString 
&s
) = 0; 
1956     // Find the next break postion, starting from the prev break position, or from zero. 
1957     // Return -1 after reaching end of string. 
1958     virtual  int32_t   next(int32_t i
) = 0; 
1960     virtual ~RBBIMonkeyKind(); 
1961     UErrorCode       deferredStatus
; 
1970 RBBIMonkeyKind::RBBIMonkeyKind() { 
1971     deferredStatus 
= U_ZERO_ERROR
; 
1974 RBBIMonkeyKind::~RBBIMonkeyKind() { 
1978 //---------------------------------------------------------------------------------------- 
1980 //   Random Numbers.  Similar to standard lib rand() and srand() 
1981 //                    Not using library to 
1982 //                      1.  Get same results on all platforms. 
1983 //                      2.  Get access to current seed, to more easily reproduce failures. 
1985 //--------------------------------------------------------------------------------------- 
1986 static uint32_t m_seed 
= 1; 
1988 static uint32_t m_rand() 
1990     m_seed 
= m_seed 
* 1103515245 + 12345; 
1991     return (uint32_t)(m_seed
/65536) % 32768; 
1995 //------------------------------------------------------------------------------------------ 
1997 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation 
1998 //                             of RBBIMonkeyKind. 
2000 //------------------------------------------------------------------------------------------ 
2001 class RBBICharMonkey
: public RBBIMonkeyKind 
{ 
2004     virtual          ~RBBICharMonkey(); 
2005     virtual  UVector 
*charClasses(); 
2006     virtual  void     setText(const UnicodeString 
&s
); 
2007     virtual  int32_t  next(int32_t i
); 
2011     UnicodeSet  
*fCRLFSet
; 
2012     UnicodeSet  
*fControlSet
; 
2013     UnicodeSet  
*fExtendSet
; 
2014     UnicodeSet  
*fRegionalIndicatorSet
; 
2015     UnicodeSet  
*fPrependSet
; 
2016     UnicodeSet  
*fSpacingSet
; 
2021     UnicodeSet  
*fLVTSet
; 
2022     UnicodeSet  
*fHangulSet
; 
2023     UnicodeSet  
*fAnySet
; 
2025     const UnicodeString 
*fText
; 
2029 RBBICharMonkey::RBBICharMonkey() { 
2030     UErrorCode  status 
= U_ZERO_ERROR
; 
2034     fCRLFSet    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
); 
2035     fControlSet 
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status
); 
2036     fExtendSet  
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status
); 
2037     fRegionalIndicatorSet 
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
); 
2038     fPrependSet 
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
); 
2039     fSpacingSet 
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
); 
2040     fLSet       
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
); 
2041     fVSet       
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
); 
2042     fTSet       
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
); 
2043     fLVSet      
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
); 
2044     fLVTSet     
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
); 
2045     fHangulSet  
= new UnicodeSet(); 
2046     fHangulSet
->addAll(*fLSet
); 
2047     fHangulSet
->addAll(*fVSet
); 
2048     fHangulSet
->addAll(*fTSet
); 
2049     fHangulSet
->addAll(*fLVSet
); 
2050     fHangulSet
->addAll(*fLVTSet
); 
2051     fAnySet     
= new UnicodeSet(0, 0x10ffff); 
2053     fSets       
= new UVector(status
); 
2054     fSets
->addElement(fCRLFSet
,    status
); 
2055     fSets
->addElement(fControlSet
, status
); 
2056     fSets
->addElement(fExtendSet
,  status
); 
2057     fSets
->addElement(fRegionalIndicatorSet
, status
); 
2058     if (!fPrependSet
->isEmpty()) { 
2059         fSets
->addElement(fPrependSet
, status
); 
2061     fSets
->addElement(fSpacingSet
, status
); 
2062     fSets
->addElement(fHangulSet
,  status
); 
2063     fSets
->addElement(fAnySet
,     status
); 
2064     if (U_FAILURE(status
)) { 
2065         deferredStatus 
= status
; 
2070 void RBBICharMonkey::setText(const UnicodeString 
&s
) { 
2076 int32_t RBBICharMonkey::next(int32_t prevPos
) { 
2077     int    p0
, p1
, p2
, p3
;    // Indices of the significant code points around the 
2078                               //   break position being tested.  The candidate break 
2079                               //   location is before p2. 
2083     UChar32 c0
, c1
, c2
, c3
;   // The code points at p0, p1, p2 & p3. 
2085     if (U_FAILURE(deferredStatus
)) { 
2089     // Previous break at end of string.  return DONE. 
2090     if (prevPos 
>= fText
->length()) { 
2093     p0 
= p1 
= p2 
= p3 
= prevPos
; 
2094     c3 
=  fText
->char32At(prevPos
); 
2096     (void)p0
;   // suppress set but not used warning. 
2099     // Loop runs once per "significant" character position in the input text. 
2101         // Move all of the positions forward in the input string. 
2106         // Advancd p3 by one codepoint 
2107         p3 
= fText
->moveIndex32(p3
, 1); 
2108         c3 
= fText
->char32At(p3
); 
2111             // Still warming up the loop.  (won't work with zero length strings, but we don't care) 
2114         if (p2 
== fText
->length()) { 
2115             // Reached end of string.  Always a break position. 
2120         //     No Extend or Format characters may appear between the CR and LF, 
2121         //     which requires the additional check for p2 immediately following p1. 
2123         if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) { 
2127         // Rule (GB4).   ( Control | CR | LF ) <break> 
2128         if (fControlSet
->contains(c1
) || 
2134         // Rule (GB5)    <break>  ( Control | CR | LF ) 
2136         if (fControlSet
->contains(c2
) || 
2143         // Rule (GB6)  L x ( L | V | LV | LVT ) 
2144         if (fLSet
->contains(c1
) && 
2145                (fLSet
->contains(c2
)  || 
2146                 fVSet
->contains(c2
)  || 
2147                 fLVSet
->contains(c2
) || 
2148                 fLVTSet
->contains(c2
))) { 
2152         // Rule (GB7)    ( LV | V )  x  ( V | T ) 
2153         if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) && 
2154             (fVSet
->contains(c2
) || fTSet
->contains(c2
)))  { 
2158         // Rule (GB8)    ( LVT | T)  x T 
2159         if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) && 
2160             fTSet
->contains(c2
))  { 
2164         // Just adding extra Apple rule does here not work, behavior depends on arbitrary context 
2166         // Rule (GB8a)    Regional_Indicator x Regional_Indicator 
2167         if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) { 
2171         // Rule (GB9)    Numeric x ALetter 
2172         if (fExtendSet
->contains(c2
))  { 
2176         // Rule (GB9a)   x  SpacingMark 
2177         if (fSpacingSet
->contains(c2
)) { 
2181         // Rule (GB9b)   Prepend x 
2182         if (fPrependSet
->contains(c1
)) { 
2186         // Rule (GB10)  Any  <break>  Any 
2196 UVector  
*RBBICharMonkey::charClasses() { 
2201 RBBICharMonkey::~RBBICharMonkey() { 
2206     delete fRegionalIndicatorSet
; 
2218 //------------------------------------------------------------------------------------------ 
2220 //   class RBBIWordMonkey      Word Break specific implementation 
2221 //                             of RBBIMonkeyKind. 
2223 //------------------------------------------------------------------------------------------ 
2224 class RBBIWordMonkey
: public RBBIMonkeyKind 
{ 
2227     virtual          ~RBBIWordMonkey(); 
2228     virtual  UVector 
*charClasses(); 
2229     virtual  void     setText(const UnicodeString 
&s
); 
2230     virtual int32_t   next(int32_t i
); 
2236     UnicodeSet  
*fNewlineSet
; 
2237     UnicodeSet  
*fRegionalIndicatorSet
; 
2238     UnicodeSet  
*fKatakanaSet
; 
2239     UnicodeSet  
*fHebrew_LetterSet
; 
2240     UnicodeSet  
*fALetterSet
; 
2241     // TODO(jungshik): Do we still need this change?  
2242     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt 
2243     UnicodeSet  
*fSingle_QuoteSet
; 
2244     UnicodeSet  
*fDouble_QuoteSet
; 
2245     UnicodeSet  
*fMidNumLetSet
; 
2246     UnicodeSet  
*fMidLetterSet
; 
2247     UnicodeSet  
*fMidNumSet
; 
2248     UnicodeSet  
*fNumericSet
; 
2249     UnicodeSet  
*fFormatSet
; 
2250     UnicodeSet  
*fOtherSet
; 
2251     UnicodeSet  
*fExtendSet
; 
2252     UnicodeSet  
*fExtendNumLetSet
; 
2253     UnicodeSet  
*fDictionaryCjkSet
; 
2255     const UnicodeString  
*fText
; 
2259 RBBIWordMonkey::RBBIWordMonkey() 
2261     UErrorCode  status 
= U_ZERO_ERROR
; 
2263     fSets            
= new UVector(status
); 
2265     fCRSet           
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status
); 
2266     fLFSet           
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status
); 
2267     fNewlineSet      
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status
); 
2268     fDictionaryCjkSet
= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status
); 
2269     // Exclude Hangul syllables from ALetterSet during testing. 
2270     // Leave CJK dictionary characters out from the monkey tests! 
2272     fALetterSet      
= new UnicodeSet("[\\p{Word_Break = ALetter}" 
2273                                       "[\\p{Line_Break = Complex_Context}" 
2274                                       "-\\p{Grapheme_Cluster_Break = Extend}" 
2275                                       "-\\p{Grapheme_Cluster_Break = Control}" 
2279     fRegionalIndicatorSet 
=  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status
); 
2280     fKatakanaSet      
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status
); 
2281     fHebrew_LetterSet 
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status
); 
2282     fALetterSet       
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
); 
2283     fALetterSet
->removeAll(*fDictionaryCjkSet
); 
2284     fSingle_QuoteSet  
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status
); 
2285     fDouble_QuoteSet  
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status
); 
2286     fMidNumLetSet     
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status
); 
2287     fMidLetterSet     
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status
); 
2288     fMidNumSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status
); 
2289     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 
2290     // we should figure out why 
2291     fNumericSet       
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status
); 
2292     fFormatSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status
); 
2293     fExtendNumLetSet  
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
); 
2294     fExtendSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status
); 
2296     fOtherSet        
= new UnicodeSet(); 
2297     if(U_FAILURE(status
)) { 
2298       deferredStatus 
= status
; 
2302     fOtherSet
->complement(); 
2303     fOtherSet
->removeAll(*fCRSet
); 
2304     fOtherSet
->removeAll(*fLFSet
); 
2305     fOtherSet
->removeAll(*fNewlineSet
); 
2306     fOtherSet
->removeAll(*fKatakanaSet
); 
2307     fOtherSet
->removeAll(*fHebrew_LetterSet
); 
2308     fOtherSet
->removeAll(*fALetterSet
); 
2309     fOtherSet
->removeAll(*fSingle_QuoteSet
); 
2310     fOtherSet
->removeAll(*fDouble_QuoteSet
); 
2311     fOtherSet
->removeAll(*fMidLetterSet
); 
2312     fOtherSet
->removeAll(*fMidNumSet
); 
2313     fOtherSet
->removeAll(*fNumericSet
); 
2314     fOtherSet
->removeAll(*fExtendNumLetSet
); 
2315     fOtherSet
->removeAll(*fFormatSet
); 
2316     fOtherSet
->removeAll(*fExtendSet
); 
2317     fOtherSet
->removeAll(*fRegionalIndicatorSet
); 
2318     // Inhibit dictionary characters from being tested at all. 
2319     fOtherSet
->removeAll(*fDictionaryCjkSet
); 
2320     fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
)); 
2322     fSets
->addElement(fCRSet
,                status
); 
2323     fSets
->addElement(fLFSet
,                status
); 
2324     fSets
->addElement(fNewlineSet
,           status
); 
2325     fSets
->addElement(fRegionalIndicatorSet
, status
); 
2326     fSets
->addElement(fHebrew_LetterSet
,     status
); 
2327     fSets
->addElement(fALetterSet
,           status
); 
2328     fSets
->addElement(fSingle_QuoteSet
,      status
); 
2329     fSets
->addElement(fDouble_QuoteSet
,      status
); 
2330     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana 
2331     fSets
->addElement(fMidLetterSet
,         status
); 
2332     fSets
->addElement(fMidNumLetSet
,         status
); 
2333     fSets
->addElement(fMidNumSet
,            status
); 
2334     fSets
->addElement(fNumericSet
,           status
); 
2335     fSets
->addElement(fFormatSet
,            status
); 
2336     fSets
->addElement(fExtendSet
,            status
); 
2337     fSets
->addElement(fOtherSet
,             status
); 
2338     fSets
->addElement(fExtendNumLetSet
,      status
); 
2340     if (U_FAILURE(status
)) { 
2341         deferredStatus 
= status
; 
2345 void RBBIWordMonkey::setText(const UnicodeString 
&s
) { 
2350 int32_t RBBIWordMonkey::next(int32_t prevPos
) { 
2351     int    p0
, p1
, p2
, p3
;    // Indices of the significant code points around the 
2352                               //   break position being tested.  The candidate break 
2353                               //   location is before p2. 
2357     UChar32 c0
, c1
, c2
, c3
;   // The code points at p0, p1, p2 & p3. 
2359     if (U_FAILURE(deferredStatus
)) { 
2363     // Prev break at end of string.  return DONE. 
2364     if (prevPos 
>= fText
->length()) { 
2367     p0 
= p1 
= p2 
= p3 
= prevPos
; 
2368     c3 
=  fText
->char32At(prevPos
); 
2370     (void)p0
;       // Suppress set but not used warning. 
2372     // Loop runs once per "significant" character position in the input text. 
2374         // Move all of the positions forward in the input string. 
2379         // Advancd p3 by    X(Extend | Format)*   Rule 4 
2380         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 
2382             p3 
= fText
->moveIndex32(p3
, 1); 
2383             c3 
= fText
->char32At(p3
); 
2384             if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) { 
2388         while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
)); 
2392             // Still warming up the loop.  (won't work with zero length strings, but we don't care) 
2395         if (p2 
== fText
->length()) { 
2396             // Reached end of string.  Always a break position. 
2401         //     No Extend or Format characters may appear between the CR and LF, 
2402         //     which requires the additional check for p2 immediately following p1. 
2404         if (c1
==0x0D && c2
==0x0A) { 
2408         // Rule (3a)  Break before and after newlines (including CR and LF) 
2410         if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) { 
2413         if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) { 
2417         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 
2418         if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) && 
2419             (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
)))  { 
2423         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 
2425         if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
))   && 
2426              (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) && 
2427              (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) { 
2431         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter) 
2432         if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) && 
2433             (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) && 
2434             (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) { 
2438         // Rule (7a)     Hebrew_Letter x Single_Quote 
2439         if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) { 
2443         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter 
2444         if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) { 
2448         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter 
2449         if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) { 
2453         // Rule (8)    Numeric x Numeric 
2454         if (fNumericSet
->contains(c1
) && 
2455             fNumericSet
->contains(c2
))  { 
2459         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric 
2460         if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) && 
2461             fNumericSet
->contains(c2
))  { 
2465         // Rule (10)    Numeric x (ALetter | Hebrew_Letter) 
2466         if (fNumericSet
->contains(c1
) && 
2467             (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
)))  { 
2471         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric 
2472         if (fNumericSet
->contains(c0
) && 
2473             (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
))  && 
2474             fNumericSet
->contains(c2
)) { 
2478         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 
2479         if (fNumericSet
->contains(c1
) && 
2480             (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
))  && 
2481             fNumericSet
->contains(c3
)) { 
2485         // Rule (13)  Katakana x Katakana 
2486         if (fKatakanaSet
->contains(c1
) && 
2487             fKatakanaSet
->contains(c2
))  { 
2491         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 
2492         if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) || 
2493              fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) && 
2494              fExtendNumLetSet
->contains(c2
)) { 
2498         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 
2499         if (fExtendNumLetSet
->contains(c1
) && 
2500                 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) || 
2501                  fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
)))  { 
2506         if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) { 
2510         // Rule 14.  Break found here. 
2519 UVector  
*RBBIWordMonkey::charClasses() { 
2524 RBBIWordMonkey::~RBBIWordMonkey() { 
2529     delete fKatakanaSet
; 
2530     delete fHebrew_LetterSet
; 
2532     delete fSingle_QuoteSet
; 
2533     delete fDouble_QuoteSet
; 
2534     delete fMidNumLetSet
; 
2535     delete fMidLetterSet
; 
2540     delete fExtendNumLetSet
; 
2541     delete fRegionalIndicatorSet
; 
2542     delete fDictionaryCjkSet
; 
2549 //------------------------------------------------------------------------------------------ 
2551 //   class RBBISentMonkey      Sentence Break specific implementation 
2552 //                             of RBBIMonkeyKind. 
2554 //------------------------------------------------------------------------------------------ 
2555 class RBBISentMonkey
: public RBBIMonkeyKind 
{ 
2558     virtual          ~RBBISentMonkey(); 
2559     virtual  UVector 
*charClasses(); 
2560     virtual  void     setText(const UnicodeString 
&s
); 
2561     virtual int32_t   next(int32_t i
); 
2563     int               moveBack(int posFrom
); 
2564     int               moveForward(int posFrom
); 
2565     UChar32           
cAt(int pos
); 
2569     UnicodeSet  
*fSepSet
; 
2570     UnicodeSet  
*fFormatSet
; 
2572     UnicodeSet  
*fLowerSet
; 
2573     UnicodeSet  
*fUpperSet
; 
2574     UnicodeSet  
*fOLetterSet
; 
2575     UnicodeSet  
*fNumericSet
; 
2576     UnicodeSet  
*fATermSet
; 
2577     UnicodeSet  
*fSContinueSet
; 
2578     UnicodeSet  
*fSTermSet
; 
2579     UnicodeSet  
*fCloseSet
; 
2580     UnicodeSet  
*fOtherSet
; 
2581     UnicodeSet  
*fExtendSet
; 
2583     const UnicodeString  
*fText
; 
2587 RBBISentMonkey::RBBISentMonkey() 
2589     UErrorCode  status 
= U_ZERO_ERROR
; 
2591     fSets            
= new UVector(status
); 
2593     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator 
2594     //                       set and made into character classes of their own.  For the monkey impl, 
2595     //                       they remain in SEP, since Sep always appears with CR and LF in the rules. 
2596     fSepSet          
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status
); 
2597     fFormatSet       
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status
); 
2598     fSpSet           
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status
); 
2599     fLowerSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status
); 
2600     fUpperSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status
); 
2601     fOLetterSet      
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status
); 
2602     fNumericSet      
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status
); 
2603     fATermSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status
); 
2604     fSContinueSet    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
); 
2605     fSTermSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status
); 
2606     fCloseSet        
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status
); 
2607     fExtendSet       
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status
); 
2608     fOtherSet        
= new UnicodeSet(); 
2610     if(U_FAILURE(status
)) { 
2611       deferredStatus 
= status
; 
2615     fOtherSet
->complement(); 
2616     fOtherSet
->removeAll(*fSepSet
); 
2617     fOtherSet
->removeAll(*fFormatSet
); 
2618     fOtherSet
->removeAll(*fSpSet
); 
2619     fOtherSet
->removeAll(*fLowerSet
); 
2620     fOtherSet
->removeAll(*fUpperSet
); 
2621     fOtherSet
->removeAll(*fOLetterSet
); 
2622     fOtherSet
->removeAll(*fNumericSet
); 
2623     fOtherSet
->removeAll(*fATermSet
); 
2624     fOtherSet
->removeAll(*fSContinueSet
); 
2625     fOtherSet
->removeAll(*fSTermSet
); 
2626     fOtherSet
->removeAll(*fCloseSet
); 
2627     fOtherSet
->removeAll(*fExtendSet
); 
2629     fSets
->addElement(fSepSet
,       status
); 
2630     fSets
->addElement(fFormatSet
,    status
); 
2631     fSets
->addElement(fSpSet
,        status
); 
2632     fSets
->addElement(fLowerSet
,     status
); 
2633     fSets
->addElement(fUpperSet
,     status
); 
2634     fSets
->addElement(fOLetterSet
,   status
); 
2635     fSets
->addElement(fNumericSet
,   status
); 
2636     fSets
->addElement(fATermSet
,     status
); 
2637     fSets
->addElement(fSContinueSet
, status
); 
2638     fSets
->addElement(fSTermSet
,     status
); 
2639     fSets
->addElement(fCloseSet
,     status
); 
2640     fSets
->addElement(fOtherSet
,     status
); 
2641     fSets
->addElement(fExtendSet
,    status
); 
2643     if (U_FAILURE(status
)) { 
2644         deferredStatus 
= status
; 
2650 void RBBISentMonkey::setText(const UnicodeString 
&s
) { 
2654 UVector  
*RBBISentMonkey::charClasses() { 
2659 //  moveBack()   Find the "significant" code point preceding the index i. 
2660 //               Skips over ($Extend | $Format)* . 
2662 int RBBISentMonkey::moveBack(int i
) { 
2669         j 
= fText
->moveIndex32(j
, -1); 
2670         c 
= fText
->char32At(j
); 
2672     while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
))); 
2678 int RBBISentMonkey::moveForward(int i
) { 
2679     if (i
>=fText
->length()) { 
2680         return fText
->length(); 
2685         j 
= fText
->moveIndex32(j
, 1); 
2688     while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
)); 
2692 UChar32 
RBBISentMonkey::cAt(int pos
) { 
2693     if (pos
<0 || pos
>=fText
->length()) { 
2696         return fText
->char32At(pos
); 
2700 int32_t RBBISentMonkey::next(int32_t prevPos
) { 
2701     int    p0
, p1
, p2
, p3
;    // Indices of the significant code points around the 
2702                               //   break position being tested.  The candidate break 
2703                               //   location is before p2. 
2707     UChar32 c0
, c1
, c2
, c3
;   // The code points at p0, p1, p2 & p3. 
2710     if (U_FAILURE(deferredStatus
)) { 
2714     // Prev break at end of string.  return DONE. 
2715     if (prevPos 
>= fText
->length()) { 
2718     p0 
= p1 
= p2 
= p3 
= prevPos
; 
2719     c3 
=  fText
->char32At(prevPos
); 
2721     (void)p0
;     // Suppress set but not used warning. 
2723     // Loop runs once per "significant" character position in the input text. 
2725         // Move all of the positions forward in the input string. 
2730         // Advancd p3 by    X(Extend | Format)*   Rule 4 
2731         p3 
= moveForward(p3
); 
2735         if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) { 
2739         // Rule (4).   Sep  <break> 
2740         if (fSepSet
->contains(c1
)) { 
2741             p2 
= p1
+1;   // Separators don't combine with Extend or Format. 
2745         if (p2 
>= fText
->length()) { 
2746             // Reached end of string.  Always a break position. 
2750         if (p2 
== prevPos
) { 
2751             // Still warming up the loop.  (won't work with zero length strings, but we don't care) 
2755         // Rule (6).   ATerm x Numeric 
2756         if (fATermSet
->contains(c1
) &&  fNumericSet
->contains(c2
))  { 
2760         // Rule (7).  Upper ATerm  x  Uppper 
2761         if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) { 
2765         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 
2766         //           Note:  STerm | ATerm are added to the negated part of the expression by a 
2767         //                  note to the Unicode 5.0 documents. 
2769         while (fSpSet
->contains(cAt(p8
))) { 
2772         while (fCloseSet
->contains(cAt(p8
))) { 
2775         if (fATermSet
->contains(cAt(p8
))) { 
2779                 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) || 
2780                     fLowerSet
->contains(c
) || fSepSet
->contains(c
) || 
2781                     fATermSet
->contains(c
) || fSTermSet
->contains(c
))  { 
2784                 p8 
= moveForward(p8
); 
2786             if (fLowerSet
->contains(cAt(p8
))) { 
2791         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 
2792         if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) { 
2794             while (fSpSet
->contains(cAt(p8
))) { 
2797             while (fCloseSet
->contains(cAt(p8
))) { 
2801             if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) { 
2806         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF) 
2808         while (fCloseSet
->contains(cAt(p9
))) { 
2812         if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) { 
2813             if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) { 
2818         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF) 
2820         while (fSpSet
->contains(cAt(p10
))) { 
2821             p10 
= moveBack(p10
); 
2823         while (fCloseSet
->contains(cAt(p10
))) { 
2824             p10 
= moveBack(p10
); 
2826         if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) { 
2827             if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) { 
2832         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break> 
2834         if (fSepSet
->contains(cAt(p11
))) { 
2835             p11 
= moveBack(p11
); 
2837         while (fSpSet
->contains(cAt(p11
))) { 
2838             p11 
= moveBack(p11
); 
2840         while (fCloseSet
->contains(cAt(p11
))) { 
2841             p11 
= moveBack(p11
); 
2843         if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) { 
2847         //  Rule (12)  Any x Any 
2854 RBBISentMonkey::~RBBISentMonkey() { 
2864     delete fSContinueSet
; 
2873 //------------------------------------------------------------------------------------------- 
2877 //------------------------------------------------------------------------------------------- 
2879 class RBBILineMonkey
: public RBBIMonkeyKind 
{ 
2882     virtual          ~RBBILineMonkey(); 
2883     virtual  UVector 
*charClasses(); 
2884     virtual  void     setText(const UnicodeString 
&s
); 
2885     virtual  int32_t  next(int32_t i
); 
2886     virtual  void     rule9Adjust(int32_t pos
, UChar32 
*posChar
, int32_t *nextPos
, UChar32 
*nextChar
); 
2931     BreakIterator        
*fCharBI
; 
2932     const UnicodeString  
*fText
; 
2933     RegexMatcher         
*fNumberMatcher
; 
2937 RBBILineMonkey::RBBILineMonkey() 
2939     UErrorCode  status 
= U_ZERO_ERROR
; 
2941     fSets  
= new UVector(status
); 
2943     fBK    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
); 
2944     fCR    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
); 
2945     fLF    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
); 
2946     fCM    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
); 
2947     fNL    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
); 
2948     fWJ    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
); 
2949     fZW    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
); 
2950     fGL    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
); 
2951     fCB    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
); 
2952     fSP    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
); 
2953     fB2    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
); 
2954     fBA    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
); 
2955     fBB    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
); 
2956     fHY    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
); 
2957     fH2    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
); 
2958     fH3    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
); 
2959     fCL    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
); 
2960     fCP    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
); 
2961     fEX    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
); 
2962     fIN    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
); 
2963     fJL    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
); 
2964     fJV    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
); 
2965     fJT    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
); 
2966     fNS    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
); 
2967     fOP    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
); 
2968     fQU    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
); 
2969     fIS    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
); 
2970     fNU    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
); 
2971     fPO    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
); 
2972     fPR    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
); 
2973     fSY    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
); 
2974     fAI    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
); 
2975     fAL    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
); 
2976     fCJ    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
); 
2977     fHL    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
); 
2978     fID    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
); 
2979     fRI    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
); 
2980     fSA    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status
); 
2981     fSG    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
); 
2982     fXX    
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
); 
2984     if (U_FAILURE(status
)) { 
2985         deferredStatus 
= status
; 
2987         fNumberMatcher 
= NULL
; 
2991     fAL
->addAll(*fXX
);     // Default behavior for XX is identical to AL 
2992     fAL
->addAll(*fAI
);     // Default behavior for AI is identical to AL 
2993     fAL
->addAll(*fSA
);     // Default behavior for SA is XX, which defaults to AL 
2994     fAL
->addAll(*fSG
);     // Default behavior for SG is identical to AL. 
2996     fNS
->addAll(*fCJ
);     // Default behavior for CJ is identical to NS. 
2998     fSets
->addElement(fBK
, status
); 
2999     fSets
->addElement(fCR
, status
); 
3000     fSets
->addElement(fLF
, status
); 
3001     fSets
->addElement(fCM
, status
); 
3002     fSets
->addElement(fNL
, status
); 
3003     fSets
->addElement(fWJ
, status
); 
3004     fSets
->addElement(fZW
, status
); 
3005     fSets
->addElement(fGL
, status
); 
3006     fSets
->addElement(fCB
, status
); 
3007     fSets
->addElement(fSP
, status
); 
3008     fSets
->addElement(fB2
, status
); 
3009     fSets
->addElement(fBA
, status
); 
3010     fSets
->addElement(fBB
, status
); 
3011     fSets
->addElement(fHY
, status
); 
3012     fSets
->addElement(fH2
, status
); 
3013     fSets
->addElement(fH3
, status
); 
3014     fSets
->addElement(fCL
, status
); 
3015     fSets
->addElement(fCP
, status
); 
3016     fSets
->addElement(fEX
, status
); 
3017     fSets
->addElement(fIN
, status
); 
3018     fSets
->addElement(fJL
, status
); 
3019     fSets
->addElement(fJT
, status
); 
3020     fSets
->addElement(fJV
, status
); 
3021     fSets
->addElement(fNS
, status
); 
3022     fSets
->addElement(fOP
, status
); 
3023     fSets
->addElement(fQU
, status
); 
3024     fSets
->addElement(fIS
, status
); 
3025     fSets
->addElement(fNU
, status
); 
3026     fSets
->addElement(fPO
, status
); 
3027     fSets
->addElement(fPR
, status
); 
3028     fSets
->addElement(fSY
, status
); 
3029     fSets
->addElement(fAI
, status
); 
3030     fSets
->addElement(fAL
, status
); 
3031     fSets
->addElement(fHL
, status
); 
3032     fSets
->addElement(fID
, status
); 
3033     fSets
->addElement(fWJ
, status
); 
3034     fSets
->addElement(fRI
, status
); 
3035     fSets
->addElement(fSA
, status
); 
3036     fSets
->addElement(fSG
, status
); 
3039             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 
3040             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 
3041             "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 
3042             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 
3043             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 
3044             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 
3046     fNumberMatcher 
= new RegexMatcher( 
3047         UnicodeString(rules
, -1, US_INV
), 0, status
); 
3049     fCharBI 
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
); 
3051     if (U_FAILURE(status
)) { 
3052         deferredStatus 
= status
; 
3057 void RBBILineMonkey::setText(const UnicodeString 
&s
) { 
3059     fCharBI
->setText(s
); 
3060     fNumberMatcher
->reset(s
); 
3065 //     Line Break TR rules 9 and 10 implementation. 
3066 //     This deals with combining marks and other sequences that 
3067 //     that must be treated as if they were something other than what they actually are. 
3069 //     This is factored out into a separate function because it must be applied twice for 
3070 //     each potential break, once to the chars before the position being checked, then 
3071 //     again to the text following the possible break. 
3073 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32 
*posChar
, int32_t *nextPos
, UChar32 
*nextChar
) { 
3075         // Invalid initial position.  Happens during the warmup iteration of the 
3076         //   main loop in next(). 
3080     int32_t  nPos 
= *nextPos
; 
3082     // LB 9  Keep combining sequences together. 
3083     //  advance over any CM class chars.  Note that Line Break CM is different 
3084     //  from the normal Grapheme Extend property. 
3085     if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d || 
3086           *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) { 
3088             *nextChar 
= fText
->char32At(nPos
); 
3089             if (!fCM
->contains(*nextChar
)) { 
3092             nPos 
= fText
->moveIndex32(nPos
, 1); 
3097     // LB 9 Treat X CM* as if it were x. 
3098     //       No explicit action required. 
3100     // LB 10  Treat any remaining combining mark as AL 
3101     if (fCM
->contains(*posChar
)) { 
3102         *posChar 
= 0x41;   // thisChar = 'A'; 
3105     // Push the updated nextPos and nextChar back to our caller. 
3106     // This only makes a difference if posChar got bigger by consuming a 
3107     // combining sequence. 
3109     *nextChar 
= fText
->char32At(nPos
); 
3114 int32_t RBBILineMonkey::next(int32_t startPos
) { 
3115     UErrorCode status 
= U_ZERO_ERROR
; 
3116     int32_t    pos
;       //  Index of the char following a potential break position 
3117     UChar32    thisChar
;  //  Character at above position "pos" 
3119     int32_t    prevPos
;   //  Index of the char preceding a potential break position 
3120     UChar32    prevChar
;  //  Character at above position.  Note that prevChar 
3121                           //   and thisChar may not be adjacent because combining 
3122                           //   characters between them will be ignored. 
3124     int32_t    prevPosX2
; //  Second previous character.  Wider context for LB21a. 
3127     int32_t    nextPos
;   //  Index of the next character following pos. 
3128                           //     Usually skips over combining marks. 
3129     int32_t    nextCPPos
; //  Index of the code point following "pos." 
3130                           //     May point to a combining mark. 
3131     int32_t    tPos
;      //  temp value. 
3134     if (U_FAILURE(deferredStatus
)) { 
3138     if (startPos 
>= fText
->length()) { 
3143     // Initial values for loop.  Loop will run the first time without finding breaks, 
3144     //                           while the invalid values shift out and the "this" and 
3145     //                           "prev" positions are filled in with good values. 
3146     pos      
= prevPos   
= prevPosX2  
= -1;    // Invalid value, serves as flag for initial loop iteration. 
3147     thisChar 
= prevChar  
= prevCharX2 
= 0; 
3148     nextPos  
= nextCPPos 
= startPos
; 
3151     // Loop runs once per position in the test text, until a break position 
3154         prevPosX2 
= prevPos
; 
3155         prevCharX2 
= prevChar
; 
3158         prevChar  
= thisChar
; 
3161         thisChar  
= fText
->char32At(pos
); 
3163         nextCPPos 
= fText
->moveIndex32(pos
, 1); 
3164         nextPos   
= nextCPPos
; 
3166         // Rule LB2 - Break at end of text. 
3167         if (pos 
>= fText
->length()) { 
3171         // Rule LB 9 - adjust for combining sequences. 
3172         //             We do this one out-of-order because the adjustment does not change anything 
3173         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 
3175         rule9Adjust(prevPos
, &prevChar
, &pos
,     &thisChar
); 
3176         nextCPPos 
= nextPos 
= fText
->moveIndex32(pos
, 1); 
3177         c 
= fText
->char32At(nextPos
); 
3178         rule9Adjust(pos
,     &thisChar
, &nextPos
, &c
); 
3180         // If the loop is still warming up - if we haven't shifted the initial 
3181         //   -1 positions out of prevPos yet - loop back to advance the 
3182         //    position in the input without any further looking for breaks. 
3183         if (prevPos 
== -1) { 
3187         // LB 4  Always break after hard line breaks, 
3188         if (fBK
->contains(prevChar
)) { 
3192         // LB 5  Break after CR, LF, NL, but not inside CR LF 
3193         if (prevChar 
== 0x0d && thisChar 
== 0x0a) { 
3196         if (prevChar 
== 0x0d || 
3202         // LB 6  Don't break before hard line breaks 
3203         if (thisChar 
== 0x0d || thisChar 
== 0x0a || thisChar 
== 0x85 || 
3204             fBK
->contains(thisChar
)) { 
3209         // LB 7  Don't break before spaces or zero-width space. 
3210         if (fSP
->contains(thisChar
)) { 
3214         if (fZW
->contains(thisChar
)) { 
3218         // LB 8  Break after zero width space 
3219         if (fZW
->contains(prevChar
)) { 
3223         // LB 9, 10  Already done, at top of loop. 
3227         // LB 11  Do not break before or after WORD JOINER and related characters. 
3231         if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) { 
3237         if (fGL
->contains(prevChar
)) { 
3243         if (!(fSP
->contains(prevChar
) || 
3244               fBA
->contains(prevChar
) || 
3245               fHY
->contains(prevChar
)     ) && fGL
->contains(thisChar
)) { 
3251         // LB 13  Don't break before closings. 
3252         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will 
3253         //        fall into LB 17 and the more general number regular expression. 
3255         if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) || 
3256             (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) || 
3257                                          fEX
->contains(thisChar
)  || 
3258             (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) || 
3259             (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
)))    { 
3263         // LB 14 Don't break after OP SP* 
3264         //       Scan backwards, checking for this sequence. 
3265         //       The OP char could include combining marks, so we actually check for 
3267         //       Another Twist: The Rule 67 fixes may have changed a SP CM 
3268         //       sequence into a ID char, so before scanning back through spaces, 
3269         //       verify that prevChar is indeed a space.  The prevChar variable 
3270         //       may differ from fText[prevPos] 
3272         if (fSP
->contains(prevChar
)) { 
3273             while (tPos 
> 0 && fSP
->contains(fText
->char32At(tPos
))) { 
3274                 tPos
=fText
->moveIndex32(tPos
, -1); 
3277         while (tPos 
> 0 && fCM
->contains(fText
->char32At(tPos
))) { 
3278             tPos
=fText
->moveIndex32(tPos
, -1); 
3280         if (fOP
->contains(fText
->char32At(tPos
))) { 
3285         // LB 15    QU SP* x OP 
3286         if (fOP
->contains(thisChar
)) { 
3287             // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 
3289             while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) { 
3290                 tPos 
= fText
->moveIndex32(tPos
, -1); 
3292             while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) { 
3293                 tPos 
= fText
->moveIndex32(tPos
, -1); 
3295             if (fQU
->contains(fText
->char32At(tPos
))) { 
3302         // LB 16   (CL | CP) SP* x NS 
3303         //    Scan backwards for SP* CM* (CL | CP) 
3304         if (fNS
->contains(thisChar
)) { 
3306             while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) { 
3307                 tPos 
= fText
->moveIndex32(tPos
, -1); 
3309             while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) { 
3310                 tPos 
= fText
->moveIndex32(tPos
, -1); 
3312             if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) { 
3318         // LB 17        B2 SP* x B2 
3319         if (fB2
->contains(thisChar
)) { 
3320             //  Scan backwards, checking for the B2 CM* SP* sequence. 
3322             if (fSP
->contains(prevChar
)) { 
3323                 while (tPos 
> 0 && fSP
->contains(fText
->char32At(tPos
))) { 
3324                     tPos
=fText
->moveIndex32(tPos
, -1); 
3327             while (tPos 
> 0 && fCM
->contains(fText
->char32At(tPos
))) { 
3328                 tPos
=fText
->moveIndex32(tPos
, -1); 
3330             if (fB2
->contains(fText
->char32At(tPos
))) { 
3336         // LB 18    break after space 
3337         if (fSP
->contains(prevChar
)) { 
3344         if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) { 
3348         // LB 20  Break around a CB 
3349         if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) { 
3354         if (fBA
->contains(thisChar
) || 
3355             fHY
->contains(thisChar
) || 
3356             fNS
->contains(thisChar
) || 
3357             fBB
->contains(prevChar
) )   { 
3363         if (fHL
->contains(prevCharX2
) &&  
3364                 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) { 
3370         if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) { 
3375         if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) || 
3376             (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) || 
3377             (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) || 
3378             (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) || 
3379             (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) )   { 
3388         if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) || 
3389             (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) || 
3390             (fHL
->contains(prevChar
) && fNU
->contains(thisChar
)) || 
3391             (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) || 
3392             (fNU
->contains(prevChar
) && fHL
->contains(thisChar
)) )   { 
3396         // LB 24  Do not break between prefix and letters or ideographs. 
3400         if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) || 
3401             (fPR
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) || 
3402             (fPO
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))))  { 
3409         if (fNumberMatcher
->lookingAt(prevPos
, status
)) { 
3410             if (U_FAILURE(status
)) { 
3413             // Matched a number.  But could have been just a single digit, which would 
3414             //    not represent a "no break here" between prevChar and thisChar 
3415             int32_t numEndIdx 
= fNumberMatcher
->end(status
);  // idx of first char following num 
3416             if (numEndIdx 
> pos
) { 
3417                 // Number match includes at least our two chars being checked 
3418                 if (numEndIdx 
> nextPos
) { 
3419                     // Number match includes additional chars.  Update pos and nextPos 
3420                     //   so that next loop iteration will continue at the end of the number, 
3421                     //   checking for breaks between last char in number & whatever follows. 
3422                     pos 
= nextPos 
= numEndIdx
; 
3424                         pos 
= fText
->moveIndex32(pos
, -1); 
3425                         thisChar 
= fText
->char32At(pos
); 
3426                     } while (fCM
->contains(thisChar
)); 
3433         // LB 26 Do not break a Korean syllable. 
3434         if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) || 
3435                                         fJV
->contains(thisChar
) || 
3436                                         fH2
->contains(thisChar
) || 
3437                                         fH3
->contains(thisChar
))) { 
3441         if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
))  && 
3442             (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) { 
3446         if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) && 
3447             fJT
->contains(thisChar
)) { 
3451         // LB 27 Treat a Korean Syllable Block the same as ID. 
3452         if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) || 
3453             fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) && 
3454             fIN
->contains(thisChar
)) { 
3457         if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) || 
3458             fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) && 
3459             fPO
->contains(thisChar
)) { 
3462         if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) || 
3463             fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) { 
3469         // LB 28  Do not break between alphabetics ("at"). 
3470         if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) { 
3474         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g."). 
3475         if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) { 
3479         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 
3482         if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) { 
3485         if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) { 
3489         // LB30a  Do not break between regional indicators. 
3491         if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) { 
3495         // LB 31    Break everywhere else 
3504 UVector  
*RBBILineMonkey::charClasses() { 
3509 RBBILineMonkey::~RBBILineMonkey() { 
3554     delete fNumberMatcher
; 
3558 //------------------------------------------------------------------------------------------- 
3563 //       seed=nnnnn        Random number starting seed. 
3564 //                         Setting the seed allows errors to be reproduced. 
3565 //       loop=nnn          Looping count.  Controls running time. 
3567 //                          0 or greater:  run length. 
3569 //       type = char | word | line | sent | title 
3571 //------------------------------------------------------------------------------------------- 
3573 static int32_t  getIntParam(UnicodeString name
, UnicodeString 
¶ms
, int32_t defaultVal
) { 
3574     int32_t val 
= defaultVal
; 
3575     name
.append(" *= *(-?\\d+)"); 
3576     UErrorCode status 
= U_ZERO_ERROR
; 
3577     RegexMatcher 
m(name
, params
, 0, status
); 
3579         // The param exists.  Convert the string to an int. 
3580         char valString
[100]; 
3581         int32_t paramLength 
= m
.end(1, status
) - m
.start(1, status
); 
3582         if (paramLength 
>= (int32_t)(sizeof(valString
)-1)) { 
3583             paramLength 
= (int32_t)(sizeof(valString
)-2); 
3585         params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
)); 
3586         val 
= strtol(valString
,  NULL
, 10); 
3588         // Delete this parameter from the params string. 
3590         params 
= m
.replaceFirst("", status
); 
3592     U_ASSERT(U_SUCCESS(status
)); 
3597 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
3598 static void testBreakBoundPreceding(RBBITest 
*test
, UnicodeString ustr
, 
3607     for (i 
= bi
->first(); i 
!= BreakIterator::DONE
; i 
= bi
->next()) { 
3609         if (count 
< expectedcount 
&& expected
[count
] != i
) { 
3610             test
->errln("break forward test failed: expected %d but got %d", 
3611                         expected
[count
], i
); 
3616     if (count 
!= expectedcount
) { 
3617         printStringBreaks(ustr
, expected
, expectedcount
); 
3618         test
->errln("break forward test failed: missed %d match", 
3619                     expectedcount 
- count
); 
3622     // testing boundaries 
3623     for (i 
= 1; i 
< expectedcount
; i 
++) { 
3624         int j 
= expected
[i 
- 1]; 
3625         if (!bi
->isBoundary(j
)) { 
3626             printStringBreaks(ustr
, expected
, expectedcount
); 
3627             test
->errln("isBoundary() failed.  Expected boundary at position %d", j
); 
3630         for (j 
= expected
[i 
- 1] + 1; j 
< expected
[i
]; j 
++) { 
3631             if (bi
->isBoundary(j
)) { 
3632                 printStringBreaks(ustr
, expected
, expectedcount
); 
3633                 test
->errln("isBoundary() failed.  Not expecting boundary at position %d", j
); 
3639     for (i 
= bi
->last(); i 
!= BreakIterator::DONE
; i 
= bi
->previous()) { 
3641         if (forward
[count
] != i
) { 
3642             printStringBreaks(ustr
, expected
, expectedcount
); 
3643             test
->errln("happy break test previous() failed: expected %d but got %d", 
3649         printStringBreaks(ustr
, expected
, expectedcount
); 
3650         test
->errln("break test previous() failed: missed a match"); 
3654     // testing preceding 
3655     for (i 
= 0; i 
< expectedcount 
- 1; i 
++) { 
3656         // int j = expected[i] + 1; 
3657         int j 
= ustr
.moveIndex32(expected
[i
], 1); 
3658         for (; j 
<= expected
[i 
+ 1]; j 
++) { 
3659             if (bi
->preceding(j
) != expected
[i
]) { 
3660                 printStringBreaks(ustr
, expected
, expectedcount
); 
3661                 test
->errln("preceding(): Not expecting boundary at position %d", j
); 
3669 void RBBITest::TestWordBreaks(void) 
3671 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
3673     Locale        
locale("en"); 
3674     UErrorCode    status 
= U_ZERO_ERROR
; 
3675     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status); 
3676     BreakIterator 
*bi 
= BreakIterator::createWordInstance(locale
, status
); 
3677     // Replaced any C+J characters in a row with a random sequence of characters 
3678     // of the same length to make our C+J segmentation not get in the way. 
3679     static const char *strlist
[] = 
3681     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 
3682     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 
3683     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 
3684     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 
3685     "\\uac00\\u3588\\u009c\\u0953\\u194b", 
3686     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 
3687     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 
3688     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 
3689     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 
3690     "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 
3691     "\\u2027\\U000e0067\\u0a47\\u00b7", 
3692     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 
3693     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 
3694     "\\u0589\\U000e006e\\u0a42\\U000104a5", 
3695     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 
3696     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 
3697     "\\u0027\\u11af\\U000e0057\\u0602", 
3698     "\\U0001d7f2\\U000e007\\u0004\\u0589", 
3699     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 
3700     "\\U0001d7f2\\U000e007d\\u0004\\u0589", 
3701     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 
3702     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 
3703     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 
3704     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 
3705     "\\u0233\\U000e0020\\u0a69\\u0d6a", 
3706     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 
3707     "\\u18f4\\U000e0049\\u20e7\\u2027", 
3708     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 
3709     "\\ua183\\u102d\\u0bec\\u003a", 
3710     "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 
3711     "\\u003a\\u0e57\\u0fad\\u002e", 
3712     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 
3713     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 
3714     "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 
3715     "\\u003a\\u0664\\u00b7\\u1fba", 
3716     "\\u003b\\u0027\\u00b7\\u47a3", 
3717     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 
3718     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 
3719     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 
3722     if (U_FAILURE(status
)) { 
3723         errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
)); 
3726     for (loop 
= 0; loop 
< (int)(sizeof(strlist
) / sizeof(char *)); loop 
++) { 
3727         // printf("looping %d\n", loop); 
3728         UnicodeString ustr 
= CharsToUnicodeString(strlist
[loop
]); 
3729         // RBBICharMonkey monkey; 
3730         RBBIWordMonkey monkey
; 
3733         int expectedcount 
= 0; 
3735         monkey
.setText(ustr
); 
3737         for (i 
= 0; i 
!= BreakIterator::DONE
; i 
= monkey
.next(i
)) { 
3738             expected
[expectedcount 
++] = i
; 
3741         testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
); 
3747 void RBBITest::TestWordBoundary(void) 
3749     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 
3750     Locale        
locale("en"); 
3751     UErrorCode    status 
= U_ZERO_ERROR
; 
3752     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status); 
3753     BreakIterator 
*bi 
= BreakIterator::createWordInstance(locale
, status
); 
3755     static const char *strlist
[] = 
3757     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 
3758     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 
3759     "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 
3760     "\\u2027\\U000e0067\\u0a47\\u00b7", 
3761     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 
3762     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 
3763     "\\u0589\\U000e006e\\u0a42\\U000104a5", 
3764     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 
3765     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 
3766     "\\u0027\\u11af\\U000e0057\\u0602", 
3767     "\\U0001d7f2\\U000e007\\u0004\\u0589", 
3768     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 
3769     "\\U0001d7f2\\U000e007d\\u0004\\u0589", 
3770     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 
3771     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 
3772     "\\U000e0065\\u302c\\u09ee\\U000e0068", 
3773     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 
3774     "\\u0233\\U000e0020\\u0a69\\u0d6a", 
3775     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 
3776     "\\u58f4\\U000e0049\\u20e7\\u2027", 
3777     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 
3778     "\\ua183\\u102d\\u0bec\\u003a", 
3779     "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 
3780     "\\u003a\\u0e57\\u0fad\\u002e", 
3781     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 
3782     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 
3783     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 
3784     "\\u003a\\u0664\\u00b7\\u1fba", 
3785     "\\u003b\\u0027\\u00b7\\u47a3", 
3788     if (U_FAILURE(status
)) { 
3789         errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
)); 
3792     for (loop 
= 0; loop 
< (int)(sizeof(strlist
) / sizeof(char *)); loop 
++) { 
3793         // printf("looping %d\n", loop); 
3794         u_unescape(strlist
[loop
], str
, 20); 
3795         UnicodeString 
ustr(str
); 
3802         for (i 
= bi
->first(); i 
!= BreakIterator::DONE
; i 
= bi
->next()) { 
3803             forward
[count 
++] = i
; 
3806                 for (j 
= prev 
+ 1; j 
< i
; j 
++) { 
3807                     if (bi
->isBoundary(j
)) { 
3808                         printStringBreaks(ustr
, forward
, count
); 
3809                         errln("happy boundary test failed: expected %d not a boundary", 
3815             if (!bi
->isBoundary(i
)) { 
3816                 printStringBreaks(ustr
, forward
, count
); 
3817                 errln("happy boundary test failed: expected %d a boundary", 
3827 void RBBITest::TestLineBreaks(void) 
3829 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
3830     Locale        
locale("en"); 
3831     UErrorCode    status 
= U_ZERO_ERROR
; 
3832     BreakIterator 
*bi 
= BreakIterator::createLineInstance(locale
, status
); 
3833     const int32_t  STRSIZE 
= 50; 
3835     static const char *strlist
[] = 
3837      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 
3838      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 
3839              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 
3840      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 
3841              "u2014\\U000e0105\\u118c\\u000a\\u07f8", 
3842      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 
3843      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 
3844      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 
3845      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 
3846      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 
3847      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 
3848      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 
3849      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 
3850      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 
3851      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 
3852      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 
3853      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 
3854      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 
3855      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 
3856      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 
3857      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 
3858      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 
3859      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 
3860      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 
3861      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 
3862      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 
3863      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 
3864      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 
3865      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 
3866      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 
3867      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 
3868      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 
3869      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 
3870      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 
3871      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 
3872      "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 
3873      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 
3874      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 
3875      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 
3876      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 
3877      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 
3878      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 
3879          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 
3880          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 
3881          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 
3882      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 
3883          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 
3886     TEST_ASSERT_SUCCESS(status
); 
3887     if (U_FAILURE(status
)) { 
3890     for (loop 
= 0; loop 
< (int)(sizeof(strlist
) / sizeof(char *)); loop 
++) { 
3891         // printf("looping %d\n", loop); 
3892         int32_t t 
= u_unescape(strlist
[loop
], str
, STRSIZE
); 
3899         UnicodeString 
ustr(str
); 
3900         RBBILineMonkey monkey
; 
3901         if (U_FAILURE(monkey
.deferredStatus
)) { 
3905         const int EXPECTEDSIZE 
= 50; 
3906         int expected
[EXPECTEDSIZE
]; 
3907         int expectedcount 
= 0; 
3909         monkey
.setText(ustr
); 
3911         for (i 
= 0; i 
!= BreakIterator::DONE
; i 
= monkey
.next(i
)) { 
3912             if (expectedcount 
>= EXPECTEDSIZE
) { 
3913                 TEST_ASSERT(expectedcount 
< EXPECTEDSIZE
); 
3916             expected
[expectedcount 
++] = i
; 
3919         testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
); 
3925 void RBBITest::TestSentBreaks(void) 
3927 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
3928     Locale        
locale("en"); 
3929     UErrorCode    status 
= U_ZERO_ERROR
; 
3930     BreakIterator 
*bi 
= BreakIterator::createSentenceInstance(locale
, status
); 
3932     static const char *strlist
[] = 
3934      "Now\ris\nthe\r\ntime\n\rfor\r\r", 
3936      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 
3937      "\"Sentence ending with a quote.\" Bye.", 
3938      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"", 
3939      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 
3940      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 
3941      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 
3942      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 
3943      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 
3944      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 
3945              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 
3946              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 
3947              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 
3948      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 
3949              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 
3950              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 
3951              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 
3952              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 
3953              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 
3956     if (U_FAILURE(status
)) { 
3957         errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
)); 
3960     for (loop 
= 0; loop 
< (int)(sizeof(strlist
) / sizeof(char *)); loop 
++) { 
3961         u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0]))); 
3962         UnicodeString 
ustr(str
); 
3964         RBBISentMonkey monkey
; 
3965         if (U_FAILURE(monkey
.deferredStatus
)) { 
3969         const int EXPECTEDSIZE 
= 50; 
3970         int expected
[EXPECTEDSIZE
]; 
3971         int expectedcount 
= 0; 
3973         monkey
.setText(ustr
); 
3975         for (i 
= 0; i 
!= BreakIterator::DONE
; i 
= monkey
.next(i
)) { 
3976             if (expectedcount 
>= EXPECTEDSIZE
) { 
3977                 TEST_ASSERT(expectedcount 
< EXPECTEDSIZE
); 
3980             expected
[expectedcount 
++] = i
; 
3983         testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
); 
3989 void RBBITest::TestMonkey(char *params
) { 
3990 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
3992     UErrorCode     status    
= U_ZERO_ERROR
; 
3993     int32_t        loopCount 
= 500; 
3995     UnicodeString  breakType 
= "all"; 
3996     Locale         
locale("en"); 
3997     UBool          useUText  
= FALSE
; 
3999     if (quick 
== FALSE
) { 
4004         UnicodeString 
p(params
); 
4005         loopCount 
= getIntParam("loop", p
, loopCount
); 
4006         seed      
= getIntParam("seed", p
, seed
); 
4008         RegexMatcher 
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
); 
4010             breakType 
= m
.group(1, status
); 
4012             p 
= m
.replaceFirst("", status
); 
4015         RegexMatcher 
u(" *utext", p
, 0, status
); 
4019             p 
= u
.replaceFirst("", status
); 
4024         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) { 
4025             // Each option is stripped out of the option string as it is processed. 
4026             // All options have been checked.  The option string should have been completely emptied.. 
4028             p
.extract(buf
, sizeof(buf
), NULL
, status
); 
4029             buf
[sizeof(buf
)-1] = 0; 
4030             errln("Unrecognized or extra parameter:  %s\n", buf
); 
4036     if (breakType 
== "char" || breakType 
== "all") { 
4038         BreakIterator  
*bi 
= BreakIterator::createCharacterInstance(locale
, status
); 
4039         if (U_SUCCESS(status
)) { 
4040             RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
); 
4041             if (breakType 
== "all" && useUText
==FALSE
) { 
4042                 // Also run a quick test with UText when "all" is specified 
4043                 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
); 
4047             errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
)); 
4052     if (breakType 
== "word" || breakType 
== "all") { 
4053         logln("Word Break Monkey Test"); 
4055         BreakIterator  
*bi 
= BreakIterator::createWordInstance(locale
, status
); 
4056         if (U_SUCCESS(status
)) { 
4057             RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
); 
4060             errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
)); 
4065     if (breakType 
== "line" || breakType 
== "all") { 
4066         logln("Line Break Monkey Test"); 
4068         BreakIterator  
*bi 
= BreakIterator::createLineInstance(locale
, status
); 
4069         if (loopCount 
>= 10) { 
4070             loopCount 
= loopCount 
/ 5;   // Line break runs slower than the others. 
4072         if (U_SUCCESS(status
)) { 
4073             RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
); 
4076             errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
)); 
4081     if (breakType 
== "sent" || breakType 
== "all"  ) { 
4082         logln("Sentence Break Monkey Test"); 
4084         BreakIterator  
*bi 
= BreakIterator::createSentenceInstance(locale
, status
); 
4085         if (loopCount 
>= 10) { 
4086             loopCount 
= loopCount 
/ 10;   // Sentence runs slower than the other break types 
4088         if (U_SUCCESS(status
)) { 
4089             RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
); 
4092             errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
)); 
4101 //  Run a RBBI monkey test.  Common routine, for all break iterator types. 
4103 //       bi      - the break iterator to use 
4104 //       mk      - MonkeyKind, abstraction for obtaining expected results 
4105 //       name    - Name of test (char, word, etc.) for use in error messages 
4106 //       seed    - Seed for starting random number generator (parameter from user) 
4109 void RBBITest::RunMonkey(BreakIterator 
*bi
, RBBIMonkeyKind 
&mk
, const char *name
, uint32_t  seed
, 
4110                          int32_t numIterations
, UBool useUText
) { 
4112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
4114     const int32_t    TESTSTRINGLEN 
= 500; 
4115     UnicodeString    testText
; 
4116     int32_t          numCharClasses
; 
4118     int              expected
[TESTSTRINGLEN
*2 + 1]; 
4119     int              expectedCount 
= 0; 
4120     char             expectedBreaks
[TESTSTRINGLEN
*2 + 1]; 
4121     char             forwardBreaks
[TESTSTRINGLEN
*2 + 1]; 
4122     char             reverseBreaks
[TESTSTRINGLEN
*2+1]; 
4123     char             isBoundaryBreaks
[TESTSTRINGLEN
*2+1]; 
4124     char             followingBreaks
[TESTSTRINGLEN
*2+1]; 
4125     char             precedingBreaks
[TESTSTRINGLEN
*2+1]; 
4131     numCharClasses 
= mk
.charClasses()->size(); 
4132     chClasses      
= mk
.charClasses(); 
4134     // Check for errors that occured during the construction of the MonkeyKind object. 
4135     //  Can't report them where they occured because errln() is a method coming from intlTest, 
4136     //  and is not visible outside of RBBITest :-( 
4137     if (U_FAILURE(mk
.deferredStatus
)) { 
4138         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
)); 
4142     // Verify that the character classes all have at least one member. 
4143     for (i
=0; i
<numCharClasses
; i
++) { 
4144         UnicodeSet 
*s 
= (UnicodeSet 
*)chClasses
->elementAt(i
); 
4145         if (s 
== NULL 
|| s
->size() == 0) { 
4146             errln("Character Class #%d is null or of zero size.", i
); 
4151     while (loopCount 
< numIterations 
|| numIterations 
== -1) { 
4152         if (numIterations 
== -1 && loopCount 
% 10 == 0) { 
4153             // If test is running in an infinite loop, display a periodic tic so 
4154             //   we can tell that it is making progress. 
4155             fprintf(stderr
, "."); 
4157         // Save current random number seed, so that we can recreate the random numbers 
4158         //   for this loop iteration in event of an error. 
4161         // Populate a test string with data. 
4162         testText
.truncate(0); 
4163         for (i
=0; i
<TESTSTRINGLEN
; i
++) { 
4164             int32_t  aClassNum 
= m_rand() % numCharClasses
; 
4165             UnicodeSet 
*classSet 
= (UnicodeSet 
*)chClasses
->elementAt(aClassNum
); 
4166             int32_t   charIdx 
= m_rand() % classSet
->size(); 
4167             UChar32   c 
= classSet
->charAt(charIdx
); 
4168             if (c 
< 0) {   // TODO:  deal with sets containing strings. 
4175         // Calculate the expected results for this test string. 
4176         mk
.setText(testText
); 
4177         memset(expectedBreaks
, 0, sizeof(expectedBreaks
)); 
4178         expectedBreaks
[0] = 1; 
4179         int32_t breakPos 
= 0; 
4182             breakPos 
= mk
.next(breakPos
); 
4183             if (breakPos 
== -1) { 
4186             if (breakPos 
> testText
.length()) { 
4187                 errln("breakPos > testText.length()"); 
4189             expectedBreaks
[breakPos
] = 1; 
4190             U_ASSERT(expectedCount
<testText
.length()); 
4191             expected
[expectedCount 
++] = breakPos
; 
4192             (void)expected
;   // Set but not used warning. 
4193                               // TODO (andy): check it out. 
4196         // Find the break positions using forward iteration 
4197         memset(forwardBreaks
, 0, sizeof(forwardBreaks
)); 
4199             UErrorCode status 
= U_ZERO_ERROR
; 
4200             UText 
*testUText 
= utext_openReplaceable(NULL
, &testText
, &status
); 
4201             // testUText = utext_openUnicodeString(testUText, &testText, &status); 
4202             bi
->setText(testUText
, status
); 
4203             TEST_ASSERT_SUCCESS(status
); 
4204             utext_close(testUText
);   // The break iterator does a shallow clone of the UText 
4205                                       //  This UText can be closed immediately, so long as the 
4206                                       //  testText string continues to exist. 
4208             bi
->setText(testText
); 
4211         for (i
=bi
->first(); i 
!= BreakIterator::DONE
; i
=bi
->next()) { 
4212             if (i 
< 0 || i 
> testText
.length()) { 
4213                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
); 
4216             forwardBreaks
[i
] = 1; 
4219         // Find the break positions using reverse iteration 
4220         memset(reverseBreaks
, 0, sizeof(reverseBreaks
)); 
4221         for (i
=bi
->last(); i 
!= BreakIterator::DONE
; i
=bi
->previous()) { 
4222             if (i 
< 0 || i 
> testText
.length()) { 
4223                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
); 
4226             reverseBreaks
[i
] = 1; 
4229         // Find the break positions using isBoundary() tests. 
4230         memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
)); 
4231         U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length()); 
4232         for (i
=0; i
<=testText
.length(); i
++) { 
4233             isBoundaryBreaks
[i
] = bi
->isBoundary(i
); 
4237         // Find the break positions using the following() function. 
4239         memset(followingBreaks
, 0, sizeof(followingBreaks
)); 
4240         int32_t   lastBreakPos 
= 0; 
4241         followingBreaks
[0] = 1; 
4242         for (i
=0; i
<testText
.length(); i
++) { 
4243             breakPos 
= bi
->following(i
); 
4244             if (breakPos 
<= i 
|| 
4245                 breakPos 
< lastBreakPos 
|| 
4246                 breakPos 
> testText
.length() || 
4247                 (breakPos 
> lastBreakPos 
&& lastBreakPos 
> i
)) { 
4248                 UChar32 brkChar 
= testText
.char32At(lastBreakPos
); 
4249                 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar 
< 0x1F1E6 || brkChar 
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests 
4250                 errln("%s break monkey test: " 
4251                     "Out of range value returned by BreakIterator::following().\n" 
4252                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d", 
4253                          name
, seed
, i
, breakPos
, lastBreakPos
); 
4257             followingBreaks
[breakPos
] = 1; 
4258             lastBreakPos 
= breakPos
; 
4261         // Find the break positions using the preceding() function. 
4262         memset(precedingBreaks
, 0, sizeof(precedingBreaks
)); 
4263         lastBreakPos 
= testText
.length(); 
4264         precedingBreaks
[testText
.length()] = 1; 
4265         for (i
=testText
.length(); i
>0; i
--) { 
4266             breakPos 
= bi
->preceding(i
); 
4267             if (breakPos 
>= i 
|| 
4268                 breakPos 
> lastBreakPos 
|| 
4269                 (breakPos 
< 0 && testText
.getChar32Start(i
)>0) || 
4270                 (breakPos 
< lastBreakPos 
&& lastBreakPos 
< testText
.getChar32Start(i
)) ) { 
4271                 UChar32 brkChar 
= testText
.char32At(breakPos
); 
4272                 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar 
< 0x1F1E6 || brkChar 
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests 
4273                 errln("%s break monkey test: " 
4274                     "Out of range value returned by BreakIterator::preceding().\n" 
4275                     "index=%d;  prev returned %d; lastBreak=%d" , 
4276                     name
,  i
, breakPos
, lastBreakPos
); 
4277                 if (breakPos 
>= 0 && breakPos 
< (int32_t)sizeof(precedingBreaks
)) { 
4278                     precedingBreaks
[i
] = 2;   // Forces an error. 
4282                 if (breakPos 
>= 0) { 
4283                     precedingBreaks
[breakPos
] = 1; 
4285                 lastBreakPos 
= breakPos
; 
4289         // Compare the expected and actual results. 
4290         for (i
=0; i
<=testText
.length(); i
++) { 
4291             const char *errorType 
= NULL
; 
4292             if  (forwardBreaks
[i
] != expectedBreaks
[i
]) { 
4293                 errorType 
= "next()"; 
4294             } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) { 
4295                 errorType 
= "previous()"; 
4296             } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) { 
4297                 errorType 
= "isBoundary()"; 
4298             } else if (followingBreaks
[i
] != expectedBreaks
[i
]) { 
4299                 errorType 
= "following()"; 
4300             } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) { 
4301                 errorType 
= "preceding()"; 
4305             if (errorType 
!= NULL
) { 
4306                 // Format a range of the test text that includes the failure as 
4307                 //  a data item that can be included in the rbbi test data file. 
4309                 // Start of the range is the last point where expected and actual results 
4310                 //   both agreed that there was a break position. 
4311                 int startContext 
= i
; 
4314                     if (startContext
==0) { break; } 
4316                     if (expectedBreaks
[startContext
] != 0) { 
4317                         if (count 
== 2) break; 
4322                 // End of range is two expected breaks past the start position. 
4323                 int endContext 
= i 
+ 1; 
4325                 for (ci
=0; ci
<2; ci
++) {  // Number of items to include in error text. 
4327                         if (endContext 
>= testText
.length()) {break;} 
4328                         if (expectedBreaks
[endContext
-1] != 0) { 
4329                             if (count 
== 0) break; 
4336                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 
4337                 UnicodeString errorText 
= "<data>"; 
4338                 /***if (strcmp(errorType, "next()") == 0) { 
4340                     endContext = testText.length(); 
4342                     printStringBreaks(testText, expected, expectedCount); 
4345                 for (ci
=startContext
; ci
<endContext
;) { 
4346                     UnicodeString 
hexChars("0123456789abcdef"); 
4349                     c 
= testText
.char32At(ci
); 
4351                         // This is the location of the error. 
4352                         errorText
.append("<?>"); 
4353                     } else if (expectedBreaks
[ci
] != 0) { 
4354                         // This a non-error expected break position. 
4355                         errorText
.append("\\"); 
4358                         errorText
.append("\\u"); 
4359                         for (bn
=12; bn
>=0; bn
-=4) { 
4360                             errorText
.append(hexChars
.charAt((c
>>bn
)&0xf)); 
4363                         errorText
.append("\\U"); 
4364                         for (bn
=28; bn
>=0; bn
-=4) { 
4365                             errorText
.append(hexChars
.charAt((c
>>bn
)&0xf)); 
4368                     ci 
= testText
.moveIndex32(ci
, 1); 
4370                 errorText
.append("\\"); 
4371                 errorText
.append("</data>\n"); 
4374                 char  charErrorTxt
[500]; 
4375                 UErrorCode status 
= U_ZERO_ERROR
; 
4376                 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
); 
4377                 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0; 
4378                 const char *badLocale 
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
); 
4380                 UChar32 brkChar 
= testText
.char32At(i
); 
4381                 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar 
< 0x1F1E6 || brkChar 
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests 
4382                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s", 
4383                     name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"), 
4384                     errorType
, seed
, i
, charErrorTxt
); 
4396 //  Bug 5532.  UTF-8 based UText fails in dictionary code. 
4397 //             This test checks the initial patch, 
4398 //             which is to just keep it from crashing.  Correct word boundaries 
4399 //             await a proper fix to the dictionary code. 
4401 void RBBITest::TestBug5532(void)  { 
4402    // Text includes a mixture of Thai and Latin. 
4403    const unsigned char utf8Data
[] = { 
4404            0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
, 
4405            0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,  
4406            0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
, 
4407            0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 
4408            0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
, 
4409            0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,  
4410            0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,  
4411            0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,  
4412            0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,  
4413            0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,  
4414            0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00}; 
4416     UErrorCode status 
= U_ZERO_ERROR
; 
4417     UText utext
=UTEXT_INITIALIZER
; 
4418     utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
); 
4419     TEST_ASSERT_SUCCESS(status
); 
4421     BreakIterator 
*bi 
= BreakIterator::createWordInstance(Locale("th"), status
); 
4422     TEST_ASSERT_SUCCESS(status
); 
4423     if (U_SUCCESS(status
)) { 
4424         bi
->setText(&utext
, status
); 
4425         TEST_ASSERT_SUCCESS(status
); 
4427         int32_t breakCount 
= 0; 
4428         int32_t previousBreak 
= -1; 
4429         for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) { 
4430             // For now, just make sure that the break iterator doesn't hang. 
4431             TEST_ASSERT(previousBreak 
< bi
->current()); 
4432             previousBreak 
= bi
->current(); 
4434         TEST_ASSERT(breakCount 
> 0); 
4437     utext_close(&utext
); 
4441 void RBBITest::TestBug9983(void)  { 
4442     UnicodeString text 
= UnicodeString("\\u002A"  // * Other 
4444                                        "\\u309C"  //   Katakana 
4448                                        "\\u0000").unescape(); 
4450     UErrorCode status 
= U_ZERO_ERROR
; 
4451     LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator 
*>( 
4452         BreakIterator::createWordInstance(Locale::getRoot(), status
))); 
4453     TEST_ASSERT_SUCCESS(status
); 
4454     LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator 
*>( 
4455         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
))); 
4456     TEST_ASSERT_SUCCESS(status
); 
4457     if (U_FAILURE(status
)) { 
4460     int32_t offset
, rstatus
, iterationCount
; 
4462     brkiter
->setText(text
); 
4465     while ( (offset 
= brkiter
->previous()) != UBRK_DONE 
) { 
4467         rstatus 
= brkiter
->getRuleStatus(); 
4468         (void)rstatus
;     // Suppress set but not used warning. 
4469         if (iterationCount 
>= 10) { 
4473     TEST_ASSERT(iterationCount 
== 6); 
4475     brkiterPOSIX
->setText(text
); 
4476     brkiterPOSIX
->last(); 
4478     while ( (offset 
= brkiterPOSIX
->previous()) != UBRK_DONE 
) { 
4480         rstatus 
= brkiterPOSIX
->getRuleStatus(); 
4481         (void)rstatus
;     // Suppress set but not used warning. 
4482         if (iterationCount 
>= 10) { 
4486     TEST_ASSERT(iterationCount 
== 6); 
4491 //  TestDebug    -  A place-holder test for debugging purposes. 
4492 //                  For putting in fragments of other tests that can be invoked 
4493 //                  for tracing  without a lot of unwanted extra stuff happening. 
4495 void RBBITest::TestDebug(void) { 
4497     UErrorCode   status 
= U_ZERO_ERROR
; 
4501     RuleBasedBreakIterator
* bi 
= 
4502        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 
4503        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 
4504        (RuleBasedBreakIterator 
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
); 
4505     UnicodeString 
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 
4506     // UnicodeString s("Aaa.  Bcd"); 
4509     UBool r 
= bi
->isBoundary(8); 
4510     printf("%s", r
?"true":"false"); 
4514         // ruleStatus = bi->getRuleStatus(); 
4515         printf("%d\t%d\n", pos
, ruleStatus
); 
4516         pos 
= bi
->previous(); 
4517     } while (pos 
!= BreakIterator::DONE
); 
4521 void RBBITest::TestProperties() { 
4522     UErrorCode errorCode 
= U_ZERO_ERROR
; 
4523     UnicodeSet 
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
); 
4524     if (!prependSet
.isEmpty()) { 
4526             "[:GCB=Prepend:] is not empty any more. " 
4527             "Uncomment relevant lines in source/data/brkitr/char.txt and " 
4528             "change this test to the opposite condition."); 
4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */