1 /********************************************************************
3 * Copyright (c) 1999-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION
19 #include "unicode/brkiter.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/numfmt.h"
22 #include "unicode/rbbi.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #include "unicode/regex.h"
26 #include "unicode/schriter.h"
27 #include "unicode/uchar.h"
28 #include "unicode/utf16.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uniset.h"
31 #include "unicode/uscript.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utext.h"
39 #include "utypeinfo.h" // for 'typeid' to work
43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
44 #include "unicode/filteredbrk.h"
45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
47 #define TEST_ASSERT(x) {if (!(x)) { \
48 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
51 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
54 //---------------------------------------------
56 //---------------------------------------------
59 // Note: Before adding new tests to this file, check whether the desired test data can
60 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
61 // it's much less work than writing a new test, diagnostic output in the event of failures
62 // is good, and the test data file will is shared with ICU4J, so eventually the test
63 // will run there as well, without additional effort.
65 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
67 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
70 #if !UCONFIG_NO_FILE_IO
71 case 0: name
= "TestBug4153072";
72 if(exec
) TestBug4153072(); break;
74 case 0: name
= "skip";
78 case 1: name
= "skip";
80 case 2: name
= "TestStatusReturn";
81 if(exec
) TestStatusReturn(); break;
83 #if !UCONFIG_NO_FILE_IO
84 case 3: name
= "TestUnicodeFiles";
85 if(exec
) TestUnicodeFiles(); break;
86 case 4: name
= "TestEmptyString";
87 if(exec
) TestEmptyString(); break;
89 case 3: case 4: name
= "skip";
93 case 5: name
= "TestGetAvailableLocales";
94 if(exec
) TestGetAvailableLocales(); break;
96 case 6: name
= "TestGetDisplayName";
97 if(exec
) TestGetDisplayName(); break;
99 #if !UCONFIG_NO_FILE_IO
100 case 7: name
= "TestEndBehaviour";
101 if(exec
) TestEndBehaviour(); break;
102 case 8: case 9: case 10: name
= "skip";
104 case 11: name
= "TestWordBreaks";
105 if(exec
) TestWordBreaks(); break;
106 case 12: name
= "TestWordBoundary";
107 if(exec
) TestWordBoundary(); break;
108 case 13: name
= "TestLineBreaks";
109 if(exec
) TestLineBreaks(); break;
110 case 14: name
= "TestSentBreaks";
111 if(exec
) TestSentBreaks(); break;
112 case 15: name
= "TestExtended";
113 if(exec
) TestExtended(); break;
115 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name
= "skip";
119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
121 name
= "TestMonkey"; if(exec
) TestMonkey(params
); break;
124 name
= "skip"; break;
127 #if !UCONFIG_NO_FILE_IO
128 case 17: name
= "TestBug3818";
129 if(exec
) TestBug3818(); break;
131 case 17: name
= "skip";
135 case 18: name
= "skip";
137 case 19: name
= "TestDebug";
138 if(exec
) TestDebug(); break;
139 case 20: name
= "skip";
142 #if !UCONFIG_NO_FILE_IO
143 case 21: name
= "TestBug5775";
144 if (exec
) TestBug5775(); break;
146 case 21: name
= "skip";
150 case 22: name
= "TestBug9983";
151 if (exec
) TestBug9983(); break;
152 case 23: name
= "TestDictRules";
153 if (exec
) TestDictRules(); break;
154 case 24: name
= "TestBug5532";
155 if (exec
) TestBug5532(); break;
156 default: name
= ""; break; //needed to end loop
161 //---------------------------------------------------------------------------
163 // class BITestData Holds a set of Break iterator test data and results
165 // - the string data to be broken
166 // - a vector of the expected break positions.
167 // - a vector of source line numbers for the data,
168 // (to help see where errors occured.)
169 // - The expected break tag values.
170 // - Vectors of actual break positions and tag values.
171 // - Functions for comparing actual with expected and
174 //----------------------------------------------------------------------------
177 UnicodeString fDataToBreak
;
178 UVector fExpectedBreakPositions
;
179 UVector fExpectedTags
;
181 UVector fActualBreakPositions
; // Test Results.
184 BITestData(UErrorCode
&status
);
185 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
186 void checkResults(const char *heading
, RBBITest
*test
);
187 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
194 BITestData::BITestData(UErrorCode
&status
)
195 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
201 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
202 // The macro form collects the line number, which is helpful
203 // when tracking down failures.
205 // A null data item is inserted at the start of each test's data
206 // to put the starting zero into the data list. The position saved for
207 // each non-null item is its ending position.
209 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
210 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
211 if (U_FAILURE(status
)) {return;}
213 fDataToBreak
.append(CharsToUnicodeString(data
));
215 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
216 fExpectedTags
.addElement(tag
, status
);
217 fLineNum
.addElement(lineNum
, status
);
222 // checkResults. Compare the actual and expected break positions, report any differences.
224 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
225 int32_t expectedIndex
= 0;
226 int32_t actualIndex
= 0;
229 // If we've run through both the expected and actual results vectors, we're done.
230 // break out of the loop.
231 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
232 actualIndex
>= fActualBreakPositions
.size()) {
237 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
238 err(heading
, test
, expectedIndex
-1, actualIndex
);
243 if (actualIndex
>= fActualBreakPositions
.size()) {
244 err(heading
, test
, expectedIndex
, actualIndex
-1);
249 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
250 err(heading
, test
, expectedIndex
, actualIndex
);
251 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
252 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
260 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
261 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
262 heading
, fLineNum
.elementAt(expectedIndex
),
263 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
272 // err - An error was found. Report it, along with information about where the
273 // incorrectly broken test data appeared in the source file.
275 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
277 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
278 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
280 int32_t line
= fLineNum
.elementAti(expectedIdx
);
281 if (expectedIdx
> 0) {
282 // The line numbers are off by one because a premature break occurs somewhere
283 // within the previous item, rather than at the start of the current (expected) item.
284 // We want to report the offset of the unexpected break from the start of
285 // this previous item.
286 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
288 if (actual
< expected
) {
289 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
291 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
296 void BITestData::clearResults() {
297 fActualBreakPositions
.removeAllElements();
298 fActualTags
.removeAllElements();
302 //--------------------------------------------------------------------------------------
304 // RBBITest constructor and destructor
306 //--------------------------------------------------------------------------------------
308 RBBITest::RBBITest() {
312 RBBITest::~RBBITest() {
315 //-----------------------------------------------------------------------------------
317 // Test for status {tag} return value from break rules.
318 // TODO: a more thorough test.
320 //-----------------------------------------------------------------------------------
321 void RBBITest::TestStatusReturn() {
322 UnicodeString
rulesString1("$Letters = [:L:];\n"
323 "$Numbers = [:N:];\n"
326 "Help\\ /me\\!{4};\n"
327 "[^$Letters $Numbers];\n"
328 "!.*;\n", -1, US_INV
);
329 UnicodeString testString1
= "abc123..abc Help me Help me!";
330 // 01234567890123456789012345678
331 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
332 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
334 UErrorCode status
=U_ZERO_ERROR
;
335 UParseError parseError
;
337 LocalPointer
<BreakIterator
> bi(new RuleBasedBreakIterator(rulesString1
, parseError
, status
));
338 if(U_FAILURE(status
)) {
339 dataerrln("%s:%d error in break iterator construction - %s", __FILE__
, __LINE__
, u_errorName(status
));
344 bi
->setText(testString1
);
345 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
346 if (pos
!= bounds1
[i
]) {
347 errln("%s:%d expected break at %d, got %d\n", __FILE__
, __LINE__
, bounds1
[i
], pos
);
351 int tag
= bi
->getRuleStatus();
352 if (tag
!= brkStatus
[i
]) {
353 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__
, __LINE__
, pos
, brkStatus
[i
], tag
);
361 static void printStringBreaks(UText
*tstr
, int expected
[], int expectedCount
) {
362 UErrorCode status
= U_ZERO_ERROR
;
364 printf("code alpha extend alphanum type word sent line name\n");
365 int nextExpectedIndex
= 0;
366 utext_setNativeIndex(tstr
, 0);
367 for (int j
= 0; j
< utext_nativeLength(tstr
); j
=utext_getNativeIndex(tstr
)) {
368 if (nextExpectedIndex
< expectedCount
&& j
>= expected
[nextExpectedIndex
] ) {
369 printf("------------------------------------------------ %d\n", j
);
373 UChar32 c
= utext_next32(tstr
);
374 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
375 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
377 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
379 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
381 U_SHORT_PROPERTY_NAME
),
382 u_getPropertyValueName(UCHAR_WORD_BREAK
,
383 u_getIntPropertyValue(c
,
385 U_SHORT_PROPERTY_NAME
),
386 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
387 u_getIntPropertyValue(c
,
388 UCHAR_SENTENCE_BREAK
),
389 U_SHORT_PROPERTY_NAME
),
390 u_getPropertyValueName(UCHAR_LINE_BREAK
,
391 u_getIntPropertyValue(c
,
393 U_SHORT_PROPERTY_NAME
),
399 static void printStringBreaks(const UnicodeString
&ustr
, int expected
[], int expectedCount
) {
400 UErrorCode status
= U_ZERO_ERROR
;
402 tstr
= utext_openConstUnicodeString(NULL
, &ustr
, &status
);
403 if (U_FAILURE(status
)) {
404 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
));
407 printStringBreaks(tstr
, expected
, expectedCount
);
412 void RBBITest::TestBug3818() {
413 UErrorCode status
= U_ZERO_ERROR
;
415 // Four Thai words...
416 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
417 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
418 UnicodeString
thaiStr(thaiWordData
);
420 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
421 if (U_FAILURE(status
) || bi
== NULL
) {
422 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
425 bi
->setText(thaiStr
);
427 int32_t startOfSecondWord
= bi
->following(1);
428 if (startOfSecondWord
!= 4) {
429 errln("Fail at file %s, line %d expected start of word at 4, got %d",
430 __FILE__
, __LINE__
, startOfSecondWord
);
432 startOfSecondWord
= bi
->following(0);
433 if (startOfSecondWord
!= 4) {
434 errln("Fail at file %s, line %d expected start of word at 4, got %d",
435 __FILE__
, __LINE__
, startOfSecondWord
);
440 //----------------------------------------------------------------------------
442 // generalIteratorTest Given a break iterator and a set of test data,
443 // Run the tests and report the results.
445 //----------------------------------------------------------------------------
446 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
449 bi
.setText(td
.fDataToBreak
);
451 testFirstAndNext(bi
, td
);
453 testLastAndPrevious(bi
, td
);
455 testFollowing(bi
, td
);
456 testPreceding(bi
, td
);
457 testIsBoundary(bi
, td
);
458 doMultipleSelectionTest(bi
, td
);
463 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
466 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
468 UErrorCode status
= U_ZERO_ERROR
;
473 logln("Test first and next");
474 bi
.setText(td
.fDataToBreak
);
477 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
478 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
479 tag
= bi
.getRuleStatus();
480 td
.fActualTags
.addElement(tag
, status
);
482 // If the iterator is not making forward progress, stop.
483 // No need to raise an error here, it'll be detected in the normal check of results.
488 td
.checkResults("testFirstAndNext", this);
493 // TestLastAndPrevious. Run the iterator backwards, starting with last().
495 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
497 UErrorCode status
= U_ZERO_ERROR
;
499 int32_t lastP
= 0x7ffffffe;
502 logln("Test last and previous");
503 bi
.setText(td
.fDataToBreak
);
506 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
507 // Save break position. Insert it at start of vector of results, shoving
508 // already-saved results further towards the end.
509 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
510 // bi.previous(); // TODO: Why does this fix things up????
512 tag
= bi
.getRuleStatus();
513 td
.fActualTags
.insertElementAt(tag
, 0, status
);
515 // If the iterator is not making progress, stop.
516 // No need to raise an error here, it'll be detected in the normal check of results.
521 td
.checkResults("testLastAndPrevious", this);
525 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
527 UErrorCode status
= U_ZERO_ERROR
;
530 int32_t lastP
= -2; // A value that will never be returned as a break position.
531 // cannot be -1; that is returned for DONE.
534 logln("testFollowing():");
535 bi
.setText(td
.fDataToBreak
);
538 // Save the starting point, since we won't get that out of following.
540 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
541 tag
= bi
.getRuleStatus();
542 td
.fActualTags
.addElement(tag
, status
);
544 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
547 if (p
== RuleBasedBreakIterator::DONE
) {
550 // We've reached a new break position. Save it.
551 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
552 tag
= bi
.getRuleStatus();
553 td
.fActualTags
.addElement(tag
, status
);
557 // The loop normally exits by means of the break in the middle.
558 // Make sure that the index was at the correct position for the break iterator to have
560 if (i
!= td
.fDataToBreak
.length()) {
561 errln("testFollowing(): iterator returned DONE prematurely.");
564 // Full check of all results.
565 td
.checkResults("testFollowing", this);
570 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
571 UErrorCode status
= U_ZERO_ERROR
;
574 int32_t lastP
= 0x7ffffffe;
577 logln("testPreceding():");
578 bi
.setText(td
.fDataToBreak
);
582 td
.fActualBreakPositions
.addElement(p
, status
);
583 tag
= bi
.getRuleStatus();
584 td
.fActualTags
.addElement(tag
, status
);
586 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
589 if (p
== RuleBasedBreakIterator::DONE
) {
592 // We've reached a new break position. Save it.
593 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
595 tag
= bi
.getRuleStatus();
596 td
.fActualTags
.insertElementAt(tag
, 0, status
);
599 // The loop normally exits by means of the break in the middle.
600 // Make sure that the index was at the correct position for the break iterator to have
603 errln("testPreceding(): iterator returned DONE prematurely.");
606 // Full check of all results.
607 td
.checkResults("testPreceding", this);
612 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
613 UErrorCode status
= U_ZERO_ERROR
;
617 logln("testIsBoundary():");
618 bi
.setText(td
.fDataToBreak
);
621 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
622 if (bi
.isBoundary(i
)) {
623 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
624 tag
= bi
.getRuleStatus();
625 td
.fActualTags
.addElement(tag
, status
);
628 td
.checkResults("testIsBoundary: ", this);
633 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
635 iterator
.setText(td
.fDataToBreak
);
637 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
638 int32_t offset
= iterator
.first();
642 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
644 if (*testIterator
!= iterator
)
645 errln("clone() or operator!= failed: two clones compared unequal");
648 testOffset
= testIterator
->first();
649 testOffset
= testIterator
->next(count
);
650 if (offset
!= testOffset
)
651 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
653 if (offset
!= RuleBasedBreakIterator::DONE
) {
655 offset
= iterator
.next();
657 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
658 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
659 if (count
> 10000 || offset
== -1) {
660 errln("operator== failed too many times. Stopping test.");
662 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
668 } while (offset
!= RuleBasedBreakIterator::DONE
);
670 // now do it backwards...
671 offset
= iterator
.last();
675 testOffset
= testIterator
->last();
676 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
677 if (offset
!= testOffset
)
678 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
680 if (offset
!= RuleBasedBreakIterator::DONE
) {
682 offset
= iterator
.previous();
684 } while (offset
!= RuleBasedBreakIterator::DONE
);
690 //---------------------------------------------
694 //---------------------------------------------
695 void RBBITest::TestEmptyString()
697 UnicodeString text
= "";
698 UErrorCode status
= U_ZERO_ERROR
;
700 BITestData
x(status
);
701 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
702 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
703 if (U_FAILURE(status
))
705 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
708 generalIteratorTest(*bi
, x
);
712 void RBBITest::TestGetAvailableLocales()
714 int32_t locCount
= 0;
715 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
718 dataerrln("getAvailableLocales() returned an empty list!");
719 // Just make sure that it's returning good memory.
721 for (i
= 0; i
< locCount
; ++i
) {
722 logln(locList
[i
].getName());
726 //Testing the BreakIterator::getDisplayName() function
727 void RBBITest::TestGetDisplayName()
729 UnicodeString result
;
731 BreakIterator::getDisplayName(Locale::getUS(), result
);
732 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
733 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
736 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
737 if (result
!= "French (France)")
738 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
745 void RBBITest::TestEndBehaviour()
747 UErrorCode status
= U_ZERO_ERROR
;
748 UnicodeString
testString("boo.");
749 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
750 if (U_FAILURE(status
))
752 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
755 wb
->setText(testString
);
757 if (wb
->first() != 0)
758 errln("Didn't get break at beginning of string.");
760 errln("Didn't get break before period in \"boo.\"");
761 if (wb
->current() != 4 && wb
->next() != 4)
762 errln("Didn't get break at end of string.");
768 void RBBITest::TestBug4153072() {
769 UErrorCode status
= U_ZERO_ERROR
;
770 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
771 if (U_FAILURE(status
))
773 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
776 UnicodeString
str("...Hello, World!...");
778 int32_t end
= str
.length() - 3;
781 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
782 iter
->adoptText(textIterator
);
784 // Note: with the switch to UText, there is no way to restrict the
785 // iteration range to begin at an index other than zero.
786 // String character iterators created with a non-zero bound are
787 // treated by RBBI as being empty.
788 for (index
= -1; index
< begin
+ 1; ++index
) {
789 onBoundary
= iter
->isBoundary(index
);
790 if (index
== 0? !onBoundary
: onBoundary
) {
791 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
792 " and begin index = " + begin
);
800 // Test for problem reported by Ashok Matoria on 9 July 2007
801 // One.<kSoftHyphen><kSpace>Two.
803 // Sentence break at start (0) and then on calling next() it breaks at
804 // 'T' of "Two". Now, at this point if I do next() and
805 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
807 void RBBITest::TestBug5775() {
808 UErrorCode status
= U_ZERO_ERROR
;
809 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
810 TEST_ASSERT_SUCCESS(status
);
811 if (U_FAILURE(status
)) {
814 // Check for status first for better handling of no data errors.
815 TEST_ASSERT(bi
!= NULL
);
820 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
824 int pos
= bi
->next();
825 TEST_ASSERT(pos
== 6);
827 TEST_ASSERT(pos
== 10);
828 pos
= bi
->previous();
829 TEST_ASSERT(pos
== 6);
835 //------------------------------------------------------------------------------
837 // RBBITest::Extended Run RBBI Tests from an external test data file
839 //------------------------------------------------------------------------------
842 BreakIterator
*bi
; // Break iterator is set while parsing test source.
843 // Changed out whenever test data changes break type.
845 UnicodeString dataToBreak
; // Data that is built up while parsing the test.
846 UVector32
*expectedBreaks
; // Expected break positions, matches dataToBreak UnicodeString.
847 UVector32
*srcLine
; // Positions in source file, indexed same as dataToBreak.
850 UText
*textToBreak
; // UText, could be UTF8 or UTF16.
851 UVector32
*textMap
; // Map from UTF-16 dataToBreak offsets to UText offsets.
852 CharString utf8String
; // UTF-8 form of text to break.
854 TestParams(UErrorCode
&status
) : dataToBreak() {
856 expectedBreaks
= new UVector32(status
);
857 srcLine
= new UVector32(status
);
858 srcCol
= new UVector32(status
);
860 textMap
= new UVector32(status
);
865 delete expectedBreaks
;
868 utext_close(textToBreak
);
872 int32_t getSrcLine(int32_t bp
);
873 int32_t getExpectedBreak(int32_t bp
);
874 int32_t getSrcCol(int32_t bp
);
876 void setUTF16(UErrorCode
&status
);
877 void setUTF8(UErrorCode
&status
);
880 // Append a UnicodeString to a CharString with UTF-8 encoding.
881 // Substitute any invalid chars.
882 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
883 static void CharStringAppend(CharString
&dest
, const UnicodeString
&src
, UErrorCode
&status
) {
884 if (U_FAILURE(status
)) {
888 u_strToUTF8WithSub(NULL
, 0, &utf8Length
, // Output Buffer, NULL for preflight.
889 src
.getBuffer(), src
.length(), // UTF-16 data
890 0xfffd, NULL
, // Substitution char, number of subs.
892 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
895 status
= U_ZERO_ERROR
;
897 char *buffer
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
);
898 u_strToUTF8WithSub(buffer
, utf8Length
, NULL
,
899 src
.getBuffer(), src
.length(),
900 0xfffd, NULL
, &status
);
901 dest
.append(buffer
, utf8Length
, status
);
905 void TestParams::setUTF16(UErrorCode
&status
) {
906 textToBreak
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
);
907 textMap
->removeAllElements();
908 for (int32_t i
=0; i
<dataToBreak
.length(); i
++) {
909 if (i
== dataToBreak
.getChar32Start(i
)) {
910 textMap
->addElement(i
, status
);
912 textMap
->addElement(-1, status
);
915 textMap
->addElement(dataToBreak
.length(), status
);
916 U_ASSERT(dataToBreak
.length() + 1 == textMap
->size());
920 void TestParams::setUTF8(UErrorCode
&status
) {
921 if (U_FAILURE(status
)) {
925 CharStringAppend(utf8String
, dataToBreak
, status
);
926 textToBreak
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
);
927 if (U_FAILURE(status
)) {
931 textMap
->removeAllElements();
932 int32_t utf16Index
= 0;
934 textMap
->addElement(utf16Index
, status
);
935 UChar32 c32
= utext_current32(textToBreak
);
939 utf16Index
+= U16_LENGTH(c32
);
940 utext_next32(textToBreak
);
941 while (textMap
->size() < utext_getNativeIndex(textToBreak
)) {
942 textMap
->addElement(-1, status
);
945 U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size());
949 int32_t TestParams::getSrcLine(int bp
) {
950 if (bp
>= textMap
->size()) {
951 bp
= textMap
->size() - 1;
954 for(; bp
>= 0 ; --bp
) {
955 // Move to a character boundary if we are not on one already.
956 i
= textMap
->elementAti(bp
);
961 return srcLine
->elementAti(i
);
965 int32_t TestParams::getExpectedBreak(int bp
) {
966 if (bp
>= textMap
->size()) {
969 int32_t i
= textMap
->elementAti(bp
);
972 retVal
= expectedBreaks
->elementAti(i
);
978 int32_t TestParams::getSrcCol(int bp
) {
979 if (bp
>= textMap
->size()) {
980 bp
= textMap
->size() - 1;
983 for(; bp
>= 0; --bp
) {
984 // Move bp to a character boundary if we are not on one already.
985 i
= textMap
->elementAti(bp
);
990 return srcCol
->elementAti(i
);
994 void RBBITest::executeTest(TestParams
*t
, UErrorCode
&status
) {
999 TEST_ASSERT_SUCCESS(status
);
1000 if (U_FAILURE(status
)) {
1004 if (t
->bi
== NULL
) {
1008 t
->bi
->setText(t
->textToBreak
, status
);
1010 // Run the iterator forward
1013 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1015 // Fail for lack of forward progress.
1016 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1017 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1021 // Check that there we didn't miss an expected break between the last one
1023 for (i
=prevBP
+1; i
<bp
; i
++) {
1024 if (t
->getExpectedBreak(i
) != 0) {
1025 int expected
[] = {0, i
};
1026 printStringBreaks(t
->dataToBreak
, expected
, 2);
1027 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1028 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1032 // Check that the break we did find was expected
1033 if (t
->getExpectedBreak(bp
) == 0) {
1034 int expected
[] = {0, bp
};
1035 printStringBreaks(t
->textToBreak
, expected
, 2);
1036 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1037 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1039 // The break was expected.
1040 // Check that the {nnn} tag value is correct.
1041 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
1042 if (expectedTagVal
== -1) {
1045 int32_t line
= t
->getSrcLine(bp
);
1046 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1047 if (rs
!= expectedTagVal
) {
1048 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1049 " Actual, Expected status = %4d, %4d",
1050 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1057 // Verify that there were no missed expected breaks after the last one found
1058 for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) {
1059 if (t
->getExpectedBreak(i
) != 0) {
1060 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1061 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1066 // Run the iterator backwards, verify that the same breaks are found.
1068 prevBP
= utext_nativeLength(t
->textToBreak
)+2; // start with a phony value for the last break pos seen.
1069 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1071 // Fail for lack of progress.
1072 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1073 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1077 // Check that we didn't miss an expected break between the last one
1078 // and this one. (UVector returns zeros for index out of bounds.)
1079 for (i
=prevBP
-1; i
>bp
; i
--) {
1080 if (t
->getExpectedBreak(i
) != 0) {
1081 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1082 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1086 // Check that the break we did find was expected
1087 if (t
->getExpectedBreak(bp
) == 0) {
1088 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1089 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1091 // The break was expected.
1092 // Check that the {nnn} tag value is correct.
1093 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
1094 if (expectedTagVal
== -1) {
1097 int line
= t
->getSrcLine(bp
);
1098 int32_t rs
= t
->bi
->getRuleStatus();
1099 if (rs
!= expectedTagVal
) {
1100 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1101 " Actual, Expected status = %4d, %4d",
1102 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1109 // Verify that there were no missed breaks prior to the last one found
1110 for (i
=prevBP
-1; i
>=0; i
--) {
1111 if (t
->getExpectedBreak(i
) != 0) {
1112 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1113 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1117 // Check isBoundary()
1118 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1119 UBool boundaryExpected
= (t
->getExpectedBreak(i
) != 0);
1120 UBool boundaryFound
= t
->bi
->isBoundary(i
);
1121 if (boundaryExpected
!= boundaryFound
) {
1122 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1123 " Expected, Actual= %s, %s",
1124 i
, t
->getSrcLine(i
), t
->getSrcCol(i
),
1125 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
1129 // Check following()
1130 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1131 int32_t actualBreak
= t
->bi
->following(i
);
1132 int32_t expectedBreak
= BreakIterator::DONE
;
1133 for (int32_t j
=i
+1; j
<= utext_nativeLength(t
->textToBreak
); j
++) {
1134 if (t
->getExpectedBreak(j
) != 0) {
1139 if (expectedBreak
!= actualBreak
) {
1140 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1141 " Expected, Actual= %d, %d",
1142 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1146 // Check preceding()
1147 for (i
=utext_nativeLength(t
->textToBreak
); i
>=0; i
--) {
1148 int32_t actualBreak
= t
->bi
->preceding(i
);
1149 int32_t expectedBreak
= BreakIterator::DONE
;
1151 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1152 // preceding(trailing byte) will return the index of some preceding code point,
1153 // not the lead byte of the current code point, even though that has a smaller index.
1154 // Therefore, start looking at the expected break data not at i-1, but at
1155 // the start of code point index - 1.
1156 utext_setNativeIndex(t
->textToBreak
, i
);
1157 int32_t j
= utext_getNativeIndex(t
->textToBreak
) - 1;
1158 for (; j
>= 0; j
--) {
1159 if (t
->getExpectedBreak(j
) != 0) {
1164 if (expectedBreak
!= actualBreak
) {
1165 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1166 " Expected, Actual= %d, %d",
1167 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1173 void RBBITest::TestExtended() {
1174 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1175 UErrorCode status
= U_ZERO_ERROR
;
1178 UnicodeString rules
;
1179 TestParams
tp(status
);
1181 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status
);
1182 if (U_FAILURE(status
)) {
1183 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1188 // Open and read the test data file.
1190 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1191 char testFileName
[1000];
1192 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1193 errln("Can't open test data. Path too long.");
1196 strcpy(testFileName
, testDataDirectory
);
1197 strcat(testFileName
, "rbbitst.txt");
1200 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1201 if (U_FAILURE(status
)) {
1202 return; /* something went wrong, error already output */
1206 bool skipTest
= false; // Skip this test?
1209 // Put the test data into a UnicodeString
1211 UnicodeString
testString(FALSE
, testFile
, len
);
1219 parseState
= PARSE_TAG
;
1221 EParseState savedState
= PARSE_TAG
;
1223 static const UChar CH_LF
= 0x0a;
1224 static const UChar CH_CR
= 0x0d;
1225 static const UChar CH_HASH
= 0x23;
1226 /*static const UChar CH_PERIOD = 0x2e;*/
1227 static const UChar CH_LT
= 0x3c;
1228 static const UChar CH_GT
= 0x3e;
1229 static const UChar CH_BACKSLASH
= 0x5c;
1230 static const UChar CH_BULLET
= 0x2022;
1232 int32_t lineNum
= 1;
1233 int32_t colStart
= 0;
1235 int32_t charIdx
= 0;
1237 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1239 for (charIdx
= 0; charIdx
< len
; ) {
1240 status
= U_ZERO_ERROR
;
1241 UChar c
= testString
.charAt(charIdx
);
1243 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1244 // treat CRLF as a unit
1248 if (c
== CH_LF
|| c
== CH_CR
) {
1252 column
= charIdx
- colStart
+ 1;
1254 switch (parseState
) {
1256 if (c
== 0x0a || c
== 0x0d) {
1257 parseState
= savedState
;
1264 parseState
= PARSE_COMMENT
;
1265 savedState
= PARSE_TAG
;
1268 if (u_isUWhiteSpace(c
)) {
1271 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1273 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1278 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1280 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1285 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1287 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1292 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1294 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1299 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1301 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1306 // <locale loc_name>
1307 localeMatcher
.reset(testString
);
1308 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1309 UnicodeString localeName
= localeMatcher
.group(1, status
);
1310 char localeName8
[100];
1311 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1312 locale
= Locale::createFromName(localeName8
);
1313 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
1314 TEST_ASSERT_SUCCESS(status
);
1317 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1318 parseState
= PARSE_DATA
;
1320 tp
.dataToBreak
= "";
1321 tp
.expectedBreaks
->removeAllElements();
1322 tp
.srcCol
->removeAllElements();
1323 tp
.srcLine
->removeAllElements();
1327 errln("line %d: Tag expected in test file.", lineNum
);
1328 parseState
= PARSE_COMMENT
;
1329 savedState
= PARSE_DATA
;
1330 goto end_test
; // Stop the test.
1335 if (c
== CH_BULLET
) {
1336 int32_t breakIdx
= tp
.dataToBreak
.length();
1337 tp
.expectedBreaks
->setSize(breakIdx
+1);
1338 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1339 tp
.srcLine
->setSize(breakIdx
+1);
1340 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1341 tp
.srcCol
->setSize(breakIdx
+1);
1342 tp
.srcCol
->setElementAt(column
, breakIdx
);
1346 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1347 // Add final entry to mappings from break location to source file position.
1348 // Need one extra because last break position returned is after the
1349 // last char in the data, not at the last char.
1350 tp
.srcLine
->addElement(lineNum
, status
);
1351 tp
.srcCol
->addElement(column
, status
);
1353 parseState
= PARSE_TAG
;
1358 status
= U_ZERO_ERROR
;
1359 tp
.setUTF16(status
);
1360 executeTest(&tp
, status
);
1361 TEST_ASSERT_SUCCESS(status
);
1363 // Run again, this time with UTF-8 text wrapped in a UText.
1364 status
= U_ZERO_ERROR
;
1366 TEST_ASSERT_SUCCESS(status
);
1367 executeTest(&tp
, status
);
1372 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1373 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1374 // Get the code point from the name and insert it into the test data.
1375 // (Damn, no API takes names in Unicode !!!
1376 // we've got to take it back to char *)
1377 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1378 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1379 char charNameBuf
[200];
1380 UChar32 theChar
= -1;
1381 if (nameEndIdx
!= -1) {
1382 UErrorCode status
= U_ZERO_ERROR
;
1383 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1384 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1385 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1386 if (U_FAILURE(status
)) {
1390 if (theChar
== -1) {
1391 errln("Error in named character in test file at line %d, col %d",
1394 // Named code point was recognized. Insert it
1395 // into the test data.
1396 tp
.dataToBreak
.append(theChar
);
1397 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1398 tp
.srcLine
->addElement(lineNum
, status
);
1399 tp
.srcCol
->addElement(column
, status
);
1402 if (nameEndIdx
> charIdx
) {
1403 charIdx
= nameEndIdx
+1;
1412 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1414 int32_t breakIdx
= tp
.dataToBreak
.length();
1415 tp
.expectedBreaks
->setSize(breakIdx
+1);
1416 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1417 tp
.srcLine
->setSize(breakIdx
+1);
1418 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1419 tp
.srcCol
->setSize(breakIdx
+1);
1420 tp
.srcCol
->setElementAt(column
, breakIdx
);
1426 parseState
= PARSE_NUM
;
1430 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1431 parseState
= PARSE_COMMENT
;
1432 savedState
= PARSE_DATA
;
1436 if (c
== CH_BACKSLASH
) {
1437 // Check for \ at end of line, a line continuation.
1438 // Advance over (discard) the newline
1439 UChar32 cp
= testString
.char32At(charIdx
);
1440 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1442 // Need an extra increment of the input ptr to move over both of them
1445 if (cp
== CH_LF
|| cp
== CH_CR
) {
1452 // Let unescape handle the back slash.
1453 cp
= testString
.unescapeAt(charIdx
);
1455 // Escape sequence was recognized. Insert the char
1456 // into the test data.
1457 tp
.dataToBreak
.append(cp
);
1458 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1459 tp
.srcLine
->addElement(lineNum
, status
);
1460 tp
.srcCol
->addElement(column
, status
);
1466 // Not a recognized backslash escape sequence.
1467 // Take the next char as a literal.
1468 // TODO: Should this be an error?
1469 c
= testString
.charAt(charIdx
);
1470 charIdx
= testString
.moveIndex32(charIdx
, 1);
1473 // Normal, non-escaped data char.
1474 tp
.dataToBreak
.append(c
);
1476 // Save the mapping from offset in the data to line/column numbers in
1477 // the original input file. Will be used for better error messages only.
1478 // If there's an expected break before this char, the slot in the mapping
1479 // vector will already be set for this char; don't overwrite it.
1480 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1481 tp
.srcLine
->addElement(lineNum
, status
);
1482 tp
.srcCol
->addElement(column
, status
);
1488 // We are parsing an expected numeric tag value, like <1234>,
1489 // within a chunk of data.
1490 if (u_isUWhiteSpace(c
)) {
1495 // Finished the number. Add the info to the expected break data,
1496 // and switch parse state back to doing plain data.
1497 parseState
= PARSE_DATA
;
1498 if (tagValue
== 0) {
1501 int32_t breakIdx
= tp
.dataToBreak
.length();
1502 tp
.expectedBreaks
->setSize(breakIdx
+1);
1503 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1504 tp
.srcLine
->setSize(breakIdx
+1);
1505 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1506 tp
.srcCol
->setSize(breakIdx
+1);
1507 tp
.srcCol
->setElementAt(column
, breakIdx
);
1512 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1516 errln("Syntax Error in test file at line %d, col %d",
1518 parseState
= PARSE_COMMENT
;
1519 goto end_test
; // Stop the test
1524 if (U_FAILURE(status
)) {
1525 dataerrln("ICU Error %s while parsing test file at line %d.",
1526 u_errorName(status
), lineNum
);
1527 status
= U_ZERO_ERROR
;
1528 goto end_test
; // Stop the test
1539 //-------------------------------------------------------------------------------
1541 // TestDictRules create a break iterator from source rules that includes a
1542 // dictionary range. Regression for bug #7130. Source rules
1543 // do not declare a break iterator type (word, line, sentence, etc.
1544 // but the dictionary code, without a type, would loop.
1546 //-------------------------------------------------------------------------------
1547 void RBBITest::TestDictRules() {
1548 const char *rules
= "$dictionary = [a-z]; \n"
1550 "$dictionary $dictionary; \n"
1552 "$dictionary $dictionary; \n";
1553 const char *text
= "aa";
1554 UErrorCode status
= U_ZERO_ERROR
;
1555 UParseError parseError
;
1557 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1558 if (U_SUCCESS(status
)) {
1559 UnicodeString utext
= text
;
1563 for (loops
= 0; loops
<10; loops
++) {
1564 position
= bi
.next();
1565 if (position
== RuleBasedBreakIterator::DONE
) {
1569 TEST_ASSERT(loops
== 1);
1571 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1577 //-------------------------------------------------------------------------------
1579 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1580 // return the data in one big UChar * buffer, which the caller must delete.
1583 // fileName: the name of the file, with no directory part. The test data directory
1585 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1586 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1587 // specified here. The BOM, if it exists, will be stripped from the returned data.
1588 // Pass NULL for the system default encoding.
1591 // The file data, converted to UChar.
1592 // The caller must delete this when done with
1593 // delete [] theBuffer;
1595 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1596 // Move this function to some common place.
1598 //--------------------------------------------------------------------------------
1599 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1600 UChar
*retPtr
= NULL
;
1601 char *fileBuf
= NULL
;
1602 UConverter
* conv
= NULL
;
1606 if (U_FAILURE(status
)) {
1613 f
= fopen(fileName
, "rb");
1615 dataerrln("Error opening test data file %s\n", fileName
);
1616 status
= U_FILE_ACCESS_ERROR
;
1625 fseek( f
, 0, SEEK_END
);
1626 fileSize
= ftell(f
);
1627 fileBuf
= new char[fileSize
];
1628 fseek(f
, 0, SEEK_SET
);
1629 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1630 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1631 errln("Error reading test data file.");
1632 goto cleanUpAndReturn
;
1636 // Look for a Unicode Signature (BOM) on the data just read
1638 int32_t signatureLength
;
1639 const char * fileBufC
;
1640 const char* bomEncoding
;
1643 bomEncoding
= ucnv_detectUnicodeSignature(
1644 fileBuf
, fileSize
, &signatureLength
, &status
);
1645 if(bomEncoding
!=NULL
){
1646 fileBufC
+= signatureLength
;
1647 fileSize
-= signatureLength
;
1648 encoding
= bomEncoding
;
1652 // Open a converter to take the rule file to UTF-16
1654 conv
= ucnv_open(encoding
, &status
);
1655 if (U_FAILURE(status
)) {
1656 goto cleanUpAndReturn
;
1660 // Convert the rules to UChar.
1661 // Preflight first to determine required buffer size.
1663 ulen
= ucnv_toUChars(conv
,
1669 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1670 // Buffer Overflow is expected from the preflight operation.
1671 status
= U_ZERO_ERROR
;
1673 retPtr
= new UChar
[ulen
+1];
1686 if (U_FAILURE(status
)) {
1687 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1697 //--------------------------------------------------------------------------------------------
1699 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1701 //-------------------------------------------------------------------------------------------
1702 void RBBITest::TestUnicodeFiles() {
1703 RuleBasedBreakIterator
*bi
;
1704 UErrorCode status
= U_ZERO_ERROR
;
1706 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1707 TEST_ASSERT_SUCCESS(status
);
1708 if (U_SUCCESS(status
)) {
1709 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1713 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1714 TEST_ASSERT_SUCCESS(status
);
1715 if (U_SUCCESS(status
)) {
1716 runUnicodeTestData("WordBreakTest.txt", bi
);
1720 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1721 TEST_ASSERT_SUCCESS(status
);
1722 if (U_SUCCESS(status
)) {
1723 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1727 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1728 TEST_ASSERT_SUCCESS(status
);
1729 if (U_SUCCESS(status
)) {
1730 runUnicodeTestData("LineBreakTest.txt", bi
);
1736 // Check for test cases from the Unicode test data files that are known to fail
1737 // and should be skipped because ICU is not yet able to fully implement the spec.
1738 // See ticket #7270.
1740 UBool
RBBITest::testCaseIsKnownIssue(const UnicodeString
&testCase
, const char *fileName
) {
1741 static const UChar badTestCases
[][4] = { // Line Numbers from Unicode 7.0.0 file.
1742 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x007D, (UChar
)0x0000}, // Line 5198
1743 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0029, (UChar
)0x0000}, // Line 5202
1744 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0021, (UChar
)0x0000}, // Line 5214
1745 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002c, (UChar
)0x0000}, // Line 5246
1746 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002f, (UChar
)0x0000}, // Line 5298
1747 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x2060, (UChar
)0x0000} // Line 5302
1749 if (strcmp(fileName
, "LineBreakTest.txt") != 0) {
1753 for (int i
=0; i
<UPRV_LENGTHOF(badTestCases
); i
++) {
1754 if (testCase
== UnicodeString(badTestCases
[i
])) {
1755 return logKnownIssue("7270");
1762 //--------------------------------------------------------------------------------------------
1764 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1766 //-------------------------------------------------------------------------------------------
1767 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1768 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1769 UErrorCode status
= U_ZERO_ERROR
;
1772 // Open and read the test data file, put it into a UnicodeString.
1774 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1775 char testFileName
[1000];
1776 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1777 dataerrln("Can't open test data. Path too long.");
1780 strcpy(testFileName
, testDataDirectory
);
1781 strcat(testFileName
, fileName
);
1783 logln("Opening data file %s\n", fileName
);
1786 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1787 if (status
!= U_FILE_ACCESS_ERROR
) {
1788 TEST_ASSERT_SUCCESS(status
);
1789 TEST_ASSERT(testFile
!= NULL
);
1791 if (U_FAILURE(status
) || testFile
== NULL
) {
1792 return; /* something went wrong, error already output */
1794 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1797 // Parse the test data file using a regular expression.
1798 // Each kind of token is recognized in its own capture group; what type of item was scanned
1799 // is identified by which group had a match.
1801 // Caputure Group # 1 2 3 4 5
1802 // Parses this item: divide x hex digits comment \n unrecognized \n
1804 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1805 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1806 UnicodeString testString
;
1807 UVector32
breakPositions(status
);
1809 TEST_ASSERT_SUCCESS(status
);
1810 if (U_FAILURE(status
)) {
1815 // Scan through each test case, building up the string to be broken in testString,
1816 // and the positions that should be boundaries in the breakPositions vector.
1819 while (tokenMatcher
.find()) {
1820 if(tokenMatcher
.hitEnd()) {
1821 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1822 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1823 and caused an infinite loop here on EBCDIC systems!
1825 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1828 if (tokenMatcher
.start(1, status
) >= 0) {
1829 // Scanned a divide sign, indicating a break position in the test data.
1830 if (testString
.length()>0) {
1831 breakPositions
.addElement(testString
.length(), status
);
1834 else if (tokenMatcher
.start(2, status
) >= 0) {
1835 // Scanned an 'x', meaning no break at this position in the test data
1836 // Nothing to be done here.
1838 else if (tokenMatcher
.start(3, status
) >= 0) {
1839 // Scanned Hex digits. Convert them to binary, append to the character data string.
1840 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1841 int length
= hexNumber
.length();
1844 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1845 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1847 testString
.append(c
);
1849 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1850 fileName
, lineNumber
);
1853 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1854 fileName
, lineNumber
);
1857 else if (tokenMatcher
.start(4, status
) >= 0) {
1858 // Scanned to end of a line, possibly skipping over a comment in the process.
1859 // If the line from the file contained test data, run the test now.
1860 if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {
1861 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1864 // Clear out this test case.
1865 // The string and breakPositions vector will be refilled as the next
1866 // test case is parsed.
1867 testString
.remove();
1868 breakPositions
.removeAllElements();
1871 // Scanner catchall. Something unrecognized appeared on the line.
1873 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1874 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1875 token
[sizeof(token
)-1] = 0;
1876 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1878 // Clean up, in preparation for continuing with the next line.
1879 testString
.remove();
1880 breakPositions
.removeAllElements();
1883 TEST_ASSERT_SUCCESS(status
);
1884 if (U_FAILURE(status
)) {
1890 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1893 //--------------------------------------------------------------------------------------------
1895 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1896 // test data files. Do only a simple, forward-only check -
1897 // this test is mostly to check that ICU and the Unicode
1898 // data agree with each other.
1900 //--------------------------------------------------------------------------------------------
1901 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1902 const UnicodeString
&testString
, // Text data to be broken
1903 UVector32
*breakPositions
, // Positions where breaks should be found.
1904 RuleBasedBreakIterator
*bi
) {
1905 int32_t pos
; // Break Position in the test string
1906 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1907 int32_t expectedPos
; // Expected break position (index into test string)
1909 bi
->setText(testString
);
1913 while (pos
!= BreakIterator::DONE
) {
1914 if (expectedI
>= breakPositions
->size()) {
1915 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1916 testFileName
, lineNumber
, pos
);
1919 expectedPos
= breakPositions
->elementAti(expectedI
);
1920 if (pos
< expectedPos
) {
1921 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1922 testFileName
, lineNumber
, pos
);
1925 if (pos
> expectedPos
) {
1926 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1927 testFileName
, lineNumber
, expectedPos
);
1934 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1935 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1936 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1942 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1943 //---------------------------------------------------------------------------------------
1945 // classs RBBIMonkeyKind
1947 // Monkey Test for Break Iteration
1948 // Abstract interface class. Concrete derived classes independently
1949 // implement the break rules for different iterator types.
1951 // The Monkey Test itself uses doesn't know which type of break iterator it is
1952 // testing, but works purely in terms of the interface defined here.
1954 //---------------------------------------------------------------------------------------
1955 class RBBIMonkeyKind
{
1957 // Return a UVector of UnicodeSets, representing the character classes used
1958 // for this type of iterator.
1959 virtual UVector
*charClasses() = 0;
1961 // Set the test text on which subsequent calls to next() will operate
1962 virtual void setText(const UnicodeString
&s
) = 0;
1964 // Find the next break postion, starting from the prev break position, or from zero.
1965 // Return -1 after reaching end of string.
1966 virtual int32_t next(int32_t i
) = 0;
1968 virtual ~RBBIMonkeyKind();
1969 UErrorCode deferredStatus
;
1978 RBBIMonkeyKind::RBBIMonkeyKind() {
1979 deferredStatus
= U_ZERO_ERROR
;
1982 RBBIMonkeyKind::~RBBIMonkeyKind() {
1986 //----------------------------------------------------------------------------------------
1988 // Random Numbers. Similar to standard lib rand() and srand()
1989 // Not using library to
1990 // 1. Get same results on all platforms.
1991 // 2. Get access to current seed, to more easily reproduce failures.
1993 //---------------------------------------------------------------------------------------
1994 static uint32_t m_seed
= 1;
1996 static uint32_t m_rand()
1998 m_seed
= m_seed
* 1103515245 + 12345;
1999 return (uint32_t)(m_seed
/65536) % 32768;
2003 //------------------------------------------------------------------------------------------
2005 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2006 // of RBBIMonkeyKind.
2008 //------------------------------------------------------------------------------------------
2009 class RBBICharMonkey
: public RBBIMonkeyKind
{
2012 virtual ~RBBICharMonkey();
2013 virtual UVector
*charClasses();
2014 virtual void setText(const UnicodeString
&s
);
2015 virtual int32_t next(int32_t i
);
2019 UnicodeSet
*fCRLFSet
;
2020 UnicodeSet
*fControlSet
;
2021 UnicodeSet
*fExtendSet
;
2022 UnicodeSet
*fRegionalIndicatorSet
;
2023 UnicodeSet
*fPrependSet
;
2024 UnicodeSet
*fSpacingSet
;
2029 UnicodeSet
*fLVTSet
;
2030 UnicodeSet
*fHangulSet
;
2031 UnicodeSet
*fAnySet
;
2032 UnicodeSet
*fEmojiModifierSet
;
2033 UnicodeSet
*fEmojiBaseSet
;
2034 UnicodeSet
*fZWJSet
;
2035 UnicodeSet
*fGAZSet
;
2037 const UnicodeString
*fText
;
2041 RBBICharMonkey::RBBICharMonkey() {
2042 UErrorCode status
= U_ZERO_ERROR
;
2046 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
2047 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status
);
2048 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status
);
2049 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
2050 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
2051 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
2052 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
2053 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
2054 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
2055 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
2056 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
2057 fHangulSet
= new UnicodeSet();
2058 fHangulSet
->addAll(*fLSet
);
2059 fHangulSet
->addAll(*fVSet
);
2060 fHangulSet
->addAll(*fTSet
);
2061 fHangulSet
->addAll(*fLVSet
);
2062 fHangulSet
->addAll(*fLVTSet
);
2063 fAnySet
= new UnicodeSet(0, 0x10ffff);
2067 fEmojiBaseSet
= new UnicodeSet(UnicodeString(
2068 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
2069 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
2070 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
2071 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
2072 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status
);
2074 fEmojiModifierSet
= new UnicodeSet(0x0001F3FB, 0x0001F3FF);
2075 fZWJSet
= new UnicodeSet(0x200D, 0x200D);
2076 fGAZSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
2077 "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
2078 "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
2079 "\\U0001F680\\U0001F692]"), status
);
2081 fSets
= new UVector(status
);
2082 fSets
->addElement(fCRLFSet
, status
);
2083 fSets
->addElement(fControlSet
, status
);
2084 fSets
->addElement(fExtendSet
, status
);
2085 fSets
->addElement(fRegionalIndicatorSet
, status
);
2086 if (!fPrependSet
->isEmpty()) {
2087 fSets
->addElement(fPrependSet
, status
);
2089 fSets
->addElement(fSpacingSet
, status
);
2090 fSets
->addElement(fHangulSet
, status
);
2091 fSets
->addElement(fAnySet
, status
);
2092 fSets
->addElement(fEmojiBaseSet
, status
);
2093 fSets
->addElement(fEmojiModifierSet
, status
);
2094 fSets
->addElement(fZWJSet
, status
);
2095 fSets
->addElement(fGAZSet
, status
);
2096 if (U_FAILURE(status
)) {
2097 deferredStatus
= status
;
2102 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2108 int32_t RBBICharMonkey::next(int32_t prevPos
) {
2109 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2110 // break position being tested. The candidate break
2111 // location is before p2.
2115 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2116 UChar32 cBase
; // for (X Extend*) patterns, the X character.
2118 if (U_FAILURE(deferredStatus
)) {
2122 // Previous break at end of string. return DONE.
2123 if (prevPos
>= fText
->length()) {
2126 p0
= p1
= p2
= p3
= prevPos
;
2127 c3
= fText
->char32At(prevPos
);
2128 c0
= c1
= c2
= cBase
= 0;
2129 (void)p0
; // suppress set but not used warning.
2132 // Loop runs once per "significant" character position in the input text.
2134 // Move all of the positions forward in the input string.
2139 // Advancd p3 by one codepoint
2140 p3
= fText
->moveIndex32(p3
, 1);
2141 c3
= fText
->char32At(p3
);
2144 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2147 if (p2
== fText
->length()) {
2148 // Reached end of string. Always a break position.
2153 // No Extend or Format characters may appear between the CR and LF,
2154 // which requires the additional check for p2 immediately following p1.
2156 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2160 // Rule (GB4). ( Control | CR | LF ) <break>
2161 if (fControlSet
->contains(c1
) ||
2167 // Rule (GB5) <break> ( Control | CR | LF )
2169 if (fControlSet
->contains(c2
) ||
2176 // Rule (GB6) L x ( L | V | LV | LVT )
2177 if (fLSet
->contains(c1
) &&
2178 (fLSet
->contains(c2
) ||
2179 fVSet
->contains(c2
) ||
2180 fLVSet
->contains(c2
) ||
2181 fLVTSet
->contains(c2
))) {
2185 // Rule (GB7) ( LV | V ) x ( V | T )
2186 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
2187 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
2191 // Rule (GB8) ( LVT | T) x T
2192 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
2193 fTSet
->contains(c2
)) {
2197 // Rule (GB9) x (Extend | ZWJ)
2198 if (fExtendSet
->contains(c2
) || fZWJSet
->contains(c2
)) {
2199 if (!fExtendSet
->contains(c1
)) {
2205 // Rule (GB9a) x SpacingMark
2206 if (fSpacingSet
->contains(c2
)) {
2210 // Rule (GB9b) Prepend x
2211 if (fPrependSet
->contains(c1
)) {
2215 // Rule (GB10) ($E_Base | $GAZ) $Extend* $E_Modifier;
2216 if ((fEmojiBaseSet
->contains(c1
) || fGAZSet
->contains(c1
)) && fEmojiModifierSet
->contains(c2
)) {
2219 if ((fEmojiBaseSet
->contains(cBase
) || fGAZSet
->contains(cBase
)) &&
2220 fExtendSet
->contains(c1
) && fEmojiModifierSet
->contains(c2
)) {
2224 // Rule (GB11) ZWJ x Glue_After_Zwj
2225 if (fZWJSet
->contains(c1
) && fGAZSet
->contains(c2
)) {
2229 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2230 // Note: The first if condition is a little tricky. We only need to force
2231 // a break if there are three or more contiguous RIs. If there are
2232 // only two, a break following will occur via other rules, and will include
2233 // any trailing extend characters, which is needed behavior.
2234 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)
2235 && fRegionalIndicatorSet
->contains(c2
)) {
2238 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2242 // Rule (GB999) Any <break> Any
2252 UVector
*RBBICharMonkey::charClasses() {
2257 RBBICharMonkey::~RBBICharMonkey() {
2262 delete fRegionalIndicatorSet
;
2272 delete fEmojiBaseSet
;
2273 delete fEmojiModifierSet
;
2278 //------------------------------------------------------------------------------------------
2280 // class RBBIWordMonkey Word Break specific implementation
2281 // of RBBIMonkeyKind.
2283 //------------------------------------------------------------------------------------------
2284 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2287 virtual ~RBBIWordMonkey();
2288 virtual UVector
*charClasses();
2289 virtual void setText(const UnicodeString
&s
);
2290 virtual int32_t next(int32_t i
);
2296 UnicodeSet
*fNewlineSet
;
2297 UnicodeSet
*fRegionalIndicatorSet
;
2298 UnicodeSet
*fKatakanaSet
;
2299 UnicodeSet
*fHebrew_LetterSet
;
2300 UnicodeSet
*fALetterSet
;
2301 // TODO(jungshik): Do we still need this change?
2302 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2303 UnicodeSet
*fSingle_QuoteSet
;
2304 UnicodeSet
*fDouble_QuoteSet
;
2305 UnicodeSet
*fMidNumLetSet
;
2306 UnicodeSet
*fMidLetterSet
;
2307 UnicodeSet
*fMidNumSet
;
2308 UnicodeSet
*fNumericSet
;
2309 UnicodeSet
*fFormatSet
;
2310 UnicodeSet
*fOtherSet
;
2311 UnicodeSet
*fExtendSet
;
2312 UnicodeSet
*fExtendNumLetSet
;
2313 UnicodeSet
*fDictionaryCjkSet
;
2314 UnicodeSet
*fEBaseSet
;
2315 UnicodeSet
*fEModifierSet
;
2316 UnicodeSet
*fZWSSet
;
2317 UnicodeSet
*fGAZSet
;
2319 const UnicodeString
*fText
;
2323 RBBIWordMonkey::RBBIWordMonkey()
2325 UErrorCode status
= U_ZERO_ERROR
;
2327 fSets
= new UVector(status
);
2329 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2330 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2331 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2332 fDictionaryCjkSet
= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status
);
2333 // Exclude Hangul syllables from ALetterSet during testing.
2334 // Leave CJK dictionary characters out from the monkey tests!
2336 fALetterSet
= new UnicodeSet("[\\p{Word_Break = ALetter}"
2337 "[\\p{Line_Break = Complex_Context}"
2338 "-\\p{Grapheme_Cluster_Break = Extend}"
2339 "-\\p{Grapheme_Cluster_Break = Control}"
2343 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status
);
2344 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2345 fHebrew_LetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status
);
2346 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2347 fALetterSet
->removeAll(*fDictionaryCjkSet
);
2348 fSingle_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status
);
2349 fDouble_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status
);
2350 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2351 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter} - [\\:]]"), status
);
2352 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2353 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2354 // we should figure out why
2355 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2356 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2357 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2358 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2360 fEBaseSet
= new UnicodeSet(UnicodeString(
2361 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
2362 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
2363 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
2364 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
2365 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status
);
2367 fEModifierSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status
);
2368 fZWSSet
= new UnicodeSet((UChar32
)0x200D, (UChar32
)0x200D);;
2369 fGAZSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
2370 "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
2371 "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
2372 "\\U0001F680\\U0001F692]"), status
);
2373 fExtendSet
->removeAll(*fZWSSet
);
2376 fOtherSet
= new UnicodeSet();
2377 if(U_FAILURE(status
)) {
2378 deferredStatus
= status
;
2382 fOtherSet
->complement();
2383 fOtherSet
->removeAll(*fCRSet
);
2384 fOtherSet
->removeAll(*fLFSet
);
2385 fOtherSet
->removeAll(*fNewlineSet
);
2386 fOtherSet
->removeAll(*fKatakanaSet
);
2387 fOtherSet
->removeAll(*fHebrew_LetterSet
);
2388 fOtherSet
->removeAll(*fALetterSet
);
2389 fOtherSet
->removeAll(*fSingle_QuoteSet
);
2390 fOtherSet
->removeAll(*fDouble_QuoteSet
);
2391 fOtherSet
->removeAll(*fMidLetterSet
);
2392 fOtherSet
->removeAll(*fMidNumSet
);
2393 fOtherSet
->removeAll(*fNumericSet
);
2394 fOtherSet
->removeAll(*fExtendNumLetSet
);
2395 fOtherSet
->removeAll(*fFormatSet
);
2396 fOtherSet
->removeAll(*fExtendSet
);
2397 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
2398 fOtherSet
->removeAll(*fEBaseSet
);
2399 fOtherSet
->removeAll(*fEModifierSet
);
2400 fOtherSet
->removeAll(*fZWSSet
);
2401 fOtherSet
->removeAll(*fGAZSet
);
2403 // Inhibit dictionary characters from being tested at all.
2404 fOtherSet
->removeAll(*fDictionaryCjkSet
);
2405 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2407 fSets
->addElement(fCRSet
, status
);
2408 fSets
->addElement(fLFSet
, status
);
2409 fSets
->addElement(fNewlineSet
, status
);
2410 fSets
->addElement(fRegionalIndicatorSet
, status
);
2411 fSets
->addElement(fHebrew_LetterSet
, status
);
2412 fSets
->addElement(fALetterSet
, status
);
2413 fSets
->addElement(fSingle_QuoteSet
, status
);
2414 fSets
->addElement(fDouble_QuoteSet
, status
);
2415 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2416 fSets
->addElement(fMidLetterSet
, status
);
2417 fSets
->addElement(fMidNumLetSet
, status
);
2418 fSets
->addElement(fMidNumSet
, status
);
2419 fSets
->addElement(fNumericSet
, status
);
2420 fSets
->addElement(fFormatSet
, status
);
2421 fSets
->addElement(fExtendSet
, status
);
2422 fSets
->addElement(fOtherSet
, status
);
2423 fSets
->addElement(fExtendNumLetSet
, status
);
2425 fSets
->addElement(fEBaseSet
, status
);
2426 fSets
->addElement(fEModifierSet
, status
);
2427 fSets
->addElement(fZWSSet
, status
);
2428 fSets
->addElement(fGAZSet
, status
);
2430 if (U_FAILURE(status
)) {
2431 deferredStatus
= status
;
2435 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2440 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2441 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2442 // break position being tested. The candidate break
2443 // location is before p2.
2447 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2449 if (U_FAILURE(deferredStatus
)) {
2453 // Prev break at end of string. return DONE.
2454 if (prevPos
>= fText
->length()) {
2457 p0
= p1
= p2
= p3
= prevPos
;
2458 c3
= fText
->char32At(prevPos
);
2460 (void)p0
; // Suppress set but not used warning.
2462 // Loop runs once per "significant" character position in the input text.
2464 // Move all of the positions forward in the input string.
2469 // Advancd p3 by X(Extend | Format)* Rule 4
2470 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2472 p3
= fText
->moveIndex32(p3
, 1);
2473 c3
= fText
->char32At(p3
);
2474 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2478 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
) || fZWSSet
->contains(c3
));
2482 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2485 if (p2
== fText
->length()) {
2486 // Reached end of string. Always a break position.
2491 // No Extend or Format characters may appear between the CR and LF,
2492 // which requires the additional check for p2 immediately following p1.
2494 if (c1
==0x0D && c2
==0x0A) {
2498 // Rule (3a) Break before and after newlines (including CR and LF)
2500 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2503 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2507 // Rule (3c) ZWJ x GAZ (Glue after ZWJ).
2508 // Not ignoring extend chars, so peek into input text to
2509 // get the potential ZWJ, the character immediately preceding c2.
2510 // Sloppy UChar32 indexing: p2-1 may reference trail half
2511 // but char32At will get the full code point.
2512 if (fZWSSet
->contains(fText
->char32At(p2
-1)) && fGAZSet
->contains(c2
)) {
2516 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2517 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2518 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2522 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2524 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2525 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2526 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2530 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2531 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2532 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2533 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2537 // Rule (7a) Hebrew_Letter x Single_Quote
2538 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2542 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2543 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2547 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2548 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2552 // Rule (8) Numeric x Numeric
2553 if (fNumericSet
->contains(c1
) &&
2554 fNumericSet
->contains(c2
)) {
2558 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2559 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2560 fNumericSet
->contains(c2
)) {
2564 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2565 if (fNumericSet
->contains(c1
) &&
2566 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2570 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2571 if (fNumericSet
->contains(c0
) &&
2572 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2573 fNumericSet
->contains(c2
)) {
2577 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2578 if (fNumericSet
->contains(c1
) &&
2579 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2580 fNumericSet
->contains(c3
)) {
2584 // Rule (13) Katakana x Katakana
2585 if (fKatakanaSet
->contains(c1
) &&
2586 fKatakanaSet
->contains(c2
)) {
2590 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2591 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2592 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2593 fExtendNumLetSet
->contains(c2
)) {
2597 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2598 if (fExtendNumLetSet
->contains(c1
) &&
2599 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2600 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2605 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)) {
2608 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2613 if ((fEBaseSet
->contains(c1
) || fGAZSet
->contains(c1
)) && fEModifierSet
->contains(c2
)) {
2617 // Rule 14. Break found here.
2626 UVector
*RBBIWordMonkey::charClasses() {
2631 RBBIWordMonkey::~RBBIWordMonkey() {
2636 delete fKatakanaSet
;
2637 delete fHebrew_LetterSet
;
2639 delete fSingle_QuoteSet
;
2640 delete fDouble_QuoteSet
;
2641 delete fMidNumLetSet
;
2642 delete fMidLetterSet
;
2647 delete fExtendNumLetSet
;
2648 delete fRegionalIndicatorSet
;
2649 delete fDictionaryCjkSet
;
2652 delete fEModifierSet
;
2660 //------------------------------------------------------------------------------------------
2662 // class RBBISentMonkey Sentence Break specific implementation
2663 // of RBBIMonkeyKind.
2665 //------------------------------------------------------------------------------------------
2666 class RBBISentMonkey
: public RBBIMonkeyKind
{
2669 virtual ~RBBISentMonkey();
2670 virtual UVector
*charClasses();
2671 virtual void setText(const UnicodeString
&s
);
2672 virtual int32_t next(int32_t i
);
2674 int moveBack(int posFrom
);
2675 int moveForward(int posFrom
);
2676 UChar32
cAt(int pos
);
2680 UnicodeSet
*fSepSet
;
2681 UnicodeSet
*fFormatSet
;
2683 UnicodeSet
*fLowerSet
;
2684 UnicodeSet
*fUpperSet
;
2685 UnicodeSet
*fOLetterSet
;
2686 UnicodeSet
*fNumericSet
;
2687 UnicodeSet
*fATermSet
;
2688 UnicodeSet
*fSContinueSet
;
2689 UnicodeSet
*fSTermSet
;
2690 UnicodeSet
*fCloseSet
;
2691 UnicodeSet
*fOtherSet
;
2692 UnicodeSet
*fExtendSet
;
2694 const UnicodeString
*fText
;
2698 RBBISentMonkey::RBBISentMonkey()
2700 UErrorCode status
= U_ZERO_ERROR
;
2702 fSets
= new UVector(status
);
2704 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2705 // set and made into character classes of their own. For the monkey impl,
2706 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2707 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2708 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2709 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2710 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2711 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2712 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2713 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2714 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2715 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2716 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2717 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2718 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2719 fOtherSet
= new UnicodeSet();
2721 if(U_FAILURE(status
)) {
2722 deferredStatus
= status
;
2726 fOtherSet
->complement();
2727 fOtherSet
->removeAll(*fSepSet
);
2728 fOtherSet
->removeAll(*fFormatSet
);
2729 fOtherSet
->removeAll(*fSpSet
);
2730 fOtherSet
->removeAll(*fLowerSet
);
2731 fOtherSet
->removeAll(*fUpperSet
);
2732 fOtherSet
->removeAll(*fOLetterSet
);
2733 fOtherSet
->removeAll(*fNumericSet
);
2734 fOtherSet
->removeAll(*fATermSet
);
2735 fOtherSet
->removeAll(*fSContinueSet
);
2736 fOtherSet
->removeAll(*fSTermSet
);
2737 fOtherSet
->removeAll(*fCloseSet
);
2738 fOtherSet
->removeAll(*fExtendSet
);
2740 fSets
->addElement(fSepSet
, status
);
2741 fSets
->addElement(fFormatSet
, status
);
2742 fSets
->addElement(fSpSet
, status
);
2743 fSets
->addElement(fLowerSet
, status
);
2744 fSets
->addElement(fUpperSet
, status
);
2745 fSets
->addElement(fOLetterSet
, status
);
2746 fSets
->addElement(fNumericSet
, status
);
2747 fSets
->addElement(fATermSet
, status
);
2748 fSets
->addElement(fSContinueSet
, status
);
2749 fSets
->addElement(fSTermSet
, status
);
2750 fSets
->addElement(fCloseSet
, status
);
2751 fSets
->addElement(fOtherSet
, status
);
2752 fSets
->addElement(fExtendSet
, status
);
2754 if (U_FAILURE(status
)) {
2755 deferredStatus
= status
;
2761 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2765 UVector
*RBBISentMonkey::charClasses() {
2770 // moveBack() Find the "significant" code point preceding the index i.
2771 // Skips over ($Extend | $Format)* .
2773 int RBBISentMonkey::moveBack(int i
) {
2780 j
= fText
->moveIndex32(j
, -1);
2781 c
= fText
->char32At(j
);
2783 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2789 int RBBISentMonkey::moveForward(int i
) {
2790 if (i
>=fText
->length()) {
2791 return fText
->length();
2796 j
= fText
->moveIndex32(j
, 1);
2799 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2803 UChar32
RBBISentMonkey::cAt(int pos
) {
2804 if (pos
<0 || pos
>=fText
->length()) {
2807 return fText
->char32At(pos
);
2811 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2812 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2813 // break position being tested. The candidate break
2814 // location is before p2.
2818 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2821 if (U_FAILURE(deferredStatus
)) {
2825 // Prev break at end of string. return DONE.
2826 if (prevPos
>= fText
->length()) {
2829 p0
= p1
= p2
= p3
= prevPos
;
2830 c3
= fText
->char32At(prevPos
);
2832 (void)p0
; // Suppress set but not used warning.
2834 // Loop runs once per "significant" character position in the input text.
2836 // Move all of the positions forward in the input string.
2841 // Advancd p3 by X(Extend | Format)* Rule 4
2842 p3
= moveForward(p3
);
2846 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2850 // Rule (4). Sep <break>
2851 if (fSepSet
->contains(c1
)) {
2852 p2
= p1
+1; // Separators don't combine with Extend or Format.
2856 if (p2
>= fText
->length()) {
2857 // Reached end of string. Always a break position.
2861 if (p2
== prevPos
) {
2862 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2866 // Rule (6). ATerm x Numeric
2867 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2871 // Rule (7). (Upper | Lower) ATerm x Uppper
2872 if ((fUpperSet
->contains(c0
) || fLowerSet
->contains(c0
)) &&
2873 fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2877 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2878 // Note: STerm | ATerm are added to the negated part of the expression by a
2879 // note to the Unicode 5.0 documents.
2881 while (fSpSet
->contains(cAt(p8
))) {
2884 while (fCloseSet
->contains(cAt(p8
))) {
2887 if (fATermSet
->contains(cAt(p8
))) {
2891 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2892 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2893 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2896 p8
= moveForward(p8
);
2898 if (fLowerSet
->contains(cAt(p8
))) {
2903 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2904 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2906 while (fSpSet
->contains(cAt(p8
))) {
2909 while (fCloseSet
->contains(cAt(p8
))) {
2913 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2918 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2920 while (fCloseSet
->contains(cAt(p9
))) {
2924 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2925 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2930 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2932 while (fSpSet
->contains(cAt(p10
))) {
2933 p10
= moveBack(p10
);
2935 while (fCloseSet
->contains(cAt(p10
))) {
2936 p10
= moveBack(p10
);
2938 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2939 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2944 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2946 if (fSepSet
->contains(cAt(p11
))) {
2947 p11
= moveBack(p11
);
2949 while (fSpSet
->contains(cAt(p11
))) {
2950 p11
= moveBack(p11
);
2952 while (fCloseSet
->contains(cAt(p11
))) {
2953 p11
= moveBack(p11
);
2955 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2959 // Rule (12) Any x Any
2966 RBBISentMonkey::~RBBISentMonkey() {
2976 delete fSContinueSet
;
2985 //-------------------------------------------------------------------------------------------
2989 //-------------------------------------------------------------------------------------------
2991 class RBBILineMonkey
: public RBBIMonkeyKind
{
2994 virtual ~RBBILineMonkey();
2995 virtual UVector
*charClasses();
2996 virtual void setText(const UnicodeString
&s
);
2997 virtual int32_t next(int32_t i
);
2998 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
3045 BreakIterator
*fCharBI
;
3046 const UnicodeString
*fText
;
3047 RegexMatcher
*fNumberMatcher
;
3050 RBBILineMonkey::RBBILineMonkey() :
3056 fNumberMatcher(NULL
)
3059 if (U_FAILURE(deferredStatus
)) {
3063 UErrorCode status
= U_ZERO_ERROR
;
3065 fSets
= new UVector(status
);
3067 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
3068 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
3069 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
3070 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
3071 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
3072 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
3073 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
3074 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
3075 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
3076 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
3077 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
3078 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
3079 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
3080 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
3081 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
3082 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
3083 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
3084 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
3085 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
3086 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
3087 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
3088 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
3089 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
3090 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
3091 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
3092 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
3093 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
3094 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
3095 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
3096 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
3097 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
3098 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
3099 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
3100 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
3101 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
3102 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
3103 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
3104 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
3105 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
3106 fEB
= new UnicodeSet(UnicodeString(
3107 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
3108 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
3109 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
3110 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
3111 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status
);
3112 fEM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status
);
3113 fZJ
= new UnicodeSet((UChar32
)0x200D, (UChar32
)0x200D);
3115 if (U_FAILURE(status
)) {
3116 deferredStatus
= status
;
3120 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
3121 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
3122 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
3124 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
3126 fID
->addAll(*fEB
); // Emoji Base and Emoji Modifier behave as ID.
3128 fAL
->removeAll(*fEM
);
3131 fAL
->remove((UChar32
)0x2695); // move u2695 from Al to Id
3132 fAL
->remove((UChar32
)0x2696); // move u2696 from Al to Id
3133 fAL
->remove((UChar32
)0x2764); // Emoji Proposal: move u2764 from Al to Id
3134 fAI
->remove((UChar32
)0x2640); // new ZWJ seqs
3135 fAI
->remove((UChar32
)0x2642); // new ZWJ seqs
3136 fID
->add((UChar32
)0x2695);
3137 fID
->add((UChar32
)0x2696);
3138 fID
->add((UChar32
)0x2764);
3139 fID
->add((UChar32
)0x2640);
3140 fID
->add((UChar32
)0x2642);
3142 fSets
->addElement(fBK
, status
);
3143 fSets
->addElement(fCR
, status
);
3144 fSets
->addElement(fLF
, status
);
3145 fSets
->addElement(fCM
, status
);
3146 fSets
->addElement(fNL
, status
);
3147 fSets
->addElement(fWJ
, status
);
3148 fSets
->addElement(fZW
, status
);
3149 fSets
->addElement(fGL
, status
);
3150 fSets
->addElement(fCB
, status
);
3151 fSets
->addElement(fSP
, status
);
3152 fSets
->addElement(fB2
, status
);
3153 fSets
->addElement(fBA
, status
);
3154 fSets
->addElement(fBB
, status
);
3155 fSets
->addElement(fHY
, status
);
3156 fSets
->addElement(fH2
, status
);
3157 fSets
->addElement(fH3
, status
);
3158 fSets
->addElement(fCL
, status
);
3159 fSets
->addElement(fCP
, status
);
3160 fSets
->addElement(fEX
, status
);
3161 fSets
->addElement(fIN
, status
);
3162 fSets
->addElement(fJL
, status
);
3163 fSets
->addElement(fJT
, status
);
3164 fSets
->addElement(fJV
, status
);
3165 fSets
->addElement(fNS
, status
);
3166 fSets
->addElement(fOP
, status
);
3167 fSets
->addElement(fQU
, status
);
3168 fSets
->addElement(fIS
, status
);
3169 fSets
->addElement(fNU
, status
);
3170 fSets
->addElement(fPO
, status
);
3171 fSets
->addElement(fPR
, status
);
3172 fSets
->addElement(fSY
, status
);
3173 fSets
->addElement(fAI
, status
);
3174 fSets
->addElement(fAL
, status
);
3175 fSets
->addElement(fHL
, status
);
3176 fSets
->addElement(fID
, status
);
3177 fSets
->addElement(fWJ
, status
);
3178 fSets
->addElement(fRI
, status
);
3179 fSets
->addElement(fSG
, status
);
3180 fSets
->addElement(fEB
, status
);
3181 fSets
->addElement(fEM
, status
);
3182 fSets
->addElement(fZJ
, status
);
3185 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3186 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3187 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3188 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3189 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3190 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3192 fNumberMatcher
= new RegexMatcher(
3193 UnicodeString(rules
, -1, US_INV
), 0, status
);
3195 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
3197 if (U_FAILURE(status
)) {
3198 deferredStatus
= status
;
3203 void RBBILineMonkey::setText(const UnicodeString
&s
) {
3205 fCharBI
->setText(s
);
3206 fNumberMatcher
->reset(s
);
3211 // Line Break TR rules 9 and 10 implementation.
3212 // This deals with combining marks and other sequences that
3213 // that must be treated as if they were something other than what they actually are.
3215 // This is factored out into a separate function because it must be applied twice for
3216 // each potential break, once to the chars before the position being checked, then
3217 // again to the text following the possible break.
3219 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
3221 // Invalid initial position. Happens during the warmup iteration of the
3222 // main loop in next().
3226 int32_t nPos
= *nextPos
;
3228 // LB 9 Keep combining sequences together.
3229 // advance over any CM class chars. Note that Line Break CM is different
3230 // from the normal Grapheme Extend property.
3231 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
3232 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
3234 *nextChar
= fText
->char32At(nPos
);
3235 if (!fCM
->contains(*nextChar
)) {
3238 nPos
= fText
->moveIndex32(nPos
, 1);
3243 // LB 9 Treat X CM* as if it were x.
3244 // No explicit action required.
3246 // LB 10 Treat any remaining combining mark as AL
3247 if (fCM
->contains(*posChar
)) {
3248 *posChar
= 0x41; // thisChar = 'A';
3251 // Push the updated nextPos and nextChar back to our caller.
3252 // This only makes a difference if posChar got bigger by consuming a
3253 // combining sequence.
3255 *nextChar
= fText
->char32At(nPos
);
3260 int32_t RBBILineMonkey::next(int32_t startPos
) {
3261 UErrorCode status
= U_ZERO_ERROR
;
3262 int32_t pos
; // Index of the char following a potential break position
3263 UChar32 thisChar
; // Character at above position "pos"
3265 int32_t prevPos
; // Index of the char preceding a potential break position
3266 UChar32 prevChar
; // Character at above position. Note that prevChar
3267 // and thisChar may not be adjacent because combining
3268 // characters between them will be ignored.
3270 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
3273 int32_t nextPos
; // Index of the next character following pos.
3274 // Usually skips over combining marks.
3275 int32_t nextCPPos
; // Index of the code point following "pos."
3276 // May point to a combining mark.
3277 int32_t tPos
; // temp value.
3280 if (U_FAILURE(deferredStatus
)) {
3284 if (startPos
>= fText
->length()) {
3289 // Initial values for loop. Loop will run the first time without finding breaks,
3290 // while the invalid values shift out and the "this" and
3291 // "prev" positions are filled in with good values.
3292 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
3293 thisChar
= prevChar
= prevCharX2
= 0;
3294 nextPos
= nextCPPos
= startPos
;
3297 // Loop runs once per position in the test text, until a break position
3300 prevPosX2
= prevPos
;
3301 prevCharX2
= prevChar
;
3304 prevChar
= thisChar
;
3307 thisChar
= fText
->char32At(pos
);
3309 nextCPPos
= fText
->moveIndex32(pos
, 1);
3310 nextPos
= nextCPPos
;
3312 // Rule LB2 - Break at end of text.
3313 if (pos
>= fText
->length()) {
3317 // Rule LB 9 - adjust for combining sequences.
3318 // We do this one out-of-order because the adjustment does not change anything
3319 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3321 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3322 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3323 c
= fText
->char32At(nextPos
);
3324 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3326 // If the loop is still warming up - if we haven't shifted the initial
3327 // -1 positions out of prevPos yet - loop back to advance the
3328 // position in the input without any further looking for breaks.
3329 if (prevPos
== -1) {
3333 // LB 4 Always break after hard line breaks,
3334 if (fBK
->contains(prevChar
)) {
3338 // LB 5 Break after CR, LF, NL, but not inside CR LF
3339 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3342 if (prevChar
== 0x0d ||
3348 // LB 6 Don't break before hard line breaks
3349 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3350 fBK
->contains(thisChar
)) {
3355 // LB 7 Don't break before spaces or zero-width space.
3356 if (fSP
->contains(thisChar
)) {
3360 if (fZW
->contains(thisChar
)) {
3364 // LB 8 Break after zero width space
3365 if (fZW
->contains(prevChar
)) {
3370 // The monkey test's way of ignoring combining characters doesn't work
3371 // for this rule. ZJ is also a CM. Need to get the actual character
3372 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3374 int32_t prevIdx
= fText
->moveIndex32(pos
, -1);
3375 UChar32 prevC
= fText
->char32At(prevIdx
);
3376 if (fZJ
->contains(prevC
) && fID
->contains(thisChar
)) {
3381 // LB 9, 10 Already done, at top of loop.
3385 // LB 11 Do not break before or after WORD JOINER and related characters.
3389 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3395 if (fGL
->contains(prevChar
)) {
3401 if (!(fSP
->contains(prevChar
) ||
3402 fBA
->contains(prevChar
) ||
3403 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3409 // LB 13 Don't break before closings.
3410 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3411 // fall into LB 17 and the more general number regular expression.
3413 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3414 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3415 fEX
->contains(thisChar
) ||
3416 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3417 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3421 // LB 14 Don't break after OP SP*
3422 // Scan backwards, checking for this sequence.
3423 // The OP char could include combining marks, so we actually check for
3425 // Another Twist: The Rule 67 fixes may have changed a SP CM
3426 // sequence into a ID char, so before scanning back through spaces,
3427 // verify that prevChar is indeed a space. The prevChar variable
3428 // may differ from fText[prevPos]
3430 if (fSP
->contains(prevChar
)) {
3431 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3432 tPos
=fText
->moveIndex32(tPos
, -1);
3435 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3436 tPos
=fText
->moveIndex32(tPos
, -1);
3438 if (fOP
->contains(fText
->char32At(tPos
))) {
3443 // LB 15 QU SP* x OP
3444 if (fOP
->contains(thisChar
)) {
3445 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3447 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3448 tPos
= fText
->moveIndex32(tPos
, -1);
3450 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3451 tPos
= fText
->moveIndex32(tPos
, -1);
3453 if (fQU
->contains(fText
->char32At(tPos
))) {
3460 // LB 16 (CL | CP) SP* x NS
3461 // Scan backwards for SP* CM* (CL | CP)
3462 if (fNS
->contains(thisChar
)) {
3464 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3465 tPos
= fText
->moveIndex32(tPos
, -1);
3467 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3468 tPos
= fText
->moveIndex32(tPos
, -1);
3470 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3476 // LB 17 B2 SP* x B2
3477 if (fB2
->contains(thisChar
)) {
3478 // Scan backwards, checking for the B2 CM* SP* sequence.
3480 if (fSP
->contains(prevChar
)) {
3481 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3482 tPos
=fText
->moveIndex32(tPos
, -1);
3485 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3486 tPos
=fText
->moveIndex32(tPos
, -1);
3488 if (fB2
->contains(fText
->char32At(tPos
))) {
3494 // LB 18 break after space
3495 if (fSP
->contains(prevChar
)) {
3502 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3506 // LB 20 Break around a CB
3507 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3512 if (fBA
->contains(thisChar
) ||
3513 fHY
->contains(thisChar
) ||
3514 fNS
->contains(thisChar
) ||
3515 fBB
->contains(prevChar
) ) {
3521 if (fHL
->contains(prevCharX2
) &&
3522 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3528 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3533 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3534 (fEX
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3535 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3536 (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3537 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3538 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3547 if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) ||
3548 (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3549 (fHL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3550 (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) ||
3551 (fNU
->contains(prevChar
) && fHL
->contains(thisChar
)) ) {
3555 // LB 24 Do not break between prefix and letters or ideographs.
3559 // (AL | HL) x PR // Apple early addition
3560 // (AL | HL) x PO // Apple early addition
3561 if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) ||
3562 (fPR
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3563 (fPO
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3564 ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fPR
->contains(thisChar
)) ||
3565 ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fPO
->contains(thisChar
)) ) {
3572 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3573 if (U_FAILURE(status
)) {
3576 // Matched a number. But could have been just a single digit, which would
3577 // not represent a "no break here" between prevChar and thisChar
3578 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3579 if (numEndIdx
> pos
) {
3580 // Number match includes at least our two chars being checked
3581 if (numEndIdx
> nextPos
) {
3582 // Number match includes additional chars. Update pos and nextPos
3583 // so that next loop iteration will continue at the end of the number,
3584 // checking for breaks between last char in number & whatever follows.
3585 pos
= nextPos
= numEndIdx
;
3587 pos
= fText
->moveIndex32(pos
, -1);
3588 thisChar
= fText
->char32At(pos
);
3589 } while (fCM
->contains(thisChar
));
3596 // LB 26 Do not break a Korean syllable.
3597 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3598 fJV
->contains(thisChar
) ||
3599 fH2
->contains(thisChar
) ||
3600 fH3
->contains(thisChar
))) {
3604 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3605 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3609 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3610 fJT
->contains(thisChar
)) {
3614 // LB 27 Treat a Korean Syllable Block the same as ID.
3615 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3616 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3617 fIN
->contains(thisChar
)) {
3620 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3621 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3622 fPO
->contains(thisChar
)) {
3625 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3626 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3632 // LB 28 Do not break between alphabetics ("at").
3633 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3637 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3638 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3642 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3645 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3648 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3652 // LB30a RI RI <break> RI
3654 if (fRI
->contains(prevCharX2
) && fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3657 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3661 // LB30b Emoji Base x Emoji Modifier
3662 if (fEB
->contains(prevChar
) && fEM
->contains(thisChar
)) {
3666 // LB 31 Break everywhere else
3675 UVector
*RBBILineMonkey::charClasses() {
3680 RBBILineMonkey::~RBBILineMonkey() {
3727 delete fNumberMatcher
;
3731 //-------------------------------------------------------------------------------------------
3736 // seed=nnnnn Random number starting seed.
3737 // Setting the seed allows errors to be reproduced.
3738 // loop=nnn Looping count. Controls running time.
3740 // 0 or greater: run length.
3742 // type = char | word | line | sent | title
3745 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3747 //-------------------------------------------------------------------------------------------
3749 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3750 int32_t val
= defaultVal
;
3751 name
.append(" *= *(-?\\d+)");
3752 UErrorCode status
= U_ZERO_ERROR
;
3753 RegexMatcher
m(name
, params
, 0, status
);
3755 // The param exists. Convert the string to an int.
3756 char valString
[100];
3757 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3758 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3759 paramLength
= (int32_t)(sizeof(valString
)-2);
3761 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3762 val
= strtol(valString
, NULL
, 10);
3764 // Delete this parameter from the params string.
3766 params
= m
.replaceFirst("", status
);
3768 U_ASSERT(U_SUCCESS(status
));
3773 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3774 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3783 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3785 if (count
< expectedcount
&& expected
[count
] != i
) {
3786 test
->errln("break forward test failed: expected %d but got %d",
3787 expected
[count
], i
);
3792 if (count
!= expectedcount
) {
3793 printStringBreaks(ustr
, expected
, expectedcount
);
3794 test
->errln("break forward test failed: missed %d match",
3795 expectedcount
- count
);
3798 // testing boundaries
3799 for (i
= 1; i
< expectedcount
; i
++) {
3800 int j
= expected
[i
- 1];
3801 if (!bi
->isBoundary(j
)) {
3802 printStringBreaks(ustr
, expected
, expectedcount
);
3803 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3806 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3807 if (bi
->isBoundary(j
)) {
3808 printStringBreaks(ustr
, expected
, expectedcount
);
3809 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3815 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3817 if (forward
[count
] != i
) {
3818 printStringBreaks(ustr
, expected
, expectedcount
);
3819 test
->errln("happy break test previous() failed: expected %d but got %d",
3825 printStringBreaks(ustr
, expected
, expectedcount
);
3826 test
->errln("break test previous() failed: missed a match");
3830 // testing preceding
3831 for (i
= 0; i
< expectedcount
- 1; i
++) {
3832 // int j = expected[i] + 1;
3833 int j
= ustr
.moveIndex32(expected
[i
], 1);
3834 for (; j
<= expected
[i
+ 1]; j
++) {
3835 if (bi
->preceding(j
) != expected
[i
]) {
3836 printStringBreaks(ustr
, expected
, expectedcount
);
3837 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3845 void RBBITest::TestWordBreaks(void)
3847 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3849 Locale
locale("en");
3850 UErrorCode status
= U_ZERO_ERROR
;
3851 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3852 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3853 // Replaced any C+J characters in a row with a random sequence of characters
3854 // of the same length to make our C+J segmentation not get in the way.
3855 static const char *strlist
[] =
3857 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3858 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3859 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3860 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3861 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3862 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3863 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3864 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3865 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3866 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3867 "\\u2027\\U000e0067\\u0a47\\u00b7",
3868 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3869 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3870 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3871 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3872 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3873 "\\u0027\\u11af\\U000e0057\\u0602",
3874 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3875 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3876 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3877 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3878 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3879 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3880 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3881 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3882 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3883 "\\u18f4\\U000e0049\\u20e7\\u2027",
3884 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3885 "\\ua183\\u102d\\u0bec\\u003a",
3886 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3887 "\\u003a\\u0e57\\u0fad\\u002e",
3888 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3889 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3890 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3891 "\\u003a\\u0664\\u00b7\\u1fba",
3892 "\\u003b\\u0027\\u00b7\\u47a3",
3893 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3894 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3895 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3898 if (U_FAILURE(status
)) {
3899 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3902 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3903 // printf("looping %d\n", loop);
3904 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3905 // RBBICharMonkey monkey;
3906 RBBIWordMonkey monkey
;
3909 int expectedcount
= 0;
3911 monkey
.setText(ustr
);
3913 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3914 expected
[expectedcount
++] = i
;
3917 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3923 void RBBITest::TestWordBoundary(void)
3925 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3926 Locale
locale("en");
3927 UErrorCode status
= U_ZERO_ERROR
;
3928 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3929 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3931 static const char *strlist
[] =
3933 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3934 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3935 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3936 "\\u2027\\U000e0067\\u0a47\\u00b7",
3937 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3938 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3939 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3940 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3941 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3942 "\\u0027\\u11af\\U000e0057\\u0602",
3943 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3944 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3945 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3946 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3947 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3948 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3949 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3950 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3951 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3952 "\\u58f4\\U000e0049\\u20e7\\u2027",
3953 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3954 "\\ua183\\u102d\\u0bec\\u003a",
3955 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3956 "\\u003a\\u0e57\\u0fad\\u002e",
3957 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3958 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3959 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3960 "\\u003a\\u0664\\u00b7\\u1fba",
3961 "\\u003b\\u0027\\u00b7\\u47a3",
3964 if (U_FAILURE(status
)) {
3965 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3968 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3969 // printf("looping %d\n", loop);
3970 u_unescape(strlist
[loop
], str
, 20);
3971 UnicodeString
ustr(str
);
3978 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3979 forward
[count
++] = i
;
3982 for (j
= prev
+ 1; j
< i
; j
++) {
3983 if (bi
->isBoundary(j
)) {
3984 printStringBreaks(ustr
, forward
, count
);
3985 errln("happy boundary test failed: expected %d not a boundary",
3991 if (!bi
->isBoundary(i
)) {
3992 printStringBreaks(ustr
, forward
, count
);
3993 errln("happy boundary test failed: expected %d a boundary",
4003 void RBBITest::TestLineBreaks(void)
4005 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4006 Locale
locale("en");
4007 UErrorCode status
= U_ZERO_ERROR
;
4008 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4009 const int32_t STRSIZE
= 50;
4011 static const char *strlist
[] =
4013 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4014 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4015 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4016 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4017 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4018 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4019 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4020 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4021 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4022 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4023 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4024 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4025 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4026 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4027 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4028 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4029 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4030 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4031 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4032 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4033 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4034 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4035 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4036 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4037 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4038 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4039 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4040 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4041 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4042 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4043 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4044 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4045 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4046 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4047 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4048 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4049 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4050 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4051 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4052 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4055 TEST_ASSERT_SUCCESS(status
);
4056 if (U_FAILURE(status
)) {
4059 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
4060 // printf("looping %d\n", loop);
4061 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
4068 UnicodeString
ustr(str
);
4069 RBBILineMonkey monkey
;
4070 if (U_FAILURE(monkey
.deferredStatus
)) {
4074 const int EXPECTEDSIZE
= 50;
4075 int expected
[EXPECTEDSIZE
];
4076 int expectedcount
= 0;
4078 monkey
.setText(ustr
);
4080 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4081 if (expectedcount
>= EXPECTEDSIZE
) {
4082 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4085 expected
[expectedcount
++] = i
;
4088 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4094 void RBBITest::TestSentBreaks(void)
4096 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4097 Locale
locale("en");
4098 UErrorCode status
= U_ZERO_ERROR
;
4099 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4101 static const char *strlist
[] =
4103 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4105 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4106 "\"Sentence ending with a quote.\" Bye.",
4107 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4108 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4109 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4110 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4111 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4112 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4113 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4114 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4115 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4116 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4117 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4118 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4119 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4120 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4121 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4122 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4125 if (U_FAILURE(status
)) {
4126 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
4129 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
4130 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
4131 UnicodeString
ustr(str
);
4133 RBBISentMonkey monkey
;
4134 if (U_FAILURE(monkey
.deferredStatus
)) {
4138 const int EXPECTEDSIZE
= 50;
4139 int expected
[EXPECTEDSIZE
];
4140 int expectedcount
= 0;
4142 monkey
.setText(ustr
);
4144 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4145 if (expectedcount
>= EXPECTEDSIZE
) {
4146 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4149 expected
[expectedcount
++] = i
;
4152 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4158 void RBBITest::TestMonkey(char *params
) {
4159 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4161 UErrorCode status
= U_ZERO_ERROR
;
4162 int32_t loopCount
= 500;
4164 UnicodeString breakType
= "all";
4165 Locale
locale("en");
4166 UBool useUText
= FALSE
;
4168 if (quick
== FALSE
) {
4173 UnicodeString
p(params
);
4174 loopCount
= getIntParam("loop", p
, loopCount
);
4175 seed
= getIntParam("seed", p
, seed
);
4177 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
4179 breakType
= m
.group(1, status
);
4181 p
= m
.replaceFirst("", status
);
4184 RegexMatcher
u(" *utext", p
, 0, status
);
4188 p
= u
.replaceFirst("", status
);
4193 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
4194 // Each option is stripped out of the option string as it is processed.
4195 // All options have been checked. The option string should have been completely emptied..
4197 p
.extract(buf
, sizeof(buf
), NULL
, status
);
4198 buf
[sizeof(buf
)-1] = 0;
4199 errln("Unrecognized or extra parameter: %s\n", buf
);
4205 if (breakType
== "char" || breakType
== "all") {
4207 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
4208 if (U_SUCCESS(status
)) {
4209 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
4210 if (breakType
== "all" && useUText
==FALSE
) {
4211 // Also run a quick test with UText when "all" is specified
4212 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
4216 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
4221 if (breakType
== "word" || breakType
== "all") {
4222 logln("Word Break Monkey Test");
4224 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4225 if (U_SUCCESS(status
)) {
4226 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
4229 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
4234 if (breakType
== "line" || breakType
== "all") {
4235 logln("Line Break Monkey Test");
4237 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4238 if (loopCount
>= 10) {
4239 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
4241 if (U_SUCCESS(status
)) {
4242 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
4245 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4250 if (breakType
== "sent" || breakType
== "all" ) {
4251 logln("Sentence Break Monkey Test");
4253 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4254 if (loopCount
>= 10) {
4255 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
4257 if (U_SUCCESS(status
)) {
4258 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
4261 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4270 // Run a RBBI monkey test. Common routine, for all break iterator types.
4272 // bi - the break iterator to use
4273 // mk - MonkeyKind, abstraction for obtaining expected results
4274 // name - Name of test (char, word, etc.) for use in error messages
4275 // seed - Seed for starting random number generator (parameter from user)
4278 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4279 int32_t numIterations
, UBool useUText
) {
4281 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4283 const int32_t TESTSTRINGLEN
= 500;
4284 UnicodeString testText
;
4285 int32_t numCharClasses
;
4287 int expected
[TESTSTRINGLEN
*2 + 1];
4288 int expectedCount
= 0;
4289 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4290 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4291 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4292 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4293 char followingBreaks
[TESTSTRINGLEN
*2+1];
4294 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4300 numCharClasses
= mk
.charClasses()->size();
4301 chClasses
= mk
.charClasses();
4303 // Check for errors that occured during the construction of the MonkeyKind object.
4304 // Can't report them where they occured because errln() is a method coming from intlTest,
4305 // and is not visible outside of RBBITest :-(
4306 if (U_FAILURE(mk
.deferredStatus
)) {
4307 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4311 // Verify that the character classes all have at least one member.
4312 for (i
=0; i
<numCharClasses
; i
++) {
4313 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4314 if (s
== NULL
|| s
->size() == 0) {
4315 errln("Character Class #%d is null or of zero size.", i
);
4320 while (loopCount
< numIterations
|| numIterations
== -1) {
4321 if (numIterations
== -1 && loopCount
% 10 == 0) {
4322 // If test is running in an infinite loop, display a periodic tic so
4323 // we can tell that it is making progress.
4324 fprintf(stderr
, ".");
4326 // Save current random number seed, so that we can recreate the random numbers
4327 // for this loop iteration in event of an error.
4330 // Populate a test string with data.
4331 testText
.truncate(0);
4332 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4333 int32_t aClassNum
= m_rand() % numCharClasses
;
4334 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4335 int32_t charIdx
= m_rand() % classSet
->size();
4336 UChar32 c
= classSet
->charAt(charIdx
);
4337 if (c
< 0) { // TODO: deal with sets containing strings.
4338 errln("%s:%d c < 0", __FILE__
, __LINE__
);
4341 // Do not assemble a supplementary character from randomly generated separate surrogates.
4342 // (It could be a dictionary character)
4343 if (U16_IS_TRAIL(c
) && testText
.length() > 0 && U16_IS_LEAD(testText
.charAt(testText
.length()-1))) {
4350 // Calculate the expected results for this test string.
4351 mk
.setText(testText
);
4352 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4353 expectedBreaks
[0] = 1;
4354 int32_t breakPos
= 0;
4357 breakPos
= mk
.next(breakPos
);
4358 if (breakPos
== -1) {
4361 if (breakPos
> testText
.length()) {
4362 errln("breakPos > testText.length()");
4364 expectedBreaks
[breakPos
] = 1;
4365 U_ASSERT(expectedCount
<testText
.length());
4366 expected
[expectedCount
++] = breakPos
;
4367 (void)expected
; // Set but not used warning.
4368 // TODO (andy): check it out.
4371 // Find the break positions using forward iteration
4372 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4374 UErrorCode status
= U_ZERO_ERROR
;
4375 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4376 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4377 bi
->setText(testUText
, status
);
4378 TEST_ASSERT_SUCCESS(status
);
4379 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4380 // This UText can be closed immediately, so long as the
4381 // testText string continues to exist.
4383 bi
->setText(testText
);
4386 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4387 if (i
< 0 || i
> testText
.length()) {
4388 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4391 forwardBreaks
[i
] = 1;
4394 // Find the break positions using reverse iteration
4395 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4396 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4397 if (i
< 0 || i
> testText
.length()) {
4398 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4401 reverseBreaks
[i
] = 1;
4404 // Find the break positions using isBoundary() tests.
4405 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4406 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4407 for (i
=0; i
<=testText
.length(); i
++) {
4408 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4412 // Find the break positions using the following() function.
4414 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4415 int32_t lastBreakPos
= 0;
4416 followingBreaks
[0] = 1;
4417 for (i
=0; i
<testText
.length(); i
++) {
4418 breakPos
= bi
->following(i
);
4419 if (breakPos
<= i
||
4420 breakPos
< lastBreakPos
||
4421 breakPos
> testText
.length() ||
4422 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4423 errln("%s break monkey test: "
4424 "Out of range value returned by BreakIterator::following().\n"
4425 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4426 name
, seed
, i
, breakPos
, lastBreakPos
);
4429 followingBreaks
[breakPos
] = 1;
4430 lastBreakPos
= breakPos
;
4433 // Find the break positions using the preceding() function.
4434 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4435 lastBreakPos
= testText
.length();
4436 precedingBreaks
[testText
.length()] = 1;
4437 for (i
=testText
.length(); i
>0; i
--) {
4438 breakPos
= bi
->preceding(i
);
4439 if (breakPos
>= i
||
4440 breakPos
> lastBreakPos
||
4441 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4442 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4443 errln("%s break monkey test: "
4444 "Out of range value returned by BreakIterator::preceding().\n"
4445 "index=%d; prev returned %d; lastBreak=%d" ,
4446 name
, i
, breakPos
, lastBreakPos
);
4447 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4448 precedingBreaks
[i
] = 2; // Forces an error.
4451 if (breakPos
>= 0) {
4452 precedingBreaks
[breakPos
] = 1;
4454 lastBreakPos
= breakPos
;
4458 // Compare the expected and actual results.
4459 for (i
=0; i
<=testText
.length(); i
++) {
4460 const char *errorType
= NULL
;
4461 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4462 errorType
= "next()";
4463 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4464 errorType
= "previous()";
4465 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4466 errorType
= "isBoundary()";
4467 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4468 errorType
= "following()";
4469 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4470 errorType
= "preceding()";
4474 if (errorType
!= NULL
) {
4475 // Format a range of the test text that includes the failure as
4476 // a data item that can be included in the rbbi test data file.
4478 // Start of the range is the last point where expected and actual results
4479 // both agreed that there was a break position.
4480 int startContext
= i
;
4483 if (startContext
==0) { break; }
4485 if (expectedBreaks
[startContext
] != 0) {
4486 if (count
== 2) break;
4491 // End of range is two expected breaks past the start position.
4492 int endContext
= i
+ 1;
4494 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4496 if (endContext
>= testText
.length()) {break;}
4497 if (expectedBreaks
[endContext
-1] != 0) {
4498 if (count
== 0) break;
4505 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4506 UnicodeString errorText
= "<data>";
4507 /***if (strcmp(errorType, "next()") == 0) {
4509 endContext = testText.length();
4511 printStringBreaks(testText, expected, expectedCount);
4514 for (ci
=startContext
; ci
<endContext
;) {
4515 UnicodeString
hexChars("0123456789abcdef");
4518 c
= testText
.char32At(ci
);
4520 // This is the location of the error.
4521 errorText
.append("<?>");
4522 } else if (expectedBreaks
[ci
] != 0) {
4523 // This a non-error expected break position.
4524 errorText
.append("\\");
4527 errorText
.append("\\u");
4528 for (bn
=12; bn
>=0; bn
-=4) {
4529 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4532 errorText
.append("\\U");
4533 for (bn
=28; bn
>=0; bn
-=4) {
4534 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4537 ci
= testText
.moveIndex32(ci
, 1);
4539 errorText
.append("\\");
4540 errorText
.append("</data>\n");
4543 char charErrorTxt
[500];
4544 UErrorCode status
= U_ZERO_ERROR
;
4545 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4546 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4547 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4549 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4550 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4551 errorType
, seed
, i
, charErrorTxt
);
4562 // Bug 5532. UTF-8 based UText fails in dictionary code.
4563 // This test checks the initial patch,
4564 // which is to just keep it from crashing. Correct word boundaries
4565 // await a proper fix to the dictionary code.
4567 void RBBITest::TestBug5532(void) {
4568 // Text includes a mixture of Thai and Latin.
4569 const unsigned char utf8Data
[] = {
4570 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4571 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4572 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4573 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4574 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4575 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4576 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4577 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4578 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4579 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4580 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4582 UErrorCode status
= U_ZERO_ERROR
;
4583 UText utext
=UTEXT_INITIALIZER
;
4584 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4585 TEST_ASSERT_SUCCESS(status
);
4587 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4588 TEST_ASSERT_SUCCESS(status
);
4589 if (U_SUCCESS(status
)) {
4590 bi
->setText(&utext
, status
);
4591 TEST_ASSERT_SUCCESS(status
);
4593 int32_t breakCount
= 0;
4594 int32_t previousBreak
= -1;
4595 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4596 // For now, just make sure that the break iterator doesn't hang.
4597 TEST_ASSERT(previousBreak
< bi
->current());
4598 previousBreak
= bi
->current();
4600 TEST_ASSERT(breakCount
> 0);
4603 utext_close(&utext
);
4607 void RBBITest::TestBug9983(void) {
4608 UnicodeString text
= UnicodeString("\\u002A" // * Other
4610 "\\u309C" // Katakana
4614 "\\u0000").unescape();
4616 UErrorCode status
= U_ZERO_ERROR
;
4617 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4618 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4619 TEST_ASSERT_SUCCESS(status
);
4620 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4621 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4622 TEST_ASSERT_SUCCESS(status
);
4623 if (U_FAILURE(status
)) {
4626 int32_t offset
, rstatus
, iterationCount
;
4628 brkiter
->setText(text
);
4631 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4633 rstatus
= brkiter
->getRuleStatus();
4634 (void)rstatus
; // Suppress set but not used warning.
4635 if (iterationCount
>= 10) {
4639 TEST_ASSERT(iterationCount
== 6);
4641 brkiterPOSIX
->setText(text
);
4642 brkiterPOSIX
->last();
4644 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4646 rstatus
= brkiterPOSIX
->getRuleStatus();
4647 (void)rstatus
; // Suppress set but not used warning.
4648 if (iterationCount
>= 10) {
4652 TEST_ASSERT(iterationCount
== 6);
4657 // TestDebug - A place-holder test for debugging purposes.
4658 // For putting in fragments of other tests that can be invoked
4659 // for tracing without a lot of unwanted extra stuff happening.
4661 void RBBITest::TestDebug(void) {
4663 UErrorCode status
= U_ZERO_ERROR
;
4667 RuleBasedBreakIterator
* bi
=
4668 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4669 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4670 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4671 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4672 // UnicodeString s("Aaa. Bcd");
4675 UBool r
= bi
->isBoundary(8);
4676 printf("%s", r
?"true":"false");
4680 // ruleStatus = bi->getRuleStatus();
4681 printf("%d\t%d\n", pos
, ruleStatus
);
4682 pos
= bi
->previous();
4683 } while (pos
!= BreakIterator::DONE
);
4687 void RBBITest::TestProperties() {
4688 UErrorCode errorCode
= U_ZERO_ERROR
;
4689 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4690 if (!prependSet
.isEmpty()) {
4692 "[:GCB=Prepend:] is not empty any more. "
4693 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4694 "change this test to the opposite condition.");
4698 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */