1 /********************************************************************
3 * Copyright (c) 1999-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION
19 #include "unicode/brkiter.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/numfmt.h"
22 #include "unicode/rbbi.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #include "unicode/regex.h"
26 #include "unicode/schriter.h"
27 #include "unicode/uchar.h"
28 #include "unicode/utf16.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uniset.h"
31 #include "unicode/uscript.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utext.h"
39 #include "utypeinfo.h" // for 'typeid' to work
43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
44 #include "unicode/filteredbrk.h"
45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
47 #define TEST_ASSERT(x) {if (!(x)) { \
48 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
51 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
54 //---------------------------------------------
56 //---------------------------------------------
59 // Note: Before adding new tests to this file, check whether the desired test data can
60 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
61 // it's much less work than writing a new test, diagnostic output in the event of failures
62 // is good, and the test data file will is shared with ICU4J, so eventually the test
63 // will run there as well, without additional effort.
65 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
67 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
70 #if !UCONFIG_NO_FILE_IO
71 case 0: name
= "TestBug4153072";
72 if(exec
) TestBug4153072(); break;
74 case 0: name
= "skip";
78 case 1: name
= "skip";
80 case 2: name
= "TestStatusReturn";
81 if(exec
) TestStatusReturn(); break;
83 #if !UCONFIG_NO_FILE_IO
84 case 3: name
= "TestUnicodeFiles";
85 if(exec
) TestUnicodeFiles(); break;
86 case 4: name
= "TestEmptyString";
87 if(exec
) TestEmptyString(); break;
89 case 3: case 4: name
= "skip";
93 case 5: name
= "TestGetAvailableLocales";
94 if(exec
) TestGetAvailableLocales(); break;
96 case 6: name
= "TestGetDisplayName";
97 if(exec
) TestGetDisplayName(); break;
99 #if !UCONFIG_NO_FILE_IO
100 case 7: name
= "TestEndBehaviour";
101 if(exec
) TestEndBehaviour(); break;
102 case 8: case 9: case 10: name
= "skip";
104 case 11: name
= "TestWordBreaks";
105 if(exec
) TestWordBreaks(); break;
106 case 12: name
= "TestWordBoundary";
107 if(exec
) TestWordBoundary(); break;
108 case 13: name
= "TestLineBreaks";
109 if(exec
) TestLineBreaks(); break;
110 case 14: name
= "TestSentBreaks";
111 if(exec
) TestSentBreaks(); break;
112 case 15: name
= "TestExtended";
113 if(exec
) TestExtended(); break;
115 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name
= "skip";
119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
121 name
= "TestMonkey"; if(exec
) TestMonkey(params
); break;
124 name
= "skip"; break;
127 #if !UCONFIG_NO_FILE_IO
128 case 17: name
= "TestBug3818";
129 if(exec
) TestBug3818(); break;
131 case 17: name
= "skip";
135 case 18: name
= "skip";
137 case 19: name
= "TestDebug";
138 if(exec
) TestDebug(); break;
139 case 20: name
= "skip";
142 #if !UCONFIG_NO_FILE_IO
143 case 21: name
= "TestBug5775";
144 if (exec
) TestBug5775(); break;
146 case 21: name
= "skip";
150 case 22: name
= "TestBug9983";
151 if (exec
) TestBug9983(); break;
152 case 23: name
= "TestDictRules";
153 if (exec
) TestDictRules(); break;
154 case 24: name
= "TestBug5532";
155 if (exec
) TestBug5532(); break;
156 default: name
= ""; break; //needed to end loop
161 //---------------------------------------------------------------------------
163 // class BITestData Holds a set of Break iterator test data and results
165 // - the string data to be broken
166 // - a vector of the expected break positions.
167 // - a vector of source line numbers for the data,
168 // (to help see where errors occured.)
169 // - The expected break tag values.
170 // - Vectors of actual break positions and tag values.
171 // - Functions for comparing actual with expected and
174 //----------------------------------------------------------------------------
177 UnicodeString fDataToBreak
;
178 UVector fExpectedBreakPositions
;
179 UVector fExpectedTags
;
181 UVector fActualBreakPositions
; // Test Results.
184 BITestData(UErrorCode
&status
);
185 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
186 void checkResults(const char *heading
, RBBITest
*test
);
187 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
194 BITestData::BITestData(UErrorCode
&status
)
195 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
201 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
202 // The macro form collects the line number, which is helpful
203 // when tracking down failures.
205 // A null data item is inserted at the start of each test's data
206 // to put the starting zero into the data list. The position saved for
207 // each non-null item is its ending position.
209 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
210 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
211 if (U_FAILURE(status
)) {return;}
213 fDataToBreak
.append(CharsToUnicodeString(data
));
215 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
216 fExpectedTags
.addElement(tag
, status
);
217 fLineNum
.addElement(lineNum
, status
);
222 // checkResults. Compare the actual and expected break positions, report any differences.
224 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
225 int32_t expectedIndex
= 0;
226 int32_t actualIndex
= 0;
229 // If we've run through both the expected and actual results vectors, we're done.
230 // break out of the loop.
231 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
232 actualIndex
>= fActualBreakPositions
.size()) {
237 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
238 err(heading
, test
, expectedIndex
-1, actualIndex
);
243 if (actualIndex
>= fActualBreakPositions
.size()) {
244 err(heading
, test
, expectedIndex
, actualIndex
-1);
249 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
250 err(heading
, test
, expectedIndex
, actualIndex
);
251 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
252 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
260 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
261 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
262 heading
, fLineNum
.elementAt(expectedIndex
),
263 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
272 // err - An error was found. Report it, along with information about where the
273 // incorrectly broken test data appeared in the source file.
275 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
277 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
278 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
280 int32_t line
= fLineNum
.elementAti(expectedIdx
);
281 if (expectedIdx
> 0) {
282 // The line numbers are off by one because a premature break occurs somewhere
283 // within the previous item, rather than at the start of the current (expected) item.
284 // We want to report the offset of the unexpected break from the start of
285 // this previous item.
286 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
288 if (actual
< expected
) {
289 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
291 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
296 void BITestData::clearResults() {
297 fActualBreakPositions
.removeAllElements();
298 fActualTags
.removeAllElements();
302 //--------------------------------------------------------------------------------------
304 // RBBITest constructor and destructor
306 //--------------------------------------------------------------------------------------
308 RBBITest::RBBITest() {
312 RBBITest::~RBBITest() {
315 //-----------------------------------------------------------------------------------
317 // Test for status {tag} return value from break rules.
318 // TODO: a more thorough test.
320 //-----------------------------------------------------------------------------------
321 void RBBITest::TestStatusReturn() {
322 UnicodeString
rulesString1("$Letters = [:L:];\n"
323 "$Numbers = [:N:];\n"
326 "Help\\ /me\\!{4};\n"
327 "[^$Letters $Numbers];\n"
328 "!.*;\n", -1, US_INV
);
329 UnicodeString testString1
= "abc123..abc Help me Help me!";
330 // 01234567890123456789012345678
331 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
332 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
334 UErrorCode status
=U_ZERO_ERROR
;
335 UParseError parseError
;
337 LocalPointer
<BreakIterator
> bi(new RuleBasedBreakIterator(rulesString1
, parseError
, status
));
338 if(U_FAILURE(status
)) {
339 dataerrln("%s:%d error in break iterator construction - %s", __FILE__
, __LINE__
, u_errorName(status
));
344 bi
->setText(testString1
);
345 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
346 if (pos
!= bounds1
[i
]) {
347 errln("%s:%d expected break at %d, got %d\n", __FILE__
, __LINE__
, bounds1
[i
], pos
);
351 int tag
= bi
->getRuleStatus();
352 if (tag
!= brkStatus
[i
]) {
353 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__
, __LINE__
, pos
, brkStatus
[i
], tag
);
361 static void printStringBreaks(UText
*tstr
, int expected
[], int expectedCount
) {
362 UErrorCode status
= U_ZERO_ERROR
;
364 printf("code alpha extend alphanum type word sent line name\n");
365 int nextExpectedIndex
= 0;
366 utext_setNativeIndex(tstr
, 0);
367 for (int j
= 0; j
< utext_nativeLength(tstr
); j
=utext_getNativeIndex(tstr
)) {
368 if (nextExpectedIndex
< expectedCount
&& j
>= expected
[nextExpectedIndex
] ) {
369 printf("------------------------------------------------ %d\n", j
);
373 UChar32 c
= utext_next32(tstr
);
374 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
375 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
377 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
379 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
381 U_SHORT_PROPERTY_NAME
),
382 u_getPropertyValueName(UCHAR_WORD_BREAK
,
383 u_getIntPropertyValue(c
,
385 U_SHORT_PROPERTY_NAME
),
386 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
387 u_getIntPropertyValue(c
,
388 UCHAR_SENTENCE_BREAK
),
389 U_SHORT_PROPERTY_NAME
),
390 u_getPropertyValueName(UCHAR_LINE_BREAK
,
391 u_getIntPropertyValue(c
,
393 U_SHORT_PROPERTY_NAME
),
399 static void printStringBreaks(const UnicodeString
&ustr
, int expected
[], int expectedCount
) {
400 UErrorCode status
= U_ZERO_ERROR
;
402 tstr
= utext_openConstUnicodeString(NULL
, &ustr
, &status
);
403 if (U_FAILURE(status
)) {
404 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
));
407 printStringBreaks(tstr
, expected
, expectedCount
);
412 void RBBITest::TestBug3818() {
413 UErrorCode status
= U_ZERO_ERROR
;
415 // Four Thai words...
416 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
417 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
418 UnicodeString
thaiStr(thaiWordData
);
420 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
421 if (U_FAILURE(status
) || bi
== NULL
) {
422 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
425 bi
->setText(thaiStr
);
427 int32_t startOfSecondWord
= bi
->following(1);
428 if (startOfSecondWord
!= 4) {
429 errln("Fail at file %s, line %d expected start of word at 4, got %d",
430 __FILE__
, __LINE__
, startOfSecondWord
);
432 startOfSecondWord
= bi
->following(0);
433 if (startOfSecondWord
!= 4) {
434 errln("Fail at file %s, line %d expected start of word at 4, got %d",
435 __FILE__
, __LINE__
, startOfSecondWord
);
440 //----------------------------------------------------------------------------
442 // generalIteratorTest Given a break iterator and a set of test data,
443 // Run the tests and report the results.
445 //----------------------------------------------------------------------------
446 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
449 bi
.setText(td
.fDataToBreak
);
451 testFirstAndNext(bi
, td
);
453 testLastAndPrevious(bi
, td
);
455 testFollowing(bi
, td
);
456 testPreceding(bi
, td
);
457 testIsBoundary(bi
, td
);
458 doMultipleSelectionTest(bi
, td
);
463 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
466 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
468 UErrorCode status
= U_ZERO_ERROR
;
473 logln("Test first and next");
474 bi
.setText(td
.fDataToBreak
);
477 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
478 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
479 tag
= bi
.getRuleStatus();
480 td
.fActualTags
.addElement(tag
, status
);
482 // If the iterator is not making forward progress, stop.
483 // No need to raise an error here, it'll be detected in the normal check of results.
488 td
.checkResults("testFirstAndNext", this);
493 // TestLastAndPrevious. Run the iterator backwards, starting with last().
495 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
497 UErrorCode status
= U_ZERO_ERROR
;
499 int32_t lastP
= 0x7ffffffe;
502 logln("Test last and previous");
503 bi
.setText(td
.fDataToBreak
);
506 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
507 // Save break position. Insert it at start of vector of results, shoving
508 // already-saved results further towards the end.
509 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
510 // bi.previous(); // TODO: Why does this fix things up????
512 tag
= bi
.getRuleStatus();
513 td
.fActualTags
.insertElementAt(tag
, 0, status
);
515 // If the iterator is not making progress, stop.
516 // No need to raise an error here, it'll be detected in the normal check of results.
521 td
.checkResults("testLastAndPrevious", this);
525 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
527 UErrorCode status
= U_ZERO_ERROR
;
530 int32_t lastP
= -2; // A value that will never be returned as a break position.
531 // cannot be -1; that is returned for DONE.
534 logln("testFollowing():");
535 bi
.setText(td
.fDataToBreak
);
538 // Save the starting point, since we won't get that out of following.
540 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
541 tag
= bi
.getRuleStatus();
542 td
.fActualTags
.addElement(tag
, status
);
544 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
547 if (p
== RuleBasedBreakIterator::DONE
) {
550 // We've reached a new break position. Save it.
551 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
552 tag
= bi
.getRuleStatus();
553 td
.fActualTags
.addElement(tag
, status
);
557 // The loop normally exits by means of the break in the middle.
558 // Make sure that the index was at the correct position for the break iterator to have
560 if (i
!= td
.fDataToBreak
.length()) {
561 errln("testFollowing(): iterator returned DONE prematurely.");
564 // Full check of all results.
565 td
.checkResults("testFollowing", this);
570 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
571 UErrorCode status
= U_ZERO_ERROR
;
574 int32_t lastP
= 0x7ffffffe;
577 logln("testPreceding():");
578 bi
.setText(td
.fDataToBreak
);
582 td
.fActualBreakPositions
.addElement(p
, status
);
583 tag
= bi
.getRuleStatus();
584 td
.fActualTags
.addElement(tag
, status
);
586 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
589 if (p
== RuleBasedBreakIterator::DONE
) {
592 // We've reached a new break position. Save it.
593 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
595 tag
= bi
.getRuleStatus();
596 td
.fActualTags
.insertElementAt(tag
, 0, status
);
599 // The loop normally exits by means of the break in the middle.
600 // Make sure that the index was at the correct position for the break iterator to have
603 errln("testPreceding(): iterator returned DONE prematurely.");
606 // Full check of all results.
607 td
.checkResults("testPreceding", this);
612 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
613 UErrorCode status
= U_ZERO_ERROR
;
617 logln("testIsBoundary():");
618 bi
.setText(td
.fDataToBreak
);
621 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
622 if (bi
.isBoundary(i
)) {
623 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
624 tag
= bi
.getRuleStatus();
625 td
.fActualTags
.addElement(tag
, status
);
628 td
.checkResults("testIsBoundary: ", this);
633 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
635 iterator
.setText(td
.fDataToBreak
);
637 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
638 int32_t offset
= iterator
.first();
642 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
644 if (*testIterator
!= iterator
)
645 errln("clone() or operator!= failed: two clones compared unequal");
648 testOffset
= testIterator
->first();
649 testOffset
= testIterator
->next(count
);
650 if (offset
!= testOffset
)
651 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
653 if (offset
!= RuleBasedBreakIterator::DONE
) {
655 offset
= iterator
.next();
657 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
658 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
659 if (count
> 10000 || offset
== -1) {
660 errln("operator== failed too many times. Stopping test.");
662 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
668 } while (offset
!= RuleBasedBreakIterator::DONE
);
670 // now do it backwards...
671 offset
= iterator
.last();
675 testOffset
= testIterator
->last();
676 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
677 if (offset
!= testOffset
)
678 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
680 if (offset
!= RuleBasedBreakIterator::DONE
) {
682 offset
= iterator
.previous();
684 } while (offset
!= RuleBasedBreakIterator::DONE
);
690 //---------------------------------------------
694 //---------------------------------------------
695 void RBBITest::TestEmptyString()
697 UnicodeString text
= "";
698 UErrorCode status
= U_ZERO_ERROR
;
700 BITestData
x(status
);
701 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
702 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
703 if (U_FAILURE(status
))
705 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
708 generalIteratorTest(*bi
, x
);
712 void RBBITest::TestGetAvailableLocales()
714 int32_t locCount
= 0;
715 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
718 dataerrln("getAvailableLocales() returned an empty list!");
719 // Just make sure that it's returning good memory.
721 for (i
= 0; i
< locCount
; ++i
) {
722 logln(locList
[i
].getName());
726 //Testing the BreakIterator::getDisplayName() function
727 void RBBITest::TestGetDisplayName()
729 UnicodeString result
;
731 BreakIterator::getDisplayName(Locale::getUS(), result
);
732 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
733 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
736 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
737 if (result
!= "French (France)")
738 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
745 void RBBITest::TestEndBehaviour()
747 UErrorCode status
= U_ZERO_ERROR
;
748 UnicodeString
testString("boo.");
749 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
750 if (U_FAILURE(status
))
752 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
755 wb
->setText(testString
);
757 if (wb
->first() != 0)
758 errln("Didn't get break at beginning of string.");
760 errln("Didn't get break before period in \"boo.\"");
761 if (wb
->current() != 4 && wb
->next() != 4)
762 errln("Didn't get break at end of string.");
768 void RBBITest::TestBug4153072() {
769 UErrorCode status
= U_ZERO_ERROR
;
770 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
771 if (U_FAILURE(status
))
773 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
776 UnicodeString
str("...Hello, World!...");
778 int32_t end
= str
.length() - 3;
781 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
782 iter
->adoptText(textIterator
);
784 // Note: with the switch to UText, there is no way to restrict the
785 // iteration range to begin at an index other than zero.
786 // String character iterators created with a non-zero bound are
787 // treated by RBBI as being empty.
788 for (index
= -1; index
< begin
+ 1; ++index
) {
789 onBoundary
= iter
->isBoundary(index
);
790 if (index
== 0? !onBoundary
: onBoundary
) {
791 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
792 " and begin index = " + begin
);
800 // Test for problem reported by Ashok Matoria on 9 July 2007
801 // One.<kSoftHyphen><kSpace>Two.
803 // Sentence break at start (0) and then on calling next() it breaks at
804 // 'T' of "Two". Now, at this point if I do next() and
805 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
807 void RBBITest::TestBug5775() {
808 UErrorCode status
= U_ZERO_ERROR
;
809 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
810 TEST_ASSERT_SUCCESS(status
);
811 if (U_FAILURE(status
)) {
814 // Check for status first for better handling of no data errors.
815 TEST_ASSERT(bi
!= NULL
);
820 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
824 int pos
= bi
->next();
825 TEST_ASSERT(pos
== 6);
827 TEST_ASSERT(pos
== 10);
828 pos
= bi
->previous();
829 TEST_ASSERT(pos
== 6);
835 //------------------------------------------------------------------------------
837 // RBBITest::Extended Run RBBI Tests from an external test data file
839 //------------------------------------------------------------------------------
842 BreakIterator
*bi
; // Break iterator is set while parsing test source.
843 // Changed out whenever test data changes break type.
845 UnicodeString dataToBreak
; // Data that is built up while parsing the test.
846 UVector32
*expectedBreaks
; // Expected break positions, matches dataToBreak UnicodeString.
847 UVector32
*srcLine
; // Positions in source file, indexed same as dataToBreak.
850 UText
*textToBreak
; // UText, could be UTF8 or UTF16.
851 UVector32
*textMap
; // Map from UTF-16 dataToBreak offsets to UText offsets.
852 CharString utf8String
; // UTF-8 form of text to break.
854 TestParams(UErrorCode
&status
) : dataToBreak() {
856 expectedBreaks
= new UVector32(status
);
857 srcLine
= new UVector32(status
);
858 srcCol
= new UVector32(status
);
860 textMap
= new UVector32(status
);
865 delete expectedBreaks
;
868 utext_close(textToBreak
);
872 int32_t getSrcLine(int32_t bp
);
873 int32_t getExpectedBreak(int32_t bp
);
874 int32_t getSrcCol(int32_t bp
);
876 void setUTF16(UErrorCode
&status
);
877 void setUTF8(UErrorCode
&status
);
880 // Append a UnicodeString to a CharString with UTF-8 encoding.
881 // Substitute any invalid chars.
882 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
883 static void CharStringAppend(CharString
&dest
, const UnicodeString
&src
, UErrorCode
&status
) {
884 if (U_FAILURE(status
)) {
888 u_strToUTF8WithSub(NULL
, 0, &utf8Length
, // Output Buffer, NULL for preflight.
889 src
.getBuffer(), src
.length(), // UTF-16 data
890 0xfffd, NULL
, // Substitution char, number of subs.
892 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
895 status
= U_ZERO_ERROR
;
897 char *buffer
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
);
898 u_strToUTF8WithSub(buffer
, utf8Length
, NULL
,
899 src
.getBuffer(), src
.length(),
900 0xfffd, NULL
, &status
);
901 dest
.append(buffer
, utf8Length
, status
);
905 void TestParams::setUTF16(UErrorCode
&status
) {
906 textToBreak
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
);
907 textMap
->removeAllElements();
908 for (int32_t i
=0; i
<dataToBreak
.length(); i
++) {
909 if (i
== dataToBreak
.getChar32Start(i
)) {
910 textMap
->addElement(i
, status
);
912 textMap
->addElement(-1, status
);
915 textMap
->addElement(dataToBreak
.length(), status
);
916 U_ASSERT(dataToBreak
.length() + 1 == textMap
->size());
920 void TestParams::setUTF8(UErrorCode
&status
) {
921 if (U_FAILURE(status
)) {
925 CharStringAppend(utf8String
, dataToBreak
, status
);
926 textToBreak
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
);
927 if (U_FAILURE(status
)) {
931 textMap
->removeAllElements();
932 int32_t utf16Index
= 0;
934 textMap
->addElement(utf16Index
, status
);
935 UChar32 c32
= utext_current32(textToBreak
);
939 utf16Index
+= U16_LENGTH(c32
);
940 utext_next32(textToBreak
);
941 while (textMap
->size() < utext_getNativeIndex(textToBreak
)) {
942 textMap
->addElement(-1, status
);
945 U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size());
949 int32_t TestParams::getSrcLine(int bp
) {
950 if (bp
>= textMap
->size()) {
951 bp
= textMap
->size() - 1;
954 for(; bp
>= 0 ; --bp
) {
955 // Move to a character boundary if we are not on one already.
956 i
= textMap
->elementAti(bp
);
961 return srcLine
->elementAti(i
);
965 int32_t TestParams::getExpectedBreak(int bp
) {
966 if (bp
>= textMap
->size()) {
969 int32_t i
= textMap
->elementAti(bp
);
972 retVal
= expectedBreaks
->elementAti(i
);
978 int32_t TestParams::getSrcCol(int bp
) {
979 if (bp
>= textMap
->size()) {
980 bp
= textMap
->size() - 1;
983 for(; bp
>= 0; --bp
) {
984 // Move bp to a character boundary if we are not on one already.
985 i
= textMap
->elementAti(bp
);
990 return srcCol
->elementAti(i
);
994 void RBBITest::executeTest(TestParams
*t
, UErrorCode
&status
) {
999 TEST_ASSERT_SUCCESS(status
);
1000 if (U_FAILURE(status
)) {
1004 if (t
->bi
== NULL
) {
1008 t
->bi
->setText(t
->textToBreak
, status
);
1010 // Run the iterator forward
1013 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1015 // Fail for lack of forward progress.
1016 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1017 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1021 // Check that there we didn't miss an expected break between the last one
1023 for (i
=prevBP
+1; i
<bp
; i
++) {
1024 if (t
->getExpectedBreak(i
) != 0) {
1025 int expected
[] = {0, i
};
1026 printStringBreaks(t
->dataToBreak
, expected
, 2);
1027 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1028 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1032 // Check that the break we did find was expected
1033 if (t
->getExpectedBreak(bp
) == 0) {
1034 int expected
[] = {0, bp
};
1035 printStringBreaks(t
->textToBreak
, expected
, 2);
1036 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1037 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1039 // The break was expected.
1040 // Check that the {nnn} tag value is correct.
1041 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
1042 if (expectedTagVal
== -1) {
1045 int32_t line
= t
->getSrcLine(bp
);
1046 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1047 if (rs
!= expectedTagVal
) {
1048 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1049 " Actual, Expected status = %4d, %4d",
1050 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1057 // Verify that there were no missed expected breaks after the last one found
1058 for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) {
1059 if (t
->getExpectedBreak(i
) != 0) {
1060 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1061 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1066 // Run the iterator backwards, verify that the same breaks are found.
1068 prevBP
= utext_nativeLength(t
->textToBreak
)+2; // start with a phony value for the last break pos seen.
1069 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1071 // Fail for lack of progress.
1072 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1073 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1077 // Check that we didn't miss an expected break between the last one
1078 // and this one. (UVector returns zeros for index out of bounds.)
1079 for (i
=prevBP
-1; i
>bp
; i
--) {
1080 if (t
->getExpectedBreak(i
) != 0) {
1081 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1082 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1086 // Check that the break we did find was expected
1087 if (t
->getExpectedBreak(bp
) == 0) {
1088 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1089 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1091 // The break was expected.
1092 // Check that the {nnn} tag value is correct.
1093 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
1094 if (expectedTagVal
== -1) {
1097 int line
= t
->getSrcLine(bp
);
1098 int32_t rs
= t
->bi
->getRuleStatus();
1099 if (rs
!= expectedTagVal
) {
1100 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1101 " Actual, Expected status = %4d, %4d",
1102 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1109 // Verify that there were no missed breaks prior to the last one found
1110 for (i
=prevBP
-1; i
>=0; i
--) {
1111 if (t
->getExpectedBreak(i
) != 0) {
1112 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1113 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1117 // Check isBoundary()
1118 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1119 UBool boundaryExpected
= (t
->getExpectedBreak(i
) != 0);
1120 UBool boundaryFound
= t
->bi
->isBoundary(i
);
1121 if (boundaryExpected
!= boundaryFound
) {
1122 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1123 " Expected, Actual= %s, %s",
1124 i
, t
->getSrcLine(i
), t
->getSrcCol(i
),
1125 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
1129 // Check following()
1130 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1131 int32_t actualBreak
= t
->bi
->following(i
);
1132 int32_t expectedBreak
= BreakIterator::DONE
;
1133 for (int32_t j
=i
+1; j
<= utext_nativeLength(t
->textToBreak
); j
++) {
1134 if (t
->getExpectedBreak(j
) != 0) {
1139 if (expectedBreak
!= actualBreak
) {
1140 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1141 " Expected, Actual= %d, %d",
1142 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1146 // Check preceding()
1147 for (i
=utext_nativeLength(t
->textToBreak
); i
>=0; i
--) {
1148 int32_t actualBreak
= t
->bi
->preceding(i
);
1149 int32_t expectedBreak
= BreakIterator::DONE
;
1151 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1152 // preceding(trailing byte) will return the index of some preceding code point,
1153 // not the lead byte of the current code point, even though that has a smaller index.
1154 // Therefore, start looking at the expected break data not at i-1, but at
1155 // the start of code point index - 1.
1156 utext_setNativeIndex(t
->textToBreak
, i
);
1157 int32_t j
= utext_getNativeIndex(t
->textToBreak
) - 1;
1158 for (; j
>= 0; j
--) {
1159 if (t
->getExpectedBreak(j
) != 0) {
1164 if (expectedBreak
!= actualBreak
) {
1165 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1166 " Expected, Actual= %d, %d",
1167 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1173 void RBBITest::TestExtended() {
1174 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1175 UErrorCode status
= U_ZERO_ERROR
;
1178 UnicodeString rules
;
1179 TestParams
tp(status
);
1181 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status
);
1182 if (U_FAILURE(status
)) {
1183 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1188 // Open and read the test data file.
1190 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1191 char testFileName
[1000];
1192 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1193 errln("Can't open test data. Path too long.");
1196 strcpy(testFileName
, testDataDirectory
);
1197 strcat(testFileName
, "rbbitst.txt");
1200 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1201 if (U_FAILURE(status
)) {
1202 return; /* something went wrong, error already output */
1206 bool skipTest
= false; // Skip this test?
1209 // Put the test data into a UnicodeString
1211 UnicodeString
testString(FALSE
, testFile
, len
);
1219 parseState
= PARSE_TAG
;
1221 EParseState savedState
= PARSE_TAG
;
1223 static const UChar CH_LF
= 0x0a;
1224 static const UChar CH_CR
= 0x0d;
1225 static const UChar CH_HASH
= 0x23;
1226 /*static const UChar CH_PERIOD = 0x2e;*/
1227 static const UChar CH_LT
= 0x3c;
1228 static const UChar CH_GT
= 0x3e;
1229 static const UChar CH_BACKSLASH
= 0x5c;
1230 static const UChar CH_BULLET
= 0x2022;
1232 int32_t lineNum
= 1;
1233 int32_t colStart
= 0;
1235 int32_t charIdx
= 0;
1237 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1239 for (charIdx
= 0; charIdx
< len
; ) {
1240 status
= U_ZERO_ERROR
;
1241 UChar c
= testString
.charAt(charIdx
);
1243 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1244 // treat CRLF as a unit
1248 if (c
== CH_LF
|| c
== CH_CR
) {
1252 column
= charIdx
- colStart
+ 1;
1254 switch (parseState
) {
1256 if (c
== 0x0a || c
== 0x0d) {
1257 parseState
= savedState
;
1264 parseState
= PARSE_COMMENT
;
1265 savedState
= PARSE_TAG
;
1268 if (u_isUWhiteSpace(c
)) {
1271 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1273 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1278 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1280 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1285 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1287 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1292 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1294 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1299 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1301 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1306 // <locale loc_name>
1307 localeMatcher
.reset(testString
);
1308 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1309 UnicodeString localeName
= localeMatcher
.group(1, status
);
1310 char localeName8
[100];
1311 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1312 locale
= Locale::createFromName(localeName8
);
1313 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
1314 TEST_ASSERT_SUCCESS(status
);
1317 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1318 parseState
= PARSE_DATA
;
1320 tp
.dataToBreak
= "";
1321 tp
.expectedBreaks
->removeAllElements();
1322 tp
.srcCol
->removeAllElements();
1323 tp
.srcLine
->removeAllElements();
1327 errln("line %d: Tag expected in test file.", lineNum
);
1328 parseState
= PARSE_COMMENT
;
1329 savedState
= PARSE_DATA
;
1330 goto end_test
; // Stop the test.
1335 if (c
== CH_BULLET
) {
1336 int32_t breakIdx
= tp
.dataToBreak
.length();
1337 tp
.expectedBreaks
->setSize(breakIdx
+1);
1338 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1339 tp
.srcLine
->setSize(breakIdx
+1);
1340 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1341 tp
.srcCol
->setSize(breakIdx
+1);
1342 tp
.srcCol
->setElementAt(column
, breakIdx
);
1346 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1347 // Add final entry to mappings from break location to source file position.
1348 // Need one extra because last break position returned is after the
1349 // last char in the data, not at the last char.
1350 tp
.srcLine
->addElement(lineNum
, status
);
1351 tp
.srcCol
->addElement(column
, status
);
1353 parseState
= PARSE_TAG
;
1358 status
= U_ZERO_ERROR
;
1359 tp
.setUTF16(status
);
1360 executeTest(&tp
, status
);
1361 TEST_ASSERT_SUCCESS(status
);
1363 // Run again, this time with UTF-8 text wrapped in a UText.
1364 status
= U_ZERO_ERROR
;
1366 TEST_ASSERT_SUCCESS(status
);
1367 executeTest(&tp
, status
);
1372 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1373 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1374 // Get the code point from the name and insert it into the test data.
1375 // (Damn, no API takes names in Unicode !!!
1376 // we've got to take it back to char *)
1377 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1378 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1379 char charNameBuf
[200];
1380 UChar32 theChar
= -1;
1381 if (nameEndIdx
!= -1) {
1382 UErrorCode status
= U_ZERO_ERROR
;
1383 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1384 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1385 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1386 if (U_FAILURE(status
)) {
1390 if (theChar
== -1) {
1391 errln("Error in named character in test file at line %d, col %d",
1394 // Named code point was recognized. Insert it
1395 // into the test data.
1396 tp
.dataToBreak
.append(theChar
);
1397 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1398 tp
.srcLine
->addElement(lineNum
, status
);
1399 tp
.srcCol
->addElement(column
, status
);
1402 if (nameEndIdx
> charIdx
) {
1403 charIdx
= nameEndIdx
+1;
1412 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1414 int32_t breakIdx
= tp
.dataToBreak
.length();
1415 tp
.expectedBreaks
->setSize(breakIdx
+1);
1416 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1417 tp
.srcLine
->setSize(breakIdx
+1);
1418 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1419 tp
.srcCol
->setSize(breakIdx
+1);
1420 tp
.srcCol
->setElementAt(column
, breakIdx
);
1426 parseState
= PARSE_NUM
;
1430 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1431 parseState
= PARSE_COMMENT
;
1432 savedState
= PARSE_DATA
;
1436 if (c
== CH_BACKSLASH
) {
1437 // Check for \ at end of line, a line continuation.
1438 // Advance over (discard) the newline
1439 UChar32 cp
= testString
.char32At(charIdx
);
1440 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1442 // Need an extra increment of the input ptr to move over both of them
1445 if (cp
== CH_LF
|| cp
== CH_CR
) {
1452 // Let unescape handle the back slash.
1453 cp
= testString
.unescapeAt(charIdx
);
1455 // Escape sequence was recognized. Insert the char
1456 // into the test data.
1457 tp
.dataToBreak
.append(cp
);
1458 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1459 tp
.srcLine
->addElement(lineNum
, status
);
1460 tp
.srcCol
->addElement(column
, status
);
1466 // Not a recognized backslash escape sequence.
1467 // Take the next char as a literal.
1468 // TODO: Should this be an error?
1469 c
= testString
.charAt(charIdx
);
1470 charIdx
= testString
.moveIndex32(charIdx
, 1);
1473 // Normal, non-escaped data char.
1474 tp
.dataToBreak
.append(c
);
1476 // Save the mapping from offset in the data to line/column numbers in
1477 // the original input file. Will be used for better error messages only.
1478 // If there's an expected break before this char, the slot in the mapping
1479 // vector will already be set for this char; don't overwrite it.
1480 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1481 tp
.srcLine
->addElement(lineNum
, status
);
1482 tp
.srcCol
->addElement(column
, status
);
1488 // We are parsing an expected numeric tag value, like <1234>,
1489 // within a chunk of data.
1490 if (u_isUWhiteSpace(c
)) {
1495 // Finished the number. Add the info to the expected break data,
1496 // and switch parse state back to doing plain data.
1497 parseState
= PARSE_DATA
;
1498 if (tagValue
== 0) {
1501 int32_t breakIdx
= tp
.dataToBreak
.length();
1502 tp
.expectedBreaks
->setSize(breakIdx
+1);
1503 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1504 tp
.srcLine
->setSize(breakIdx
+1);
1505 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1506 tp
.srcCol
->setSize(breakIdx
+1);
1507 tp
.srcCol
->setElementAt(column
, breakIdx
);
1512 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1516 errln("Syntax Error in test file at line %d, col %d",
1518 parseState
= PARSE_COMMENT
;
1519 goto end_test
; // Stop the test
1524 if (U_FAILURE(status
)) {
1525 dataerrln("ICU Error %s while parsing test file at line %d.",
1526 u_errorName(status
), lineNum
);
1527 status
= U_ZERO_ERROR
;
1528 goto end_test
; // Stop the test
1539 //-------------------------------------------------------------------------------
1541 // TestDictRules create a break iterator from source rules that includes a
1542 // dictionary range. Regression for bug #7130. Source rules
1543 // do not declare a break iterator type (word, line, sentence, etc.
1544 // but the dictionary code, without a type, would loop.
1546 //-------------------------------------------------------------------------------
1547 void RBBITest::TestDictRules() {
1548 const char *rules
= "$dictionary = [a-z]; \n"
1550 "$dictionary $dictionary; \n"
1552 "$dictionary $dictionary; \n";
1553 const char *text
= "aa";
1554 UErrorCode status
= U_ZERO_ERROR
;
1555 UParseError parseError
;
1557 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1558 if (U_SUCCESS(status
)) {
1559 UnicodeString utext
= text
;
1563 for (loops
= 0; loops
<10; loops
++) {
1564 position
= bi
.next();
1565 if (position
== RuleBasedBreakIterator::DONE
) {
1569 TEST_ASSERT(loops
== 1);
1571 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1577 //-------------------------------------------------------------------------------
1579 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1580 // return the data in one big UChar * buffer, which the caller must delete.
1583 // fileName: the name of the file, with no directory part. The test data directory
1585 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1586 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1587 // specified here. The BOM, if it exists, will be stripped from the returned data.
1588 // Pass NULL for the system default encoding.
1591 // The file data, converted to UChar.
1592 // The caller must delete this when done with
1593 // delete [] theBuffer;
1595 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1596 // Move this function to some common place.
1598 //--------------------------------------------------------------------------------
1599 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1600 UChar
*retPtr
= NULL
;
1601 char *fileBuf
= NULL
;
1602 UConverter
* conv
= NULL
;
1606 if (U_FAILURE(status
)) {
1613 f
= fopen(fileName
, "rb");
1615 dataerrln("Error opening test data file %s\n", fileName
);
1616 status
= U_FILE_ACCESS_ERROR
;
1625 fseek( f
, 0, SEEK_END
);
1626 fileSize
= ftell(f
);
1627 fileBuf
= new char[fileSize
];
1628 fseek(f
, 0, SEEK_SET
);
1629 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1630 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1631 errln("Error reading test data file.");
1632 goto cleanUpAndReturn
;
1636 // Look for a Unicode Signature (BOM) on the data just read
1638 int32_t signatureLength
;
1639 const char * fileBufC
;
1640 const char* bomEncoding
;
1643 bomEncoding
= ucnv_detectUnicodeSignature(
1644 fileBuf
, fileSize
, &signatureLength
, &status
);
1645 if(bomEncoding
!=NULL
){
1646 fileBufC
+= signatureLength
;
1647 fileSize
-= signatureLength
;
1648 encoding
= bomEncoding
;
1652 // Open a converter to take the rule file to UTF-16
1654 conv
= ucnv_open(encoding
, &status
);
1655 if (U_FAILURE(status
)) {
1656 goto cleanUpAndReturn
;
1660 // Convert the rules to UChar.
1661 // Preflight first to determine required buffer size.
1663 ulen
= ucnv_toUChars(conv
,
1669 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1670 // Buffer Overflow is expected from the preflight operation.
1671 status
= U_ZERO_ERROR
;
1673 retPtr
= new UChar
[ulen
+1];
1686 if (U_FAILURE(status
)) {
1687 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1697 //--------------------------------------------------------------------------------------------
1699 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1701 //-------------------------------------------------------------------------------------------
1702 void RBBITest::TestUnicodeFiles() {
1703 RuleBasedBreakIterator
*bi
;
1704 UErrorCode status
= U_ZERO_ERROR
;
1706 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1707 TEST_ASSERT_SUCCESS(status
);
1708 if (U_SUCCESS(status
)) {
1709 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1713 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1714 TEST_ASSERT_SUCCESS(status
);
1715 if (U_SUCCESS(status
)) {
1716 runUnicodeTestData("WordBreakTest.txt", bi
);
1720 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1721 TEST_ASSERT_SUCCESS(status
);
1722 if (U_SUCCESS(status
)) {
1723 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1727 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1728 TEST_ASSERT_SUCCESS(status
);
1729 if (U_SUCCESS(status
)) {
1730 runUnicodeTestData("LineBreakTest.txt", bi
);
1736 // Check for test cases from the Unicode test data files that are known to fail
1737 // and should be skipped because ICU is not yet able to fully implement the spec.
1738 // See ticket #7270.
1740 UBool
RBBITest::testCaseIsKnownIssue(const UnicodeString
&testCase
, const char *fileName
) {
1741 static const UChar badTestCases
[][4] = { // Line Numbers from Unicode 7.0.0 file.
1742 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x007D, (UChar
)0x0000}, // Line 5198
1743 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0029, (UChar
)0x0000}, // Line 5202
1744 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0021, (UChar
)0x0000}, // Line 5214
1745 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002c, (UChar
)0x0000}, // Line 5246
1746 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002f, (UChar
)0x0000}, // Line 5298
1747 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x2060, (UChar
)0x0000} // Line 5302
1749 if (strcmp(fileName
, "LineBreakTest.txt") != 0) {
1753 for (int i
=0; i
<UPRV_LENGTHOF(badTestCases
); i
++) {
1754 if (testCase
== UnicodeString(badTestCases
[i
])) {
1755 return logKnownIssue("7270");
1762 //--------------------------------------------------------------------------------------------
1764 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1766 //-------------------------------------------------------------------------------------------
1767 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1768 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1769 UErrorCode status
= U_ZERO_ERROR
;
1772 // Open and read the test data file, put it into a UnicodeString.
1774 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1775 char testFileName
[1000];
1776 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1777 dataerrln("Can't open test data. Path too long.");
1780 strcpy(testFileName
, testDataDirectory
);
1781 strcat(testFileName
, fileName
);
1783 logln("Opening data file %s\n", fileName
);
1786 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1787 if (status
!= U_FILE_ACCESS_ERROR
) {
1788 TEST_ASSERT_SUCCESS(status
);
1789 TEST_ASSERT(testFile
!= NULL
);
1791 if (U_FAILURE(status
) || testFile
== NULL
) {
1792 return; /* something went wrong, error already output */
1794 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1797 // Parse the test data file using a regular expression.
1798 // Each kind of token is recognized in its own capture group; what type of item was scanned
1799 // is identified by which group had a match.
1801 // Caputure Group # 1 2 3 4 5
1802 // Parses this item: divide x hex digits comment \n unrecognized \n
1804 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1805 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1806 UnicodeString testString
;
1807 UVector32
breakPositions(status
);
1809 TEST_ASSERT_SUCCESS(status
);
1810 if (U_FAILURE(status
)) {
1815 // Scan through each test case, building up the string to be broken in testString,
1816 // and the positions that should be boundaries in the breakPositions vector.
1819 while (tokenMatcher
.find()) {
1820 if(tokenMatcher
.hitEnd()) {
1821 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1822 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1823 and caused an infinite loop here on EBCDIC systems!
1825 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1828 if (tokenMatcher
.start(1, status
) >= 0) {
1829 // Scanned a divide sign, indicating a break position in the test data.
1830 if (testString
.length()>0) {
1831 breakPositions
.addElement(testString
.length(), status
);
1834 else if (tokenMatcher
.start(2, status
) >= 0) {
1835 // Scanned an 'x', meaning no break at this position in the test data
1836 // Nothing to be done here.
1838 else if (tokenMatcher
.start(3, status
) >= 0) {
1839 // Scanned Hex digits. Convert them to binary, append to the character data string.
1840 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1841 int length
= hexNumber
.length();
1844 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1845 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1847 testString
.append(c
);
1849 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1850 fileName
, lineNumber
);
1853 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1854 fileName
, lineNumber
);
1857 else if (tokenMatcher
.start(4, status
) >= 0) {
1858 // Scanned to end of a line, possibly skipping over a comment in the process.
1859 // If the line from the file contained test data, run the test now.
1860 if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {
1861 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1864 // Clear out this test case.
1865 // The string and breakPositions vector will be refilled as the next
1866 // test case is parsed.
1867 testString
.remove();
1868 breakPositions
.removeAllElements();
1871 // Scanner catchall. Something unrecognized appeared on the line.
1873 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1874 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1875 token
[sizeof(token
)-1] = 0;
1876 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1878 // Clean up, in preparation for continuing with the next line.
1879 testString
.remove();
1880 breakPositions
.removeAllElements();
1883 TEST_ASSERT_SUCCESS(status
);
1884 if (U_FAILURE(status
)) {
1890 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1893 //--------------------------------------------------------------------------------------------
1895 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1896 // test data files. Do only a simple, forward-only check -
1897 // this test is mostly to check that ICU and the Unicode
1898 // data agree with each other.
1900 //--------------------------------------------------------------------------------------------
1901 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1902 const UnicodeString
&testString
, // Text data to be broken
1903 UVector32
*breakPositions
, // Positions where breaks should be found.
1904 RuleBasedBreakIterator
*bi
) {
1905 int32_t pos
; // Break Position in the test string
1906 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1907 int32_t expectedPos
; // Expected break position (index into test string)
1909 bi
->setText(testString
);
1913 while (pos
!= BreakIterator::DONE
) {
1914 if (expectedI
>= breakPositions
->size()) {
1915 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1916 testFileName
, lineNumber
, pos
);
1919 expectedPos
= breakPositions
->elementAti(expectedI
);
1920 if (pos
< expectedPos
) {
1921 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1922 testFileName
, lineNumber
, pos
);
1925 if (pos
> expectedPos
) {
1926 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1927 testFileName
, lineNumber
, expectedPos
);
1934 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1935 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1936 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1942 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1943 //---------------------------------------------------------------------------------------
1945 // classs RBBIMonkeyKind
1947 // Monkey Test for Break Iteration
1948 // Abstract interface class. Concrete derived classes independently
1949 // implement the break rules for different iterator types.
1951 // The Monkey Test itself uses doesn't know which type of break iterator it is
1952 // testing, but works purely in terms of the interface defined here.
1954 //---------------------------------------------------------------------------------------
1955 class RBBIMonkeyKind
{
1957 // Return a UVector of UnicodeSets, representing the character classes used
1958 // for this type of iterator.
1959 virtual UVector
*charClasses() = 0;
1961 // Set the test text on which subsequent calls to next() will operate
1962 virtual void setText(const UnicodeString
&s
) = 0;
1964 // Find the next break postion, starting from the prev break position, or from zero.
1965 // Return -1 after reaching end of string.
1966 virtual int32_t next(int32_t i
) = 0;
1968 virtual ~RBBIMonkeyKind();
1969 UErrorCode deferredStatus
;
1978 RBBIMonkeyKind::RBBIMonkeyKind() {
1979 deferredStatus
= U_ZERO_ERROR
;
1982 RBBIMonkeyKind::~RBBIMonkeyKind() {
1986 //----------------------------------------------------------------------------------------
1988 // Random Numbers. Similar to standard lib rand() and srand()
1989 // Not using library to
1990 // 1. Get same results on all platforms.
1991 // 2. Get access to current seed, to more easily reproduce failures.
1993 //---------------------------------------------------------------------------------------
1994 static uint32_t m_seed
= 1;
1996 static uint32_t m_rand()
1998 m_seed
= m_seed
* 1103515245 + 12345;
1999 return (uint32_t)(m_seed
/65536) % 32768;
2003 //------------------------------------------------------------------------------------------
2005 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2006 // of RBBIMonkeyKind.
2008 //------------------------------------------------------------------------------------------
2009 class RBBICharMonkey
: public RBBIMonkeyKind
{
2012 virtual ~RBBICharMonkey();
2013 virtual UVector
*charClasses();
2014 virtual void setText(const UnicodeString
&s
);
2015 virtual int32_t next(int32_t i
);
2019 UnicodeSet
*fCRLFSet
;
2020 UnicodeSet
*fControlSet
;
2021 UnicodeSet
*fExtendSet
;
2022 UnicodeSet
*fRegionalIndicatorSet
;
2023 UnicodeSet
*fPrependSet
;
2024 UnicodeSet
*fSpacingSet
;
2029 UnicodeSet
*fLVTSet
;
2030 UnicodeSet
*fHangulSet
;
2031 UnicodeSet
*fAnySet
;
2032 UnicodeSet
*fEmojiModifierSet
;
2033 UnicodeSet
*fEmojiBaseSet
;
2034 UnicodeSet
*fZWJSet
;
2035 UnicodeSet
*fGAZSet
;
2037 const UnicodeString
*fText
;
2041 RBBICharMonkey::RBBICharMonkey() {
2042 UErrorCode status
= U_ZERO_ERROR
;
2046 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
2047 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status
);
2048 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status
);
2049 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
2050 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
2051 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
2052 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
2053 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
2054 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
2055 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
2056 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
2057 fHangulSet
= new UnicodeSet();
2058 fHangulSet
->addAll(*fLSet
);
2059 fHangulSet
->addAll(*fVSet
);
2060 fHangulSet
->addAll(*fTSet
);
2061 fHangulSet
->addAll(*fLVSet
);
2062 fHangulSet
->addAll(*fLVTSet
);
2063 fAnySet
= new UnicodeSet(0, 0x10ffff);
2067 fEmojiBaseSet
= new UnicodeSet(UnicodeString(
2068 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
2069 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
2070 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
2071 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status
);
2073 fEmojiModifierSet
= new UnicodeSet(0x0001F3FB, 0x0001F3FF);
2074 fZWJSet
= new UnicodeSet(0x200D, 0x200D);
2075 fGAZSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2764\\U0001F308\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8]"), status
);
2077 fSets
= new UVector(status
);
2078 fSets
->addElement(fCRLFSet
, status
);
2079 fSets
->addElement(fControlSet
, status
);
2080 fSets
->addElement(fExtendSet
, status
);
2081 fSets
->addElement(fRegionalIndicatorSet
, status
);
2082 if (!fPrependSet
->isEmpty()) {
2083 fSets
->addElement(fPrependSet
, status
);
2085 fSets
->addElement(fSpacingSet
, status
);
2086 fSets
->addElement(fHangulSet
, status
);
2087 fSets
->addElement(fAnySet
, status
);
2088 fSets
->addElement(fEmojiBaseSet
, status
);
2089 fSets
->addElement(fEmojiModifierSet
, status
);
2090 fSets
->addElement(fZWJSet
, status
);
2091 fSets
->addElement(fGAZSet
, status
);
2092 if (U_FAILURE(status
)) {
2093 deferredStatus
= status
;
2098 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2104 int32_t RBBICharMonkey::next(int32_t prevPos
) {
2105 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2106 // break position being tested. The candidate break
2107 // location is before p2.
2111 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2112 UChar32 cBase
; // for (X Extend*) patterns, the X character.
2114 if (U_FAILURE(deferredStatus
)) {
2118 // Previous break at end of string. return DONE.
2119 if (prevPos
>= fText
->length()) {
2122 p0
= p1
= p2
= p3
= prevPos
;
2123 c3
= fText
->char32At(prevPos
);
2124 c0
= c1
= c2
= cBase
= 0;
2125 (void)p0
; // suppress set but not used warning.
2128 // Loop runs once per "significant" character position in the input text.
2130 // Move all of the positions forward in the input string.
2135 // Advancd p3 by one codepoint
2136 p3
= fText
->moveIndex32(p3
, 1);
2137 c3
= fText
->char32At(p3
);
2140 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2143 if (p2
== fText
->length()) {
2144 // Reached end of string. Always a break position.
2149 // No Extend or Format characters may appear between the CR and LF,
2150 // which requires the additional check for p2 immediately following p1.
2152 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2156 // Rule (GB4). ( Control | CR | LF ) <break>
2157 if (fControlSet
->contains(c1
) ||
2163 // Rule (GB5) <break> ( Control | CR | LF )
2165 if (fControlSet
->contains(c2
) ||
2172 // Rule (GB6) L x ( L | V | LV | LVT )
2173 if (fLSet
->contains(c1
) &&
2174 (fLSet
->contains(c2
) ||
2175 fVSet
->contains(c2
) ||
2176 fLVSet
->contains(c2
) ||
2177 fLVTSet
->contains(c2
))) {
2181 // Rule (GB7) ( LV | V ) x ( V | T )
2182 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
2183 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
2187 // Rule (GB8) ( LVT | T) x T
2188 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
2189 fTSet
->contains(c2
)) {
2193 // Rule (GB9) x (Extend | ZWJ)
2194 if (fExtendSet
->contains(c2
) || fZWJSet
->contains(c2
)) {
2195 if (!fExtendSet
->contains(c1
)) {
2201 // Rule (GB9a) x SpacingMark
2202 if (fSpacingSet
->contains(c2
)) {
2206 // Rule (GB9b) Prepend x
2207 if (fPrependSet
->contains(c1
)) {
2211 // Rule (GB10) ($E_Base | $GAZ) $Extend* $E_Modifier;
2212 if ((fEmojiBaseSet
->contains(c1
) || fGAZSet
->contains(c1
)) && fEmojiModifierSet
->contains(c2
)) {
2215 if ((fEmojiBaseSet
->contains(cBase
) || fGAZSet
->contains(cBase
)) &&
2216 fExtendSet
->contains(c1
) && fEmojiModifierSet
->contains(c2
)) {
2220 // Rule (GB11) ZWJ x Glue_After_Zwj
2221 if (fZWJSet
->contains(c1
) && fGAZSet
->contains(c2
)) {
2225 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2226 // Note: The first if condition is a little tricky. We only need to force
2227 // a break if there are three or more contiguous RIs. If there are
2228 // only two, a break following will occur via other rules, and will include
2229 // any trailing extend characters, which is needed behavior.
2230 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)
2231 && fRegionalIndicatorSet
->contains(c2
)) {
2234 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2238 // Rule (GB999) Any <break> Any
2248 UVector
*RBBICharMonkey::charClasses() {
2253 RBBICharMonkey::~RBBICharMonkey() {
2258 delete fRegionalIndicatorSet
;
2268 delete fEmojiBaseSet
;
2269 delete fEmojiModifierSet
;
2274 //------------------------------------------------------------------------------------------
2276 // class RBBIWordMonkey Word Break specific implementation
2277 // of RBBIMonkeyKind.
2279 //------------------------------------------------------------------------------------------
2280 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2283 virtual ~RBBIWordMonkey();
2284 virtual UVector
*charClasses();
2285 virtual void setText(const UnicodeString
&s
);
2286 virtual int32_t next(int32_t i
);
2292 UnicodeSet
*fNewlineSet
;
2293 UnicodeSet
*fRegionalIndicatorSet
;
2294 UnicodeSet
*fKatakanaSet
;
2295 UnicodeSet
*fHebrew_LetterSet
;
2296 UnicodeSet
*fALetterSet
;
2297 // TODO(jungshik): Do we still need this change?
2298 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2299 UnicodeSet
*fSingle_QuoteSet
;
2300 UnicodeSet
*fDouble_QuoteSet
;
2301 UnicodeSet
*fMidNumLetSet
;
2302 UnicodeSet
*fMidLetterSet
;
2303 UnicodeSet
*fMidNumSet
;
2304 UnicodeSet
*fNumericSet
;
2305 UnicodeSet
*fFormatSet
;
2306 UnicodeSet
*fOtherSet
;
2307 UnicodeSet
*fExtendSet
;
2308 UnicodeSet
*fExtendNumLetSet
;
2309 UnicodeSet
*fDictionaryCjkSet
;
2310 UnicodeSet
*fEBaseSet
;
2311 UnicodeSet
*fEModifierSet
;
2312 UnicodeSet
*fZWSSet
;
2313 UnicodeSet
*fGAZSet
;
2315 const UnicodeString
*fText
;
2319 RBBIWordMonkey::RBBIWordMonkey()
2321 UErrorCode status
= U_ZERO_ERROR
;
2323 fSets
= new UVector(status
);
2325 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2326 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2327 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2328 fDictionaryCjkSet
= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status
);
2329 // Exclude Hangul syllables from ALetterSet during testing.
2330 // Leave CJK dictionary characters out from the monkey tests!
2332 fALetterSet
= new UnicodeSet("[\\p{Word_Break = ALetter}"
2333 "[\\p{Line_Break = Complex_Context}"
2334 "-\\p{Grapheme_Cluster_Break = Extend}"
2335 "-\\p{Grapheme_Cluster_Break = Control}"
2339 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status
);
2340 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2341 fHebrew_LetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status
);
2342 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2343 fALetterSet
->removeAll(*fDictionaryCjkSet
);
2344 fSingle_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status
);
2345 fDouble_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status
);
2346 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2347 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter} - [\\:]]"), status
);
2348 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2349 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2350 // we should figure out why
2351 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2352 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2353 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2354 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2356 fEBaseSet
= new UnicodeSet(UnicodeString(
2357 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
2358 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
2359 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
2360 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status
);
2362 fEModifierSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status
);
2363 fZWSSet
= new UnicodeSet((UChar32
)0x200D, (UChar32
)0x200D);;
2364 fGAZSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2764\\U0001F308\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8]"), status
);
2365 fExtendSet
->removeAll(*fZWSSet
);
2368 fOtherSet
= new UnicodeSet();
2369 if(U_FAILURE(status
)) {
2370 deferredStatus
= status
;
2374 fOtherSet
->complement();
2375 fOtherSet
->removeAll(*fCRSet
);
2376 fOtherSet
->removeAll(*fLFSet
);
2377 fOtherSet
->removeAll(*fNewlineSet
);
2378 fOtherSet
->removeAll(*fKatakanaSet
);
2379 fOtherSet
->removeAll(*fHebrew_LetterSet
);
2380 fOtherSet
->removeAll(*fALetterSet
);
2381 fOtherSet
->removeAll(*fSingle_QuoteSet
);
2382 fOtherSet
->removeAll(*fDouble_QuoteSet
);
2383 fOtherSet
->removeAll(*fMidLetterSet
);
2384 fOtherSet
->removeAll(*fMidNumSet
);
2385 fOtherSet
->removeAll(*fNumericSet
);
2386 fOtherSet
->removeAll(*fExtendNumLetSet
);
2387 fOtherSet
->removeAll(*fFormatSet
);
2388 fOtherSet
->removeAll(*fExtendSet
);
2389 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
2390 fOtherSet
->removeAll(*fEBaseSet
);
2391 fOtherSet
->removeAll(*fEModifierSet
);
2392 fOtherSet
->removeAll(*fZWSSet
);
2393 fOtherSet
->removeAll(*fGAZSet
);
2395 // Inhibit dictionary characters from being tested at all.
2396 fOtherSet
->removeAll(*fDictionaryCjkSet
);
2397 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2399 fSets
->addElement(fCRSet
, status
);
2400 fSets
->addElement(fLFSet
, status
);
2401 fSets
->addElement(fNewlineSet
, status
);
2402 fSets
->addElement(fRegionalIndicatorSet
, status
);
2403 fSets
->addElement(fHebrew_LetterSet
, status
);
2404 fSets
->addElement(fALetterSet
, status
);
2405 fSets
->addElement(fSingle_QuoteSet
, status
);
2406 fSets
->addElement(fDouble_QuoteSet
, status
);
2407 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2408 fSets
->addElement(fMidLetterSet
, status
);
2409 fSets
->addElement(fMidNumLetSet
, status
);
2410 fSets
->addElement(fMidNumSet
, status
);
2411 fSets
->addElement(fNumericSet
, status
);
2412 fSets
->addElement(fFormatSet
, status
);
2413 fSets
->addElement(fExtendSet
, status
);
2414 fSets
->addElement(fOtherSet
, status
);
2415 fSets
->addElement(fExtendNumLetSet
, status
);
2417 fSets
->addElement(fEBaseSet
, status
);
2418 fSets
->addElement(fEModifierSet
, status
);
2419 fSets
->addElement(fZWSSet
, status
);
2420 fSets
->addElement(fGAZSet
, status
);
2422 if (U_FAILURE(status
)) {
2423 deferredStatus
= status
;
2427 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2432 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2433 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2434 // break position being tested. The candidate break
2435 // location is before p2.
2439 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2441 if (U_FAILURE(deferredStatus
)) {
2445 // Prev break at end of string. return DONE.
2446 if (prevPos
>= fText
->length()) {
2449 p0
= p1
= p2
= p3
= prevPos
;
2450 c3
= fText
->char32At(prevPos
);
2452 (void)p0
; // Suppress set but not used warning.
2454 // Loop runs once per "significant" character position in the input text.
2456 // Move all of the positions forward in the input string.
2461 // Advancd p3 by X(Extend | Format)* Rule 4
2462 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2464 p3
= fText
->moveIndex32(p3
, 1);
2465 c3
= fText
->char32At(p3
);
2466 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2470 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
) || fZWSSet
->contains(c3
));
2474 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2477 if (p2
== fText
->length()) {
2478 // Reached end of string. Always a break position.
2483 // No Extend or Format characters may appear between the CR and LF,
2484 // which requires the additional check for p2 immediately following p1.
2486 if (c1
==0x0D && c2
==0x0A) {
2490 // Rule (3a) Break before and after newlines (including CR and LF)
2492 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2495 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2499 // Rule (3c) ZWJ x GAZ (Glue after ZWJ).
2500 // Not ignoring extend chars, so peek into input text to
2501 // get the potential ZWJ, the character immediately preceding c2.
2502 // Sloppy UChar32 indexing: p2-1 may reference trail half
2503 // but char32At will get the full code point.
2504 if (fZWSSet
->contains(fText
->char32At(p2
-1)) && fGAZSet
->contains(c2
)) {
2508 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2509 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2510 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2514 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2516 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2517 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2518 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2522 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2523 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2524 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2525 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2529 // Rule (7a) Hebrew_Letter x Single_Quote
2530 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2534 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2535 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2539 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2540 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2544 // Rule (8) Numeric x Numeric
2545 if (fNumericSet
->contains(c1
) &&
2546 fNumericSet
->contains(c2
)) {
2550 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2551 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2552 fNumericSet
->contains(c2
)) {
2556 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2557 if (fNumericSet
->contains(c1
) &&
2558 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2562 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2563 if (fNumericSet
->contains(c0
) &&
2564 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2565 fNumericSet
->contains(c2
)) {
2569 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2570 if (fNumericSet
->contains(c1
) &&
2571 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2572 fNumericSet
->contains(c3
)) {
2576 // Rule (13) Katakana x Katakana
2577 if (fKatakanaSet
->contains(c1
) &&
2578 fKatakanaSet
->contains(c2
)) {
2582 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2583 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2584 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2585 fExtendNumLetSet
->contains(c2
)) {
2589 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2590 if (fExtendNumLetSet
->contains(c1
) &&
2591 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2592 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2597 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)) {
2600 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2605 if ((fEBaseSet
->contains(c1
) || fGAZSet
->contains(c1
)) && fEModifierSet
->contains(c2
)) {
2609 // Rule 14. Break found here.
2618 UVector
*RBBIWordMonkey::charClasses() {
2623 RBBIWordMonkey::~RBBIWordMonkey() {
2628 delete fKatakanaSet
;
2629 delete fHebrew_LetterSet
;
2631 delete fSingle_QuoteSet
;
2632 delete fDouble_QuoteSet
;
2633 delete fMidNumLetSet
;
2634 delete fMidLetterSet
;
2639 delete fExtendNumLetSet
;
2640 delete fRegionalIndicatorSet
;
2641 delete fDictionaryCjkSet
;
2644 delete fEModifierSet
;
2652 //------------------------------------------------------------------------------------------
2654 // class RBBISentMonkey Sentence Break specific implementation
2655 // of RBBIMonkeyKind.
2657 //------------------------------------------------------------------------------------------
2658 class RBBISentMonkey
: public RBBIMonkeyKind
{
2661 virtual ~RBBISentMonkey();
2662 virtual UVector
*charClasses();
2663 virtual void setText(const UnicodeString
&s
);
2664 virtual int32_t next(int32_t i
);
2666 int moveBack(int posFrom
);
2667 int moveForward(int posFrom
);
2668 UChar32
cAt(int pos
);
2672 UnicodeSet
*fSepSet
;
2673 UnicodeSet
*fFormatSet
;
2675 UnicodeSet
*fLowerSet
;
2676 UnicodeSet
*fUpperSet
;
2677 UnicodeSet
*fOLetterSet
;
2678 UnicodeSet
*fNumericSet
;
2679 UnicodeSet
*fATermSet
;
2680 UnicodeSet
*fSContinueSet
;
2681 UnicodeSet
*fSTermSet
;
2682 UnicodeSet
*fCloseSet
;
2683 UnicodeSet
*fOtherSet
;
2684 UnicodeSet
*fExtendSet
;
2686 const UnicodeString
*fText
;
2690 RBBISentMonkey::RBBISentMonkey()
2692 UErrorCode status
= U_ZERO_ERROR
;
2694 fSets
= new UVector(status
);
2696 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2697 // set and made into character classes of their own. For the monkey impl,
2698 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2699 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2700 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2701 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2702 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2703 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2704 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2705 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2706 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2707 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2708 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2709 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2710 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2711 fOtherSet
= new UnicodeSet();
2713 if(U_FAILURE(status
)) {
2714 deferredStatus
= status
;
2718 fOtherSet
->complement();
2719 fOtherSet
->removeAll(*fSepSet
);
2720 fOtherSet
->removeAll(*fFormatSet
);
2721 fOtherSet
->removeAll(*fSpSet
);
2722 fOtherSet
->removeAll(*fLowerSet
);
2723 fOtherSet
->removeAll(*fUpperSet
);
2724 fOtherSet
->removeAll(*fOLetterSet
);
2725 fOtherSet
->removeAll(*fNumericSet
);
2726 fOtherSet
->removeAll(*fATermSet
);
2727 fOtherSet
->removeAll(*fSContinueSet
);
2728 fOtherSet
->removeAll(*fSTermSet
);
2729 fOtherSet
->removeAll(*fCloseSet
);
2730 fOtherSet
->removeAll(*fExtendSet
);
2732 fSets
->addElement(fSepSet
, status
);
2733 fSets
->addElement(fFormatSet
, status
);
2734 fSets
->addElement(fSpSet
, status
);
2735 fSets
->addElement(fLowerSet
, status
);
2736 fSets
->addElement(fUpperSet
, status
);
2737 fSets
->addElement(fOLetterSet
, status
);
2738 fSets
->addElement(fNumericSet
, status
);
2739 fSets
->addElement(fATermSet
, status
);
2740 fSets
->addElement(fSContinueSet
, status
);
2741 fSets
->addElement(fSTermSet
, status
);
2742 fSets
->addElement(fCloseSet
, status
);
2743 fSets
->addElement(fOtherSet
, status
);
2744 fSets
->addElement(fExtendSet
, status
);
2746 if (U_FAILURE(status
)) {
2747 deferredStatus
= status
;
2753 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2757 UVector
*RBBISentMonkey::charClasses() {
2762 // moveBack() Find the "significant" code point preceding the index i.
2763 // Skips over ($Extend | $Format)* .
2765 int RBBISentMonkey::moveBack(int i
) {
2772 j
= fText
->moveIndex32(j
, -1);
2773 c
= fText
->char32At(j
);
2775 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2781 int RBBISentMonkey::moveForward(int i
) {
2782 if (i
>=fText
->length()) {
2783 return fText
->length();
2788 j
= fText
->moveIndex32(j
, 1);
2791 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2795 UChar32
RBBISentMonkey::cAt(int pos
) {
2796 if (pos
<0 || pos
>=fText
->length()) {
2799 return fText
->char32At(pos
);
2803 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2804 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2805 // break position being tested. The candidate break
2806 // location is before p2.
2810 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2813 if (U_FAILURE(deferredStatus
)) {
2817 // Prev break at end of string. return DONE.
2818 if (prevPos
>= fText
->length()) {
2821 p0
= p1
= p2
= p3
= prevPos
;
2822 c3
= fText
->char32At(prevPos
);
2824 (void)p0
; // Suppress set but not used warning.
2826 // Loop runs once per "significant" character position in the input text.
2828 // Move all of the positions forward in the input string.
2833 // Advancd p3 by X(Extend | Format)* Rule 4
2834 p3
= moveForward(p3
);
2838 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2842 // Rule (4). Sep <break>
2843 if (fSepSet
->contains(c1
)) {
2844 p2
= p1
+1; // Separators don't combine with Extend or Format.
2848 if (p2
>= fText
->length()) {
2849 // Reached end of string. Always a break position.
2853 if (p2
== prevPos
) {
2854 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2858 // Rule (6). ATerm x Numeric
2859 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2863 // Rule (7). (Upper | Lower) ATerm x Uppper
2864 if ((fUpperSet
->contains(c0
) || fLowerSet
->contains(c0
)) &&
2865 fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2869 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2870 // Note: STerm | ATerm are added to the negated part of the expression by a
2871 // note to the Unicode 5.0 documents.
2873 while (fSpSet
->contains(cAt(p8
))) {
2876 while (fCloseSet
->contains(cAt(p8
))) {
2879 if (fATermSet
->contains(cAt(p8
))) {
2883 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2884 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2885 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2888 p8
= moveForward(p8
);
2890 if (fLowerSet
->contains(cAt(p8
))) {
2895 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2896 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2898 while (fSpSet
->contains(cAt(p8
))) {
2901 while (fCloseSet
->contains(cAt(p8
))) {
2905 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2910 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2912 while (fCloseSet
->contains(cAt(p9
))) {
2916 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2917 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2922 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2924 while (fSpSet
->contains(cAt(p10
))) {
2925 p10
= moveBack(p10
);
2927 while (fCloseSet
->contains(cAt(p10
))) {
2928 p10
= moveBack(p10
);
2930 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2931 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2936 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2938 if (fSepSet
->contains(cAt(p11
))) {
2939 p11
= moveBack(p11
);
2941 while (fSpSet
->contains(cAt(p11
))) {
2942 p11
= moveBack(p11
);
2944 while (fCloseSet
->contains(cAt(p11
))) {
2945 p11
= moveBack(p11
);
2947 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2951 // Rule (12) Any x Any
2958 RBBISentMonkey::~RBBISentMonkey() {
2968 delete fSContinueSet
;
2977 //-------------------------------------------------------------------------------------------
2981 //-------------------------------------------------------------------------------------------
2983 class RBBILineMonkey
: public RBBIMonkeyKind
{
2986 virtual ~RBBILineMonkey();
2987 virtual UVector
*charClasses();
2988 virtual void setText(const UnicodeString
&s
);
2989 virtual int32_t next(int32_t i
);
2990 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
3037 BreakIterator
*fCharBI
;
3038 const UnicodeString
*fText
;
3039 RegexMatcher
*fNumberMatcher
;
3042 RBBILineMonkey::RBBILineMonkey() :
3048 fNumberMatcher(NULL
)
3051 if (U_FAILURE(deferredStatus
)) {
3055 UErrorCode status
= U_ZERO_ERROR
;
3057 fSets
= new UVector(status
);
3059 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
3060 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
3061 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
3062 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
3063 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
3064 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
3065 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
3066 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
3067 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
3068 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
3069 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
3070 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
3071 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
3072 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
3073 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
3074 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
3075 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
3076 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
3077 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
3078 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
3079 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
3080 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
3081 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
3082 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
3083 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
3084 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
3085 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
3086 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
3087 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
3088 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
3089 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
3090 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
3091 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
3092 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
3093 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
3094 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
3095 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
3096 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
3097 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
3098 fEB
= new UnicodeSet(UnicodeString(
3099 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
3100 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
3101 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
3102 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status
);
3103 fEM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status
);
3104 fZJ
= new UnicodeSet((UChar32
)0x200D, (UChar32
)0x200D);
3106 if (U_FAILURE(status
)) {
3107 deferredStatus
= status
;
3111 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
3112 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
3113 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
3115 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
3117 fID
->addAll(*fEB
); // Emoji Base and Emoji Modifier behave as ID.
3119 fAL
->removeAll(*fEM
);
3122 fAL
->remove((UChar32
)0x2764); // Emoji Proposal: move u2764 from Al to Id
3123 fAI
->remove((UChar32
)0x2640); // new ZWJ seqs
3124 fAI
->remove((UChar32
)0x2642); // new ZWJ seqs
3125 fID
->add((UChar32
)0x2764);
3126 fID
->add((UChar32
)0x2640);
3127 fID
->add((UChar32
)0x2642);
3129 fSets
->addElement(fBK
, status
);
3130 fSets
->addElement(fCR
, status
);
3131 fSets
->addElement(fLF
, status
);
3132 fSets
->addElement(fCM
, status
);
3133 fSets
->addElement(fNL
, status
);
3134 fSets
->addElement(fWJ
, status
);
3135 fSets
->addElement(fZW
, status
);
3136 fSets
->addElement(fGL
, status
);
3137 fSets
->addElement(fCB
, status
);
3138 fSets
->addElement(fSP
, status
);
3139 fSets
->addElement(fB2
, status
);
3140 fSets
->addElement(fBA
, status
);
3141 fSets
->addElement(fBB
, status
);
3142 fSets
->addElement(fHY
, status
);
3143 fSets
->addElement(fH2
, status
);
3144 fSets
->addElement(fH3
, status
);
3145 fSets
->addElement(fCL
, status
);
3146 fSets
->addElement(fCP
, status
);
3147 fSets
->addElement(fEX
, status
);
3148 fSets
->addElement(fIN
, status
);
3149 fSets
->addElement(fJL
, status
);
3150 fSets
->addElement(fJT
, status
);
3151 fSets
->addElement(fJV
, status
);
3152 fSets
->addElement(fNS
, status
);
3153 fSets
->addElement(fOP
, status
);
3154 fSets
->addElement(fQU
, status
);
3155 fSets
->addElement(fIS
, status
);
3156 fSets
->addElement(fNU
, status
);
3157 fSets
->addElement(fPO
, status
);
3158 fSets
->addElement(fPR
, status
);
3159 fSets
->addElement(fSY
, status
);
3160 fSets
->addElement(fAI
, status
);
3161 fSets
->addElement(fAL
, status
);
3162 fSets
->addElement(fHL
, status
);
3163 fSets
->addElement(fID
, status
);
3164 fSets
->addElement(fWJ
, status
);
3165 fSets
->addElement(fRI
, status
);
3166 fSets
->addElement(fSG
, status
);
3167 fSets
->addElement(fEB
, status
);
3168 fSets
->addElement(fEM
, status
);
3169 fSets
->addElement(fZJ
, status
);
3172 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3173 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3174 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3175 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3176 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3177 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3179 fNumberMatcher
= new RegexMatcher(
3180 UnicodeString(rules
, -1, US_INV
), 0, status
);
3182 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
3184 if (U_FAILURE(status
)) {
3185 deferredStatus
= status
;
3190 void RBBILineMonkey::setText(const UnicodeString
&s
) {
3192 fCharBI
->setText(s
);
3193 fNumberMatcher
->reset(s
);
3198 // Line Break TR rules 9 and 10 implementation.
3199 // This deals with combining marks and other sequences that
3200 // that must be treated as if they were something other than what they actually are.
3202 // This is factored out into a separate function because it must be applied twice for
3203 // each potential break, once to the chars before the position being checked, then
3204 // again to the text following the possible break.
3206 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
3208 // Invalid initial position. Happens during the warmup iteration of the
3209 // main loop in next().
3213 int32_t nPos
= *nextPos
;
3215 // LB 9 Keep combining sequences together.
3216 // advance over any CM class chars. Note that Line Break CM is different
3217 // from the normal Grapheme Extend property.
3218 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
3219 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
3221 *nextChar
= fText
->char32At(nPos
);
3222 if (!fCM
->contains(*nextChar
)) {
3225 nPos
= fText
->moveIndex32(nPos
, 1);
3230 // LB 9 Treat X CM* as if it were x.
3231 // No explicit action required.
3233 // LB 10 Treat any remaining combining mark as AL
3234 if (fCM
->contains(*posChar
)) {
3235 *posChar
= 0x41; // thisChar = 'A';
3238 // Push the updated nextPos and nextChar back to our caller.
3239 // This only makes a difference if posChar got bigger by consuming a
3240 // combining sequence.
3242 *nextChar
= fText
->char32At(nPos
);
3247 int32_t RBBILineMonkey::next(int32_t startPos
) {
3248 UErrorCode status
= U_ZERO_ERROR
;
3249 int32_t pos
; // Index of the char following a potential break position
3250 UChar32 thisChar
; // Character at above position "pos"
3252 int32_t prevPos
; // Index of the char preceding a potential break position
3253 UChar32 prevChar
; // Character at above position. Note that prevChar
3254 // and thisChar may not be adjacent because combining
3255 // characters between them will be ignored.
3257 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
3260 int32_t nextPos
; // Index of the next character following pos.
3261 // Usually skips over combining marks.
3262 int32_t nextCPPos
; // Index of the code point following "pos."
3263 // May point to a combining mark.
3264 int32_t tPos
; // temp value.
3267 if (U_FAILURE(deferredStatus
)) {
3271 if (startPos
>= fText
->length()) {
3276 // Initial values for loop. Loop will run the first time without finding breaks,
3277 // while the invalid values shift out and the "this" and
3278 // "prev" positions are filled in with good values.
3279 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
3280 thisChar
= prevChar
= prevCharX2
= 0;
3281 nextPos
= nextCPPos
= startPos
;
3284 // Loop runs once per position in the test text, until a break position
3287 prevPosX2
= prevPos
;
3288 prevCharX2
= prevChar
;
3291 prevChar
= thisChar
;
3294 thisChar
= fText
->char32At(pos
);
3296 nextCPPos
= fText
->moveIndex32(pos
, 1);
3297 nextPos
= nextCPPos
;
3299 // Rule LB2 - Break at end of text.
3300 if (pos
>= fText
->length()) {
3304 // Rule LB 9 - adjust for combining sequences.
3305 // We do this one out-of-order because the adjustment does not change anything
3306 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3308 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3309 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3310 c
= fText
->char32At(nextPos
);
3311 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3313 // If the loop is still warming up - if we haven't shifted the initial
3314 // -1 positions out of prevPos yet - loop back to advance the
3315 // position in the input without any further looking for breaks.
3316 if (prevPos
== -1) {
3320 // LB 4 Always break after hard line breaks,
3321 if (fBK
->contains(prevChar
)) {
3325 // LB 5 Break after CR, LF, NL, but not inside CR LF
3326 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3329 if (prevChar
== 0x0d ||
3335 // LB 6 Don't break before hard line breaks
3336 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3337 fBK
->contains(thisChar
)) {
3342 // LB 7 Don't break before spaces or zero-width space.
3343 if (fSP
->contains(thisChar
)) {
3347 if (fZW
->contains(thisChar
)) {
3351 // LB 8 Break after zero width space
3352 if (fZW
->contains(prevChar
)) {
3357 // The monkey test's way of ignoring combining characters doesn't work
3358 // for this rule. ZJ is also a CM. Need to get the actual character
3359 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3361 int32_t prevIdx
= fText
->moveIndex32(pos
, -1);
3362 UChar32 prevC
= fText
->char32At(prevIdx
);
3363 if (fZJ
->contains(prevC
) && fID
->contains(thisChar
)) {
3368 // LB 9, 10 Already done, at top of loop.
3372 // LB 11 Do not break before or after WORD JOINER and related characters.
3376 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3382 if (fGL
->contains(prevChar
)) {
3388 if (!(fSP
->contains(prevChar
) ||
3389 fBA
->contains(prevChar
) ||
3390 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3396 // LB 13 Don't break before closings.
3397 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3398 // fall into LB 17 and the more general number regular expression.
3400 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3401 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3402 fEX
->contains(thisChar
) ||
3403 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3404 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3408 // LB 14 Don't break after OP SP*
3409 // Scan backwards, checking for this sequence.
3410 // The OP char could include combining marks, so we actually check for
3412 // Another Twist: The Rule 67 fixes may have changed a SP CM
3413 // sequence into a ID char, so before scanning back through spaces,
3414 // verify that prevChar is indeed a space. The prevChar variable
3415 // may differ from fText[prevPos]
3417 if (fSP
->contains(prevChar
)) {
3418 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3419 tPos
=fText
->moveIndex32(tPos
, -1);
3422 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3423 tPos
=fText
->moveIndex32(tPos
, -1);
3425 if (fOP
->contains(fText
->char32At(tPos
))) {
3430 // LB 15 QU SP* x OP
3431 if (fOP
->contains(thisChar
)) {
3432 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3434 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3435 tPos
= fText
->moveIndex32(tPos
, -1);
3437 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3438 tPos
= fText
->moveIndex32(tPos
, -1);
3440 if (fQU
->contains(fText
->char32At(tPos
))) {
3447 // LB 16 (CL | CP) SP* x NS
3448 // Scan backwards for SP* CM* (CL | CP)
3449 if (fNS
->contains(thisChar
)) {
3451 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3452 tPos
= fText
->moveIndex32(tPos
, -1);
3454 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3455 tPos
= fText
->moveIndex32(tPos
, -1);
3457 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3463 // LB 17 B2 SP* x B2
3464 if (fB2
->contains(thisChar
)) {
3465 // Scan backwards, checking for the B2 CM* SP* sequence.
3467 if (fSP
->contains(prevChar
)) {
3468 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3469 tPos
=fText
->moveIndex32(tPos
, -1);
3472 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3473 tPos
=fText
->moveIndex32(tPos
, -1);
3475 if (fB2
->contains(fText
->char32At(tPos
))) {
3481 // LB 18 break after space
3482 if (fSP
->contains(prevChar
)) {
3489 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3493 // LB 20 Break around a CB
3494 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3499 if (fBA
->contains(thisChar
) ||
3500 fHY
->contains(thisChar
) ||
3501 fNS
->contains(thisChar
) ||
3502 fBB
->contains(prevChar
) ) {
3508 if (fHL
->contains(prevCharX2
) &&
3509 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3515 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3520 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3521 (fEX
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3522 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3523 (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3524 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3525 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3534 if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) ||
3535 (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3536 (fHL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3537 (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) ||
3538 (fNU
->contains(prevChar
) && fHL
->contains(thisChar
)) ) {
3542 // LB 24 Do not break between prefix and letters or ideographs.
3546 // (AL | HL) x PR // Apple early addition
3547 // (AL | HL) x PO // Apple early addition
3548 if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) ||
3549 (fPR
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3550 (fPO
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3551 ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fPR
->contains(thisChar
)) ||
3552 ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fPO
->contains(thisChar
)) ) {
3559 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3560 if (U_FAILURE(status
)) {
3563 // Matched a number. But could have been just a single digit, which would
3564 // not represent a "no break here" between prevChar and thisChar
3565 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3566 if (numEndIdx
> pos
) {
3567 // Number match includes at least our two chars being checked
3568 if (numEndIdx
> nextPos
) {
3569 // Number match includes additional chars. Update pos and nextPos
3570 // so that next loop iteration will continue at the end of the number,
3571 // checking for breaks between last char in number & whatever follows.
3572 pos
= nextPos
= numEndIdx
;
3574 pos
= fText
->moveIndex32(pos
, -1);
3575 thisChar
= fText
->char32At(pos
);
3576 } while (fCM
->contains(thisChar
));
3583 // LB 26 Do not break a Korean syllable.
3584 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3585 fJV
->contains(thisChar
) ||
3586 fH2
->contains(thisChar
) ||
3587 fH3
->contains(thisChar
))) {
3591 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3592 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3596 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3597 fJT
->contains(thisChar
)) {
3601 // LB 27 Treat a Korean Syllable Block the same as ID.
3602 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3603 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3604 fIN
->contains(thisChar
)) {
3607 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3608 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3609 fPO
->contains(thisChar
)) {
3612 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3613 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3619 // LB 28 Do not break between alphabetics ("at").
3620 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3624 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3625 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3629 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3632 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3635 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3639 // LB30a RI RI <break> RI
3641 if (fRI
->contains(prevCharX2
) && fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3644 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3648 // LB30b Emoji Base x Emoji Modifier
3649 if (fEB
->contains(prevChar
) && fEM
->contains(thisChar
)) {
3653 // LB 31 Break everywhere else
3662 UVector
*RBBILineMonkey::charClasses() {
3667 RBBILineMonkey::~RBBILineMonkey() {
3714 delete fNumberMatcher
;
3718 //-------------------------------------------------------------------------------------------
3723 // seed=nnnnn Random number starting seed.
3724 // Setting the seed allows errors to be reproduced.
3725 // loop=nnn Looping count. Controls running time.
3727 // 0 or greater: run length.
3729 // type = char | word | line | sent | title
3732 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3734 //-------------------------------------------------------------------------------------------
3736 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3737 int32_t val
= defaultVal
;
3738 name
.append(" *= *(-?\\d+)");
3739 UErrorCode status
= U_ZERO_ERROR
;
3740 RegexMatcher
m(name
, params
, 0, status
);
3742 // The param exists. Convert the string to an int.
3743 char valString
[100];
3744 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3745 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3746 paramLength
= (int32_t)(sizeof(valString
)-2);
3748 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3749 val
= strtol(valString
, NULL
, 10);
3751 // Delete this parameter from the params string.
3753 params
= m
.replaceFirst("", status
);
3755 U_ASSERT(U_SUCCESS(status
));
3760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3761 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3770 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3772 if (count
< expectedcount
&& expected
[count
] != i
) {
3773 test
->errln("break forward test failed: expected %d but got %d",
3774 expected
[count
], i
);
3779 if (count
!= expectedcount
) {
3780 printStringBreaks(ustr
, expected
, expectedcount
);
3781 test
->errln("break forward test failed: missed %d match",
3782 expectedcount
- count
);
3785 // testing boundaries
3786 for (i
= 1; i
< expectedcount
; i
++) {
3787 int j
= expected
[i
- 1];
3788 if (!bi
->isBoundary(j
)) {
3789 printStringBreaks(ustr
, expected
, expectedcount
);
3790 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3793 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3794 if (bi
->isBoundary(j
)) {
3795 printStringBreaks(ustr
, expected
, expectedcount
);
3796 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3802 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3804 if (forward
[count
] != i
) {
3805 printStringBreaks(ustr
, expected
, expectedcount
);
3806 test
->errln("happy break test previous() failed: expected %d but got %d",
3812 printStringBreaks(ustr
, expected
, expectedcount
);
3813 test
->errln("break test previous() failed: missed a match");
3817 // testing preceding
3818 for (i
= 0; i
< expectedcount
- 1; i
++) {
3819 // int j = expected[i] + 1;
3820 int j
= ustr
.moveIndex32(expected
[i
], 1);
3821 for (; j
<= expected
[i
+ 1]; j
++) {
3822 if (bi
->preceding(j
) != expected
[i
]) {
3823 printStringBreaks(ustr
, expected
, expectedcount
);
3824 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3832 void RBBITest::TestWordBreaks(void)
3834 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3836 Locale
locale("en");
3837 UErrorCode status
= U_ZERO_ERROR
;
3838 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3839 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3840 // Replaced any C+J characters in a row with a random sequence of characters
3841 // of the same length to make our C+J segmentation not get in the way.
3842 static const char *strlist
[] =
3844 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3845 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3846 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3847 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3848 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3849 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3850 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3851 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3852 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3853 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3854 "\\u2027\\U000e0067\\u0a47\\u00b7",
3855 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3856 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3857 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3858 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3859 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3860 "\\u0027\\u11af\\U000e0057\\u0602",
3861 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3862 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3863 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3864 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3865 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3866 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3867 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3868 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3869 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3870 "\\u18f4\\U000e0049\\u20e7\\u2027",
3871 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3872 "\\ua183\\u102d\\u0bec\\u003a",
3873 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3874 "\\u003a\\u0e57\\u0fad\\u002e",
3875 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3876 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3877 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3878 "\\u003a\\u0664\\u00b7\\u1fba",
3879 "\\u003b\\u0027\\u00b7\\u47a3",
3880 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3881 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3882 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3885 if (U_FAILURE(status
)) {
3886 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3889 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3890 // printf("looping %d\n", loop);
3891 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3892 // RBBICharMonkey monkey;
3893 RBBIWordMonkey monkey
;
3896 int expectedcount
= 0;
3898 monkey
.setText(ustr
);
3900 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3901 expected
[expectedcount
++] = i
;
3904 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3910 void RBBITest::TestWordBoundary(void)
3912 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3913 Locale
locale("en");
3914 UErrorCode status
= U_ZERO_ERROR
;
3915 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3916 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3918 static const char *strlist
[] =
3920 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3921 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3922 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3923 "\\u2027\\U000e0067\\u0a47\\u00b7",
3924 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3925 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3926 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3927 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3928 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3929 "\\u0027\\u11af\\U000e0057\\u0602",
3930 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3931 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3932 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3933 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3934 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3935 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3936 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3937 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3938 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3939 "\\u58f4\\U000e0049\\u20e7\\u2027",
3940 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3941 "\\ua183\\u102d\\u0bec\\u003a",
3942 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3943 "\\u003a\\u0e57\\u0fad\\u002e",
3944 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3945 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3946 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3947 "\\u003a\\u0664\\u00b7\\u1fba",
3948 "\\u003b\\u0027\\u00b7\\u47a3",
3951 if (U_FAILURE(status
)) {
3952 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3955 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3956 // printf("looping %d\n", loop);
3957 u_unescape(strlist
[loop
], str
, 20);
3958 UnicodeString
ustr(str
);
3965 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3966 forward
[count
++] = i
;
3969 for (j
= prev
+ 1; j
< i
; j
++) {
3970 if (bi
->isBoundary(j
)) {
3971 printStringBreaks(ustr
, forward
, count
);
3972 errln("happy boundary test failed: expected %d not a boundary",
3978 if (!bi
->isBoundary(i
)) {
3979 printStringBreaks(ustr
, forward
, count
);
3980 errln("happy boundary test failed: expected %d a boundary",
3990 void RBBITest::TestLineBreaks(void)
3992 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3993 Locale
locale("en");
3994 UErrorCode status
= U_ZERO_ERROR
;
3995 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3996 const int32_t STRSIZE
= 50;
3998 static const char *strlist
[] =
4000 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4001 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4002 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4003 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4004 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4005 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4006 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4007 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4008 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4009 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4010 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4011 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4012 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4013 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4014 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4015 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4016 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4017 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4018 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4019 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4020 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4021 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4022 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4023 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4024 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4025 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4026 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4027 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4028 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4029 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4030 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4031 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4032 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4033 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4034 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4035 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4036 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4037 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4038 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4039 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4042 TEST_ASSERT_SUCCESS(status
);
4043 if (U_FAILURE(status
)) {
4046 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
4047 // printf("looping %d\n", loop);
4048 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
4055 UnicodeString
ustr(str
);
4056 RBBILineMonkey monkey
;
4057 if (U_FAILURE(monkey
.deferredStatus
)) {
4061 const int EXPECTEDSIZE
= 50;
4062 int expected
[EXPECTEDSIZE
];
4063 int expectedcount
= 0;
4065 monkey
.setText(ustr
);
4067 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4068 if (expectedcount
>= EXPECTEDSIZE
) {
4069 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4072 expected
[expectedcount
++] = i
;
4075 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4081 void RBBITest::TestSentBreaks(void)
4083 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4084 Locale
locale("en");
4085 UErrorCode status
= U_ZERO_ERROR
;
4086 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4088 static const char *strlist
[] =
4090 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4092 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4093 "\"Sentence ending with a quote.\" Bye.",
4094 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4095 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4096 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4097 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4098 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4099 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4100 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4101 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4102 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4103 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4104 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4105 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4106 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4107 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4108 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4109 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4112 if (U_FAILURE(status
)) {
4113 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
4116 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
4117 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
4118 UnicodeString
ustr(str
);
4120 RBBISentMonkey monkey
;
4121 if (U_FAILURE(monkey
.deferredStatus
)) {
4125 const int EXPECTEDSIZE
= 50;
4126 int expected
[EXPECTEDSIZE
];
4127 int expectedcount
= 0;
4129 monkey
.setText(ustr
);
4131 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4132 if (expectedcount
>= EXPECTEDSIZE
) {
4133 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4136 expected
[expectedcount
++] = i
;
4139 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4145 void RBBITest::TestMonkey(char *params
) {
4146 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4148 UErrorCode status
= U_ZERO_ERROR
;
4149 int32_t loopCount
= 500;
4151 UnicodeString breakType
= "all";
4152 Locale
locale("en");
4153 UBool useUText
= FALSE
;
4155 if (quick
== FALSE
) {
4160 UnicodeString
p(params
);
4161 loopCount
= getIntParam("loop", p
, loopCount
);
4162 seed
= getIntParam("seed", p
, seed
);
4164 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
4166 breakType
= m
.group(1, status
);
4168 p
= m
.replaceFirst("", status
);
4171 RegexMatcher
u(" *utext", p
, 0, status
);
4175 p
= u
.replaceFirst("", status
);
4180 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
4181 // Each option is stripped out of the option string as it is processed.
4182 // All options have been checked. The option string should have been completely emptied..
4184 p
.extract(buf
, sizeof(buf
), NULL
, status
);
4185 buf
[sizeof(buf
)-1] = 0;
4186 errln("Unrecognized or extra parameter: %s\n", buf
);
4192 if (breakType
== "char" || breakType
== "all") {
4194 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
4195 if (U_SUCCESS(status
)) {
4196 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
4197 if (breakType
== "all" && useUText
==FALSE
) {
4198 // Also run a quick test with UText when "all" is specified
4199 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
4203 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
4208 if (breakType
== "word" || breakType
== "all") {
4209 logln("Word Break Monkey Test");
4211 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4212 if (U_SUCCESS(status
)) {
4213 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
4216 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
4221 if (breakType
== "line" || breakType
== "all") {
4222 logln("Line Break Monkey Test");
4224 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4225 if (loopCount
>= 10) {
4226 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
4228 if (U_SUCCESS(status
)) {
4229 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
4232 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4237 if (breakType
== "sent" || breakType
== "all" ) {
4238 logln("Sentence Break Monkey Test");
4240 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4241 if (loopCount
>= 10) {
4242 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
4244 if (U_SUCCESS(status
)) {
4245 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
4248 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4257 // Run a RBBI monkey test. Common routine, for all break iterator types.
4259 // bi - the break iterator to use
4260 // mk - MonkeyKind, abstraction for obtaining expected results
4261 // name - Name of test (char, word, etc.) for use in error messages
4262 // seed - Seed for starting random number generator (parameter from user)
4265 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4266 int32_t numIterations
, UBool useUText
) {
4268 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4270 const int32_t TESTSTRINGLEN
= 500;
4271 UnicodeString testText
;
4272 int32_t numCharClasses
;
4274 int expected
[TESTSTRINGLEN
*2 + 1];
4275 int expectedCount
= 0;
4276 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4277 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4278 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4279 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4280 char followingBreaks
[TESTSTRINGLEN
*2+1];
4281 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4287 numCharClasses
= mk
.charClasses()->size();
4288 chClasses
= mk
.charClasses();
4290 // Check for errors that occured during the construction of the MonkeyKind object.
4291 // Can't report them where they occured because errln() is a method coming from intlTest,
4292 // and is not visible outside of RBBITest :-(
4293 if (U_FAILURE(mk
.deferredStatus
)) {
4294 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4298 // Verify that the character classes all have at least one member.
4299 for (i
=0; i
<numCharClasses
; i
++) {
4300 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4301 if (s
== NULL
|| s
->size() == 0) {
4302 errln("Character Class #%d is null or of zero size.", i
);
4307 while (loopCount
< numIterations
|| numIterations
== -1) {
4308 if (numIterations
== -1 && loopCount
% 10 == 0) {
4309 // If test is running in an infinite loop, display a periodic tic so
4310 // we can tell that it is making progress.
4311 fprintf(stderr
, ".");
4313 // Save current random number seed, so that we can recreate the random numbers
4314 // for this loop iteration in event of an error.
4317 // Populate a test string with data.
4318 testText
.truncate(0);
4319 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4320 int32_t aClassNum
= m_rand() % numCharClasses
;
4321 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4322 int32_t charIdx
= m_rand() % classSet
->size();
4323 UChar32 c
= classSet
->charAt(charIdx
);
4324 if (c
< 0) { // TODO: deal with sets containing strings.
4325 errln("%s:%d c < 0", __FILE__
, __LINE__
);
4328 // Do not assemble a supplementary character from randomly generated separate surrogates.
4329 // (It could be a dictionary character)
4330 if (U16_IS_TRAIL(c
) && testText
.length() > 0 && U16_IS_LEAD(testText
.charAt(testText
.length()-1))) {
4337 // Calculate the expected results for this test string.
4338 mk
.setText(testText
);
4339 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4340 expectedBreaks
[0] = 1;
4341 int32_t breakPos
= 0;
4344 breakPos
= mk
.next(breakPos
);
4345 if (breakPos
== -1) {
4348 if (breakPos
> testText
.length()) {
4349 errln("breakPos > testText.length()");
4351 expectedBreaks
[breakPos
] = 1;
4352 U_ASSERT(expectedCount
<testText
.length());
4353 expected
[expectedCount
++] = breakPos
;
4354 (void)expected
; // Set but not used warning.
4355 // TODO (andy): check it out.
4358 // Find the break positions using forward iteration
4359 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4361 UErrorCode status
= U_ZERO_ERROR
;
4362 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4363 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4364 bi
->setText(testUText
, status
);
4365 TEST_ASSERT_SUCCESS(status
);
4366 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4367 // This UText can be closed immediately, so long as the
4368 // testText string continues to exist.
4370 bi
->setText(testText
);
4373 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4374 if (i
< 0 || i
> testText
.length()) {
4375 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4378 forwardBreaks
[i
] = 1;
4381 // Find the break positions using reverse iteration
4382 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4383 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4384 if (i
< 0 || i
> testText
.length()) {
4385 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4388 reverseBreaks
[i
] = 1;
4391 // Find the break positions using isBoundary() tests.
4392 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4393 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4394 for (i
=0; i
<=testText
.length(); i
++) {
4395 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4399 // Find the break positions using the following() function.
4401 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4402 int32_t lastBreakPos
= 0;
4403 followingBreaks
[0] = 1;
4404 for (i
=0; i
<testText
.length(); i
++) {
4405 breakPos
= bi
->following(i
);
4406 if (breakPos
<= i
||
4407 breakPos
< lastBreakPos
||
4408 breakPos
> testText
.length() ||
4409 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4410 errln("%s break monkey test: "
4411 "Out of range value returned by BreakIterator::following().\n"
4412 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4413 name
, seed
, i
, breakPos
, lastBreakPos
);
4416 followingBreaks
[breakPos
] = 1;
4417 lastBreakPos
= breakPos
;
4420 // Find the break positions using the preceding() function.
4421 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4422 lastBreakPos
= testText
.length();
4423 precedingBreaks
[testText
.length()] = 1;
4424 for (i
=testText
.length(); i
>0; i
--) {
4425 breakPos
= bi
->preceding(i
);
4426 if (breakPos
>= i
||
4427 breakPos
> lastBreakPos
||
4428 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4429 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4430 errln("%s break monkey test: "
4431 "Out of range value returned by BreakIterator::preceding().\n"
4432 "index=%d; prev returned %d; lastBreak=%d" ,
4433 name
, i
, breakPos
, lastBreakPos
);
4434 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4435 precedingBreaks
[i
] = 2; // Forces an error.
4438 if (breakPos
>= 0) {
4439 precedingBreaks
[breakPos
] = 1;
4441 lastBreakPos
= breakPos
;
4445 // Compare the expected and actual results.
4446 for (i
=0; i
<=testText
.length(); i
++) {
4447 const char *errorType
= NULL
;
4448 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4449 errorType
= "next()";
4450 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4451 errorType
= "previous()";
4452 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4453 errorType
= "isBoundary()";
4454 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4455 errorType
= "following()";
4456 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4457 errorType
= "preceding()";
4461 if (errorType
!= NULL
) {
4462 // Format a range of the test text that includes the failure as
4463 // a data item that can be included in the rbbi test data file.
4465 // Start of the range is the last point where expected and actual results
4466 // both agreed that there was a break position.
4467 int startContext
= i
;
4470 if (startContext
==0) { break; }
4472 if (expectedBreaks
[startContext
] != 0) {
4473 if (count
== 2) break;
4478 // End of range is two expected breaks past the start position.
4479 int endContext
= i
+ 1;
4481 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4483 if (endContext
>= testText
.length()) {break;}
4484 if (expectedBreaks
[endContext
-1] != 0) {
4485 if (count
== 0) break;
4492 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4493 UnicodeString errorText
= "<data>";
4494 /***if (strcmp(errorType, "next()") == 0) {
4496 endContext = testText.length();
4498 printStringBreaks(testText, expected, expectedCount);
4501 for (ci
=startContext
; ci
<endContext
;) {
4502 UnicodeString
hexChars("0123456789abcdef");
4505 c
= testText
.char32At(ci
);
4507 // This is the location of the error.
4508 errorText
.append("<?>");
4509 } else if (expectedBreaks
[ci
] != 0) {
4510 // This a non-error expected break position.
4511 errorText
.append("\\");
4514 errorText
.append("\\u");
4515 for (bn
=12; bn
>=0; bn
-=4) {
4516 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4519 errorText
.append("\\U");
4520 for (bn
=28; bn
>=0; bn
-=4) {
4521 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4524 ci
= testText
.moveIndex32(ci
, 1);
4526 errorText
.append("\\");
4527 errorText
.append("</data>\n");
4530 char charErrorTxt
[500];
4531 UErrorCode status
= U_ZERO_ERROR
;
4532 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4533 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4534 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4536 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4537 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4538 errorType
, seed
, i
, charErrorTxt
);
4549 // Bug 5532. UTF-8 based UText fails in dictionary code.
4550 // This test checks the initial patch,
4551 // which is to just keep it from crashing. Correct word boundaries
4552 // await a proper fix to the dictionary code.
4554 void RBBITest::TestBug5532(void) {
4555 // Text includes a mixture of Thai and Latin.
4556 const unsigned char utf8Data
[] = {
4557 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4558 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4559 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4560 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4561 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4562 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4563 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4564 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4565 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4566 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4567 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4569 UErrorCode status
= U_ZERO_ERROR
;
4570 UText utext
=UTEXT_INITIALIZER
;
4571 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4572 TEST_ASSERT_SUCCESS(status
);
4574 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4575 TEST_ASSERT_SUCCESS(status
);
4576 if (U_SUCCESS(status
)) {
4577 bi
->setText(&utext
, status
);
4578 TEST_ASSERT_SUCCESS(status
);
4580 int32_t breakCount
= 0;
4581 int32_t previousBreak
= -1;
4582 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4583 // For now, just make sure that the break iterator doesn't hang.
4584 TEST_ASSERT(previousBreak
< bi
->current());
4585 previousBreak
= bi
->current();
4587 TEST_ASSERT(breakCount
> 0);
4590 utext_close(&utext
);
4594 void RBBITest::TestBug9983(void) {
4595 UnicodeString text
= UnicodeString("\\u002A" // * Other
4597 "\\u309C" // Katakana
4601 "\\u0000").unescape();
4603 UErrorCode status
= U_ZERO_ERROR
;
4604 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4605 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4606 TEST_ASSERT_SUCCESS(status
);
4607 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4608 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4609 TEST_ASSERT_SUCCESS(status
);
4610 if (U_FAILURE(status
)) {
4613 int32_t offset
, rstatus
, iterationCount
;
4615 brkiter
->setText(text
);
4618 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4620 rstatus
= brkiter
->getRuleStatus();
4621 (void)rstatus
; // Suppress set but not used warning.
4622 if (iterationCount
>= 10) {
4626 TEST_ASSERT(iterationCount
== 6);
4628 brkiterPOSIX
->setText(text
);
4629 brkiterPOSIX
->last();
4631 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4633 rstatus
= brkiterPOSIX
->getRuleStatus();
4634 (void)rstatus
; // Suppress set but not used warning.
4635 if (iterationCount
>= 10) {
4639 TEST_ASSERT(iterationCount
== 6);
4644 // TestDebug - A place-holder test for debugging purposes.
4645 // For putting in fragments of other tests that can be invoked
4646 // for tracing without a lot of unwanted extra stuff happening.
4648 void RBBITest::TestDebug(void) {
4650 UErrorCode status
= U_ZERO_ERROR
;
4654 RuleBasedBreakIterator
* bi
=
4655 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4656 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4657 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4658 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4659 // UnicodeString s("Aaa. Bcd");
4662 UBool r
= bi
->isBoundary(8);
4663 printf("%s", r
?"true":"false");
4667 // ruleStatus = bi->getRuleStatus();
4668 printf("%d\t%d\n", pos
, ruleStatus
);
4669 pos
= bi
->previous();
4670 } while (pos
!= BreakIterator::DONE
);
4674 void RBBITest::TestProperties() {
4675 UErrorCode errorCode
= U_ZERO_ERROR
;
4676 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4677 if (!prependSet
.isEmpty()) {
4679 "[:GCB=Prepend:] is not empty any more. "
4680 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4681 "change this test to the opposite condition.");
4685 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */