1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
21 #include "unicode/brkiter.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/numfmt.h"
24 #include "unicode/rbbi.h"
25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26 #include "unicode/regex.h"
28 #include "unicode/schriter.h"
29 #include "unicode/uchar.h"
30 #include "unicode/utf16.h"
31 #include "unicode/ucnv.h"
32 #include "unicode/uniset.h"
33 #include "unicode/uscript.h"
34 #include "unicode/ustring.h"
35 #include "unicode/utext.h"
42 #include "utypeinfo.h" // for 'typeid' to work
46 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
47 #include "unicode/filteredbrk.h"
48 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
50 #define TEST_ASSERT(x) {if (!(x)) { \
51 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
53 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
54 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
57 //---------------------------------------------
59 //---------------------------------------------
62 // Note: Before adding new tests to this file, check whether the desired test data can
63 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
64 // it's much less work than writing a new test, diagnostic output in the event of failures
65 // is good, and the test data file will is shared with ICU4J, so eventually the test
66 // will run there as well, without additional effort.
68 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
70 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
74 #if !UCONFIG_NO_FILE_IO
75 TESTCASE_AUTO(TestBug4153072
);
77 TESTCASE_AUTO(TestStatusReturn
);
78 #if !UCONFIG_NO_FILE_IO
79 TESTCASE_AUTO(TestUnicodeFiles
);
80 TESTCASE_AUTO(TestEmptyString
);
82 TESTCASE_AUTO(TestGetAvailableLocales
);
83 TESTCASE_AUTO(TestGetDisplayName
);
84 #if !UCONFIG_NO_FILE_IO
85 TESTCASE_AUTO(TestEndBehaviour
);
86 TESTCASE_AUTO(TestWordBreaks
);
87 TESTCASE_AUTO(TestWordBoundary
);
88 TESTCASE_AUTO(TestLineBreaks
);
89 TESTCASE_AUTO(TestSentBreaks
);
90 TESTCASE_AUTO(TestExtended
);
92 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
93 TESTCASE_AUTO(TestMonkey
);
95 #if !UCONFIG_NO_FILE_IO
96 TESTCASE_AUTO(TestBug3818
);
98 TESTCASE_AUTO(TestDebug
);
99 #if !UCONFIG_NO_FILE_IO
100 TESTCASE_AUTO(TestBug5775
);
102 TESTCASE_AUTO(TestBug9983
);
103 TESTCASE_AUTO(TestDictRules
);
104 TESTCASE_AUTO(TestBug5532
);
105 TESTCASE_AUTO(TestBug7547
);
106 TESTCASE_AUTO(TestBug12797
);
107 TESTCASE_AUTO(TestBug12918
);
108 TESTCASE_AUTO(TestBug12932
);
109 TESTCASE_AUTO(TestEmoji
);
114 //---------------------------------------------------------------------------
116 // class BITestData Holds a set of Break iterator test data and results
118 // - the string data to be broken
119 // - a vector of the expected break positions.
120 // - a vector of source line numbers for the data,
121 // (to help see where errors occured.)
122 // - The expected break tag values.
123 // - Vectors of actual break positions and tag values.
124 // - Functions for comparing actual with expected and
127 //----------------------------------------------------------------------------
130 UnicodeString fDataToBreak
;
131 UVector fExpectedBreakPositions
;
132 UVector fExpectedTags
;
134 UVector fActualBreakPositions
; // Test Results.
137 BITestData(UErrorCode
&status
);
138 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
139 void checkResults(const char *heading
, RBBITest
*test
);
140 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
147 BITestData::BITestData(UErrorCode
&status
)
148 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
154 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
155 // The macro form collects the line number, which is helpful
156 // when tracking down failures.
158 // A null data item is inserted at the start of each test's data
159 // to put the starting zero into the data list. The position saved for
160 // each non-null item is its ending position.
162 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
163 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
164 if (U_FAILURE(status
)) {return;}
166 fDataToBreak
.append(CharsToUnicodeString(data
));
168 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
169 fExpectedTags
.addElement(tag
, status
);
170 fLineNum
.addElement(lineNum
, status
);
175 // checkResults. Compare the actual and expected break positions, report any differences.
177 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
178 int32_t expectedIndex
= 0;
179 int32_t actualIndex
= 0;
182 // If we've run through both the expected and actual results vectors, we're done.
183 // break out of the loop.
184 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
185 actualIndex
>= fActualBreakPositions
.size()) {
190 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
191 err(heading
, test
, expectedIndex
-1, actualIndex
);
196 if (actualIndex
>= fActualBreakPositions
.size()) {
197 err(heading
, test
, expectedIndex
, actualIndex
-1);
202 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
203 err(heading
, test
, expectedIndex
, actualIndex
);
204 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
205 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
213 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
214 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
215 heading
, fLineNum
.elementAt(expectedIndex
),
216 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
225 // err - An error was found. Report it, along with information about where the
226 // incorrectly broken test data appeared in the source file.
228 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
230 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
231 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
233 int32_t line
= fLineNum
.elementAti(expectedIdx
);
234 if (expectedIdx
> 0) {
235 // The line numbers are off by one because a premature break occurs somewhere
236 // within the previous item, rather than at the start of the current (expected) item.
237 // We want to report the offset of the unexpected break from the start of
238 // this previous item.
239 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
241 if (actual
< expected
) {
242 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
244 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
249 void BITestData::clearResults() {
250 fActualBreakPositions
.removeAllElements();
251 fActualTags
.removeAllElements();
255 //--------------------------------------------------------------------------------------
257 // RBBITest constructor and destructor
259 //--------------------------------------------------------------------------------------
261 RBBITest::RBBITest() {
266 RBBITest::~RBBITest() {
269 //-----------------------------------------------------------------------------------
271 // Test for status {tag} return value from break rules.
272 // TODO: a more thorough test.
274 //-----------------------------------------------------------------------------------
275 void RBBITest::TestStatusReturn() {
276 UnicodeString
rulesString1("$Letters = [:L:];\n"
277 "$Numbers = [:N:];\n"
280 "Help\\ /me\\!{4};\n"
281 "[^$Letters $Numbers];\n"
282 "!.*;\n", -1, US_INV
);
283 UnicodeString testString1
= "abc123..abc Help me Help me!";
284 // 01234567890123456789012345678
285 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
286 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
288 UErrorCode status
=U_ZERO_ERROR
;
289 UParseError parseError
;
291 LocalPointer
<BreakIterator
> bi(new RuleBasedBreakIterator(rulesString1
, parseError
, status
));
292 if(U_FAILURE(status
)) {
293 dataerrln("%s:%d error in break iterator construction - %s", __FILE__
, __LINE__
, u_errorName(status
));
298 bi
->setText(testString1
);
299 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
300 if (pos
!= bounds1
[i
]) {
301 errln("%s:%d expected break at %d, got %d\n", __FILE__
, __LINE__
, bounds1
[i
], pos
);
305 int tag
= bi
->getRuleStatus();
306 if (tag
!= brkStatus
[i
]) {
307 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__
, __LINE__
, pos
, brkStatus
[i
], tag
);
315 static void printStringBreaks(UText
*tstr
, int expected
[], int expectedCount
) {
316 UErrorCode status
= U_ZERO_ERROR
;
318 printf("code alpha extend alphanum type word sent line name\n");
319 int nextExpectedIndex
= 0;
320 utext_setNativeIndex(tstr
, 0);
321 for (int j
= 0; j
< utext_nativeLength(tstr
); j
=utext_getNativeIndex(tstr
)) {
322 if (nextExpectedIndex
< expectedCount
&& j
>= expected
[nextExpectedIndex
] ) {
323 printf("------------------------------------------------ %d\n", j
);
327 UChar32 c
= utext_next32(tstr
);
328 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
329 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
331 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
333 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
335 U_SHORT_PROPERTY_NAME
),
336 u_getPropertyValueName(UCHAR_WORD_BREAK
,
337 u_getIntPropertyValue(c
,
339 U_SHORT_PROPERTY_NAME
),
340 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
341 u_getIntPropertyValue(c
,
342 UCHAR_SENTENCE_BREAK
),
343 U_SHORT_PROPERTY_NAME
),
344 u_getPropertyValueName(UCHAR_LINE_BREAK
,
345 u_getIntPropertyValue(c
,
347 U_SHORT_PROPERTY_NAME
),
353 static void printStringBreaks(const UnicodeString
&ustr
, int expected
[], int expectedCount
) {
354 UErrorCode status
= U_ZERO_ERROR
;
356 tstr
= utext_openConstUnicodeString(NULL
, &ustr
, &status
);
357 if (U_FAILURE(status
)) {
358 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
));
361 printStringBreaks(tstr
, expected
, expectedCount
);
366 void RBBITest::TestBug3818() {
367 UErrorCode status
= U_ZERO_ERROR
;
369 // Four Thai words...
370 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
371 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
372 UnicodeString
thaiStr(thaiWordData
);
374 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
375 if (U_FAILURE(status
) || bi
== NULL
) {
376 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
379 bi
->setText(thaiStr
);
381 int32_t startOfSecondWord
= bi
->following(1);
382 if (startOfSecondWord
!= 4) {
383 errln("Fail at file %s, line %d expected start of word at 4, got %d",
384 __FILE__
, __LINE__
, startOfSecondWord
);
386 startOfSecondWord
= bi
->following(0);
387 if (startOfSecondWord
!= 4) {
388 errln("Fail at file %s, line %d expected start of word at 4, got %d",
389 __FILE__
, __LINE__
, startOfSecondWord
);
394 //----------------------------------------------------------------------------
396 // generalIteratorTest Given a break iterator and a set of test data,
397 // Run the tests and report the results.
399 //----------------------------------------------------------------------------
400 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
403 bi
.setText(td
.fDataToBreak
);
405 testFirstAndNext(bi
, td
);
407 testLastAndPrevious(bi
, td
);
409 testFollowing(bi
, td
);
410 testPreceding(bi
, td
);
411 testIsBoundary(bi
, td
);
412 doMultipleSelectionTest(bi
, td
);
417 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
420 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
422 UErrorCode status
= U_ZERO_ERROR
;
427 logln("Test first and next");
428 bi
.setText(td
.fDataToBreak
);
431 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
432 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
433 tag
= bi
.getRuleStatus();
434 td
.fActualTags
.addElement(tag
, status
);
436 // If the iterator is not making forward progress, stop.
437 // No need to raise an error here, it'll be detected in the normal check of results.
442 td
.checkResults("testFirstAndNext", this);
447 // TestLastAndPrevious. Run the iterator backwards, starting with last().
449 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
451 UErrorCode status
= U_ZERO_ERROR
;
453 int32_t lastP
= 0x7ffffffe;
456 logln("Test last and previous");
457 bi
.setText(td
.fDataToBreak
);
460 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
461 // Save break position. Insert it at start of vector of results, shoving
462 // already-saved results further towards the end.
463 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
464 // bi.previous(); // TODO: Why does this fix things up????
466 tag
= bi
.getRuleStatus();
467 td
.fActualTags
.insertElementAt(tag
, 0, status
);
469 // If the iterator is not making progress, stop.
470 // No need to raise an error here, it'll be detected in the normal check of results.
475 td
.checkResults("testLastAndPrevious", this);
479 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
481 UErrorCode status
= U_ZERO_ERROR
;
484 int32_t lastP
= -2; // A value that will never be returned as a break position.
485 // cannot be -1; that is returned for DONE.
488 logln("testFollowing():");
489 bi
.setText(td
.fDataToBreak
);
492 // Save the starting point, since we won't get that out of following.
494 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
495 tag
= bi
.getRuleStatus();
496 td
.fActualTags
.addElement(tag
, status
);
498 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
501 if (p
== RuleBasedBreakIterator::DONE
) {
504 // We've reached a new break position. Save it.
505 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
506 tag
= bi
.getRuleStatus();
507 td
.fActualTags
.addElement(tag
, status
);
511 // The loop normally exits by means of the break in the middle.
512 // Make sure that the index was at the correct position for the break iterator to have
514 if (i
!= td
.fDataToBreak
.length()) {
515 errln("testFollowing(): iterator returned DONE prematurely.");
518 // Full check of all results.
519 td
.checkResults("testFollowing", this);
524 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
525 UErrorCode status
= U_ZERO_ERROR
;
528 int32_t lastP
= 0x7ffffffe;
531 logln("testPreceding():");
532 bi
.setText(td
.fDataToBreak
);
536 td
.fActualBreakPositions
.addElement(p
, status
);
537 tag
= bi
.getRuleStatus();
538 td
.fActualTags
.addElement(tag
, status
);
540 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
543 if (p
== RuleBasedBreakIterator::DONE
) {
546 // We've reached a new break position. Save it.
547 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
549 tag
= bi
.getRuleStatus();
550 td
.fActualTags
.insertElementAt(tag
, 0, status
);
553 // The loop normally exits by means of the break in the middle.
554 // Make sure that the index was at the correct position for the break iterator to have
557 errln("testPreceding(): iterator returned DONE prematurely.");
560 // Full check of all results.
561 td
.checkResults("testPreceding", this);
566 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
567 UErrorCode status
= U_ZERO_ERROR
;
571 logln("testIsBoundary():");
572 bi
.setText(td
.fDataToBreak
);
575 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
576 if (bi
.isBoundary(i
)) {
577 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
578 tag
= bi
.getRuleStatus();
579 td
.fActualTags
.addElement(tag
, status
);
582 td
.checkResults("testIsBoundary: ", this);
587 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
589 iterator
.setText(td
.fDataToBreak
);
591 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
592 int32_t offset
= iterator
.first();
596 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
598 if (*testIterator
!= iterator
)
599 errln("clone() or operator!= failed: two clones compared unequal");
602 testOffset
= testIterator
->first();
603 testOffset
= testIterator
->next(count
);
604 if (offset
!= testOffset
)
605 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
607 if (offset
!= RuleBasedBreakIterator::DONE
) {
609 offset
= iterator
.next();
611 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
612 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
613 if (count
> 10000 || offset
== -1) {
614 errln("operator== failed too many times. Stopping test.");
616 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
622 } while (offset
!= RuleBasedBreakIterator::DONE
);
624 // now do it backwards...
625 offset
= iterator
.last();
629 testOffset
= testIterator
->last();
630 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
631 if (offset
!= testOffset
)
632 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
634 if (offset
!= RuleBasedBreakIterator::DONE
) {
636 offset
= iterator
.previous();
638 } while (offset
!= RuleBasedBreakIterator::DONE
);
644 //---------------------------------------------
648 //---------------------------------------------
649 void RBBITest::TestEmptyString()
651 UnicodeString text
= "";
652 UErrorCode status
= U_ZERO_ERROR
;
654 BITestData
x(status
);
655 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
656 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
657 if (U_FAILURE(status
))
659 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
662 generalIteratorTest(*bi
, x
);
666 void RBBITest::TestGetAvailableLocales()
668 int32_t locCount
= 0;
669 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
672 dataerrln("getAvailableLocales() returned an empty list!");
673 // Just make sure that it's returning good memory.
675 for (i
= 0; i
< locCount
; ++i
) {
676 logln(locList
[i
].getName());
680 //Testing the BreakIterator::getDisplayName() function
681 void RBBITest::TestGetDisplayName()
683 UnicodeString result
;
685 BreakIterator::getDisplayName(Locale::getUS(), result
);
686 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
687 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
690 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
691 if (result
!= "French (France)")
692 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
699 void RBBITest::TestEndBehaviour()
701 UErrorCode status
= U_ZERO_ERROR
;
702 UnicodeString
testString("boo.");
703 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
704 if (U_FAILURE(status
))
706 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
709 wb
->setText(testString
);
711 if (wb
->first() != 0)
712 errln("Didn't get break at beginning of string.");
714 errln("Didn't get break before period in \"boo.\"");
715 if (wb
->current() != 4 && wb
->next() != 4)
716 errln("Didn't get break at end of string.");
722 void RBBITest::TestBug4153072() {
723 UErrorCode status
= U_ZERO_ERROR
;
724 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
725 if (U_FAILURE(status
))
727 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
730 UnicodeString
str("...Hello, World!...");
732 int32_t end
= str
.length() - 3;
735 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
736 iter
->adoptText(textIterator
);
738 // Note: with the switch to UText, there is no way to restrict the
739 // iteration range to begin at an index other than zero.
740 // String character iterators created with a non-zero bound are
741 // treated by RBBI as being empty.
742 for (index
= -1; index
< begin
+ 1; ++index
) {
743 onBoundary
= iter
->isBoundary(index
);
744 if (index
== 0? !onBoundary
: onBoundary
) {
745 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
746 " and begin index = " + begin
);
754 // Test for problem reported by Ashok Matoria on 9 July 2007
755 // One.<kSoftHyphen><kSpace>Two.
757 // Sentence break at start (0) and then on calling next() it breaks at
758 // 'T' of "Two". Now, at this point if I do next() and
759 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
761 void RBBITest::TestBug5775() {
762 UErrorCode status
= U_ZERO_ERROR
;
763 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
764 TEST_ASSERT_SUCCESS(status
);
765 if (U_FAILURE(status
)) {
768 // Check for status first for better handling of no data errors.
769 TEST_ASSERT(bi
!= NULL
);
774 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
778 int pos
= bi
->next();
779 TEST_ASSERT(pos
== 6);
781 TEST_ASSERT(pos
== 10);
782 pos
= bi
->previous();
783 TEST_ASSERT(pos
== 6);
789 //------------------------------------------------------------------------------
791 // RBBITest::Extended Run RBBI Tests from an external test data file
793 //------------------------------------------------------------------------------
796 BreakIterator
*bi
; // Break iterator is set while parsing test source.
797 // Changed out whenever test data changes break type.
799 UnicodeString dataToBreak
; // Data that is built up while parsing the test.
800 UVector32
*expectedBreaks
; // Expected break positions, matches dataToBreak UnicodeString.
801 UVector32
*srcLine
; // Positions in source file, indexed same as dataToBreak.
804 UText
*textToBreak
; // UText, could be UTF8 or UTF16.
805 UVector32
*textMap
; // Map from UTF-16 dataToBreak offsets to UText offsets.
806 CharString utf8String
; // UTF-8 form of text to break.
808 TestParams(UErrorCode
&status
) : dataToBreak() {
810 expectedBreaks
= new UVector32(status
);
811 srcLine
= new UVector32(status
);
812 srcCol
= new UVector32(status
);
814 textMap
= new UVector32(status
);
819 delete expectedBreaks
;
822 utext_close(textToBreak
);
826 int32_t getSrcLine(int32_t bp
);
827 int32_t getExpectedBreak(int32_t bp
);
828 int32_t getSrcCol(int32_t bp
);
830 void setUTF16(UErrorCode
&status
);
831 void setUTF8(UErrorCode
&status
);
834 // Append a UnicodeString to a CharString with UTF-8 encoding.
835 // Substitute any invalid chars.
836 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
837 static void CharStringAppend(CharString
&dest
, const UnicodeString
&src
, UErrorCode
&status
) {
838 if (U_FAILURE(status
)) {
842 u_strToUTF8WithSub(NULL
, 0, &utf8Length
, // Output Buffer, NULL for preflight.
843 src
.getBuffer(), src
.length(), // UTF-16 data
844 0xfffd, NULL
, // Substitution char, number of subs.
846 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
849 status
= U_ZERO_ERROR
;
851 char *buffer
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
);
852 u_strToUTF8WithSub(buffer
, utf8Length
, NULL
,
853 src
.getBuffer(), src
.length(),
854 0xfffd, NULL
, &status
);
855 dest
.append(buffer
, utf8Length
, status
);
859 void TestParams::setUTF16(UErrorCode
&status
) {
860 textToBreak
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
);
861 textMap
->removeAllElements();
862 for (int32_t i
=0; i
<dataToBreak
.length(); i
++) {
863 if (i
== dataToBreak
.getChar32Start(i
)) {
864 textMap
->addElement(i
, status
);
866 textMap
->addElement(-1, status
);
869 textMap
->addElement(dataToBreak
.length(), status
);
870 U_ASSERT(dataToBreak
.length() + 1 == textMap
->size());
874 void TestParams::setUTF8(UErrorCode
&status
) {
875 if (U_FAILURE(status
)) {
879 CharStringAppend(utf8String
, dataToBreak
, status
);
880 textToBreak
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
);
881 if (U_FAILURE(status
)) {
885 textMap
->removeAllElements();
886 int32_t utf16Index
= 0;
888 textMap
->addElement(utf16Index
, status
);
889 UChar32 c32
= utext_current32(textToBreak
);
893 utf16Index
+= U16_LENGTH(c32
);
894 utext_next32(textToBreak
);
895 while (textMap
->size() < utext_getNativeIndex(textToBreak
)) {
896 textMap
->addElement(-1, status
);
899 U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size());
903 int32_t TestParams::getSrcLine(int32_t bp
) {
904 if (bp
>= textMap
->size()) {
905 bp
= textMap
->size() - 1;
908 for(; bp
>= 0 ; --bp
) {
909 // Move to a character boundary if we are not on one already.
910 i
= textMap
->elementAti(bp
);
915 return srcLine
->elementAti(i
);
919 int32_t TestParams::getExpectedBreak(int32_t bp
) {
920 if (bp
>= textMap
->size()) {
923 int32_t i
= textMap
->elementAti(bp
);
926 retVal
= expectedBreaks
->elementAti(i
);
932 int32_t TestParams::getSrcCol(int32_t bp
) {
933 if (bp
>= textMap
->size()) {
934 bp
= textMap
->size() - 1;
937 for(; bp
>= 0; --bp
) {
938 // Move bp to a character boundary if we are not on one already.
939 i
= textMap
->elementAti(bp
);
944 return srcCol
->elementAti(i
);
948 void RBBITest::executeTest(TestParams
*t
, UErrorCode
&status
) {
953 TEST_ASSERT_SUCCESS(status
);
954 if (U_FAILURE(status
)) {
962 t
->bi
->setText(t
->textToBreak
, status
);
964 // Run the iterator forward
967 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
969 // Fail for lack of forward progress.
970 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
971 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
975 // Check that there we didn't miss an expected break between the last one
977 for (i
=prevBP
+1; i
<bp
; i
++) {
978 if (t
->getExpectedBreak(i
) != 0) {
979 int expected
[] = {0, i
};
980 printStringBreaks(t
->dataToBreak
, expected
, 2);
981 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
982 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
986 // Check that the break we did find was expected
987 if (t
->getExpectedBreak(bp
) == 0) {
988 int expected
[] = {0, bp
};
989 printStringBreaks(t
->textToBreak
, expected
, 2);
990 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
991 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
993 // The break was expected.
994 // Check that the {nnn} tag value is correct.
995 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
996 if (expectedTagVal
== -1) {
999 int32_t line
= t
->getSrcLine(bp
);
1000 int32_t rs
= t
->bi
->getRuleStatus();
1001 if (rs
!= expectedTagVal
) {
1002 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1003 " Actual, Expected status = %4d, %4d",
1004 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1011 // Verify that there were no missed expected breaks after the last one found
1012 for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) {
1013 if (t
->getExpectedBreak(i
) != 0) {
1014 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1015 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1020 // Run the iterator backwards, verify that the same breaks are found.
1022 prevBP
= utext_nativeLength(t
->textToBreak
)+2; // start with a phony value for the last break pos seen.
1023 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1025 // Fail for lack of progress.
1026 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1027 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1031 // Check that we didn't miss an expected break between the last one
1032 // and this one. (UVector returns zeros for index out of bounds.)
1033 for (i
=prevBP
-1; i
>bp
; i
--) {
1034 if (t
->getExpectedBreak(i
) != 0) {
1035 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1036 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1040 // Check that the break we did find was expected
1041 if (t
->getExpectedBreak(bp
) == 0) {
1042 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1043 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1045 // The break was expected.
1046 // Check that the {nnn} tag value is correct.
1047 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
1048 if (expectedTagVal
== -1) {
1051 int line
= t
->getSrcLine(bp
);
1052 int32_t rs
= t
->bi
->getRuleStatus();
1053 if (rs
!= expectedTagVal
) {
1054 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1055 " Actual, Expected status = %4d, %4d",
1056 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1063 // Verify that there were no missed breaks prior to the last one found
1064 for (i
=prevBP
-1; i
>=0; i
--) {
1065 if (t
->getExpectedBreak(i
) != 0) {
1066 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1067 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1071 // Check isBoundary()
1072 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1073 UBool boundaryExpected
= (t
->getExpectedBreak(i
) != 0);
1074 UBool boundaryFound
= t
->bi
->isBoundary(i
);
1075 if (boundaryExpected
!= boundaryFound
) {
1076 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1077 " Expected, Actual= %s, %s",
1078 i
, t
->getSrcLine(i
), t
->getSrcCol(i
),
1079 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
1083 // Check following()
1084 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1085 int32_t actualBreak
= t
->bi
->following(i
);
1086 int32_t expectedBreak
= BreakIterator::DONE
;
1087 for (int32_t j
=i
+1; j
<= utext_nativeLength(t
->textToBreak
); j
++) {
1088 if (t
->getExpectedBreak(j
) != 0) {
1093 if (expectedBreak
!= actualBreak
) {
1094 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1095 " Expected, Actual= %d, %d",
1096 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1100 // Check preceding()
1101 for (i
=utext_nativeLength(t
->textToBreak
); i
>=0; i
--) {
1102 int32_t actualBreak
= t
->bi
->preceding(i
);
1103 int32_t expectedBreak
= BreakIterator::DONE
;
1105 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1106 // preceding(trailing byte) will return the index of some preceding code point,
1107 // not the lead byte of the current code point, even though that has a smaller index.
1108 // Therefore, start looking at the expected break data not at i-1, but at
1109 // the start of code point index - 1.
1110 utext_setNativeIndex(t
->textToBreak
, i
);
1111 int32_t j
= utext_getNativeIndex(t
->textToBreak
) - 1;
1112 for (; j
>= 0; j
--) {
1113 if (t
->getExpectedBreak(j
) != 0) {
1118 if (expectedBreak
!= actualBreak
) {
1119 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1120 " Expected, Actual= %d, %d",
1121 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1127 void RBBITest::TestExtended() {
1128 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
1129 // data driven test closely entangles filtered and regular data.
1130 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
1131 UErrorCode status
= U_ZERO_ERROR
;
1134 UnicodeString rules
;
1135 TestParams
tp(status
);
1137 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status
);
1138 if (U_FAILURE(status
)) {
1139 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1144 // Open and read the test data file.
1146 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1147 char testFileName
[1000];
1148 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1149 errln("Can't open test data. Path too long.");
1152 strcpy(testFileName
, testDataDirectory
);
1153 strcat(testFileName
, "rbbitst.txt");
1156 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1157 if (U_FAILURE(status
)) {
1158 return; /* something went wrong, error already output */
1162 bool skipTest
= false; // Skip this test?
1165 // Put the test data into a UnicodeString
1167 UnicodeString
testString(FALSE
, testFile
, len
);
1175 parseState
= PARSE_TAG
;
1177 EParseState savedState
= PARSE_TAG
;
1179 int32_t lineNum
= 1;
1180 int32_t colStart
= 0;
1182 int32_t charIdx
= 0;
1184 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1186 for (charIdx
= 0; charIdx
< len
; ) {
1187 status
= U_ZERO_ERROR
;
1188 UChar c
= testString
.charAt(charIdx
);
1190 if (c
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
) == u
'\n') {
1191 // treat CRLF as a unit
1195 if (c
== u
'\n' || c
== u
'\r') {
1199 column
= charIdx
- colStart
+ 1;
1201 switch (parseState
) {
1203 if (c
== u
'\n' || c
== u
'\r') {
1204 parseState
= savedState
;
1211 parseState
= PARSE_COMMENT
;
1212 savedState
= PARSE_TAG
;
1215 if (u_isUWhiteSpace(c
)) {
1218 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1220 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1225 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1227 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1232 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1234 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1239 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1241 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1246 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1248 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1253 // <locale loc_name>
1254 localeMatcher
.reset(testString
);
1255 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1256 UnicodeString localeName
= localeMatcher
.group(1, status
);
1257 char localeName8
[100];
1258 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1259 locale
= Locale::createFromName(localeName8
);
1260 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
1261 TEST_ASSERT_SUCCESS(status
);
1264 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1265 parseState
= PARSE_DATA
;
1267 tp
.dataToBreak
= "";
1268 tp
.expectedBreaks
->removeAllElements();
1269 tp
.srcCol
->removeAllElements();
1270 tp
.srcLine
->removeAllElements();
1274 errln("line %d: Tag expected in test file.", lineNum
);
1275 parseState
= PARSE_COMMENT
;
1276 savedState
= PARSE_DATA
;
1277 goto end_test
; // Stop the test.
1282 if (c
== u
'\u2022') { // u'•'
1283 int32_t breakIdx
= tp
.dataToBreak
.length();
1284 tp
.expectedBreaks
->setSize(breakIdx
+1);
1285 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1286 tp
.srcLine
->setSize(breakIdx
+1);
1287 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1288 tp
.srcCol
->setSize(breakIdx
+1);
1289 tp
.srcCol
->setElementAt(column
, breakIdx
);
1293 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1294 // Add final entry to mappings from break location to source file position.
1295 // Need one extra because last break position returned is after the
1296 // last char in the data, not at the last char.
1297 tp
.srcLine
->addElement(lineNum
, status
);
1298 tp
.srcCol
->addElement(column
, status
);
1300 parseState
= PARSE_TAG
;
1305 status
= U_ZERO_ERROR
;
1306 tp
.setUTF16(status
);
1307 executeTest(&tp
, status
);
1308 TEST_ASSERT_SUCCESS(status
);
1310 // Run again, this time with UTF-8 text wrapped in a UText.
1311 status
= U_ZERO_ERROR
;
1313 TEST_ASSERT_SUCCESS(status
);
1314 executeTest(&tp
, status
);
1319 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1320 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1321 // Get the code point from the name and insert it into the test data.
1322 // (Damn, no API takes names in Unicode !!!
1323 // we've got to take it back to char *)
1324 int32_t nameEndIdx
= testString
.indexOf(u
'}', charIdx
);
1325 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1326 char charNameBuf
[200];
1327 UChar32 theChar
= -1;
1328 if (nameEndIdx
!= -1) {
1329 UErrorCode status
= U_ZERO_ERROR
;
1330 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1331 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1332 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1333 if (U_FAILURE(status
)) {
1337 if (theChar
== -1) {
1338 errln("Error in named character in test file at line %d, col %d",
1341 // Named code point was recognized. Insert it
1342 // into the test data.
1343 tp
.dataToBreak
.append(theChar
);
1344 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1345 tp
.srcLine
->addElement(lineNum
, status
);
1346 tp
.srcCol
->addElement(column
, status
);
1349 if (nameEndIdx
> charIdx
) {
1350 charIdx
= nameEndIdx
+1;
1359 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1361 int32_t breakIdx
= tp
.dataToBreak
.length();
1362 tp
.expectedBreaks
->setSize(breakIdx
+1);
1363 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1364 tp
.srcLine
->setSize(breakIdx
+1);
1365 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1366 tp
.srcCol
->setSize(breakIdx
+1);
1367 tp
.srcCol
->setElementAt(column
, breakIdx
);
1373 parseState
= PARSE_NUM
;
1377 if (c
== u
'#' && column
==3) { // TODO: why is column off so far?
1378 parseState
= PARSE_COMMENT
;
1379 savedState
= PARSE_DATA
;
1384 // Check for \ at end of line, a line continuation.
1385 // Advance over (discard) the newline
1386 UChar32 cp
= testString
.char32At(charIdx
);
1387 if (cp
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
+1) == u
'\n') {
1389 // Need an extra increment of the input ptr to move over both of them
1392 if (cp
== u
'\n' || cp
== u
'\r') {
1399 // Let unescape handle the back slash.
1400 cp
= testString
.unescapeAt(charIdx
);
1402 // Escape sequence was recognized. Insert the char
1403 // into the test data.
1404 tp
.dataToBreak
.append(cp
);
1405 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1406 tp
.srcLine
->addElement(lineNum
, status
);
1407 tp
.srcCol
->addElement(column
, status
);
1413 // Not a recognized backslash escape sequence.
1414 // Take the next char as a literal.
1415 // TODO: Should this be an error?
1416 c
= testString
.charAt(charIdx
);
1417 charIdx
= testString
.moveIndex32(charIdx
, 1);
1420 // Normal, non-escaped data char.
1421 tp
.dataToBreak
.append(c
);
1423 // Save the mapping from offset in the data to line/column numbers in
1424 // the original input file. Will be used for better error messages only.
1425 // If there's an expected break before this char, the slot in the mapping
1426 // vector will already be set for this char; don't overwrite it.
1427 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1428 tp
.srcLine
->addElement(lineNum
, status
);
1429 tp
.srcCol
->addElement(column
, status
);
1435 // We are parsing an expected numeric tag value, like <1234>,
1436 // within a chunk of data.
1437 if (u_isUWhiteSpace(c
)) {
1442 // Finished the number. Add the info to the expected break data,
1443 // and switch parse state back to doing plain data.
1444 parseState
= PARSE_DATA
;
1445 if (tagValue
== 0) {
1448 int32_t breakIdx
= tp
.dataToBreak
.length();
1449 tp
.expectedBreaks
->setSize(breakIdx
+1);
1450 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1451 tp
.srcLine
->setSize(breakIdx
+1);
1452 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1453 tp
.srcCol
->setSize(breakIdx
+1);
1454 tp
.srcCol
->setElementAt(column
, breakIdx
);
1459 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1463 errln("Syntax Error in test file at line %d, col %d",
1465 parseState
= PARSE_COMMENT
;
1466 goto end_test
; // Stop the test
1471 if (U_FAILURE(status
)) {
1472 dataerrln("ICU Error %s while parsing test file at line %d.",
1473 u_errorName(status
), lineNum
);
1474 status
= U_ZERO_ERROR
;
1475 goto end_test
; // Stop the test
1486 //-------------------------------------------------------------------------------
1488 // TestDictRules create a break iterator from source rules that includes a
1489 // dictionary range. Regression for bug #7130. Source rules
1490 // do not declare a break iterator type (word, line, sentence, etc.
1491 // but the dictionary code, without a type, would loop.
1493 //-------------------------------------------------------------------------------
1494 void RBBITest::TestDictRules() {
1495 const char *rules
= "$dictionary = [a-z]; \n"
1497 "$dictionary $dictionary; \n"
1499 "$dictionary $dictionary; \n";
1500 const char *text
= "aa";
1501 UErrorCode status
= U_ZERO_ERROR
;
1502 UParseError parseError
;
1504 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1505 if (U_SUCCESS(status
)) {
1506 UnicodeString utext
= text
;
1510 for (loops
= 0; loops
<10; loops
++) {
1511 position
= bi
.next();
1512 if (position
== RuleBasedBreakIterator::DONE
) {
1516 TEST_ASSERT(loops
== 1);
1518 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1524 //-------------------------------------------------------------------------------
1526 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1527 // return the data in one big UChar * buffer, which the caller must delete.
1530 // fileName: the name of the file, with no directory part. The test data directory
1532 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1533 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1534 // specified here. The BOM, if it exists, will be stripped from the returned data.
1535 // Pass NULL for the system default encoding.
1538 // The file data, converted to UChar.
1539 // The caller must delete this when done with
1540 // delete [] theBuffer;
1542 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1543 // Move this function to some common place.
1545 //--------------------------------------------------------------------------------
1546 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1547 UChar
*retPtr
= NULL
;
1548 char *fileBuf
= NULL
;
1549 UConverter
* conv
= NULL
;
1553 if (U_FAILURE(status
)) {
1560 f
= fopen(fileName
, "rb");
1562 dataerrln("Error opening test data file %s\n", fileName
);
1563 status
= U_FILE_ACCESS_ERROR
;
1572 fseek( f
, 0, SEEK_END
);
1573 fileSize
= ftell(f
);
1574 fileBuf
= new char[fileSize
];
1575 fseek(f
, 0, SEEK_SET
);
1576 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1577 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1578 errln("Error reading test data file.");
1579 goto cleanUpAndReturn
;
1583 // Look for a Unicode Signature (BOM) on the data just read
1585 int32_t signatureLength
;
1586 const char * fileBufC
;
1587 const char* bomEncoding
;
1590 bomEncoding
= ucnv_detectUnicodeSignature(
1591 fileBuf
, fileSize
, &signatureLength
, &status
);
1592 if(bomEncoding
!=NULL
){
1593 fileBufC
+= signatureLength
;
1594 fileSize
-= signatureLength
;
1595 encoding
= bomEncoding
;
1599 // Open a converter to take the rule file to UTF-16
1601 conv
= ucnv_open(encoding
, &status
);
1602 if (U_FAILURE(status
)) {
1603 goto cleanUpAndReturn
;
1607 // Convert the rules to UChar.
1608 // Preflight first to determine required buffer size.
1610 ulen
= ucnv_toUChars(conv
,
1616 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1617 // Buffer Overflow is expected from the preflight operation.
1618 status
= U_ZERO_ERROR
;
1620 retPtr
= new UChar
[ulen
+1];
1633 if (U_FAILURE(status
)) {
1634 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1644 //--------------------------------------------------------------------------------------------
1646 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1648 //-------------------------------------------------------------------------------------------
1649 void RBBITest::TestUnicodeFiles() {
1650 RuleBasedBreakIterator
*bi
;
1651 UErrorCode status
= U_ZERO_ERROR
;
1653 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1654 TEST_ASSERT_SUCCESS(status
);
1655 if (U_SUCCESS(status
)) {
1656 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1660 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1661 TEST_ASSERT_SUCCESS(status
);
1662 if (U_SUCCESS(status
)) {
1663 runUnicodeTestData("WordBreakTest.txt", bi
);
1667 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1668 TEST_ASSERT_SUCCESS(status
);
1669 if (U_SUCCESS(status
)) {
1670 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1674 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1675 TEST_ASSERT_SUCCESS(status
);
1676 if (U_SUCCESS(status
)) {
1677 runUnicodeTestData("LineBreakTest.txt", bi
);
1683 // Check for test cases from the Unicode test data files that are known to fail
1684 // and should be skipped because ICU is not yet able to fully implement the spec.
1685 // See ticket #7270.
1687 UBool
RBBITest::testCaseIsKnownIssue(const UnicodeString
&testCase
, const char *fileName
) {
1688 static struct TestCase
{
1689 const char *fFileName
;
1690 const UChar
*fString
;
1691 } badTestCases
[] = { // Line Numbers from Unicode 7.0.0 file.
1692 {"LineBreakTest.txt", u
"\u200B\u0020}"}, // Line 5198
1693 {"LineBreakTest.txt", u
"\u200B\u0020)"}, // Line 5202
1694 {"LineBreakTest.txt", u
"\u200B\u0020!"}, // Line 5214
1695 {"LineBreakTest.txt", u
"\u200B\u0020,"}, // Line 5246
1696 {"LineBreakTest.txt", u
"\u200B\u0020/"}, // Line 5298
1697 {"LineBreakTest.txt", u
"\u200B\u0020\u2060"}, // Line 5302
1698 // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
1699 {"GraphemeBreakTest.txt", u
"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
1700 {"GraphemeBreakTest.txt", u
"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
1701 {"GraphemeBreakTest.txt", u
"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
1703 // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
1704 {"WordBreakTest.txt", u
"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
1705 {"WordBreakTest.txt", u
"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
1708 for (int n
=0; n
<UPRV_LENGTHOF(badTestCases
); n
++) {
1709 const TestCase
&badCase
= badTestCases
[n
];
1710 if (!strcmp(fileName
, badCase
.fFileName
) &&
1711 testCase
== UnicodeString(badCase
.fString
)) {
1712 return logKnownIssue("7270");
1719 //--------------------------------------------------------------------------------------------
1721 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1723 //-------------------------------------------------------------------------------------------
1724 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1725 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1726 UErrorCode status
= U_ZERO_ERROR
;
1729 // Open and read the test data file, put it into a UnicodeString.
1731 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1732 char testFileName
[1000];
1733 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1734 dataerrln("Can't open test data. Path too long.");
1737 strcpy(testFileName
, testDataDirectory
);
1738 strcat(testFileName
, fileName
);
1740 logln("Opening data file %s\n", fileName
);
1743 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1744 if (status
!= U_FILE_ACCESS_ERROR
) {
1745 TEST_ASSERT_SUCCESS(status
);
1746 TEST_ASSERT(testFile
!= NULL
);
1748 if (U_FAILURE(status
) || testFile
== NULL
) {
1749 return; /* something went wrong, error already output */
1751 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1754 // Parse the test data file using a regular expression.
1755 // Each kind of token is recognized in its own capture group; what type of item was scanned
1756 // is identified by which group had a match.
1758 // Caputure Group # 1 2 3 4 5
1759 // Parses this item: divide x hex digits comment \n unrecognized \n
1761 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1762 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1763 UnicodeString testString
;
1764 UVector32
breakPositions(status
);
1766 TEST_ASSERT_SUCCESS(status
);
1767 if (U_FAILURE(status
)) {
1772 // Scan through each test case, building up the string to be broken in testString,
1773 // and the positions that should be boundaries in the breakPositions vector.
1776 while (tokenMatcher
.find()) {
1777 if(tokenMatcher
.hitEnd()) {
1778 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1779 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1780 and caused an infinite loop here on EBCDIC systems!
1782 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1785 if (tokenMatcher
.start(1, status
) >= 0) {
1786 // Scanned a divide sign, indicating a break position in the test data.
1787 if (testString
.length()>0) {
1788 breakPositions
.addElement(testString
.length(), status
);
1791 else if (tokenMatcher
.start(2, status
) >= 0) {
1792 // Scanned an 'x', meaning no break at this position in the test data
1793 // Nothing to be done here.
1795 else if (tokenMatcher
.start(3, status
) >= 0) {
1796 // Scanned Hex digits. Convert them to binary, append to the character data string.
1797 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1798 int length
= hexNumber
.length();
1801 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1802 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1804 testString
.append(c
);
1806 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1807 fileName
, lineNumber
);
1810 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1811 fileName
, lineNumber
);
1814 else if (tokenMatcher
.start(4, status
) >= 0) {
1815 // Scanned to end of a line, possibly skipping over a comment in the process.
1816 // If the line from the file contained test data, run the test now.
1817 if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {
1818 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1821 // Clear out this test case.
1822 // The string and breakPositions vector will be refilled as the next
1823 // test case is parsed.
1824 testString
.remove();
1825 breakPositions
.removeAllElements();
1828 // Scanner catchall. Something unrecognized appeared on the line.
1830 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1831 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1832 token
[sizeof(token
)-1] = 0;
1833 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1835 // Clean up, in preparation for continuing with the next line.
1836 testString
.remove();
1837 breakPositions
.removeAllElements();
1840 TEST_ASSERT_SUCCESS(status
);
1841 if (U_FAILURE(status
)) {
1847 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1850 //--------------------------------------------------------------------------------------------
1852 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1853 // test data files. Do only a simple, forward-only check -
1854 // this test is mostly to check that ICU and the Unicode
1855 // data agree with each other.
1857 //--------------------------------------------------------------------------------------------
1858 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1859 const UnicodeString
&testString
, // Text data to be broken
1860 UVector32
*breakPositions
, // Positions where breaks should be found.
1861 RuleBasedBreakIterator
*bi
) {
1862 int32_t pos
; // Break Position in the test string
1863 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1864 int32_t expectedPos
; // Expected break position (index into test string)
1866 bi
->setText(testString
);
1870 while (pos
!= BreakIterator::DONE
) {
1871 if (expectedI
>= breakPositions
->size()) {
1872 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1873 testFileName
, lineNumber
, pos
);
1876 expectedPos
= breakPositions
->elementAti(expectedI
);
1877 if (pos
< expectedPos
) {
1878 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1879 testFileName
, lineNumber
, pos
);
1882 if (pos
> expectedPos
) {
1883 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1884 testFileName
, lineNumber
, expectedPos
);
1891 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1892 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1893 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1899 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1900 //---------------------------------------------------------------------------------------
1902 // classs RBBIMonkeyKind
1904 // Monkey Test for Break Iteration
1905 // Abstract interface class. Concrete derived classes independently
1906 // implement the break rules for different iterator types.
1908 // The Monkey Test itself uses doesn't know which type of break iterator it is
1909 // testing, but works purely in terms of the interface defined here.
1911 //---------------------------------------------------------------------------------------
1912 class RBBIMonkeyKind
{
1914 // Return a UVector of UnicodeSets, representing the character classes used
1915 // for this type of iterator.
1916 virtual UVector
*charClasses() = 0;
1918 // Set the test text on which subsequent calls to next() will operate
1919 virtual void setText(const UnicodeString
&s
) = 0;
1921 // Find the next break postion, starting from the prev break position, or from zero.
1922 // Return -1 after reaching end of string.
1923 virtual int32_t next(int32_t i
) = 0;
1925 virtual ~RBBIMonkeyKind();
1926 UErrorCode deferredStatus
;
1935 RBBIMonkeyKind::RBBIMonkeyKind() {
1936 deferredStatus
= U_ZERO_ERROR
;
1939 RBBIMonkeyKind::~RBBIMonkeyKind() {
1943 //----------------------------------------------------------------------------------------
1945 // Random Numbers. Similar to standard lib rand() and srand()
1946 // Not using library to
1947 // 1. Get same results on all platforms.
1948 // 2. Get access to current seed, to more easily reproduce failures.
1950 //---------------------------------------------------------------------------------------
1951 static uint32_t m_seed
= 1;
1953 static uint32_t m_rand()
1955 m_seed
= m_seed
* 1103515245 + 12345;
1956 return (uint32_t)(m_seed
/65536) % 32768;
1961 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
1963 static const char16_t *gExtended_Pict
= u
"["
1964 "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767"
1965 "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1966 "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F"
1967 "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F"
1968 "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6"
1969 "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586"
1970 "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7"
1971 "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB"
1972 "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1973 "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C"
1974 "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637"
1975 "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A"
1976 "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9"
1977 "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD"
1978 "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF"
1979 "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5"
1980 "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F"
1981 "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F"
1982 "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F"
1983 "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF"
1984 "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8"
1985 "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF"
1988 //------------------------------------------------------------------------------------------
1990 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1991 // of RBBIMonkeyKind.
1993 //------------------------------------------------------------------------------------------
1994 class RBBICharMonkey
: public RBBIMonkeyKind
{
1997 virtual ~RBBICharMonkey();
1998 virtual UVector
*charClasses();
1999 virtual void setText(const UnicodeString
&s
);
2000 virtual int32_t next(int32_t i
);
2004 UnicodeSet
*fCRLFSet
;
2005 UnicodeSet
*fControlSet
;
2006 UnicodeSet
*fExtendSet
;
2007 UnicodeSet
*fZWJSet
;
2008 UnicodeSet
*fRegionalIndicatorSet
;
2009 UnicodeSet
*fPrependSet
;
2010 UnicodeSet
*fSpacingSet
;
2015 UnicodeSet
*fLVTSet
;
2016 UnicodeSet
*fHangulSet
;
2017 UnicodeSet
*fEmojiBaseSet
;
2018 UnicodeSet
*fEmojiModifierSet
;
2019 UnicodeSet
*fExtendedPictSet
;
2020 UnicodeSet
*fEBGSet
;
2021 UnicodeSet
*fEmojiNRKSet
;
2022 UnicodeSet
*fAnySet
;
2024 const UnicodeString
*fText
;
2028 RBBICharMonkey::RBBICharMonkey() {
2029 UErrorCode status
= U_ZERO_ERROR
;
2033 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
2034 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status
);
2035 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status
);
2036 fZWJSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status
);
2037 fRegionalIndicatorSet
=
2038 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
2039 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
2040 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
2041 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
2042 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
2043 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
2044 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
2045 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
2046 fHangulSet
= new UnicodeSet();
2047 fHangulSet
->addAll(*fLSet
);
2048 fHangulSet
->addAll(*fVSet
);
2049 fHangulSet
->addAll(*fTSet
);
2050 fHangulSet
->addAll(*fLVSet
);
2051 fHangulSet
->addAll(*fLVTSet
);
2053 fEmojiBaseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status
);
2054 fEmojiModifierSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status
);
2055 fExtendedPictSet
= new UnicodeSet(gExtended_Pict
, status
);
2056 fEBGSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status
);
2057 fEmojiNRKSet
= new UnicodeSet(UNICODE_STRING_SIMPLE(
2058 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status
);
2059 fAnySet
= new UnicodeSet(0, 0x10ffff);
2061 fSets
= new UVector(status
);
2062 fSets
->addElement(fCRLFSet
, status
);
2063 fSets
->addElement(fControlSet
, status
);
2064 fSets
->addElement(fExtendSet
, status
);
2065 fSets
->addElement(fRegionalIndicatorSet
, status
);
2066 if (!fPrependSet
->isEmpty()) {
2067 fSets
->addElement(fPrependSet
, status
);
2069 fSets
->addElement(fSpacingSet
, status
);
2070 fSets
->addElement(fHangulSet
, status
);
2071 fSets
->addElement(fAnySet
, status
);
2072 fSets
->addElement(fEmojiBaseSet
, status
);
2073 fSets
->addElement(fEmojiModifierSet
, status
);
2074 fSets
->addElement(fZWJSet
, status
);
2075 fSets
->addElement(fExtendedPictSet
, status
);
2076 fSets
->addElement(fEBGSet
, status
);
2077 fSets
->addElement(fEmojiNRKSet
,status
);
2078 if (U_FAILURE(status
)) {
2079 deferredStatus
= status
;
2084 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2090 int32_t RBBICharMonkey::next(int32_t prevPos
) {
2091 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2092 // break position being tested. The candidate break
2093 // location is before p2.
2097 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2098 UChar32 cBase
; // for (X Extend*) patterns, the X character.
2100 if (U_FAILURE(deferredStatus
)) {
2104 // Previous break at end of string. return DONE.
2105 if (prevPos
>= fText
->length()) {
2108 p0
= p1
= p2
= p3
= prevPos
;
2109 c3
= fText
->char32At(prevPos
);
2110 c0
= c1
= c2
= cBase
= 0;
2111 (void)p0
; // suppress set but not used warning.
2114 // Loop runs once per "significant" character position in the input text.
2116 // Move all of the positions forward in the input string.
2121 // Advancd p3 by one codepoint
2122 p3
= fText
->moveIndex32(p3
, 1);
2123 c3
= fText
->char32At(p3
);
2126 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2129 if (p2
== fText
->length()) {
2130 // Reached end of string. Always a break position.
2135 // No Extend or Format characters may appear between the CR and LF,
2136 // which requires the additional check for p2 immediately following p1.
2138 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2142 // Rule (GB4). ( Control | CR | LF ) <break>
2143 if (fControlSet
->contains(c1
) ||
2149 // Rule (GB5) <break> ( Control | CR | LF )
2151 if (fControlSet
->contains(c2
) ||
2158 // Rule (GB6) L x ( L | V | LV | LVT )
2159 if (fLSet
->contains(c1
) &&
2160 (fLSet
->contains(c2
) ||
2161 fVSet
->contains(c2
) ||
2162 fLVSet
->contains(c2
) ||
2163 fLVTSet
->contains(c2
))) {
2167 // Rule (GB7) ( LV | V ) x ( V | T )
2168 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
2169 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
2173 // Rule (GB8) ( LVT | T) x T
2174 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
2175 fTSet
->contains(c2
)) {
2179 // Rule (GB9) x (Extend | ZWJ)
2180 if (fExtendSet
->contains(c2
) || fZWJSet
->contains(c2
)) {
2181 if (!fExtendSet
->contains(c1
)) {
2187 // Rule (GB9a) x SpacingMark
2188 if (fSpacingSet
->contains(c2
)) {
2192 // Rule (GB9b) Prepend x
2193 if (fPrependSet
->contains(c1
)) {
2197 // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier
2198 if ((fEmojiBaseSet
->contains(c1
) || fEBGSet
->contains(c1
)) && fEmojiModifierSet
->contains(c2
)) {
2201 if ((fEmojiBaseSet
->contains(cBase
) || fEBGSet
->contains(cBase
)) &&
2202 fExtendSet
->contains(c1
) && fEmojiModifierSet
->contains(c2
)) {
2206 // Rule (GB11) (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji)
2207 if ((fExtendedPictSet
->contains(c0
) || fEmojiNRKSet
->contains(c0
)) && fZWJSet
->contains(c1
) &&
2208 (fExtendedPictSet
->contains(c2
) || fEmojiNRKSet
->contains(c2
))) {
2211 if ((fExtendedPictSet
->contains(cBase
) || fEmojiNRKSet
->contains(cBase
)) && fExtendSet
->contains(c0
) && fZWJSet
->contains(c1
) &&
2212 (fExtendedPictSet
->contains(c2
) || fEmojiNRKSet
->contains(c2
))) {
2216 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2217 // Note: The first if condition is a little tricky. We only need to force
2218 // a break if there are three or more contiguous RIs. If there are
2219 // only two, a break following will occur via other rules, and will include
2220 // any trailing extend characters, which is needed behavior.
2221 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)
2222 && fRegionalIndicatorSet
->contains(c2
)) {
2225 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2229 // Rule (GB999) Any <break> Any
2239 UVector
*RBBICharMonkey::charClasses() {
2244 RBBICharMonkey::~RBBICharMonkey() {
2249 delete fRegionalIndicatorSet
;
2259 delete fEmojiBaseSet
;
2260 delete fEmojiModifierSet
;
2262 delete fExtendedPictSet
;
2264 delete fEmojiNRKSet
;
2267 //------------------------------------------------------------------------------------------
2269 // class RBBIWordMonkey Word Break specific implementation
2270 // of RBBIMonkeyKind.
2272 //------------------------------------------------------------------------------------------
2273 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2276 virtual ~RBBIWordMonkey();
2277 virtual UVector
*charClasses();
2278 virtual void setText(const UnicodeString
&s
);
2279 virtual int32_t next(int32_t i
);
2285 UnicodeSet
*fNewlineSet
;
2286 UnicodeSet
*fRegionalIndicatorSet
;
2287 UnicodeSet
*fKatakanaSet
;
2288 UnicodeSet
*fHebrew_LetterSet
;
2289 UnicodeSet
*fALetterSet
;
2290 UnicodeSet
*fSingle_QuoteSet
;
2291 UnicodeSet
*fDouble_QuoteSet
;
2292 UnicodeSet
*fMidNumLetSet
;
2293 UnicodeSet
*fMidLetterSet
;
2294 UnicodeSet
*fMidNumSet
;
2295 UnicodeSet
*fNumericSet
;
2296 UnicodeSet
*fFormatSet
;
2297 UnicodeSet
*fOtherSet
;
2298 UnicodeSet
*fExtendSet
;
2299 UnicodeSet
*fExtendNumLetSet
;
2300 UnicodeSet
*fDictionarySet
;
2301 UnicodeSet
*fEBaseSet
;
2302 UnicodeSet
*fEBGSet
;
2303 UnicodeSet
*fEModifierSet
;
2304 UnicodeSet
*fZWJSet
;
2305 UnicodeSet
*fExtendedPictSet
;
2306 UnicodeSet
*fEmojiNRKSet
;
2308 const UnicodeString
*fText
;
2312 RBBIWordMonkey::RBBIWordMonkey()
2314 UErrorCode status
= U_ZERO_ERROR
;
2316 fSets
= new UVector(status
);
2318 fCRSet
= new UnicodeSet(u
"[\\p{Word_Break = CR}]", status
);
2319 fLFSet
= new UnicodeSet(u
"[\\p{Word_Break = LF}]", status
);
2320 fNewlineSet
= new UnicodeSet(u
"[\\p{Word_Break = Newline}]", status
);
2321 fKatakanaSet
= new UnicodeSet(u
"[\\p{Word_Break = Katakana}]", status
);
2322 fRegionalIndicatorSet
= new UnicodeSet(u
"[\\p{Word_Break = Regional_Indicator}]", status
);
2323 fHebrew_LetterSet
= new UnicodeSet(u
"[\\p{Word_Break = Hebrew_Letter}]", status
);
2324 fALetterSet
= new UnicodeSet(u
"[\\p{Word_Break = ALetter}]", status
);
2325 fSingle_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Single_Quote}]", status
);
2326 fDouble_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Double_Quote}]", status
);
2327 fMidNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNumLet}]", status
);
2328 fMidLetterSet
= new UnicodeSet(u
"[\\p{Word_Break = MidLetter} - [\\:]]", status
);
2329 fMidNumSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNum}]", status
);
2330 fNumericSet
= new UnicodeSet(u
"[\\p{Word_Break = Numeric}]", status
);
2331 fFormatSet
= new UnicodeSet(u
"[\\p{Word_Break = Format}]", status
);
2332 fExtendNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = ExtendNumLet}]", status
);
2333 fExtendSet
= new UnicodeSet(u
"[\\p{Word_Break = Extend}]", status
);
2335 fEBaseSet
= new UnicodeSet(u
"[[\\p{Word_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]", status
);
2336 fEBGSet
= new UnicodeSet(u
"[\\p{Word_Break = EBG}]", status
);
2337 fEModifierSet
= new UnicodeSet(u
"[\\p{Word_Break = EM}]", status
);
2338 fZWJSet
= new UnicodeSet(u
"[\\p{Word_Break = ZWJ}]", status
);
2339 fExtendedPictSet
= new UnicodeSet(gExtended_Pict
, status
);
2340 fEmojiNRKSet
= new UnicodeSet(
2341 u
"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status
);
2343 fDictionarySet
= new UnicodeSet(u
"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status
);
2344 fDictionarySet
->addAll(*fKatakanaSet
);
2345 fDictionarySet
->addAll(UnicodeSet(u
"[\\p{LineBreak = Complex_Context}]", status
));
2347 fALetterSet
->removeAll(*fDictionarySet
);
2349 fOtherSet
= new UnicodeSet();
2350 if(U_FAILURE(status
)) {
2351 IntlTest::gTest
->errln("%s:%d %s", __FILE__
, __LINE__
, u_errorName(status
));
2352 deferredStatus
= status
;
2356 fOtherSet
->complement();
2357 fOtherSet
->removeAll(*fCRSet
);
2358 fOtherSet
->removeAll(*fLFSet
);
2359 fOtherSet
->removeAll(*fNewlineSet
);
2360 fOtherSet
->removeAll(*fKatakanaSet
);
2361 fOtherSet
->removeAll(*fHebrew_LetterSet
);
2362 fOtherSet
->removeAll(*fALetterSet
);
2363 fOtherSet
->removeAll(*fSingle_QuoteSet
);
2364 fOtherSet
->removeAll(*fDouble_QuoteSet
);
2365 fOtherSet
->removeAll(*fMidLetterSet
);
2366 fOtherSet
->removeAll(*fMidNumSet
);
2367 fOtherSet
->removeAll(*fNumericSet
);
2368 fOtherSet
->removeAll(*fExtendNumLetSet
);
2369 fOtherSet
->removeAll(*fFormatSet
);
2370 fOtherSet
->removeAll(*fExtendSet
);
2371 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
2372 fOtherSet
->removeAll(*fEBaseSet
);
2373 fOtherSet
->removeAll(*fEBGSet
);
2374 fOtherSet
->removeAll(*fEModifierSet
);
2375 fOtherSet
->removeAll(*fZWJSet
);
2376 fOtherSet
->removeAll(*fExtendedPictSet
);
2377 fOtherSet
->removeAll(*fEmojiNRKSet
);
2379 // Inhibit dictionary characters from being tested at all.
2380 fOtherSet
->removeAll(*fDictionarySet
);
2382 fSets
->addElement(fCRSet
, status
);
2383 fSets
->addElement(fLFSet
, status
);
2384 fSets
->addElement(fNewlineSet
, status
);
2385 fSets
->addElement(fRegionalIndicatorSet
, status
);
2386 fSets
->addElement(fHebrew_LetterSet
, status
);
2387 fSets
->addElement(fALetterSet
, status
);
2388 fSets
->addElement(fSingle_QuoteSet
, status
);
2389 fSets
->addElement(fDouble_QuoteSet
, status
);
2390 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
2391 // from the test data. They are all in the dictionary set,
2392 // which this (old, to be retired) monkey test cannot handle.
2393 fSets
->addElement(fMidLetterSet
, status
);
2394 fSets
->addElement(fMidNumLetSet
, status
);
2395 fSets
->addElement(fMidNumSet
, status
);
2396 fSets
->addElement(fNumericSet
, status
);
2397 fSets
->addElement(fFormatSet
, status
);
2398 fSets
->addElement(fExtendSet
, status
);
2399 fSets
->addElement(fOtherSet
, status
);
2400 fSets
->addElement(fExtendNumLetSet
, status
);
2402 fSets
->addElement(fEBaseSet
, status
);
2403 fSets
->addElement(fEBGSet
, status
);
2404 fSets
->addElement(fEModifierSet
, status
);
2405 fSets
->addElement(fZWJSet
, status
);
2406 fSets
->addElement(fExtendedPictSet
, status
);
2407 fSets
->addElement(fEmojiNRKSet
, status
);
2409 if (U_FAILURE(status
)) {
2410 deferredStatus
= status
;
2414 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2419 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2420 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2421 // break position being tested. The candidate break
2422 // location is before p2.
2426 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2428 if (U_FAILURE(deferredStatus
)) {
2432 // Prev break at end of string. return DONE.
2433 if (prevPos
>= fText
->length()) {
2436 p0
= p1
= p2
= p3
= prevPos
;
2437 c3
= fText
->char32At(prevPos
);
2439 (void)p0
; // Suppress set but not used warning.
2441 // Loop runs once per "significant" character position in the input text.
2443 // Move all of the positions forward in the input string.
2448 // Advancd p3 by X(Extend | Format)* Rule 4
2449 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2451 p3
= fText
->moveIndex32(p3
, 1);
2452 c3
= fText
->char32At(p3
);
2453 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2457 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
) || fZWJSet
->contains(c3
));
2461 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2464 if (p2
== fText
->length()) {
2465 // Reached end of string. Always a break position.
2470 // No Extend or Format characters may appear between the CR and LF,
2471 // which requires the additional check for p2 immediately following p1.
2473 if (c1
==0x0D && c2
==0x0A) {
2477 // Rule (3a) Break before and after newlines (including CR and LF)
2479 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2482 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2486 // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
2487 // Not ignoring extend chars, so peek into input text to
2488 // get the potential ZWJ, the character immediately preceding c2.
2489 // Sloppy UChar32 indexing: p2-1 may reference trail half
2490 // but char32At will get the full code point.
2491 if (fZWJSet
->contains(fText
->char32At(p2
-1)) && (fExtendedPictSet
->contains(c2
) || fEmojiNRKSet
->contains(c2
))) {
2495 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2496 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2497 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2501 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2503 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2504 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2505 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2509 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2510 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2511 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2512 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2516 // Rule (7a) Hebrew_Letter x Single_Quote
2517 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2521 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2522 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2526 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2527 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2531 // Rule (8) Numeric x Numeric
2532 if (fNumericSet
->contains(c1
) &&
2533 fNumericSet
->contains(c2
)) {
2537 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2538 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2539 fNumericSet
->contains(c2
)) {
2543 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2544 if (fNumericSet
->contains(c1
) &&
2545 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2549 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2550 if (fNumericSet
->contains(c0
) &&
2551 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2552 fNumericSet
->contains(c2
)) {
2556 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2557 if (fNumericSet
->contains(c1
) &&
2558 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2559 fNumericSet
->contains(c3
)) {
2563 // Rule (13) Katakana x Katakana
2564 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2565 // all Katakana are handled by the dictionary breaker.
2566 if (fKatakanaSet
->contains(c1
) &&
2567 fKatakanaSet
->contains(c2
)) {
2571 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2572 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2573 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2574 fExtendNumLetSet
->contains(c2
)) {
2578 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2579 if (fExtendNumLetSet
->contains(c1
) &&
2580 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2581 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2585 // WB 14 (E_Base | EBG) x E_Modifier
2586 if ((fEBaseSet
->contains(c1
) || fEBGSet
->contains(c1
)) && fEModifierSet
->contains(c2
)) {
2590 // Rule 15 - 17 Group pairs of Regional Indicators.
2591 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)) {
2594 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2598 // Rule 999. Break found here.
2607 UVector
*RBBIWordMonkey::charClasses() {
2612 RBBIWordMonkey::~RBBIWordMonkey() {
2617 delete fKatakanaSet
;
2618 delete fHebrew_LetterSet
;
2620 delete fSingle_QuoteSet
;
2621 delete fDouble_QuoteSet
;
2622 delete fMidNumLetSet
;
2623 delete fMidLetterSet
;
2628 delete fExtendNumLetSet
;
2629 delete fRegionalIndicatorSet
;
2630 delete fDictionarySet
;
2634 delete fEModifierSet
;
2636 delete fExtendedPictSet
;
2637 delete fEmojiNRKSet
;
2643 //------------------------------------------------------------------------------------------
2645 // class RBBISentMonkey Sentence Break specific implementation
2646 // of RBBIMonkeyKind.
2648 //------------------------------------------------------------------------------------------
2649 class RBBISentMonkey
: public RBBIMonkeyKind
{
2652 virtual ~RBBISentMonkey();
2653 virtual UVector
*charClasses();
2654 virtual void setText(const UnicodeString
&s
);
2655 virtual int32_t next(int32_t i
);
2657 int moveBack(int posFrom
);
2658 int moveForward(int posFrom
);
2659 UChar32
cAt(int pos
);
2663 UnicodeSet
*fSepSet
;
2664 UnicodeSet
*fFormatSet
;
2666 UnicodeSet
*fLowerSet
;
2667 UnicodeSet
*fUpperSet
;
2668 UnicodeSet
*fOLetterSet
;
2669 UnicodeSet
*fNumericSet
;
2670 UnicodeSet
*fATermSet
;
2671 UnicodeSet
*fSContinueSet
;
2672 UnicodeSet
*fSTermSet
;
2673 UnicodeSet
*fCloseSet
;
2674 UnicodeSet
*fOtherSet
;
2675 UnicodeSet
*fExtendSet
;
2677 const UnicodeString
*fText
;
2681 RBBISentMonkey::RBBISentMonkey()
2683 UErrorCode status
= U_ZERO_ERROR
;
2685 fSets
= new UVector(status
);
2687 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2688 // set and made into character classes of their own. For the monkey impl,
2689 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2690 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2691 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2692 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2693 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2694 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2695 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2696 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2697 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2698 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2699 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2700 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2701 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2702 fOtherSet
= new UnicodeSet();
2704 if(U_FAILURE(status
)) {
2705 deferredStatus
= status
;
2709 fOtherSet
->complement();
2710 fOtherSet
->removeAll(*fSepSet
);
2711 fOtherSet
->removeAll(*fFormatSet
);
2712 fOtherSet
->removeAll(*fSpSet
);
2713 fOtherSet
->removeAll(*fLowerSet
);
2714 fOtherSet
->removeAll(*fUpperSet
);
2715 fOtherSet
->removeAll(*fOLetterSet
);
2716 fOtherSet
->removeAll(*fNumericSet
);
2717 fOtherSet
->removeAll(*fATermSet
);
2718 fOtherSet
->removeAll(*fSContinueSet
);
2719 fOtherSet
->removeAll(*fSTermSet
);
2720 fOtherSet
->removeAll(*fCloseSet
);
2721 fOtherSet
->removeAll(*fExtendSet
);
2723 fSets
->addElement(fSepSet
, status
);
2724 fSets
->addElement(fFormatSet
, status
);
2725 fSets
->addElement(fSpSet
, status
);
2726 fSets
->addElement(fLowerSet
, status
);
2727 fSets
->addElement(fUpperSet
, status
);
2728 fSets
->addElement(fOLetterSet
, status
);
2729 fSets
->addElement(fNumericSet
, status
);
2730 fSets
->addElement(fATermSet
, status
);
2731 fSets
->addElement(fSContinueSet
, status
);
2732 fSets
->addElement(fSTermSet
, status
);
2733 fSets
->addElement(fCloseSet
, status
);
2734 fSets
->addElement(fOtherSet
, status
);
2735 fSets
->addElement(fExtendSet
, status
);
2737 if (U_FAILURE(status
)) {
2738 deferredStatus
= status
;
2744 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2748 UVector
*RBBISentMonkey::charClasses() {
2753 // moveBack() Find the "significant" code point preceding the index i.
2754 // Skips over ($Extend | $Format)* .
2756 int RBBISentMonkey::moveBack(int i
) {
2763 j
= fText
->moveIndex32(j
, -1);
2764 c
= fText
->char32At(j
);
2766 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2772 int RBBISentMonkey::moveForward(int i
) {
2773 if (i
>=fText
->length()) {
2774 return fText
->length();
2779 j
= fText
->moveIndex32(j
, 1);
2782 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2786 UChar32
RBBISentMonkey::cAt(int pos
) {
2787 if (pos
<0 || pos
>=fText
->length()) {
2790 return fText
->char32At(pos
);
2794 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2795 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2796 // break position being tested. The candidate break
2797 // location is before p2.
2801 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2804 if (U_FAILURE(deferredStatus
)) {
2808 // Prev break at end of string. return DONE.
2809 if (prevPos
>= fText
->length()) {
2812 p0
= p1
= p2
= p3
= prevPos
;
2813 c3
= fText
->char32At(prevPos
);
2815 (void)p0
; // Suppress set but not used warning.
2817 // Loop runs once per "significant" character position in the input text.
2819 // Move all of the positions forward in the input string.
2824 // Advancd p3 by X(Extend | Format)* Rule 4
2825 p3
= moveForward(p3
);
2829 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2833 // Rule (4). Sep <break>
2834 if (fSepSet
->contains(c1
)) {
2835 p2
= p1
+1; // Separators don't combine with Extend or Format.
2839 if (p2
>= fText
->length()) {
2840 // Reached end of string. Always a break position.
2844 if (p2
== prevPos
) {
2845 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2849 // Rule (6). ATerm x Numeric
2850 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2854 // Rule (7). (Upper | Lower) ATerm x Uppper
2855 if ((fUpperSet
->contains(c0
) || fLowerSet
->contains(c0
)) &&
2856 fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2860 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2861 // Note: STerm | ATerm are added to the negated part of the expression by a
2862 // note to the Unicode 5.0 documents.
2864 while (fSpSet
->contains(cAt(p8
))) {
2867 while (fCloseSet
->contains(cAt(p8
))) {
2870 if (fATermSet
->contains(cAt(p8
))) {
2874 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2875 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2876 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2879 p8
= moveForward(p8
);
2881 if (fLowerSet
->contains(cAt(p8
))) {
2886 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2887 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2889 while (fSpSet
->contains(cAt(p8
))) {
2892 while (fCloseSet
->contains(cAt(p8
))) {
2896 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2901 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2903 while (fCloseSet
->contains(cAt(p9
))) {
2907 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2908 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2913 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2915 while (fSpSet
->contains(cAt(p10
))) {
2916 p10
= moveBack(p10
);
2918 while (fCloseSet
->contains(cAt(p10
))) {
2919 p10
= moveBack(p10
);
2921 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2922 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2927 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2929 if (fSepSet
->contains(cAt(p11
))) {
2930 p11
= moveBack(p11
);
2932 while (fSpSet
->contains(cAt(p11
))) {
2933 p11
= moveBack(p11
);
2935 while (fCloseSet
->contains(cAt(p11
))) {
2936 p11
= moveBack(p11
);
2938 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2942 // Rule (12) Any x Any
2949 RBBISentMonkey::~RBBISentMonkey() {
2959 delete fSContinueSet
;
2968 //-------------------------------------------------------------------------------------------
2972 //-------------------------------------------------------------------------------------------
2974 class RBBILineMonkey
: public RBBIMonkeyKind
{
2977 virtual ~RBBILineMonkey();
2978 virtual UVector
*charClasses();
2979 virtual void setText(const UnicodeString
&s
);
2980 virtual int32_t next(int32_t i
);
2981 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
3027 UnicodeSet
*fExtendedPict
;
3028 UnicodeSet
*fEmojiNRK
;
3030 BreakIterator
*fCharBI
;
3031 const UnicodeString
*fText
;
3032 RegexMatcher
*fNumberMatcher
;
3035 RBBILineMonkey::RBBILineMonkey() :
3041 fNumberMatcher(NULL
)
3044 if (U_FAILURE(deferredStatus
)) {
3048 UErrorCode status
= U_ZERO_ERROR
;
3050 fSets
= new UVector(status
);
3052 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
3053 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
3054 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
3055 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
3056 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
3057 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
3058 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
3059 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
3060 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
3061 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
3062 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
3063 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
3064 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
3065 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
3066 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
3067 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
3068 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
3069 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
3070 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
3071 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
3072 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
3073 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
3074 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
3075 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
3076 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
3077 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
3078 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
3079 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
3080 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
3081 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
3082 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
3083 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
3084 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
3085 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
3086 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
3087 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
3088 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
3089 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
3090 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
3091 fEB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status
);
3092 fEM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status
);
3093 fZJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status
);
3094 fEmojiNRK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status
);
3095 fExtendedPict
= new UnicodeSet(gExtended_Pict
, status
);
3097 if (U_FAILURE(status
)) {
3098 deferredStatus
= status
;
3102 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
3103 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
3104 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
3106 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
3107 fCM
->addAll(*fZJ
); // ZWJ behaves as a CM.
3109 fSets
->addElement(fBK
, status
);
3110 fSets
->addElement(fCR
, status
);
3111 fSets
->addElement(fLF
, status
);
3112 fSets
->addElement(fCM
, status
);
3113 fSets
->addElement(fNL
, status
);
3114 fSets
->addElement(fWJ
, status
);
3115 fSets
->addElement(fZW
, status
);
3116 fSets
->addElement(fGL
, status
);
3117 fSets
->addElement(fCB
, status
);
3118 fSets
->addElement(fSP
, status
);
3119 fSets
->addElement(fB2
, status
);
3120 fSets
->addElement(fBA
, status
);
3121 fSets
->addElement(fBB
, status
);
3122 fSets
->addElement(fHY
, status
);
3123 fSets
->addElement(fH2
, status
);
3124 fSets
->addElement(fH3
, status
);
3125 fSets
->addElement(fCL
, status
);
3126 fSets
->addElement(fCP
, status
);
3127 fSets
->addElement(fEX
, status
);
3128 fSets
->addElement(fIN
, status
);
3129 fSets
->addElement(fJL
, status
);
3130 fSets
->addElement(fJT
, status
);
3131 fSets
->addElement(fJV
, status
);
3132 fSets
->addElement(fNS
, status
);
3133 fSets
->addElement(fOP
, status
);
3134 fSets
->addElement(fQU
, status
);
3135 fSets
->addElement(fIS
, status
);
3136 fSets
->addElement(fNU
, status
);
3137 fSets
->addElement(fPO
, status
);
3138 fSets
->addElement(fPR
, status
);
3139 fSets
->addElement(fSY
, status
);
3140 fSets
->addElement(fAI
, status
);
3141 fSets
->addElement(fAL
, status
);
3142 fSets
->addElement(fHL
, status
);
3143 fSets
->addElement(fID
, status
);
3144 fSets
->addElement(fWJ
, status
);
3145 fSets
->addElement(fRI
, status
);
3146 fSets
->addElement(fSG
, status
);
3147 fSets
->addElement(fEB
, status
);
3148 fSets
->addElement(fEM
, status
);
3149 fSets
->addElement(fZJ
, status
);
3150 fSets
->addElement(fExtendedPict
, status
);
3151 fSets
->addElement(fEmojiNRK
, status
);
3155 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3156 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3157 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3158 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3159 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3160 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
3162 fNumberMatcher
= new RegexMatcher(
3163 UnicodeString(rules
, -1, US_INV
), 0, status
);
3165 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
3167 if (U_FAILURE(status
)) {
3168 deferredStatus
= status
;
3173 void RBBILineMonkey::setText(const UnicodeString
&s
) {
3175 fCharBI
->setText(s
);
3176 fNumberMatcher
->reset(s
);
3181 // Line Break TR rules 9 and 10 implementation.
3182 // This deals with combining marks and other sequences that
3183 // that must be treated as if they were something other than what they actually are.
3185 // This is factored out into a separate function because it must be applied twice for
3186 // each potential break, once to the chars before the position being checked, then
3187 // again to the text following the possible break.
3189 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
3191 // Invalid initial position. Happens during the warmup iteration of the
3192 // main loop in next().
3196 int32_t nPos
= *nextPos
;
3198 // LB 9 Keep combining sequences together.
3199 // advance over any CM class chars. Note that Line Break CM is different
3200 // from the normal Grapheme Extend property.
3201 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
3202 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
3204 *nextChar
= fText
->char32At(nPos
);
3205 if (!fCM
->contains(*nextChar
)) {
3208 nPos
= fText
->moveIndex32(nPos
, 1);
3213 // LB 9 Treat X CM* as if it were x.
3214 // No explicit action required.
3216 // LB 10 Treat any remaining combining mark as AL
3217 if (fCM
->contains(*posChar
)) {
3221 // Push the updated nextPos and nextChar back to our caller.
3222 // This only makes a difference if posChar got bigger by consuming a
3223 // combining sequence.
3225 *nextChar
= fText
->char32At(nPos
);
3230 int32_t RBBILineMonkey::next(int32_t startPos
) {
3231 UErrorCode status
= U_ZERO_ERROR
;
3232 int32_t pos
; // Index of the char following a potential break position
3233 UChar32 thisChar
; // Character at above position "pos"
3235 int32_t prevPos
; // Index of the char preceding a potential break position
3236 UChar32 prevChar
; // Character at above position. Note that prevChar
3237 // and thisChar may not be adjacent because combining
3238 // characters between them will be ignored.
3240 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
3243 int32_t nextPos
; // Index of the next character following pos.
3244 // Usually skips over combining marks.
3245 int32_t nextCPPos
; // Index of the code point following "pos."
3246 // May point to a combining mark.
3247 int32_t tPos
; // temp value.
3250 if (U_FAILURE(deferredStatus
)) {
3254 if (startPos
>= fText
->length()) {
3259 // Initial values for loop. Loop will run the first time without finding breaks,
3260 // while the invalid values shift out and the "this" and
3261 // "prev" positions are filled in with good values.
3262 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
3263 thisChar
= prevChar
= prevCharX2
= 0;
3264 nextPos
= nextCPPos
= startPos
;
3267 // Loop runs once per position in the test text, until a break position
3270 prevPosX2
= prevPos
;
3271 prevCharX2
= prevChar
;
3274 prevChar
= thisChar
;
3277 thisChar
= fText
->char32At(pos
);
3279 nextCPPos
= fText
->moveIndex32(pos
, 1);
3280 nextPos
= nextCPPos
;
3282 // Rule LB2 - Break at end of text.
3283 if (pos
>= fText
->length()) {
3287 // Rule LB 9 - adjust for combining sequences.
3288 // We do this one out-of-order because the adjustment does not change anything
3289 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3291 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3292 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3293 c
= fText
->char32At(nextPos
);
3294 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3296 // If the loop is still warming up - if we haven't shifted the initial
3297 // -1 positions out of prevPos yet - loop back to advance the
3298 // position in the input without any further looking for breaks.
3299 if (prevPos
== -1) {
3303 // LB 4 Always break after hard line breaks,
3304 if (fBK
->contains(prevChar
)) {
3308 // LB 5 Break after CR, LF, NL, but not inside CR LF
3309 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3312 if (prevChar
== 0x0d ||
3318 // LB 6 Don't break before hard line breaks
3319 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3320 fBK
->contains(thisChar
)) {
3325 // LB 7 Don't break before spaces or zero-width space.
3326 if (fSP
->contains(thisChar
)) {
3330 if (fZW
->contains(thisChar
)) {
3334 // LB 8 Break after zero width space
3335 if (fZW
->contains(prevChar
)) {
3339 // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
3340 // The monkey test's way of ignoring combining characters doesn't work
3341 // for this rule. ZJ is also a CM. Need to get the actual character
3342 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3344 int32_t prevIdx
= fText
->moveIndex32(pos
, -1);
3345 UChar32 prevC
= fText
->char32At(prevIdx
);
3346 if (fZJ
->contains(prevC
) && (fID
->contains(thisChar
) || fExtendedPict
->contains(thisChar
) || fEmojiNRK
->contains(thisChar
))) {
3351 // LB 9, 10 Already done, at top of loop.
3355 // LB 11 Do not break before or after WORD JOINER and related characters.
3359 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3365 if (fGL
->contains(prevChar
)) {
3371 if (!(fSP
->contains(prevChar
) ||
3372 fBA
->contains(prevChar
) ||
3373 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3379 // LB 13 Don't break before closings.
3380 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3381 // fall into LB 17 and the more general number regular expression.
3383 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3384 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3385 fEX
->contains(thisChar
) ||
3386 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3387 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3391 // LB 14 Don't break after OP SP*
3392 // Scan backwards, checking for this sequence.
3393 // The OP char could include combining marks, so we actually check for
3395 // Another Twist: The Rule 67 fixes may have changed a SP CM
3396 // sequence into a ID char, so before scanning back through spaces,
3397 // verify that prevChar is indeed a space. The prevChar variable
3398 // may differ from fText[prevPos]
3400 if (fSP
->contains(prevChar
)) {
3401 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3402 tPos
=fText
->moveIndex32(tPos
, -1);
3405 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3406 tPos
=fText
->moveIndex32(tPos
, -1);
3408 if (fOP
->contains(fText
->char32At(tPos
))) {
3413 // LB 15 QU SP* x OP
3414 if (fOP
->contains(thisChar
)) {
3415 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3417 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3418 tPos
= fText
->moveIndex32(tPos
, -1);
3420 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3421 tPos
= fText
->moveIndex32(tPos
, -1);
3423 if (fQU
->contains(fText
->char32At(tPos
))) {
3430 // LB 16 (CL | CP) SP* x NS
3431 // Scan backwards for SP* CM* (CL | CP)
3432 if (fNS
->contains(thisChar
)) {
3434 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3435 tPos
= fText
->moveIndex32(tPos
, -1);
3437 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3438 tPos
= fText
->moveIndex32(tPos
, -1);
3440 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3446 // LB 17 B2 SP* x B2
3447 if (fB2
->contains(thisChar
)) {
3448 // Scan backwards, checking for the B2 CM* SP* sequence.
3450 if (fSP
->contains(prevChar
)) {
3451 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3452 tPos
=fText
->moveIndex32(tPos
, -1);
3455 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3456 tPos
=fText
->moveIndex32(tPos
, -1);
3458 if (fB2
->contains(fText
->char32At(tPos
))) {
3464 // LB 18 break after space
3465 if (fSP
->contains(prevChar
)) {
3472 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3476 // LB 20 Break around a CB
3477 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3482 if (fBA
->contains(thisChar
) ||
3483 fHY
->contains(thisChar
) ||
3484 fNS
->contains(thisChar
) ||
3485 fBB
->contains(prevChar
) ) {
3491 if (fHL
->contains(prevCharX2
) &&
3492 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3498 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3503 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3504 (fEX
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3505 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3506 ((fID
->contains(prevChar
) || fEB
->contains(prevChar
) || fEM
->contains(prevChar
)) && fIN
->contains(thisChar
)) ||
3507 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3508 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3513 // LB 23 (AL | HL) x NU
3515 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fNU
->contains(thisChar
)) {
3518 if (fNU
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3522 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3523 // PR x (ID | EB | EM)
3524 // (ID | EB | EM) x PO
3525 if (fPR
->contains(prevChar
) &&
3526 (fID
->contains(thisChar
) || fEB
->contains(thisChar
) || fEM
->contains(thisChar
))) {
3529 if ((fID
->contains(prevChar
) || fEB
->contains(prevChar
) || fEM
->contains(prevChar
)) &&
3530 fPO
->contains(thisChar
)) {
3534 // LB 24 Do not break between prefix and letters or ideographs.
3535 // (PR | PO) x (AL | HL)
3536 // (AL | HL) x (PR | PO)
3537 if ((fPR
->contains(prevChar
) || fPO
->contains(prevChar
)) &&
3538 (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3541 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) &&
3542 (fPR
->contains(thisChar
) || fPO
->contains(thisChar
))) {
3549 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3550 if (U_FAILURE(status
)) {
3553 // Matched a number. But could have been just a single digit, which would
3554 // not represent a "no break here" between prevChar and thisChar
3555 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3556 if (numEndIdx
> pos
) {
3557 // Number match includes at least our two chars being checked
3558 if (numEndIdx
> nextPos
) {
3559 // Number match includes additional chars. Update pos and nextPos
3560 // so that next loop iteration will continue at the end of the number,
3561 // checking for breaks between last char in number & whatever follows.
3562 pos
= nextPos
= numEndIdx
;
3564 pos
= fText
->moveIndex32(pos
, -1);
3565 thisChar
= fText
->char32At(pos
);
3566 } while (fCM
->contains(thisChar
));
3573 // LB 26 Do not break a Korean syllable.
3574 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3575 fJV
->contains(thisChar
) ||
3576 fH2
->contains(thisChar
) ||
3577 fH3
->contains(thisChar
))) {
3581 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3582 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3586 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3587 fJT
->contains(thisChar
)) {
3591 // LB 27 Treat a Korean Syllable Block the same as ID.
3592 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3593 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3594 fIN
->contains(thisChar
)) {
3597 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3598 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3599 fPO
->contains(thisChar
)) {
3602 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3603 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3609 // LB 28 Do not break between alphabetics ("at").
3610 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3614 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3615 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3619 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3622 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3625 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3629 // LB30a RI RI <break> RI
3631 if (fRI
->contains(prevCharX2
) && fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3634 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3638 // LB30b Emoji Base x Emoji Modifier
3639 if (fEB
->contains(prevChar
) && fEM
->contains(thisChar
)) {
3643 // LB 31 Break everywhere else
3652 UVector
*RBBILineMonkey::charClasses() {
3657 RBBILineMonkey::~RBBILineMonkey() {
3702 delete fExtendedPict
;
3706 delete fNumberMatcher
;
3710 //-------------------------------------------------------------------------------------------
3715 // seed=nnnnn Random number starting seed.
3716 // Setting the seed allows errors to be reproduced.
3717 // loop=nnn Looping count. Controls running time.
3719 // 0 or greater: run length.
3721 // type = char | word | line | sent | title
3724 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3726 //-------------------------------------------------------------------------------------------
3728 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3729 int32_t val
= defaultVal
;
3730 name
.append(" *= *(-?\\d+)");
3731 UErrorCode status
= U_ZERO_ERROR
;
3732 RegexMatcher
m(name
, params
, 0, status
);
3734 // The param exists. Convert the string to an int.
3735 char valString
[100];
3736 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3737 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3738 paramLength
= (int32_t)(sizeof(valString
)-2);
3740 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3741 val
= strtol(valString
, NULL
, 10);
3743 // Delete this parameter from the params string.
3745 params
= m
.replaceFirst("", status
);
3747 U_ASSERT(U_SUCCESS(status
));
3752 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3753 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3762 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3764 if (count
< expectedcount
&& expected
[count
] != i
) {
3765 test
->errln("break forward test failed: expected %d but got %d",
3766 expected
[count
], i
);
3771 if (count
!= expectedcount
) {
3772 printStringBreaks(ustr
, expected
, expectedcount
);
3773 test
->errln("break forward test failed: missed %d match",
3774 expectedcount
- count
);
3777 // testing boundaries
3778 for (i
= 1; i
< expectedcount
; i
++) {
3779 int j
= expected
[i
- 1];
3780 if (!bi
->isBoundary(j
)) {
3781 printStringBreaks(ustr
, expected
, expectedcount
);
3782 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3785 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3786 if (bi
->isBoundary(j
)) {
3787 printStringBreaks(ustr
, expected
, expectedcount
);
3788 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3794 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3796 if (forward
[count
] != i
) {
3797 printStringBreaks(ustr
, expected
, expectedcount
);
3798 test
->errln("happy break test previous() failed: expected %d but got %d",
3804 printStringBreaks(ustr
, expected
, expectedcount
);
3805 test
->errln("break test previous() failed: missed a match");
3809 // testing preceding
3810 for (i
= 0; i
< expectedcount
- 1; i
++) {
3811 // int j = expected[i] + 1;
3812 int j
= ustr
.moveIndex32(expected
[i
], 1);
3813 for (; j
<= expected
[i
+ 1]; j
++) {
3814 if (bi
->preceding(j
) != expected
[i
]) {
3815 printStringBreaks(ustr
, expected
, expectedcount
);
3816 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3824 void RBBITest::TestWordBreaks(void)
3826 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3828 Locale
locale("en");
3829 UErrorCode status
= U_ZERO_ERROR
;
3830 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3831 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3832 // Replaced any C+J characters in a row with a random sequence of characters
3833 // of the same length to make our C+J segmentation not get in the way.
3834 static const char *strlist
[] =
3836 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3837 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3838 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3839 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3840 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3841 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3842 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3843 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3844 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3845 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3846 "\\u2027\\U000e0067\\u0a47\\u00b7",
3847 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3848 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3849 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3850 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3851 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3852 "\\u0027\\u11af\\U000e0057\\u0602",
3853 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3854 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3855 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3856 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3857 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3858 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3859 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3860 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3861 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3862 "\\u18f4\\U000e0049\\u20e7\\u2027",
3863 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3864 "\\ua183\\u102d\\u0bec\\u003a",
3865 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3866 "\\u003a\\u0e57\\u0fad\\u002e",
3867 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3868 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3869 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3870 "\\u003a\\u0664\\u00b7\\u1fba",
3871 "\\u003b\\u0027\\u00b7\\u47a3",
3872 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3873 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3874 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3877 if (U_FAILURE(status
)) {
3878 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3881 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3882 // printf("looping %d\n", loop);
3883 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3884 // RBBICharMonkey monkey;
3885 RBBIWordMonkey monkey
;
3888 int expectedcount
= 0;
3890 monkey
.setText(ustr
);
3892 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3893 expected
[expectedcount
++] = i
;
3896 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3902 void RBBITest::TestWordBoundary(void)
3904 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3905 Locale
locale("en");
3906 UErrorCode status
= U_ZERO_ERROR
;
3907 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3908 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3910 static const char *strlist
[] =
3912 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3913 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3914 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3915 "\\u2027\\U000e0067\\u0a47\\u00b7",
3916 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3917 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3918 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3919 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3920 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3921 "\\u0027\\u11af\\U000e0057\\u0602",
3922 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3923 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3924 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3925 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3926 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3927 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3928 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3929 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3930 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3931 "\\u58f4\\U000e0049\\u20e7\\u2027",
3932 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3933 "\\ua183\\u102d\\u0bec\\u003a",
3934 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3935 "\\u003a\\u0e57\\u0fad\\u002e",
3936 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3937 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3938 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3939 "\\u003a\\u0664\\u00b7\\u1fba",
3940 "\\u003b\\u0027\\u00b7\\u47a3",
3943 if (U_FAILURE(status
)) {
3944 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3947 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3948 // printf("looping %d\n", loop);
3949 u_unescape(strlist
[loop
], str
, 20);
3950 UnicodeString
ustr(str
);
3957 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3958 forward
[count
++] = i
;
3961 for (j
= prev
+ 1; j
< i
; j
++) {
3962 if (bi
->isBoundary(j
)) {
3963 printStringBreaks(ustr
, forward
, count
);
3964 errln("happy boundary test failed: expected %d not a boundary",
3970 if (!bi
->isBoundary(i
)) {
3971 printStringBreaks(ustr
, forward
, count
);
3972 errln("happy boundary test failed: expected %d a boundary",
3982 void RBBITest::TestLineBreaks(void)
3984 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3985 Locale
locale("en");
3986 UErrorCode status
= U_ZERO_ERROR
;
3987 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3988 const int32_t STRSIZE
= 50;
3990 static const char *strlist
[] =
3992 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3993 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3994 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3995 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3996 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3997 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3998 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3999 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4000 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4001 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4002 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4003 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4004 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4005 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4006 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4007 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4008 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4009 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4010 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4011 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4012 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4013 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4014 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4015 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4016 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4017 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4018 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4019 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4020 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4021 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4022 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4023 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4024 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4025 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4026 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4027 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4028 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4029 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4030 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4031 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4034 TEST_ASSERT_SUCCESS(status
);
4035 if (U_FAILURE(status
)) {
4038 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
4039 // printf("looping %d\n", loop);
4040 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
4047 UnicodeString
ustr(str
);
4048 RBBILineMonkey monkey
;
4049 if (U_FAILURE(monkey
.deferredStatus
)) {
4053 const int EXPECTEDSIZE
= 50;
4054 int expected
[EXPECTEDSIZE
];
4055 int expectedcount
= 0;
4057 monkey
.setText(ustr
);
4059 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4060 if (expectedcount
>= EXPECTEDSIZE
) {
4061 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4064 expected
[expectedcount
++] = i
;
4067 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4073 void RBBITest::TestSentBreaks(void)
4075 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4076 Locale
locale("en");
4077 UErrorCode status
= U_ZERO_ERROR
;
4078 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4080 static const char *strlist
[] =
4082 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4084 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4085 "\"Sentence ending with a quote.\" Bye.",
4086 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4087 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4088 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4089 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4090 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4091 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4092 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4093 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4094 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4095 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4096 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4097 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4098 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4099 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4100 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4101 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4104 if (U_FAILURE(status
)) {
4105 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
4108 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
4109 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
4110 UnicodeString
ustr(str
);
4112 RBBISentMonkey monkey
;
4113 if (U_FAILURE(monkey
.deferredStatus
)) {
4117 const int EXPECTEDSIZE
= 50;
4118 int expected
[EXPECTEDSIZE
];
4119 int expectedcount
= 0;
4121 monkey
.setText(ustr
);
4123 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4124 if (expectedcount
>= EXPECTEDSIZE
) {
4125 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4128 expected
[expectedcount
++] = i
;
4131 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4137 void RBBITest::TestMonkey() {
4138 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4140 UErrorCode status
= U_ZERO_ERROR
;
4141 int32_t loopCount
= 500;
4143 UnicodeString breakType
= "all";
4144 Locale
locale("en");
4145 UBool useUText
= FALSE
;
4147 if (quick
== FALSE
) {
4152 UnicodeString
p(fTestParams
);
4153 loopCount
= getIntParam("loop", p
, loopCount
);
4154 seed
= getIntParam("seed", p
, seed
);
4156 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
4158 breakType
= m
.group(1, status
);
4160 p
= m
.replaceFirst("", status
);
4163 RegexMatcher
u(" *utext", p
, 0, status
);
4167 p
= u
.replaceFirst("", status
);
4172 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
4173 // Each option is stripped out of the option string as it is processed.
4174 // All options have been checked. The option string should have been completely emptied..
4176 p
.extract(buf
, sizeof(buf
), NULL
, status
);
4177 buf
[sizeof(buf
)-1] = 0;
4178 errln("Unrecognized or extra parameter: %s\n", buf
);
4184 if (breakType
== "char" || breakType
== "all") {
4186 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
4187 if (U_SUCCESS(status
)) {
4188 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
4189 if (breakType
== "all" && useUText
==FALSE
) {
4190 // Also run a quick test with UText when "all" is specified
4191 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
4195 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
4200 if (breakType
== "word" || breakType
== "all") {
4201 logln("Word Break Monkey Test");
4203 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4204 if (U_SUCCESS(status
)) {
4205 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
4208 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
4213 if (breakType
== "line" || breakType
== "all") {
4214 logln("Line Break Monkey Test");
4216 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4217 if (loopCount
>= 10) {
4218 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
4220 if (U_SUCCESS(status
)) {
4221 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
4224 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4229 if (breakType
== "sent" || breakType
== "all" ) {
4230 logln("Sentence Break Monkey Test");
4232 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4233 if (loopCount
>= 10) {
4234 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
4236 if (U_SUCCESS(status
)) {
4237 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
4240 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4249 // Run a RBBI monkey test. Common routine, for all break iterator types.
4251 // bi - the break iterator to use
4252 // mk - MonkeyKind, abstraction for obtaining expected results
4253 // name - Name of test (char, word, etc.) for use in error messages
4254 // seed - Seed for starting random number generator (parameter from user)
4257 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4258 int32_t numIterations
, UBool useUText
) {
4260 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4262 const int32_t TESTSTRINGLEN
= 500;
4263 UnicodeString testText
;
4264 int32_t numCharClasses
;
4266 int expected
[TESTSTRINGLEN
*2 + 1];
4267 int expectedCount
= 0;
4268 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4269 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4270 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4271 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4272 char followingBreaks
[TESTSTRINGLEN
*2+1];
4273 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4279 numCharClasses
= mk
.charClasses()->size();
4280 chClasses
= mk
.charClasses();
4282 // Check for errors that occured during the construction of the MonkeyKind object.
4283 // Can't report them where they occured because errln() is a method coming from intlTest,
4284 // and is not visible outside of RBBITest :-(
4285 if (U_FAILURE(mk
.deferredStatus
)) {
4286 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4290 // Verify that the character classes all have at least one member.
4291 for (i
=0; i
<numCharClasses
; i
++) {
4292 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4293 if (s
== NULL
|| s
->size() == 0) {
4294 errln("Character Class #%d is null or of zero size.", i
);
4299 while (loopCount
< numIterations
|| numIterations
== -1) {
4300 if (numIterations
== -1 && loopCount
% 10 == 0) {
4301 // If test is running in an infinite loop, display a periodic tic so
4302 // we can tell that it is making progress.
4303 fprintf(stderr
, ".");
4305 // Save current random number seed, so that we can recreate the random numbers
4306 // for this loop iteration in event of an error.
4309 // Populate a test string with data.
4310 testText
.truncate(0);
4311 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4312 int32_t aClassNum
= m_rand() % numCharClasses
;
4313 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4314 int32_t charIdx
= m_rand() % classSet
->size();
4315 UChar32 c
= classSet
->charAt(charIdx
);
4316 if (c
< 0) { // TODO: deal with sets containing strings.
4317 errln("%s:%d c < 0", __FILE__
, __LINE__
);
4320 // Do not assemble a supplementary character from randomly generated separate surrogates.
4321 // (It could be a dictionary character)
4322 if (U16_IS_TRAIL(c
) && testText
.length() > 0 && U16_IS_LEAD(testText
.charAt(testText
.length()-1))) {
4329 // Calculate the expected results for this test string.
4330 mk
.setText(testText
);
4331 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4332 expectedBreaks
[0] = 1;
4333 int32_t breakPos
= 0;
4336 breakPos
= mk
.next(breakPos
);
4337 if (breakPos
== -1) {
4340 if (breakPos
> testText
.length()) {
4341 errln("breakPos > testText.length()");
4343 expectedBreaks
[breakPos
] = 1;
4344 U_ASSERT(expectedCount
<testText
.length());
4345 expected
[expectedCount
++] = breakPos
;
4346 (void)expected
; // Set but not used warning.
4347 // TODO (andy): check it out.
4350 // Find the break positions using forward iteration
4351 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4353 UErrorCode status
= U_ZERO_ERROR
;
4354 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4355 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4356 bi
->setText(testUText
, status
);
4357 TEST_ASSERT_SUCCESS(status
);
4358 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4359 // This UText can be closed immediately, so long as the
4360 // testText string continues to exist.
4362 bi
->setText(testText
);
4365 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4366 if (i
< 0 || i
> testText
.length()) {
4367 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4370 forwardBreaks
[i
] = 1;
4373 // Find the break positions using reverse iteration
4374 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4375 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4376 if (i
< 0 || i
> testText
.length()) {
4377 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4380 reverseBreaks
[i
] = 1;
4383 // Find the break positions using isBoundary() tests.
4384 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4385 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4386 for (i
=0; i
<=testText
.length(); i
++) {
4387 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4391 // Find the break positions using the following() function.
4393 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4394 int32_t lastBreakPos
= 0;
4395 followingBreaks
[0] = 1;
4396 for (i
=0; i
<testText
.length(); i
++) {
4397 breakPos
= bi
->following(i
);
4398 if (breakPos
<= i
||
4399 breakPos
< lastBreakPos
||
4400 breakPos
> testText
.length() ||
4401 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4402 errln("%s break monkey test: "
4403 "Out of range value returned by BreakIterator::following().\n"
4404 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4405 name
, seed
, i
, breakPos
, lastBreakPos
);
4408 followingBreaks
[breakPos
] = 1;
4409 lastBreakPos
= breakPos
;
4412 // Find the break positions using the preceding() function.
4413 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4414 lastBreakPos
= testText
.length();
4415 precedingBreaks
[testText
.length()] = 1;
4416 for (i
=testText
.length(); i
>0; i
--) {
4417 breakPos
= bi
->preceding(i
);
4418 if (breakPos
>= i
||
4419 breakPos
> lastBreakPos
||
4420 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4421 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4422 errln("%s break monkey test: "
4423 "Out of range value returned by BreakIterator::preceding().\n"
4424 "index=%d; prev returned %d; lastBreak=%d" ,
4425 name
, i
, breakPos
, lastBreakPos
);
4426 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4427 precedingBreaks
[i
] = 2; // Forces an error.
4430 if (breakPos
>= 0) {
4431 precedingBreaks
[breakPos
] = 1;
4433 lastBreakPos
= breakPos
;
4437 // Compare the expected and actual results.
4438 for (i
=0; i
<=testText
.length(); i
++) {
4439 const char *errorType
= NULL
;
4440 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4441 errorType
= "next()";
4442 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4443 errorType
= "previous()";
4444 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4445 errorType
= "isBoundary()";
4446 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4447 errorType
= "following()";
4448 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4449 errorType
= "preceding()";
4453 if (errorType
!= NULL
) {
4454 // Format a range of the test text that includes the failure as
4455 // a data item that can be included in the rbbi test data file.
4457 // Start of the range is the last point where expected and actual results
4458 // both agreed that there was a break position.
4459 int startContext
= i
;
4462 if (startContext
==0) { break; }
4464 if (expectedBreaks
[startContext
] != 0) {
4465 if (count
== 2) break;
4470 // End of range is two expected breaks past the start position.
4471 int endContext
= i
+ 1;
4473 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4475 if (endContext
>= testText
.length()) {break;}
4476 if (expectedBreaks
[endContext
-1] != 0) {
4477 if (count
== 0) break;
4484 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4485 UnicodeString errorText
= "<data>";
4486 /***if (strcmp(errorType, "next()") == 0) {
4488 endContext = testText.length();
4490 printStringBreaks(testText, expected, expectedCount);
4493 for (ci
=startContext
; ci
<endContext
;) {
4494 UnicodeString
hexChars("0123456789abcdef");
4497 c
= testText
.char32At(ci
);
4499 // This is the location of the error.
4500 errorText
.append("<?>");
4501 } else if (expectedBreaks
[ci
] != 0) {
4502 // This a non-error expected break position.
4503 errorText
.append("\\");
4506 errorText
.append("\\u");
4507 for (bn
=12; bn
>=0; bn
-=4) {
4508 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4511 errorText
.append("\\U");
4512 for (bn
=28; bn
>=0; bn
-=4) {
4513 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4516 ci
= testText
.moveIndex32(ci
, 1);
4518 errorText
.append("\\");
4519 errorText
.append("</data>\n");
4522 char charErrorTxt
[500];
4523 UErrorCode status
= U_ZERO_ERROR
;
4524 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4525 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4526 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4528 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4529 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4530 errorType
, seed
, i
, charErrorTxt
);
4541 // Bug 5532. UTF-8 based UText fails in dictionary code.
4542 // This test checks the initial patch,
4543 // which is to just keep it from crashing. Correct word boundaries
4544 // await a proper fix to the dictionary code.
4546 void RBBITest::TestBug5532(void) {
4547 // Text includes a mixture of Thai and Latin.
4548 const unsigned char utf8Data
[] = {
4549 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4550 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4551 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4552 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4553 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4554 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4555 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4556 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4557 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4558 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4559 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4561 UErrorCode status
= U_ZERO_ERROR
;
4562 UText utext
=UTEXT_INITIALIZER
;
4563 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4564 TEST_ASSERT_SUCCESS(status
);
4566 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4567 TEST_ASSERT_SUCCESS(status
);
4568 if (U_SUCCESS(status
)) {
4569 bi
->setText(&utext
, status
);
4570 TEST_ASSERT_SUCCESS(status
);
4572 int32_t breakCount
= 0;
4573 int32_t previousBreak
= -1;
4574 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4575 // For now, just make sure that the break iterator doesn't hang.
4576 TEST_ASSERT(previousBreak
< bi
->current());
4577 previousBreak
= bi
->current();
4579 TEST_ASSERT(breakCount
> 0);
4582 utext_close(&utext
);
4586 void RBBITest::TestBug9983(void) {
4587 UnicodeString text
= UnicodeString("\\u002A" // * Other
4589 "\\u309C" // Katakana
4593 "\\u0000").unescape();
4595 UErrorCode status
= U_ZERO_ERROR
;
4596 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4597 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4598 TEST_ASSERT_SUCCESS(status
);
4599 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4600 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4601 TEST_ASSERT_SUCCESS(status
);
4602 if (U_FAILURE(status
)) {
4605 int32_t offset
, rstatus
, iterationCount
;
4607 brkiter
->setText(text
);
4610 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4612 rstatus
= brkiter
->getRuleStatus();
4613 (void)rstatus
; // Suppress set but not used warning.
4614 if (iterationCount
>= 10) {
4618 TEST_ASSERT(iterationCount
== 6);
4620 brkiterPOSIX
->setText(text
);
4621 brkiterPOSIX
->last();
4623 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4625 rstatus
= brkiterPOSIX
->getRuleStatus();
4626 (void)rstatus
; // Suppress set but not used warning.
4627 if (iterationCount
>= 10) {
4631 TEST_ASSERT(iterationCount
== 6);
4634 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4636 void RBBITest::TestBug7547() {
4637 UnicodeString rules
;
4638 UErrorCode status
= U_ZERO_ERROR
;
4639 UParseError parseError
;
4640 RuleBasedBreakIterator
breakIterator(rules
, parseError
, status
);
4641 if (status
!= U_BRK_RULE_SYNTAX
) {
4642 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__
, __LINE__
, u_errorName(status
));
4644 if (parseError
.line
!= 1 || parseError
.offset
!= 0) {
4645 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError
.line
, parseError
.offset
);
4650 void RBBITest::TestBug12797() {
4651 UnicodeString rules
= "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4652 UErrorCode status
= U_ZERO_ERROR
;
4653 UParseError parseError
;
4654 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
4655 if (U_FAILURE(status
)) {
4656 errln("%s:%s status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4659 UnicodeString text
= "abc";
4662 int32_t boundary
= bi
.next();
4663 if (boundary
!= 3) {
4664 errln("%s:%d expected boundary==3, got %d", __FILE__
, __LINE__
, boundary
);
4668 void RBBITest::TestBug12918() {
4669 // This test triggers an assertion failure in dictbe.cpp
4670 const UChar
*crasherString
= u
"\u3325\u4a16";
4671 UErrorCode status
= U_ZERO_ERROR
;
4672 UBreakIterator
* iter
= ubrk_open(UBRK_WORD
, NULL
, crasherString
, -1, &status
);
4673 if (U_FAILURE(status
)) {
4674 dataerrln("%s:%d status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4679 int32_t lastPos
= -1;
4680 while((pos
= ubrk_next(iter
)) != UBRK_DONE
) {
4681 if (pos
<= lastPos
) {
4682 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__
, __LINE__
, pos
, lastPos
);
4689 void RBBITest::TestBug12932() {
4690 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4691 UnicodeString
ruleStr(
4692 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4693 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4694 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4695 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4696 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4697 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4699 UErrorCode status
= U_ZERO_ERROR
;
4700 UParseError parseError
;
4701 RuleBasedBreakIterator
rbbi(ruleStr
, parseError
, status
);
4702 if (status
!= U_BRK_RULE_SYNTAX
) {
4703 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4704 __FILE__
, __LINE__
, u_errorName(status
));
4709 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4710 // remain undevided by ICU char, word and line break.
4711 void RBBITest::TestEmoji() {
4712 UErrorCode status
= U_ZERO_ERROR
;
4714 CharString testFileName
;
4715 testFileName
.append(IntlTest::getSourceTestData(status
), status
);
4716 testFileName
.appendPathPart("emoji-test.txt", status
);
4717 if (U_FAILURE(status
)) {
4718 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4721 logln("Opening data file %s\n", testFileName
.data());
4724 UChar
*testFile
= ReadAndConvertFile(testFileName
.data(), len
, "UTF-8", status
);
4725 if (U_FAILURE(status
) || testFile
== NULL
) {
4726 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4729 UnicodeString
testFileAsString(testFile
, len
);
4732 RegexMatcher
lineMatcher(u
"^.*?$", testFileAsString
, UREGEX_MULTILINE
, status
);
4733 RegexMatcher
hexMatcher(u
"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE
, status
);
4734 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4735 int32_t lineNumber
= 0;
4737 LocalPointer
<BreakIterator
> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status
), status
);
4738 LocalPointer
<BreakIterator
> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status
), status
);
4739 LocalPointer
<BreakIterator
> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status
), status
);
4740 if (U_FAILURE(status
)) {
4741 dataerrln("%s:%d %s while opening break iterators", __FILE__
, __LINE__
, u_errorName(status
));
4745 while (lineMatcher
.find()) {
4747 UnicodeString line
= lineMatcher
.group(status
);
4748 hexMatcher
.reset(line
);
4749 UnicodeString testString
; // accumulates the emoji sequence.
4750 while (hexMatcher
.find() && hexMatcher
.group(1, status
).length() > 0) {
4751 UnicodeString hex
= hexMatcher
.group(1, status
);
4752 if (hex
.length() > 8) {
4753 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__
, __LINE__
, lineNumber
, CStr(hex
)());
4757 hex8
.appendInvariantChars(hex
, status
);
4758 UChar32 c
= (UChar32
)strtol(hex8
.data(), NULL
, 16);
4760 testString
.append(c
);
4762 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4763 __FILE__
, __LINE__
, lineNumber
, hex8
.data());
4768 if (testString
.length() > 1) {
4769 charBreaks
->setText(testString
);
4770 charBreaks
->first();
4771 int32_t firstBreak
= charBreaks
->next();
4772 if (testString
.length() != firstBreak
) {
4773 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4774 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4776 wordBreaks
->setText(testString
);
4777 wordBreaks
->first();
4778 firstBreak
= wordBreaks
->next();
4779 if (testString
.length() != firstBreak
) {
4780 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4781 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4783 lineBreaks
->setText(testString
);
4784 lineBreaks
->first();
4785 firstBreak
= lineBreaks
->next();
4786 if (testString
.length() != firstBreak
) {
4787 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4788 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4796 // TestDebug - A place-holder test for debugging purposes.
4797 // For putting in fragments of other tests that can be invoked
4798 // for tracing without a lot of unwanted extra stuff happening.
4800 void RBBITest::TestDebug(void) {
4803 void RBBITest::TestProperties() {
4804 UErrorCode errorCode
= U_ZERO_ERROR
;
4805 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4806 if (!prependSet
.isEmpty()) {
4808 "[:GCB=Prepend:] is not empty any more. "
4809 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4810 "change this test to the opposite condition.");
4814 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */