1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
23 #include "unicode/brkiter.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/numfmt.h"
26 #include "unicode/rbbi.h"
27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/regex.h"
30 #include "unicode/schriter.h"
31 #include "unicode/uchar.h"
32 #include "unicode/utf16.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uscript.h"
36 #include "unicode/ustring.h"
37 #include "unicode/utext.h"
45 #include "utypeinfo.h" // for 'typeid' to work
50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
51 #include "unicode/filteredbrk.h"
52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
54 #define TEST_ASSERT(x) {if (!(x)) { \
55 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
58 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
60 //---------------------------------------------
62 //---------------------------------------------
65 // Note: Before adding new tests to this file, check whether the desired test data can
66 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
67 // it's much less work than writing a new test, diagnostic output in the event of failures
68 // is good, and the test data file will is shared with ICU4J, so eventually the test
69 // will run there as well, without additional effort.
71 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
73 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(TestBug4153072
);
80 #if !UCONFIG_NO_FILE_IO
81 TESTCASE_AUTO(TestUnicodeFiles
);
83 TESTCASE_AUTO(TestGetAvailableLocales
);
84 TESTCASE_AUTO(TestGetDisplayName
);
85 #if !UCONFIG_NO_FILE_IO
86 TESTCASE_AUTO(TestEndBehaviour
);
87 TESTCASE_AUTO(TestWordBreaks
);
88 TESTCASE_AUTO(TestWordBoundary
);
89 TESTCASE_AUTO(TestLineBreaks
);
90 TESTCASE_AUTO(TestSentBreaks
);
91 TESTCASE_AUTO(TestExtended
);
93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestMonkey
);
96 #if !UCONFIG_NO_FILE_IO
97 TESTCASE_AUTO(TestBug3818
);
99 TESTCASE_AUTO(TestDebug
);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestBug5775
);
103 TESTCASE_AUTO(TestBug9983
);
104 TESTCASE_AUTO(TestDictRules
);
105 TESTCASE_AUTO(TestBug5532
);
106 TESTCASE_AUTO(TestBug7547
);
107 TESTCASE_AUTO(TestBug12797
);
108 TESTCASE_AUTO(TestBug12918
);
109 TESTCASE_AUTO(TestBug12932
);
110 TESTCASE_AUTO(TestEmoji
);
111 TESTCASE_AUTO(TestBug12519
);
112 TESTCASE_AUTO(TestBug12677
);
113 TESTCASE_AUTO(TestTableRedundancies
);
114 TESTCASE_AUTO(TestBug13447
);
115 TESTCASE_AUTO(TestReverse
);
116 TESTCASE_AUTO(TestBug13692
);
121 //--------------------------------------------------------------------------------------
123 // RBBITest constructor and destructor
125 //--------------------------------------------------------------------------------------
127 RBBITest::RBBITest() {
132 RBBITest::~RBBITest() {
136 static void printStringBreaks(UText
*tstr
, int expected
[], int expectedCount
) {
137 UErrorCode status
= U_ZERO_ERROR
;
139 printf("code alpha extend alphanum type word sent line name\n");
140 int nextExpectedIndex
= 0;
141 utext_setNativeIndex(tstr
, 0);
142 for (int j
= 0; j
< utext_nativeLength(tstr
); j
=utext_getNativeIndex(tstr
)) {
143 if (nextExpectedIndex
< expectedCount
&& j
>= expected
[nextExpectedIndex
] ) {
144 printf("------------------------------------------------ %d\n", j
);
148 UChar32 c
= utext_next32(tstr
);
149 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
150 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
152 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
154 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
156 U_SHORT_PROPERTY_NAME
),
157 u_getPropertyValueName(UCHAR_WORD_BREAK
,
158 u_getIntPropertyValue(c
,
160 U_SHORT_PROPERTY_NAME
),
161 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
162 u_getIntPropertyValue(c
,
163 UCHAR_SENTENCE_BREAK
),
164 U_SHORT_PROPERTY_NAME
),
165 u_getPropertyValueName(UCHAR_LINE_BREAK
,
166 u_getIntPropertyValue(c
,
168 U_SHORT_PROPERTY_NAME
),
174 static void printStringBreaks(const UnicodeString
&ustr
, int expected
[], int expectedCount
) {
175 UErrorCode status
= U_ZERO_ERROR
;
177 tstr
= utext_openConstUnicodeString(NULL
, &ustr
, &status
);
178 if (U_FAILURE(status
)) {
179 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
));
182 printStringBreaks(tstr
, expected
, expectedCount
);
187 void RBBITest::TestBug3818() {
188 UErrorCode status
= U_ZERO_ERROR
;
190 // Four Thai words...
191 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
192 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
193 UnicodeString
thaiStr(thaiWordData
);
195 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
196 if (U_FAILURE(status
) || bi
== NULL
) {
197 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
200 bi
->setText(thaiStr
);
202 int32_t startOfSecondWord
= bi
->following(1);
203 if (startOfSecondWord
!= 4) {
204 errln("Fail at file %s, line %d expected start of word at 4, got %d",
205 __FILE__
, __LINE__
, startOfSecondWord
);
207 startOfSecondWord
= bi
->following(0);
208 if (startOfSecondWord
!= 4) {
209 errln("Fail at file %s, line %d expected start of word at 4, got %d",
210 __FILE__
, __LINE__
, startOfSecondWord
);
216 //---------------------------------------------
220 //---------------------------------------------
222 void RBBITest::TestGetAvailableLocales()
224 int32_t locCount
= 0;
225 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
228 dataerrln("getAvailableLocales() returned an empty list!");
229 // Just make sure that it's returning good memory.
231 for (i
= 0; i
< locCount
; ++i
) {
232 logln(locList
[i
].getName());
236 //Testing the BreakIterator::getDisplayName() function
237 void RBBITest::TestGetDisplayName()
239 UnicodeString result
;
241 BreakIterator::getDisplayName(Locale::getUS(), result
);
242 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
243 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
246 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
247 if (result
!= "French (France)")
248 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
255 void RBBITest::TestEndBehaviour()
257 UErrorCode status
= U_ZERO_ERROR
;
258 UnicodeString
testString("boo.");
259 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
260 if (U_FAILURE(status
))
262 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
265 wb
->setText(testString
);
267 if (wb
->first() != 0)
268 errln("Didn't get break at beginning of string.");
270 errln("Didn't get break before period in \"boo.\"");
271 if (wb
->current() != 4 && wb
->next() != 4)
272 errln("Didn't get break at end of string.");
278 void RBBITest::TestBug4153072() {
279 UErrorCode status
= U_ZERO_ERROR
;
280 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
281 if (U_FAILURE(status
))
283 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
286 UnicodeString
str("...Hello, World!...");
288 int32_t end
= str
.length() - 3;
291 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
292 iter
->adoptText(textIterator
);
294 // Note: with the switch to UText, there is no way to restrict the
295 // iteration range to begin at an index other than zero.
296 // String character iterators created with a non-zero bound are
297 // treated by RBBI as being empty.
298 for (index
= -1; index
< begin
+ 1; ++index
) {
299 onBoundary
= iter
->isBoundary(index
);
300 if (index
== 0? !onBoundary
: onBoundary
) {
301 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
302 " and begin index = " + begin
);
310 // Test for problem reported by Ashok Matoria on 9 July 2007
311 // One.<kSoftHyphen><kSpace>Two.
313 // Sentence break at start (0) and then on calling next() it breaks at
314 // 'T' of "Two". Now, at this point if I do next() and
315 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
317 void RBBITest::TestBug5775() {
318 UErrorCode status
= U_ZERO_ERROR
;
319 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
320 TEST_ASSERT_SUCCESS(status
);
321 if (U_FAILURE(status
)) {
324 // Check for status first for better handling of no data errors.
325 TEST_ASSERT(bi
!= NULL
);
330 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
334 int pos
= bi
->next();
335 TEST_ASSERT(pos
== 6);
337 TEST_ASSERT(pos
== 10);
338 pos
= bi
->previous();
339 TEST_ASSERT(pos
== 6);
345 //------------------------------------------------------------------------------
347 // RBBITest::Extended Run RBBI Tests from an external test data file
349 //------------------------------------------------------------------------------
352 BreakIterator
*bi
; // Break iterator is set while parsing test source.
353 // Changed out whenever test data changes break type.
355 UnicodeString dataToBreak
; // Data that is built up while parsing the test.
356 UVector32
*expectedBreaks
; // Expected break positions, matches dataToBreak UnicodeString.
357 UVector32
*srcLine
; // Positions in source file, indexed same as dataToBreak.
360 UText
*textToBreak
; // UText, could be UTF8 or UTF16.
361 UVector32
*textMap
; // Map from UTF-16 dataToBreak offsets to UText offsets.
362 CharString utf8String
; // UTF-8 form of text to break.
364 TestParams(UErrorCode
&status
) : dataToBreak() {
366 expectedBreaks
= new UVector32(status
);
367 srcLine
= new UVector32(status
);
368 srcCol
= new UVector32(status
);
370 textMap
= new UVector32(status
);
375 delete expectedBreaks
;
378 utext_close(textToBreak
);
382 int32_t getSrcLine(int32_t bp
);
383 int32_t getExpectedBreak(int32_t bp
);
384 int32_t getSrcCol(int32_t bp
);
386 void setUTF16(UErrorCode
&status
);
387 void setUTF8(UErrorCode
&status
);
390 // Append a UnicodeString to a CharString with UTF-8 encoding.
391 // Substitute any invalid chars.
392 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
393 static void CharStringAppend(CharString
&dest
, const UnicodeString
&src
, UErrorCode
&status
) {
394 if (U_FAILURE(status
)) {
398 u_strToUTF8WithSub(NULL
, 0, &utf8Length
, // Output Buffer, NULL for preflight.
399 src
.getBuffer(), src
.length(), // UTF-16 data
400 0xfffd, NULL
, // Substitution char, number of subs.
402 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
405 status
= U_ZERO_ERROR
;
407 char *buffer
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
);
408 u_strToUTF8WithSub(buffer
, utf8Length
, NULL
,
409 src
.getBuffer(), src
.length(),
410 0xfffd, NULL
, &status
);
411 dest
.append(buffer
, utf8Length
, status
);
415 void TestParams::setUTF16(UErrorCode
&status
) {
416 textToBreak
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
);
417 textMap
->removeAllElements();
418 for (int32_t i
=0; i
<dataToBreak
.length(); i
++) {
419 if (i
== dataToBreak
.getChar32Start(i
)) {
420 textMap
->addElement(i
, status
);
422 textMap
->addElement(-1, status
);
425 textMap
->addElement(dataToBreak
.length(), status
);
426 U_ASSERT(dataToBreak
.length() + 1 == textMap
->size());
430 void TestParams::setUTF8(UErrorCode
&status
) {
431 if (U_FAILURE(status
)) {
435 CharStringAppend(utf8String
, dataToBreak
, status
);
436 textToBreak
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
);
437 if (U_FAILURE(status
)) {
441 textMap
->removeAllElements();
442 int32_t utf16Index
= 0;
444 textMap
->addElement(utf16Index
, status
);
445 UChar32 c32
= utext_current32(textToBreak
);
449 utf16Index
+= U16_LENGTH(c32
);
450 utext_next32(textToBreak
);
451 while (textMap
->size() < utext_getNativeIndex(textToBreak
)) {
452 textMap
->addElement(-1, status
);
455 U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size());
459 int32_t TestParams::getSrcLine(int32_t bp
) {
460 if (bp
>= textMap
->size()) {
461 bp
= textMap
->size() - 1;
464 for(; bp
>= 0 ; --bp
) {
465 // Move to a character boundary if we are not on one already.
466 i
= textMap
->elementAti(bp
);
471 return srcLine
->elementAti(i
);
475 int32_t TestParams::getExpectedBreak(int32_t bp
) {
476 if (bp
>= textMap
->size()) {
479 int32_t i
= textMap
->elementAti(bp
);
482 retVal
= expectedBreaks
->elementAti(i
);
488 int32_t TestParams::getSrcCol(int32_t bp
) {
489 if (bp
>= textMap
->size()) {
490 bp
= textMap
->size() - 1;
493 for(; bp
>= 0; --bp
) {
494 // Move bp to a character boundary if we are not on one already.
495 i
= textMap
->elementAti(bp
);
500 return srcCol
->elementAti(i
);
504 void RBBITest::executeTest(TestParams
*t
, UErrorCode
&status
) {
509 TEST_ASSERT_SUCCESS(status
);
510 if (U_FAILURE(status
)) {
518 t
->bi
->setText(t
->textToBreak
, status
);
520 // Run the iterator forward
523 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
525 // Fail for lack of forward progress.
526 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
527 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
531 // Check that there we didn't miss an expected break between the last one
533 for (i
=prevBP
+1; i
<bp
; i
++) {
534 if (t
->getExpectedBreak(i
) != 0) {
535 int expected
[] = {0, i
};
536 printStringBreaks(t
->dataToBreak
, expected
, 2);
537 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
538 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
542 // Check that the break we did find was expected
543 if (t
->getExpectedBreak(bp
) == 0) {
544 int expected
[] = {0, bp
};
545 printStringBreaks(t
->textToBreak
, expected
, 2);
546 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
547 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
549 // The break was expected.
550 // Check that the {nnn} tag value is correct.
551 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
552 if (expectedTagVal
== -1) {
555 int32_t line
= t
->getSrcLine(bp
);
556 int32_t rs
= t
->bi
->getRuleStatus();
557 if (rs
!= expectedTagVal
) {
558 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
559 " Actual, Expected status = %4d, %4d",
560 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
567 // Verify that there were no missed expected breaks after the last one found
568 for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) {
569 if (t
->getExpectedBreak(i
) != 0) {
570 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
571 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
576 // Run the iterator backwards, verify that the same breaks are found.
578 prevBP
= utext_nativeLength(t
->textToBreak
)+2; // start with a phony value for the last break pos seen.
580 while (bp
!= BreakIterator::DONE
) {
582 // Fail for lack of progress.
583 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
584 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
588 // Check that we didn't miss an expected break between the last one
589 // and this one. (UVector returns zeros for index out of bounds.)
590 for (i
=prevBP
-1; i
>bp
; i
--) {
591 if (t
->getExpectedBreak(i
) != 0) {
592 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
593 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
597 // Check that the break we did find was expected
598 if (t
->getExpectedBreak(bp
) == 0) {
599 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
600 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
602 // The break was expected.
603 // Check that the {nnn} tag value is correct.
604 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
605 if (expectedTagVal
== -1) {
608 int line
= t
->getSrcLine(bp
);
609 int32_t rs
= t
->bi
->getRuleStatus();
610 if (rs
!= expectedTagVal
) {
611 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
612 " Actual, Expected status = %4d, %4d",
613 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
618 bp
= t
->bi
->previous();
621 // Verify that there were no missed breaks prior to the last one found
622 for (i
=prevBP
-1; i
>=0; i
--) {
623 if (t
->getExpectedBreak(i
) != 0) {
624 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
625 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
629 // Check isBoundary()
630 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
631 UBool boundaryExpected
= (t
->getExpectedBreak(i
) != 0);
632 UBool boundaryFound
= t
->bi
->isBoundary(i
);
633 if (boundaryExpected
!= boundaryFound
) {
634 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
635 " Expected, Actual= %s, %s",
636 i
, t
->getSrcLine(i
), t
->getSrcCol(i
),
637 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
642 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
643 int32_t actualBreak
= t
->bi
->following(i
);
644 int32_t expectedBreak
= BreakIterator::DONE
;
645 for (int32_t j
=i
+1; j
<= utext_nativeLength(t
->textToBreak
); j
++) {
646 if (t
->getExpectedBreak(j
) != 0) {
651 if (expectedBreak
!= actualBreak
) {
652 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
653 " Expected, Actual= %d, %d",
654 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
659 for (i
=utext_nativeLength(t
->textToBreak
); i
>=0; i
--) {
660 int32_t actualBreak
= t
->bi
->preceding(i
);
661 int32_t expectedBreak
= BreakIterator::DONE
;
663 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
664 // preceding(trailing byte) will return the index of some preceding code point,
665 // not the lead byte of the current code point, even though that has a smaller index.
666 // Therefore, start looking at the expected break data not at i-1, but at
667 // the start of code point index - 1.
668 utext_setNativeIndex(t
->textToBreak
, i
);
669 int32_t j
= utext_getNativeIndex(t
->textToBreak
) - 1;
670 for (; j
>= 0; j
--) {
671 if (t
->getExpectedBreak(j
) != 0) {
676 if (expectedBreak
!= actualBreak
) {
677 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
678 " Expected, Actual= %d, %d",
679 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
685 void RBBITest::TestExtended() {
686 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
687 // data driven test closely entangles filtered and regular data.
688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
689 UErrorCode status
= U_ZERO_ERROR
;
692 TestParams
tp(status
);
694 RegexMatcher
localeMatcher(UnicodeString(u
"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status
);
695 if (U_FAILURE(status
)) {
696 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
700 // Open and read the test data file.
702 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
703 CharString
testFileName(testDataDirectory
, -1, status
);
704 testFileName
.append("rbbitst.txt", -1, status
);
707 UChar
*testFile
= ReadAndConvertFile(testFileName
.data(), len
, "UTF-8", status
);
708 if (U_FAILURE(status
)) {
709 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__
, __LINE__
, u_errorName(status
));
713 bool skipTest
= false; // Skip this test?
716 // Put the test data into a UnicodeString
718 UnicodeString
testString(FALSE
, testFile
, len
);
727 parseState
= PARSE_TAG
;
729 EParseState savedState
= PARSE_TAG
;
732 int32_t colStart
= 0;
736 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
738 UnicodeString rules
; // Holds rules from a <rules> ... </rules> block
739 int32_t rulesFirstLine
; // Line number of the start of current <rules> block
741 for (charIdx
= 0; charIdx
< len
; ) {
742 status
= U_ZERO_ERROR
;
743 UChar c
= testString
.charAt(charIdx
);
745 if (c
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
) == u
'\n') {
746 // treat CRLF as a unit
750 if (c
== u
'\n' || c
== u
'\r') {
754 column
= charIdx
- colStart
+ 1;
756 switch (parseState
) {
758 if (c
== u
'\n' || c
== u
'\r') {
759 parseState
= savedState
;
766 parseState
= PARSE_COMMENT
;
767 savedState
= PARSE_TAG
;
770 if (u_isUWhiteSpace(c
)) {
773 if (testString
.compare(charIdx
-1, 6, u
"<word>") == 0) {
775 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
780 if (testString
.compare(charIdx
-1, 6, u
"<char>") == 0) {
782 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
787 if (testString
.compare(charIdx
-1, 6, u
"<line>") == 0) {
789 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
794 if (testString
.compare(charIdx
-1, 6, u
"<sent>") == 0) {
796 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
801 if (testString
.compare(charIdx
-1, 7, u
"<title>") == 0) {
803 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
808 if (testString
.compare(charIdx
-1, 7, u
"<rules>") == 0 ||
809 testString
.compare(charIdx
-1, 10, u
"<badrules>") == 0) {
810 charIdx
= testString
.indexOf(u
'>', charIdx
) + 1;
811 parseState
= PARSE_RULES
;
813 rulesFirstLine
= lineNum
;
818 localeMatcher
.reset(testString
);
819 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
820 UnicodeString localeName
= localeMatcher
.group(1, status
);
821 char localeName8
[100];
822 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
823 locale
= Locale::createFromName(localeName8
);
824 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
825 TEST_ASSERT_SUCCESS(status
);
828 if (testString
.compare(charIdx
-1, 6, u
"<data>") == 0) {
829 parseState
= PARSE_DATA
;
832 tp
.expectedBreaks
->removeAllElements();
833 tp
.srcCol
->removeAllElements();
834 tp
.srcLine
->removeAllElements();
838 errln("line %d: Tag expected in test file.", lineNum
);
839 parseState
= PARSE_COMMENT
;
840 savedState
= PARSE_DATA
;
841 goto end_test
; // Stop the test.
846 if (testString
.compare(charIdx
-1, 8, u
"</rules>") == 0) {
848 parseState
= PARSE_TAG
;
851 tp
.bi
= new RuleBasedBreakIterator(rules
, pe
, status
);
852 skipTest
= U_FAILURE(status
);
853 if (U_FAILURE(status
)) {
854 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
855 rulesFirstLine
+ pe
.line
- 1, u_errorName(status
));
857 } else if (testString
.compare(charIdx
-1, 11, u
"</badrules>") == 0) {
859 parseState
= PARSE_TAG
;
860 UErrorCode ec
= U_ZERO_ERROR
;
862 RuleBasedBreakIterator
bi(rules
, pe
, ec
);
864 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
865 rulesFirstLine
+ pe
.line
- 1);
873 if (c
== u
'\u2022') { // u'•'
874 int32_t breakIdx
= tp
.dataToBreak
.length();
875 tp
.expectedBreaks
->setSize(breakIdx
+1);
876 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
877 tp
.srcLine
->setSize(breakIdx
+1);
878 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
879 tp
.srcCol
->setSize(breakIdx
+1);
880 tp
.srcCol
->setElementAt(column
, breakIdx
);
884 if (testString
.compare(charIdx
-1, 7, u
"</data>") == 0) {
885 // Add final entry to mappings from break location to source file position.
886 // Need one extra because last break position returned is after the
887 // last char in the data, not at the last char.
888 tp
.srcLine
->addElement(lineNum
, status
);
889 tp
.srcCol
->addElement(column
, status
);
891 parseState
= PARSE_TAG
;
896 status
= U_ZERO_ERROR
;
898 executeTest(&tp
, status
);
899 TEST_ASSERT_SUCCESS(status
);
901 // Run again, this time with UTF-8 text wrapped in a UText.
902 status
= U_ZERO_ERROR
;
904 TEST_ASSERT_SUCCESS(status
);
905 executeTest(&tp
, status
);
910 if (testString
.compare(charIdx
-1, 3, u
"\\N{") == 0) {
911 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
912 // Get the code point from the name and insert it into the test data.
913 // (Damn, no API takes names in Unicode !!!
914 // we've got to take it back to char *)
915 int32_t nameEndIdx
= testString
.indexOf(u
'}', charIdx
);
916 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
917 char charNameBuf
[200];
918 UChar32 theChar
= -1;
919 if (nameEndIdx
!= -1) {
920 UErrorCode status
= U_ZERO_ERROR
;
921 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
922 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
923 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
924 if (U_FAILURE(status
)) {
929 errln("Error in named character in test file at line %d, col %d",
932 // Named code point was recognized. Insert it
933 // into the test data.
934 tp
.dataToBreak
.append(theChar
);
935 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
936 tp
.srcLine
->addElement(lineNum
, status
);
937 tp
.srcCol
->addElement(column
, status
);
940 if (nameEndIdx
> charIdx
) {
941 charIdx
= nameEndIdx
+1;
949 if (testString
.compare(charIdx
-1, 2, u
"<>") == 0) {
951 int32_t breakIdx
= tp
.dataToBreak
.length();
952 tp
.expectedBreaks
->setSize(breakIdx
+1);
953 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
954 tp
.srcLine
->setSize(breakIdx
+1);
955 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
956 tp
.srcCol
->setSize(breakIdx
+1);
957 tp
.srcCol
->setElementAt(column
, breakIdx
);
963 parseState
= PARSE_NUM
;
967 if (c
== u
'#' && column
==3) { // TODO: why is column off so far?
968 parseState
= PARSE_COMMENT
;
969 savedState
= PARSE_DATA
;
974 // Check for \ at end of line, a line continuation.
975 // Advance over (discard) the newline
976 UChar32 cp
= testString
.char32At(charIdx
);
977 if (cp
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
+1) == u
'\n') {
979 // Need an extra increment of the input ptr to move over both of them
982 if (cp
== u
'\n' || cp
== u
'\r') {
989 // Let unescape handle the back slash.
990 cp
= testString
.unescapeAt(charIdx
);
992 // Escape sequence was recognized. Insert the char
993 // into the test data.
994 tp
.dataToBreak
.append(cp
);
995 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
996 tp
.srcLine
->addElement(lineNum
, status
);
997 tp
.srcCol
->addElement(column
, status
);
1003 // Not a recognized backslash escape sequence.
1004 // Take the next char as a literal.
1005 // TODO: Should this be an error?
1006 c
= testString
.charAt(charIdx
);
1007 charIdx
= testString
.moveIndex32(charIdx
, 1);
1010 // Normal, non-escaped data char.
1011 tp
.dataToBreak
.append(c
);
1013 // Save the mapping from offset in the data to line/column numbers in
1014 // the original input file. Will be used for better error messages only.
1015 // If there's an expected break before this char, the slot in the mapping
1016 // vector will already be set for this char; don't overwrite it.
1017 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1018 tp
.srcLine
->addElement(lineNum
, status
);
1019 tp
.srcCol
->addElement(column
, status
);
1025 // We are parsing an expected numeric tag value, like <1234>,
1026 // within a chunk of data.
1027 if (u_isUWhiteSpace(c
)) {
1032 // Finished the number. Add the info to the expected break data,
1033 // and switch parse state back to doing plain data.
1034 parseState
= PARSE_DATA
;
1035 if (tagValue
== 0) {
1038 int32_t breakIdx
= tp
.dataToBreak
.length();
1039 tp
.expectedBreaks
->setSize(breakIdx
+1);
1040 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1041 tp
.srcLine
->setSize(breakIdx
+1);
1042 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1043 tp
.srcCol
->setSize(breakIdx
+1);
1044 tp
.srcCol
->setElementAt(column
, breakIdx
);
1049 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1053 errln("Syntax Error in test file at line %d, col %d",
1055 parseState
= PARSE_COMMENT
;
1056 goto end_test
; // Stop the test
1061 if (U_FAILURE(status
)) {
1062 dataerrln("ICU Error %s while parsing test file at line %d.",
1063 u_errorName(status
), lineNum
);
1064 status
= U_ZERO_ERROR
;
1065 goto end_test
; // Stop the test
1070 // Reached end of test file. Raise an error if parseState indicates that we are
1071 // within a block that should have been terminated.
1073 if (parseState
== PARSE_RULES
) {
1074 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075 lineNum
, rulesFirstLine
);
1077 if (parseState
== PARSE_DATA
) {
1078 errln("rbbitst.txt:%d <data> block not closed.", lineNum
);
1088 //-------------------------------------------------------------------------------
1090 // TestDictRules create a break iterator from source rules that includes a
1091 // dictionary range. Regression for bug #7130. Source rules
1092 // do not declare a break iterator type (word, line, sentence, etc.
1093 // but the dictionary code, without a type, would loop.
1095 //-------------------------------------------------------------------------------
1096 void RBBITest::TestDictRules() {
1097 const char *rules
= "$dictionary = [a-z]; \n"
1099 "$dictionary $dictionary; \n"
1101 "$dictionary $dictionary; \n";
1102 const char *text
= "aa";
1103 UErrorCode status
= U_ZERO_ERROR
;
1104 UParseError parseError
;
1106 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1107 if (U_SUCCESS(status
)) {
1108 UnicodeString utext
= text
;
1112 for (loops
= 0; loops
<10; loops
++) {
1113 position
= bi
.next();
1114 if (position
== RuleBasedBreakIterator::DONE
) {
1118 TEST_ASSERT(loops
== 1);
1120 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1126 //-------------------------------------------------------------------------------
1128 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1129 // return the data in one big UChar * buffer, which the caller must delete.
1132 // fileName: the name of the file, with no directory part. The test data directory
1134 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1135 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1136 // specified here. The BOM, if it exists, will be stripped from the returned data.
1137 // Pass NULL for the system default encoding.
1140 // The file data, converted to UChar.
1141 // The caller must delete this when done with
1142 // delete [] theBuffer;
1144 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1145 // Move this function to some common place.
1147 //--------------------------------------------------------------------------------
1148 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1149 UChar
*retPtr
= NULL
;
1150 char *fileBuf
= NULL
;
1151 UConverter
* conv
= NULL
;
1155 if (U_FAILURE(status
)) {
1162 f
= fopen(fileName
, "rb");
1164 dataerrln("Error opening test data file %s\n", fileName
);
1165 status
= U_FILE_ACCESS_ERROR
;
1174 fseek( f
, 0, SEEK_END
);
1175 fileSize
= ftell(f
);
1176 fileBuf
= new char[fileSize
];
1177 fseek(f
, 0, SEEK_SET
);
1178 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1179 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1180 errln("Error reading test data file.");
1181 goto cleanUpAndReturn
;
1185 // Look for a Unicode Signature (BOM) on the data just read
1187 int32_t signatureLength
;
1188 const char * fileBufC
;
1189 const char* bomEncoding
;
1192 bomEncoding
= ucnv_detectUnicodeSignature(
1193 fileBuf
, fileSize
, &signatureLength
, &status
);
1194 if(bomEncoding
!=NULL
){
1195 fileBufC
+= signatureLength
;
1196 fileSize
-= signatureLength
;
1197 encoding
= bomEncoding
;
1201 // Open a converter to take the rule file to UTF-16
1203 conv
= ucnv_open(encoding
, &status
);
1204 if (U_FAILURE(status
)) {
1205 goto cleanUpAndReturn
;
1209 // Convert the rules to UChar.
1210 // Preflight first to determine required buffer size.
1212 ulen
= ucnv_toUChars(conv
,
1218 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1219 // Buffer Overflow is expected from the preflight operation.
1220 status
= U_ZERO_ERROR
;
1222 retPtr
= new UChar
[ulen
+1];
1235 if (U_FAILURE(status
)) {
1236 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1246 //--------------------------------------------------------------------------------------------
1248 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1250 //-------------------------------------------------------------------------------------------
1251 void RBBITest::TestUnicodeFiles() {
1252 RuleBasedBreakIterator
*bi
;
1253 UErrorCode status
= U_ZERO_ERROR
;
1255 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1256 TEST_ASSERT_SUCCESS(status
);
1257 if (U_SUCCESS(status
)) {
1258 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1262 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1263 TEST_ASSERT_SUCCESS(status
);
1264 if (U_SUCCESS(status
)) {
1265 runUnicodeTestData("WordBreakTest.txt", bi
);
1269 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1270 TEST_ASSERT_SUCCESS(status
);
1271 if (U_SUCCESS(status
)) {
1272 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1276 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1277 TEST_ASSERT_SUCCESS(status
);
1278 if (U_SUCCESS(status
)) {
1279 runUnicodeTestData("LineBreakTest.txt", bi
);
1285 // Check for test cases from the Unicode test data files that are known to fail
1286 // and should be skipped because ICU is not yet able to fully implement the spec.
1287 // See ticket #7270.
1289 UBool
RBBITest::testCaseIsKnownIssue(const UnicodeString
&testCase
, const char *fileName
) {
1290 static struct TestCase
{
1291 const char *fFileName
;
1292 const UChar
*fString
;
1293 } badTestCases
[] = { // Line Numbers from Unicode 7.0.0 file.
1294 {"LineBreakTest.txt", u
"\u200B\u0020}"}, // Line 5198
1295 {"LineBreakTest.txt", u
"\u200B\u0020)"}, // Line 5202
1296 {"LineBreakTest.txt", u
"\u200B\u0020!"}, // Line 5214
1297 {"LineBreakTest.txt", u
"\u200B\u0020,"}, // Line 5246
1298 {"LineBreakTest.txt", u
"\u200B\u0020/"}, // Line 5298
1299 {"LineBreakTest.txt", u
"\u200B\u0020\u2060"}, // Line 5302
1300 // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
1301 {"GraphemeBreakTest.txt", u
"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
1302 {"GraphemeBreakTest.txt", u
"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
1303 {"GraphemeBreakTest.txt", u
"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
1305 // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
1306 {"WordBreakTest.txt", u
"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
1307 {"WordBreakTest.txt", u
"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
1310 for (int n
=0; n
<UPRV_LENGTHOF(badTestCases
); n
++) {
1311 const TestCase
&badCase
= badTestCases
[n
];
1312 if (!strcmp(fileName
, badCase
.fFileName
) &&
1313 testCase
== UnicodeString(badCase
.fString
)) {
1314 return logKnownIssue("7270");
1321 //--------------------------------------------------------------------------------------------
1323 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1325 //-------------------------------------------------------------------------------------------
1326 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1327 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1328 UErrorCode status
= U_ZERO_ERROR
;
1331 // Open and read the test data file, put it into a UnicodeString.
1333 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1334 char testFileName
[1000];
1335 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1336 dataerrln("Can't open test data. Path too long.");
1339 strcpy(testFileName
, testDataDirectory
);
1340 strcat(testFileName
, fileName
);
1342 logln("Opening data file %s\n", fileName
);
1345 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1346 if (status
!= U_FILE_ACCESS_ERROR
) {
1347 TEST_ASSERT_SUCCESS(status
);
1348 TEST_ASSERT(testFile
!= NULL
);
1350 if (U_FAILURE(status
) || testFile
== NULL
) {
1351 return; /* something went wrong, error already output */
1353 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1356 // Parse the test data file using a regular expression.
1357 // Each kind of token is recognized in its own capture group; what type of item was scanned
1358 // is identified by which group had a match.
1360 // Caputure Group # 1 2 3 4 5
1361 // Parses this item: divide x hex digits comment \n unrecognized \n
1363 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1364 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1365 UnicodeString testString
;
1366 UVector32
breakPositions(status
);
1368 TEST_ASSERT_SUCCESS(status
);
1369 if (U_FAILURE(status
)) {
1374 // Scan through each test case, building up the string to be broken in testString,
1375 // and the positions that should be boundaries in the breakPositions vector.
1378 while (tokenMatcher
.find()) {
1379 if(tokenMatcher
.hitEnd()) {
1380 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1381 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1382 and caused an infinite loop here on EBCDIC systems!
1384 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1387 if (tokenMatcher
.start(1, status
) >= 0) {
1388 // Scanned a divide sign, indicating a break position in the test data.
1389 if (testString
.length()>0) {
1390 breakPositions
.addElement(testString
.length(), status
);
1393 else if (tokenMatcher
.start(2, status
) >= 0) {
1394 // Scanned an 'x', meaning no break at this position in the test data
1395 // Nothing to be done here.
1397 else if (tokenMatcher
.start(3, status
) >= 0) {
1398 // Scanned Hex digits. Convert them to binary, append to the character data string.
1399 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1400 int length
= hexNumber
.length();
1403 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1404 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1406 testString
.append(c
);
1408 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1409 fileName
, lineNumber
);
1412 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1413 fileName
, lineNumber
);
1416 else if (tokenMatcher
.start(4, status
) >= 0) {
1417 // Scanned to end of a line, possibly skipping over a comment in the process.
1418 // If the line from the file contained test data, run the test now.
1419 if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {
1420 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1423 // Clear out this test case.
1424 // The string and breakPositions vector will be refilled as the next
1425 // test case is parsed.
1426 testString
.remove();
1427 breakPositions
.removeAllElements();
1430 // Scanner catchall. Something unrecognized appeared on the line.
1432 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1433 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1434 token
[sizeof(token
)-1] = 0;
1435 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1437 // Clean up, in preparation for continuing with the next line.
1438 testString
.remove();
1439 breakPositions
.removeAllElements();
1442 TEST_ASSERT_SUCCESS(status
);
1443 if (U_FAILURE(status
)) {
1449 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1452 //--------------------------------------------------------------------------------------------
1454 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1455 // test data files. Do only a simple, forward-only check -
1456 // this test is mostly to check that ICU and the Unicode
1457 // data agree with each other.
1459 //--------------------------------------------------------------------------------------------
1460 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1461 const UnicodeString
&testString
, // Text data to be broken
1462 UVector32
*breakPositions
, // Positions where breaks should be found.
1463 RuleBasedBreakIterator
*bi
) {
1464 int32_t pos
; // Break Position in the test string
1465 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1466 int32_t expectedPos
; // Expected break position (index into test string)
1468 bi
->setText(testString
);
1472 while (pos
!= BreakIterator::DONE
) {
1473 if (expectedI
>= breakPositions
->size()) {
1474 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1475 testFileName
, lineNumber
, pos
);
1478 expectedPos
= breakPositions
->elementAti(expectedI
);
1479 if (pos
< expectedPos
) {
1480 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1481 testFileName
, lineNumber
, pos
);
1484 if (pos
> expectedPos
) {
1485 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1486 testFileName
, lineNumber
, expectedPos
);
1493 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1494 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1495 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1501 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1502 //---------------------------------------------------------------------------------------
1504 // classs RBBIMonkeyKind
1506 // Monkey Test for Break Iteration
1507 // Abstract interface class. Concrete derived classes independently
1508 // implement the break rules for different iterator types.
1510 // The Monkey Test itself uses doesn't know which type of break iterator it is
1511 // testing, but works purely in terms of the interface defined here.
1513 //---------------------------------------------------------------------------------------
1514 class RBBIMonkeyKind
{
1516 // Return a UVector of UnicodeSets, representing the character classes used
1517 // for this type of iterator.
1518 virtual UVector
*charClasses() = 0;
1520 // Set the test text on which subsequent calls to next() will operate
1521 virtual void setText(const UnicodeString
&s
) = 0;
1523 // Find the next break postion, starting from the prev break position, or from zero.
1524 // Return -1 after reaching end of string.
1525 virtual int32_t next(int32_t i
) = 0;
1527 virtual ~RBBIMonkeyKind();
1528 UErrorCode deferredStatus
;
1537 RBBIMonkeyKind::RBBIMonkeyKind() {
1538 deferredStatus
= U_ZERO_ERROR
;
1541 RBBIMonkeyKind::~RBBIMonkeyKind() {
1545 //----------------------------------------------------------------------------------------
1547 // Random Numbers. Similar to standard lib rand() and srand()
1548 // Not using library to
1549 // 1. Get same results on all platforms.
1550 // 2. Get access to current seed, to more easily reproduce failures.
1552 //---------------------------------------------------------------------------------------
1553 static uint32_t m_seed
= 1;
1555 static uint32_t m_rand()
1557 m_seed
= m_seed
* 1103515245 + 12345;
1558 return (uint32_t)(m_seed
/65536) % 32768;
1562 //------------------------------------------------------------------------------------------
1564 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1565 // of RBBIMonkeyKind.
1567 //------------------------------------------------------------------------------------------
1568 class RBBICharMonkey
: public RBBIMonkeyKind
{
1571 virtual ~RBBICharMonkey();
1572 virtual UVector
*charClasses();
1573 virtual void setText(const UnicodeString
&s
);
1574 virtual int32_t next(int32_t i
);
1578 UnicodeSet
*fCRLFSet
;
1579 UnicodeSet
*fControlSet
;
1580 UnicodeSet
*fExtendSet
;
1581 UnicodeSet
*fZWJSet
;
1582 UnicodeSet
*fRegionalIndicatorSet
;
1583 UnicodeSet
*fPrependSet
;
1584 UnicodeSet
*fSpacingSet
;
1589 UnicodeSet
*fLVTSet
;
1590 UnicodeSet
*fHangulSet
;
1591 UnicodeSet
*fExtendedPictSet
;
1592 UnicodeSet
*fAnySet
;
1594 const UnicodeString
*fText
;
1598 RBBICharMonkey::RBBICharMonkey() {
1599 UErrorCode status
= U_ZERO_ERROR
;
1603 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
1604 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status
);
1605 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status
);
1606 fZWJSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status
);
1607 fRegionalIndicatorSet
=
1608 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
1609 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
1610 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
1611 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
1612 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
1613 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
1614 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
1615 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
1616 fHangulSet
= new UnicodeSet();
1617 fHangulSet
->addAll(*fLSet
);
1618 fHangulSet
->addAll(*fVSet
);
1619 fHangulSet
->addAll(*fTSet
);
1620 fHangulSet
->addAll(*fLVSet
);
1621 fHangulSet
->addAll(*fLVTSet
);
1623 fExtendedPictSet
= new UnicodeSet(u
"[:Extended_Pictographic:]", status
);
1624 fAnySet
= new UnicodeSet(0, 0x10ffff);
1626 fSets
= new UVector(status
);
1627 fSets
->addElement(fCRLFSet
, status
);
1628 fSets
->addElement(fControlSet
, status
);
1629 fSets
->addElement(fExtendSet
, status
);
1630 fSets
->addElement(fRegionalIndicatorSet
, status
);
1631 if (!fPrependSet
->isEmpty()) {
1632 fSets
->addElement(fPrependSet
, status
);
1634 fSets
->addElement(fSpacingSet
, status
);
1635 fSets
->addElement(fHangulSet
, status
);
1636 fSets
->addElement(fAnySet
, status
);
1637 fSets
->addElement(fZWJSet
, status
);
1638 fSets
->addElement(fExtendedPictSet
, status
);
1639 if (U_FAILURE(status
)) {
1640 deferredStatus
= status
;
1645 void RBBICharMonkey::setText(const UnicodeString
&s
) {
1651 int32_t RBBICharMonkey::next(int32_t prevPos
) {
1652 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
1653 // break position being tested. The candidate break
1654 // location is before p2.
1658 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
1659 UChar32 cBase
; // for (X Extend*) patterns, the X character.
1661 if (U_FAILURE(deferredStatus
)) {
1665 // Previous break at end of string. return DONE.
1666 if (prevPos
>= fText
->length()) {
1669 p0
= p1
= p2
= p3
= prevPos
;
1670 c3
= fText
->char32At(prevPos
);
1671 c0
= c1
= c2
= cBase
= 0;
1672 (void)p0
; // suppress set but not used warning.
1675 // Loop runs once per "significant" character position in the input text.
1677 // Move all of the positions forward in the input string.
1682 // Advancd p3 by one codepoint
1683 p3
= fText
->moveIndex32(p3
, 1);
1684 c3
= fText
->char32At(p3
);
1687 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1690 if (p2
== fText
->length()) {
1691 // Reached end of string. Always a break position.
1696 // No Extend or Format characters may appear between the CR and LF,
1697 // which requires the additional check for p2 immediately following p1.
1699 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
1703 // Rule (GB4). ( Control | CR | LF ) <break>
1704 if (fControlSet
->contains(c1
) ||
1710 // Rule (GB5) <break> ( Control | CR | LF )
1712 if (fControlSet
->contains(c2
) ||
1719 // Rule (GB6) L x ( L | V | LV | LVT )
1720 if (fLSet
->contains(c1
) &&
1721 (fLSet
->contains(c2
) ||
1722 fVSet
->contains(c2
) ||
1723 fLVSet
->contains(c2
) ||
1724 fLVTSet
->contains(c2
))) {
1728 // Rule (GB7) ( LV | V ) x ( V | T )
1729 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
1730 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
1734 // Rule (GB8) ( LVT | T) x T
1735 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
1736 fTSet
->contains(c2
)) {
1740 // Rule (GB9) x (Extend | ZWJ)
1741 if (fExtendSet
->contains(c2
) || fZWJSet
->contains(c2
)) {
1742 if (!fExtendSet
->contains(c1
)) {
1748 // Rule (GB9a) x SpacingMark
1749 if (fSpacingSet
->contains(c2
)) {
1753 // Rule (GB9b) Prepend x
1754 if (fPrependSet
->contains(c1
)) {
1758 // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1759 if (fExtendedPictSet
->contains(cBase
) && fZWJSet
->contains(c1
) && fExtendedPictSet
->contains(c2
)) {
1763 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
1764 // Note: The first if condition is a little tricky. We only need to force
1765 // a break if there are three or more contiguous RIs. If there are
1766 // only two, a break following will occur via other rules, and will include
1767 // any trailing extend characters, which is needed behavior.
1768 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)
1769 && fRegionalIndicatorSet
->contains(c2
)) {
1772 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
1776 // Rule (GB999) Any <break> Any
1786 UVector
*RBBICharMonkey::charClasses() {
1791 RBBICharMonkey::~RBBICharMonkey() {
1796 delete fRegionalIndicatorSet
;
1807 delete fExtendedPictSet
;
1810 //------------------------------------------------------------------------------------------
1812 // class RBBIWordMonkey Word Break specific implementation
1813 // of RBBIMonkeyKind.
1815 //------------------------------------------------------------------------------------------
1816 class RBBIWordMonkey
: public RBBIMonkeyKind
{
1819 virtual ~RBBIWordMonkey();
1820 virtual UVector
*charClasses();
1821 virtual void setText(const UnicodeString
&s
);
1822 virtual int32_t next(int32_t i
);
1828 UnicodeSet
*fNewlineSet
;
1829 UnicodeSet
*fRegionalIndicatorSet
;
1830 UnicodeSet
*fKatakanaSet
;
1831 UnicodeSet
*fHebrew_LetterSet
;
1832 UnicodeSet
*fALetterSet
;
1833 UnicodeSet
*fSingle_QuoteSet
;
1834 UnicodeSet
*fDouble_QuoteSet
;
1835 UnicodeSet
*fMidNumLetSet
;
1836 UnicodeSet
*fMidLetterSet
;
1837 UnicodeSet
*fMidNumSet
;
1838 UnicodeSet
*fNumericSet
;
1839 UnicodeSet
*fFormatSet
;
1840 UnicodeSet
*fOtherSet
;
1841 UnicodeSet
*fExtendSet
;
1842 UnicodeSet
*fExtendNumLetSet
;
1843 UnicodeSet
*fWSegSpaceSet
;
1844 UnicodeSet
*fDictionarySet
;
1845 UnicodeSet
*fZWJSet
;
1846 UnicodeSet
*fExtendedPictSet
;
1848 const UnicodeString
*fText
;
1852 RBBIWordMonkey::RBBIWordMonkey()
1854 UErrorCode status
= U_ZERO_ERROR
;
1856 fSets
= new UVector(status
);
1858 fCRSet
= new UnicodeSet(u
"[\\p{Word_Break = CR}]", status
);
1859 fLFSet
= new UnicodeSet(u
"[\\p{Word_Break = LF}]", status
);
1860 fNewlineSet
= new UnicodeSet(u
"[\\p{Word_Break = Newline}]", status
);
1861 fKatakanaSet
= new UnicodeSet(u
"[\\p{Word_Break = Katakana}]", status
);
1862 fRegionalIndicatorSet
= new UnicodeSet(u
"[\\p{Word_Break = Regional_Indicator}]", status
);
1863 fHebrew_LetterSet
= new UnicodeSet(u
"[\\p{Word_Break = Hebrew_Letter}]", status
);
1864 fALetterSet
= new UnicodeSet(u
"[\\p{Word_Break = ALetter}]", status
);
1865 fSingle_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Single_Quote}]", status
);
1866 fDouble_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Double_Quote}]", status
);
1867 fMidNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNumLet}]", status
);
1868 fMidLetterSet
= new UnicodeSet(u
"[\\p{Word_Break = MidLetter} - [\\:]]", status
);
1869 fMidNumSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNum}]", status
);
1870 fNumericSet
= new UnicodeSet(u
"[\\p{Word_Break = Numeric}]", status
);
1871 fFormatSet
= new UnicodeSet(u
"[\\p{Word_Break = Format}]", status
);
1872 fExtendNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = ExtendNumLet}]", status
);
1873 fExtendSet
= new UnicodeSet(u
"[\\p{Word_Break = Extend}]", status
);
1874 fWSegSpaceSet
= new UnicodeSet(u
"[\\p{Word_Break = WSegSpace}]", status
);
1876 fZWJSet
= new UnicodeSet(u
"[\\p{Word_Break = ZWJ}]", status
);
1877 fExtendedPictSet
= new UnicodeSet(u
"[:Extended_Pictographic:]", status
);
1879 fDictionarySet
= new UnicodeSet(u
"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status
);
1880 fDictionarySet
->addAll(*fKatakanaSet
);
1881 fDictionarySet
->addAll(UnicodeSet(u
"[\\p{LineBreak = Complex_Context}]", status
));
1883 fALetterSet
->removeAll(*fDictionarySet
);
1885 fOtherSet
= new UnicodeSet();
1886 if(U_FAILURE(status
)) {
1887 IntlTest::gTest
->errln("%s:%d %s", __FILE__
, __LINE__
, u_errorName(status
));
1888 deferredStatus
= status
;
1892 fOtherSet
->complement();
1893 fOtherSet
->removeAll(*fCRSet
);
1894 fOtherSet
->removeAll(*fLFSet
);
1895 fOtherSet
->removeAll(*fNewlineSet
);
1896 fOtherSet
->removeAll(*fKatakanaSet
);
1897 fOtherSet
->removeAll(*fHebrew_LetterSet
);
1898 fOtherSet
->removeAll(*fALetterSet
);
1899 fOtherSet
->removeAll(*fSingle_QuoteSet
);
1900 fOtherSet
->removeAll(*fDouble_QuoteSet
);
1901 fOtherSet
->removeAll(*fMidLetterSet
);
1902 fOtherSet
->removeAll(*fMidNumSet
);
1903 fOtherSet
->removeAll(*fNumericSet
);
1904 fOtherSet
->removeAll(*fExtendNumLetSet
);
1905 fOtherSet
->removeAll(*fWSegSpaceSet
);
1906 fOtherSet
->removeAll(*fFormatSet
);
1907 fOtherSet
->removeAll(*fExtendSet
);
1908 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
1909 fOtherSet
->removeAll(*fZWJSet
);
1910 fOtherSet
->removeAll(*fExtendedPictSet
);
1912 // Inhibit dictionary characters from being tested at all.
1913 fOtherSet
->removeAll(*fDictionarySet
);
1915 fSets
->addElement(fCRSet
, status
);
1916 fSets
->addElement(fLFSet
, status
);
1917 fSets
->addElement(fNewlineSet
, status
);
1918 fSets
->addElement(fRegionalIndicatorSet
, status
);
1919 fSets
->addElement(fHebrew_LetterSet
, status
);
1920 fSets
->addElement(fALetterSet
, status
);
1921 fSets
->addElement(fSingle_QuoteSet
, status
);
1922 fSets
->addElement(fDouble_QuoteSet
, status
);
1923 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
1924 // from the test data. They are all in the dictionary set,
1925 // which this (old, to be retired) monkey test cannot handle.
1926 fSets
->addElement(fMidLetterSet
, status
);
1927 fSets
->addElement(fMidNumLetSet
, status
);
1928 fSets
->addElement(fMidNumSet
, status
);
1929 fSets
->addElement(fNumericSet
, status
);
1930 fSets
->addElement(fFormatSet
, status
);
1931 fSets
->addElement(fExtendSet
, status
);
1932 fSets
->addElement(fOtherSet
, status
);
1933 fSets
->addElement(fExtendNumLetSet
, status
);
1934 fSets
->addElement(fWSegSpaceSet
, status
);
1936 fSets
->addElement(fZWJSet
, status
);
1937 fSets
->addElement(fExtendedPictSet
, status
);
1939 if (U_FAILURE(status
)) {
1940 deferredStatus
= status
;
1944 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
1949 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
1950 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
1951 // break position being tested. The candidate break
1952 // location is before p2.
1956 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
1958 if (U_FAILURE(deferredStatus
)) {
1962 // Prev break at end of string. return DONE.
1963 if (prevPos
>= fText
->length()) {
1966 p0
= p1
= p2
= p3
= prevPos
;
1967 c3
= fText
->char32At(prevPos
);
1969 (void)p0
; // Suppress set but not used warning.
1971 // Loop runs once per "significant" character position in the input text.
1973 // Move all of the positions forward in the input string.
1978 // Advancd p3 by X(Extend | Format)* Rule 4
1979 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
1981 p3
= fText
->moveIndex32(p3
, 1);
1982 c3
= fText
->char32At(p3
);
1983 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
1987 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
) || fZWJSet
->contains(c3
));
1991 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1994 if (p2
== fText
->length()) {
1995 // Reached end of string. Always a break position.
2000 // No Extend or Format characters may appear between the CR and LF,
2001 // which requires the additional check for p2 immediately following p1.
2003 if (c1
==0x0D && c2
==0x0A) {
2007 // Rule (3a) Break before and after newlines (including CR and LF)
2009 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2012 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2016 // Rule (3c) ZWJ x Extended_Pictographic
2017 // Not ignoring extend chars, so peek into input text to
2018 // get the potential ZWJ, the character immediately preceding c2.
2019 // Sloppy UChar32 indexing: p2-1 may reference trail half
2020 // but char32At will get the full code point.
2021 if (fZWJSet
->contains(fText
->char32At(p2
-1)) && fExtendedPictSet
->contains(c2
)) {
2025 // Rule (3d) Keep horizontal whitespace together.
2026 if (fWSegSpaceSet
->contains(fText
->char32At(p2
-1)) && fWSegSpaceSet
->contains(c2
)) {
2030 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2031 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2032 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2036 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2038 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2039 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2040 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2044 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2045 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2046 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2047 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2051 // Rule (7a) Hebrew_Letter x Single_Quote
2052 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2056 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2057 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2061 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2062 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2066 // Rule (8) Numeric x Numeric
2067 if (fNumericSet
->contains(c1
) &&
2068 fNumericSet
->contains(c2
)) {
2072 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2073 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2074 fNumericSet
->contains(c2
)) {
2078 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2079 if (fNumericSet
->contains(c1
) &&
2080 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2084 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2085 if (fNumericSet
->contains(c0
) &&
2086 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2087 fNumericSet
->contains(c2
)) {
2091 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2092 if (fNumericSet
->contains(c1
) &&
2093 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2094 fNumericSet
->contains(c3
)) {
2098 // Rule (13) Katakana x Katakana
2099 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2100 // all Katakana are handled by the dictionary breaker.
2101 if (fKatakanaSet
->contains(c1
) &&
2102 fKatakanaSet
->contains(c2
)) {
2106 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2107 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2108 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2109 fExtendNumLetSet
->contains(c2
)) {
2113 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2114 if (fExtendNumLetSet
->contains(c1
) &&
2115 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2116 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2120 // Rule 15 - 17 Group pairs of Regional Indicators.
2121 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)) {
2124 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2128 // Rule 999. Break found here.
2137 UVector
*RBBIWordMonkey::charClasses() {
2142 RBBIWordMonkey::~RBBIWordMonkey() {
2147 delete fKatakanaSet
;
2148 delete fHebrew_LetterSet
;
2150 delete fSingle_QuoteSet
;
2151 delete fDouble_QuoteSet
;
2152 delete fMidNumLetSet
;
2153 delete fMidLetterSet
;
2158 delete fExtendNumLetSet
;
2159 delete fWSegSpaceSet
;
2160 delete fRegionalIndicatorSet
;
2161 delete fDictionarySet
;
2164 delete fExtendedPictSet
;
2170 //------------------------------------------------------------------------------------------
2172 // class RBBISentMonkey Sentence Break specific implementation
2173 // of RBBIMonkeyKind.
2175 //------------------------------------------------------------------------------------------
2176 class RBBISentMonkey
: public RBBIMonkeyKind
{
2179 virtual ~RBBISentMonkey();
2180 virtual UVector
*charClasses();
2181 virtual void setText(const UnicodeString
&s
);
2182 virtual int32_t next(int32_t i
);
2184 int moveBack(int posFrom
);
2185 int moveForward(int posFrom
);
2186 UChar32
cAt(int pos
);
2190 UnicodeSet
*fSepSet
;
2191 UnicodeSet
*fFormatSet
;
2193 UnicodeSet
*fLowerSet
;
2194 UnicodeSet
*fUpperSet
;
2195 UnicodeSet
*fOLetterSet
;
2196 UnicodeSet
*fNumericSet
;
2197 UnicodeSet
*fATermSet
;
2198 UnicodeSet
*fSContinueSet
;
2199 UnicodeSet
*fSTermSet
;
2200 UnicodeSet
*fCloseSet
;
2201 UnicodeSet
*fOtherSet
;
2202 UnicodeSet
*fExtendSet
;
2204 const UnicodeString
*fText
;
2208 RBBISentMonkey::RBBISentMonkey()
2210 UErrorCode status
= U_ZERO_ERROR
;
2212 fSets
= new UVector(status
);
2214 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2215 // set and made into character classes of their own. For the monkey impl,
2216 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2217 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2218 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2219 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2220 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2221 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2222 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2223 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2224 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2225 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2226 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2227 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2228 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2229 fOtherSet
= new UnicodeSet();
2231 if(U_FAILURE(status
)) {
2232 deferredStatus
= status
;
2236 fOtherSet
->complement();
2237 fOtherSet
->removeAll(*fSepSet
);
2238 fOtherSet
->removeAll(*fFormatSet
);
2239 fOtherSet
->removeAll(*fSpSet
);
2240 fOtherSet
->removeAll(*fLowerSet
);
2241 fOtherSet
->removeAll(*fUpperSet
);
2242 fOtherSet
->removeAll(*fOLetterSet
);
2243 fOtherSet
->removeAll(*fNumericSet
);
2244 fOtherSet
->removeAll(*fATermSet
);
2245 fOtherSet
->removeAll(*fSContinueSet
);
2246 fOtherSet
->removeAll(*fSTermSet
);
2247 fOtherSet
->removeAll(*fCloseSet
);
2248 fOtherSet
->removeAll(*fExtendSet
);
2250 fSets
->addElement(fSepSet
, status
);
2251 fSets
->addElement(fFormatSet
, status
);
2252 fSets
->addElement(fSpSet
, status
);
2253 fSets
->addElement(fLowerSet
, status
);
2254 fSets
->addElement(fUpperSet
, status
);
2255 fSets
->addElement(fOLetterSet
, status
);
2256 fSets
->addElement(fNumericSet
, status
);
2257 fSets
->addElement(fATermSet
, status
);
2258 fSets
->addElement(fSContinueSet
, status
);
2259 fSets
->addElement(fSTermSet
, status
);
2260 fSets
->addElement(fCloseSet
, status
);
2261 fSets
->addElement(fOtherSet
, status
);
2262 fSets
->addElement(fExtendSet
, status
);
2264 if (U_FAILURE(status
)) {
2265 deferredStatus
= status
;
2271 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2275 UVector
*RBBISentMonkey::charClasses() {
2280 // moveBack() Find the "significant" code point preceding the index i.
2281 // Skips over ($Extend | $Format)* .
2283 int RBBISentMonkey::moveBack(int i
) {
2290 j
= fText
->moveIndex32(j
, -1);
2291 c
= fText
->char32At(j
);
2293 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2299 int RBBISentMonkey::moveForward(int i
) {
2300 if (i
>=fText
->length()) {
2301 return fText
->length();
2306 j
= fText
->moveIndex32(j
, 1);
2309 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2313 UChar32
RBBISentMonkey::cAt(int pos
) {
2314 if (pos
<0 || pos
>=fText
->length()) {
2317 return fText
->char32At(pos
);
2321 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2322 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2323 // break position being tested. The candidate break
2324 // location is before p2.
2328 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2331 if (U_FAILURE(deferredStatus
)) {
2335 // Prev break at end of string. return DONE.
2336 if (prevPos
>= fText
->length()) {
2339 p0
= p1
= p2
= p3
= prevPos
;
2340 c3
= fText
->char32At(prevPos
);
2342 (void)p0
; // Suppress set but not used warning.
2344 // Loop runs once per "significant" character position in the input text.
2346 // Move all of the positions forward in the input string.
2351 // Advancd p3 by X(Extend | Format)* Rule 4
2352 p3
= moveForward(p3
);
2356 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2360 // Rule (4). Sep <break>
2361 if (fSepSet
->contains(c1
)) {
2362 p2
= p1
+1; // Separators don't combine with Extend or Format.
2366 if (p2
>= fText
->length()) {
2367 // Reached end of string. Always a break position.
2371 if (p2
== prevPos
) {
2372 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2376 // Rule (6). ATerm x Numeric
2377 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2381 // Rule (7). (Upper | Lower) ATerm x Uppper
2382 if ((fUpperSet
->contains(c0
) || fLowerSet
->contains(c0
)) &&
2383 fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2387 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2388 // Note: STerm | ATerm are added to the negated part of the expression by a
2389 // note to the Unicode 5.0 documents.
2391 while (fSpSet
->contains(cAt(p8
))) {
2394 while (fCloseSet
->contains(cAt(p8
))) {
2397 if (fATermSet
->contains(cAt(p8
))) {
2401 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2402 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2403 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2406 p8
= moveForward(p8
);
2408 if (fLowerSet
->contains(cAt(p8
))) {
2413 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2414 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2416 while (fSpSet
->contains(cAt(p8
))) {
2419 while (fCloseSet
->contains(cAt(p8
))) {
2423 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2428 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2430 while (fCloseSet
->contains(cAt(p9
))) {
2434 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2435 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2440 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2442 while (fSpSet
->contains(cAt(p10
))) {
2443 p10
= moveBack(p10
);
2445 while (fCloseSet
->contains(cAt(p10
))) {
2446 p10
= moveBack(p10
);
2448 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2449 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2454 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2456 if (fSepSet
->contains(cAt(p11
))) {
2457 p11
= moveBack(p11
);
2459 while (fSpSet
->contains(cAt(p11
))) {
2460 p11
= moveBack(p11
);
2462 while (fCloseSet
->contains(cAt(p11
))) {
2463 p11
= moveBack(p11
);
2465 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2469 // Rule (12) Any x Any
2476 RBBISentMonkey::~RBBISentMonkey() {
2486 delete fSContinueSet
;
2495 //-------------------------------------------------------------------------------------------
2499 //-------------------------------------------------------------------------------------------
2501 class RBBILineMonkey
: public RBBIMonkeyKind
{
2504 virtual ~RBBILineMonkey();
2505 virtual UVector
*charClasses();
2506 virtual void setText(const UnicodeString
&s
);
2507 virtual int32_t next(int32_t i
);
2508 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2555 BreakIterator
*fCharBI
;
2556 const UnicodeString
*fText
;
2557 RegexMatcher
*fNumberMatcher
;
2560 RBBILineMonkey::RBBILineMonkey() :
2566 fNumberMatcher(NULL
)
2569 if (U_FAILURE(deferredStatus
)) {
2573 UErrorCode status
= U_ZERO_ERROR
;
2575 fSets
= new UVector(status
);
2577 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
2578 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
2579 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
2580 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
2581 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
2582 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
2583 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
2584 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
2585 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
2586 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
2587 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
2588 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
2589 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
2590 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
2591 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
2592 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
2593 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
2594 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
2595 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
2596 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
2597 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
2598 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
2599 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
2600 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
2601 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
2602 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
2603 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
2604 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
2605 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
2606 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
2607 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
2608 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
2609 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
2610 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
2611 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
2612 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
2613 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
2614 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
2615 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
2616 fEB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status
);
2617 fEM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status
);
2618 fZJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status
);
2620 if (U_FAILURE(status
)) {
2621 deferredStatus
= status
;
2625 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2626 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2627 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2629 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
2630 fCM
->addAll(*fZJ
); // ZWJ behaves as a CM.
2632 fSets
->addElement(fBK
, status
);
2633 fSets
->addElement(fCR
, status
);
2634 fSets
->addElement(fLF
, status
);
2635 fSets
->addElement(fCM
, status
);
2636 fSets
->addElement(fNL
, status
);
2637 fSets
->addElement(fWJ
, status
);
2638 fSets
->addElement(fZW
, status
);
2639 fSets
->addElement(fGL
, status
);
2640 fSets
->addElement(fCB
, status
);
2641 fSets
->addElement(fSP
, status
);
2642 fSets
->addElement(fB2
, status
);
2643 fSets
->addElement(fBA
, status
);
2644 fSets
->addElement(fBB
, status
);
2645 fSets
->addElement(fHY
, status
);
2646 fSets
->addElement(fH2
, status
);
2647 fSets
->addElement(fH3
, status
);
2648 fSets
->addElement(fCL
, status
);
2649 fSets
->addElement(fCP
, status
);
2650 fSets
->addElement(fEX
, status
);
2651 fSets
->addElement(fIN
, status
);
2652 fSets
->addElement(fJL
, status
);
2653 fSets
->addElement(fJT
, status
);
2654 fSets
->addElement(fJV
, status
);
2655 fSets
->addElement(fNS
, status
);
2656 fSets
->addElement(fOP
, status
);
2657 fSets
->addElement(fQU
, status
);
2658 fSets
->addElement(fIS
, status
);
2659 fSets
->addElement(fNU
, status
);
2660 fSets
->addElement(fPO
, status
);
2661 fSets
->addElement(fPR
, status
);
2662 fSets
->addElement(fSY
, status
);
2663 fSets
->addElement(fAI
, status
);
2664 fSets
->addElement(fAL
, status
);
2665 fSets
->addElement(fHL
, status
);
2666 fSets
->addElement(fID
, status
);
2667 fSets
->addElement(fWJ
, status
);
2668 fSets
->addElement(fRI
, status
);
2669 fSets
->addElement(fSG
, status
);
2670 fSets
->addElement(fEB
, status
);
2671 fSets
->addElement(fEM
, status
);
2672 fSets
->addElement(fZJ
, status
);
2676 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2677 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2678 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2679 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2680 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2681 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2683 fNumberMatcher
= new RegexMatcher(
2684 UnicodeString(rules
, -1, US_INV
), 0, status
);
2686 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2688 if (U_FAILURE(status
)) {
2689 deferredStatus
= status
;
2694 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2696 fCharBI
->setText(s
);
2697 fNumberMatcher
->reset(s
);
2702 // Line Break TR rules 9 and 10 implementation.
2703 // This deals with combining marks and other sequences that
2704 // that must be treated as if they were something other than what they actually are.
2706 // This is factored out into a separate function because it must be applied twice for
2707 // each potential break, once to the chars before the position being checked, then
2708 // again to the text following the possible break.
2710 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2712 // Invalid initial position. Happens during the warmup iteration of the
2713 // main loop in next().
2717 int32_t nPos
= *nextPos
;
2719 // LB 9 Keep combining sequences together.
2720 // advance over any CM class chars. Note that Line Break CM is different
2721 // from the normal Grapheme Extend property.
2722 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
2723 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
2725 *nextChar
= fText
->char32At(nPos
);
2726 if (!fCM
->contains(*nextChar
)) {
2729 nPos
= fText
->moveIndex32(nPos
, 1);
2734 // LB 9 Treat X CM* as if it were x.
2735 // No explicit action required.
2737 // LB 10 Treat any remaining combining mark as AL
2738 if (fCM
->contains(*posChar
)) {
2742 // Push the updated nextPos and nextChar back to our caller.
2743 // This only makes a difference if posChar got bigger by consuming a
2744 // combining sequence.
2746 *nextChar
= fText
->char32At(nPos
);
2751 int32_t RBBILineMonkey::next(int32_t startPos
) {
2752 UErrorCode status
= U_ZERO_ERROR
;
2753 int32_t pos
; // Index of the char following a potential break position
2754 UChar32 thisChar
; // Character at above position "pos"
2756 int32_t prevPos
; // Index of the char preceding a potential break position
2757 UChar32 prevChar
; // Character at above position. Note that prevChar
2758 // and thisChar may not be adjacent because combining
2759 // characters between them will be ignored.
2761 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
2764 int32_t nextPos
; // Index of the next character following pos.
2765 // Usually skips over combining marks.
2766 int32_t nextCPPos
; // Index of the code point following "pos."
2767 // May point to a combining mark.
2768 int32_t tPos
; // temp value.
2771 if (U_FAILURE(deferredStatus
)) {
2775 if (startPos
>= fText
->length()) {
2780 // Initial values for loop. Loop will run the first time without finding breaks,
2781 // while the invalid values shift out and the "this" and
2782 // "prev" positions are filled in with good values.
2783 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
2784 thisChar
= prevChar
= prevCharX2
= 0;
2785 nextPos
= nextCPPos
= startPos
;
2788 // Loop runs once per position in the test text, until a break position
2791 prevPosX2
= prevPos
;
2792 prevCharX2
= prevChar
;
2795 prevChar
= thisChar
;
2798 thisChar
= fText
->char32At(pos
);
2800 nextCPPos
= fText
->moveIndex32(pos
, 1);
2801 nextPos
= nextCPPos
;
2803 // Rule LB2 - Break at end of text.
2804 if (pos
>= fText
->length()) {
2808 // Rule LB 9 - adjust for combining sequences.
2809 // We do this one out-of-order because the adjustment does not change anything
2810 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2812 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
2813 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
2814 c
= fText
->char32At(nextPos
);
2815 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
2817 // If the loop is still warming up - if we haven't shifted the initial
2818 // -1 positions out of prevPos yet - loop back to advance the
2819 // position in the input without any further looking for breaks.
2820 if (prevPos
== -1) {
2824 // LB 4 Always break after hard line breaks,
2825 if (fBK
->contains(prevChar
)) {
2829 // LB 5 Break after CR, LF, NL, but not inside CR LF
2830 if (prevChar
== 0x0d && thisChar
== 0x0a) {
2833 if (prevChar
== 0x0d ||
2839 // LB 6 Don't break before hard line breaks
2840 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
2841 fBK
->contains(thisChar
)) {
2846 // LB 7 Don't break before spaces or zero-width space.
2847 if (fSP
->contains(thisChar
)) {
2851 if (fZW
->contains(thisChar
)) {
2855 // LB 8 Break after zero width space
2856 if (fZW
->contains(prevChar
)) {
2861 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2862 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2863 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
2864 if (U_FAILURE(status
)) {
2867 // Matched a number. But could have been just a single digit, which would
2868 // not represent a "no break here" between prevChar and thisChar
2869 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
2870 if (numEndIdx
> pos
) {
2871 // Number match includes at least our two chars being checked
2872 if (numEndIdx
> nextPos
) {
2873 // Number match includes additional chars. Update pos and nextPos
2874 // so that next loop iteration will continue at the end of the number,
2875 // checking for breaks between last char in number & whatever follows.
2876 pos
= nextPos
= numEndIdx
;
2878 pos
= fText
->moveIndex32(pos
, -1);
2879 thisChar
= fText
->char32At(pos
);
2880 } while (fCM
->contains(thisChar
));
2887 // The monkey test's way of ignoring combining characters doesn't work
2888 // for this rule. ZJ is also a CM. Need to get the actual character
2889 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
2891 int32_t prevIdx
= fText
->moveIndex32(pos
, -1);
2892 UChar32 prevC
= fText
->char32At(prevIdx
);
2893 if (fZJ
->contains(prevC
)) {
2898 // LB 9, 10 Already done, at top of loop.
2902 // LB 11 Do not break before or after WORD JOINER and related characters.
2906 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
2912 if (fGL
->contains(prevChar
)) {
2918 if (!(fSP
->contains(prevChar
) ||
2919 fBA
->contains(prevChar
) ||
2920 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
2926 // LB 13 Don't break before closings.
2927 // NU x CL, NU x CP and NU x IS are not matched here so that they will
2928 // fall into LB 17 and the more general number regular expression.
2930 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
2931 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
2932 fEX
->contains(thisChar
) ||
2933 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
2934 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
2938 // LB 14 Don't break after OP SP*
2939 // Scan backwards, checking for this sequence.
2940 // The OP char could include combining marks, so we actually check for
2942 // Another Twist: The Rule 67 fixes may have changed a SP CM
2943 // sequence into a ID char, so before scanning back through spaces,
2944 // verify that prevChar is indeed a space. The prevChar variable
2945 // may differ from fText[prevPos]
2947 if (fSP
->contains(prevChar
)) {
2948 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
2949 tPos
=fText
->moveIndex32(tPos
, -1);
2952 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
2953 tPos
=fText
->moveIndex32(tPos
, -1);
2955 if (fOP
->contains(fText
->char32At(tPos
))) {
2960 // LB 15 QU SP* x OP
2961 if (fOP
->contains(thisChar
)) {
2962 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
2964 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
2965 tPos
= fText
->moveIndex32(tPos
, -1);
2967 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
2968 tPos
= fText
->moveIndex32(tPos
, -1);
2970 if (fQU
->contains(fText
->char32At(tPos
))) {
2977 // LB 16 (CL | CP) SP* x NS
2978 // Scan backwards for SP* CM* (CL | CP)
2979 if (fNS
->contains(thisChar
)) {
2981 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
2982 tPos
= fText
->moveIndex32(tPos
, -1);
2984 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
2985 tPos
= fText
->moveIndex32(tPos
, -1);
2987 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
2993 // LB 17 B2 SP* x B2
2994 if (fB2
->contains(thisChar
)) {
2995 // Scan backwards, checking for the B2 CM* SP* sequence.
2997 if (fSP
->contains(prevChar
)) {
2998 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
2999 tPos
=fText
->moveIndex32(tPos
, -1);
3002 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3003 tPos
=fText
->moveIndex32(tPos
, -1);
3005 if (fB2
->contains(fText
->char32At(tPos
))) {
3011 // LB 18 break after space
3012 if (fSP
->contains(prevChar
)) {
3019 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3023 // LB 20 Break around a CB
3024 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3029 if (fBA
->contains(thisChar
) ||
3030 fHY
->contains(thisChar
) ||
3031 fNS
->contains(thisChar
) ||
3032 fBB
->contains(prevChar
) ) {
3038 if (fHL
->contains(prevCharX2
) &&
3039 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3045 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3050 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3051 (fEX
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3052 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3053 ((fID
->contains(prevChar
) || fEB
->contains(prevChar
) || fEM
->contains(prevChar
)) && fIN
->contains(thisChar
)) ||
3054 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3055 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3060 // LB 23 (AL | HL) x NU
3062 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fNU
->contains(thisChar
)) {
3065 if (fNU
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3069 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3070 // PR x (ID | EB | EM)
3071 // (ID | EB | EM) x PO
3072 if (fPR
->contains(prevChar
) &&
3073 (fID
->contains(thisChar
) || fEB
->contains(thisChar
) || fEM
->contains(thisChar
))) {
3076 if ((fID
->contains(prevChar
) || fEB
->contains(prevChar
) || fEM
->contains(prevChar
)) &&
3077 fPO
->contains(thisChar
)) {
3081 // LB 24 Do not break between prefix and letters or ideographs.
3082 // (PR | PO) x (AL | HL)
3083 // (AL | HL) x (PR | PO)
3084 if ((fPR
->contains(prevChar
) || fPO
->contains(prevChar
)) &&
3085 (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3088 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) &&
3089 (fPR
->contains(thisChar
) || fPO
->contains(thisChar
))) {
3093 // LB 25 numbers match, moved up, before LB 8a,
3095 // LB 26 Do not break a Korean syllable.
3096 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3097 fJV
->contains(thisChar
) ||
3098 fH2
->contains(thisChar
) ||
3099 fH3
->contains(thisChar
))) {
3103 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3104 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3108 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3109 fJT
->contains(thisChar
)) {
3113 // LB 27 Treat a Korean Syllable Block the same as ID.
3114 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3115 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3116 fIN
->contains(thisChar
)) {
3119 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3120 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3121 fPO
->contains(thisChar
)) {
3124 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3125 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3131 // LB 28 Do not break between alphabetics ("at").
3132 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3136 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3137 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3141 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3144 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3147 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3151 // LB30a RI RI <break> RI
3153 if (fRI
->contains(prevCharX2
) && fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3156 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3160 // LB30b Emoji Base x Emoji Modifier
3161 if (fEB
->contains(prevChar
) && fEM
->contains(thisChar
)) {
3165 // LB 31 Break everywhere else
3174 UVector
*RBBILineMonkey::charClasses() {
3179 RBBILineMonkey::~RBBILineMonkey() {
3226 delete fNumberMatcher
;
3230 //-------------------------------------------------------------------------------------------
3235 // seed=nnnnn Random number starting seed.
3236 // Setting the seed allows errors to be reproduced.
3237 // loop=nnn Looping count. Controls running time.
3239 // 0 or greater: run length.
3241 // type = char | word | line | sent | title
3244 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3246 //-------------------------------------------------------------------------------------------
3248 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3249 int32_t val
= defaultVal
;
3250 name
.append(" *= *(-?\\d+)");
3251 UErrorCode status
= U_ZERO_ERROR
;
3252 RegexMatcher
m(name
, params
, 0, status
);
3254 // The param exists. Convert the string to an int.
3255 char valString
[100];
3256 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3257 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3258 paramLength
= (int32_t)(sizeof(valString
)-2);
3260 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3261 val
= strtol(valString
, NULL
, 10);
3263 // Delete this parameter from the params string.
3265 params
= m
.replaceFirst("", status
);
3267 U_ASSERT(U_SUCCESS(status
));
3272 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3273 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3282 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3284 if (count
< expectedcount
&& expected
[count
] != i
) {
3285 test
->errln("%s:%d break forward test failed: expected %d but got %d",
3286 __FILE__
, __LINE__
, expected
[count
], i
);
3291 if (count
!= expectedcount
) {
3292 printStringBreaks(ustr
, expected
, expectedcount
);
3293 test
->errln("%s:%d break forward test failed: missed %d match",
3294 __FILE__
, __LINE__
, expectedcount
- count
);
3297 // testing boundaries
3298 for (i
= 1; i
< expectedcount
; i
++) {
3299 int j
= expected
[i
- 1];
3300 if (!bi
->isBoundary(j
)) {
3301 printStringBreaks(ustr
, expected
, expectedcount
);
3302 test
->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3303 __FILE__
, __LINE__
, j
);
3306 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3307 if (bi
->isBoundary(j
)) {
3308 printStringBreaks(ustr
, expected
, expectedcount
);
3309 test
->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3310 __FILE__
, __LINE__
, j
);
3316 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3318 if (forward
[count
] != i
) {
3319 printStringBreaks(ustr
, expected
, expectedcount
);
3320 test
->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3321 __FILE__
, __LINE__
, forward
[count
], i
);
3326 printStringBreaks(ustr
, expected
, expectedcount
);
3327 test
->errln("break test previous() failed: missed a match");
3331 // testing preceding
3332 for (i
= 0; i
< expectedcount
- 1; i
++) {
3333 // int j = expected[i] + 1;
3334 int j
= ustr
.moveIndex32(expected
[i
], 1);
3335 for (; j
<= expected
[i
+ 1]; j
++) {
3336 int32_t expectedPreceding
= expected
[i
];
3337 int32_t actualPreceding
= bi
->preceding(j
);
3338 if (actualPreceding
!= expectedPreceding
) {
3339 printStringBreaks(ustr
, expected
, expectedcount
);
3340 test
->errln("%s:%d preceding(%d): expected %d, got %d",
3341 __FILE__
, __LINE__
, j
, expectedPreceding
, actualPreceding
);
3349 void RBBITest::TestWordBreaks(void)
3351 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3353 Locale
locale("en");
3354 UErrorCode status
= U_ZERO_ERROR
;
3355 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3356 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3357 // Replaced any C+J characters in a row with a random sequence of characters
3358 // of the same length to make our C+J segmentation not get in the way.
3359 static const char *strlist
[] =
3361 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3362 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3363 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3364 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3365 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3366 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3367 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3368 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3369 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3370 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3371 "\\u2027\\U000e0067\\u0a47\\u00b7",
3372 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3373 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3374 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3375 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3376 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3377 "\\u0027\\u11af\\U000e0057\\u0602",
3378 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3379 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3380 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3381 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3382 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3383 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3384 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3385 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3386 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3387 "\\u18f4\\U000e0049\\u20e7\\u2027",
3388 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3389 "\\ua183\\u102d\\u0bec\\u003a",
3390 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3391 "\\u003a\\u0e57\\u0fad\\u002e",
3392 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3393 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3394 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3395 "\\u003a\\u0664\\u00b7\\u1fba",
3396 "\\u003b\\u0027\\u00b7\\u47a3",
3397 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3398 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3399 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3402 if (U_FAILURE(status
)) {
3403 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3406 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3407 // printf("looping %d\n", loop);
3408 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3409 // RBBICharMonkey monkey;
3410 RBBIWordMonkey monkey
;
3413 int expectedcount
= 0;
3415 monkey
.setText(ustr
);
3417 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3418 expected
[expectedcount
++] = i
;
3421 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3427 void RBBITest::TestWordBoundary(void)
3429 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3430 Locale
locale("en");
3431 UErrorCode status
= U_ZERO_ERROR
;
3432 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3433 LocalPointer
<BreakIterator
> bi(BreakIterator::createWordInstance(locale
, status
), status
);
3434 if (U_FAILURE(status
)) {
3435 errcheckln(status
, "%s:%d Creation of break iterator failed %s",
3436 __FILE__
, __LINE__
, u_errorName(status
));
3440 static const char *strlist
[] =
3442 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3443 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3444 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3445 "\\u2027\\U000e0067\\u0a47\\u00b7",
3446 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3447 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3448 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3449 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3450 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3451 "\\u0027\\u11af\\U000e0057\\u0602",
3452 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3453 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3454 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3455 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3456 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3457 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3458 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3459 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3460 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3461 "\\u58f4\\U000e0049\\u20e7\\u2027",
3462 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3463 "\\ua183\\u102d\\u0bec\\u003a",
3464 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3465 "\\u003a\\u0e57\\u0fad\\u002e",
3466 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3467 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3468 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3469 "\\u003a\\u0664\\u00b7\\u1fba",
3470 "\\u003b\\u0027\\u00b7\\u47a3",
3473 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3474 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
3475 UnicodeString
ustr(str
);
3481 for (int32_t boundary
= bi
->first(); boundary
!= BreakIterator::DONE
; boundary
= bi
->next()) {
3483 if (count
>= UPRV_LENGTHOF(forward
)) {
3484 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3485 __FILE__
, __LINE__
, loop
, count
, boundary
);
3488 forward
[count
] = boundary
;
3489 if (boundary
<= prev
) {
3490 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3491 __FILE__
, __LINE__
, loop
, prev
, boundary
);
3494 for (int32_t nonBoundary
= prev
+ 1; nonBoundary
< boundary
; nonBoundary
++) {
3495 if (bi
->isBoundary(nonBoundary
)) {
3496 printStringBreaks(ustr
, forward
, count
);
3497 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3498 __FILE__
, __LINE__
, loop
, prev
, nonBoundary
, boundary
);
3502 if (!bi
->isBoundary(boundary
)) {
3503 printStringBreaks(ustr
, forward
, count
);
3504 errln("%s:%d happy boundary test failed: expected %d a boundary",
3505 __FILE__
, __LINE__
, boundary
);
3513 void RBBITest::TestLineBreaks(void)
3515 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3516 Locale
locale("en");
3517 UErrorCode status
= U_ZERO_ERROR
;
3518 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3519 const int32_t STRSIZE
= 50;
3521 static const char *strlist
[] =
3523 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3524 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3525 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3526 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3527 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3528 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3529 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3530 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3531 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3532 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3533 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3534 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3535 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3536 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3537 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3538 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3539 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3540 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3541 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3542 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3543 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3544 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3545 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3546 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3547 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3548 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3549 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3550 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3551 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3552 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3553 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3554 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3555 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3556 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3557 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3558 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3559 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3560 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3561 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3562 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3565 TEST_ASSERT_SUCCESS(status
);
3566 if (U_FAILURE(status
)) {
3569 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3570 // printf("looping %d\n", loop);
3571 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3578 UnicodeString
ustr(str
);
3579 RBBILineMonkey monkey
;
3580 if (U_FAILURE(monkey
.deferredStatus
)) {
3584 const int EXPECTEDSIZE
= 50;
3585 int expected
[EXPECTEDSIZE
];
3586 int expectedcount
= 0;
3588 monkey
.setText(ustr
);
3590 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3591 if (expectedcount
>= EXPECTEDSIZE
) {
3592 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3595 expected
[expectedcount
++] = i
;
3598 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3604 void RBBITest::TestSentBreaks(void)
3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3607 Locale
locale("en");
3608 UErrorCode status
= U_ZERO_ERROR
;
3609 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3611 static const char *strlist
[] =
3613 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3615 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3616 "\"Sentence ending with a quote.\" Bye.",
3617 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3618 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3619 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3620 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3621 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3622 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3623 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3624 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3625 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3626 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3627 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3628 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3629 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3630 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3631 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3632 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3635 if (U_FAILURE(status
)) {
3636 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3639 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3640 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
3641 UnicodeString
ustr(str
);
3643 RBBISentMonkey monkey
;
3644 if (U_FAILURE(monkey
.deferredStatus
)) {
3648 const int EXPECTEDSIZE
= 50;
3649 int expected
[EXPECTEDSIZE
];
3650 int expectedcount
= 0;
3652 monkey
.setText(ustr
);
3654 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3655 if (expectedcount
>= EXPECTEDSIZE
) {
3656 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3659 expected
[expectedcount
++] = i
;
3662 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3668 void RBBITest::TestMonkey() {
3669 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3671 UErrorCode status
= U_ZERO_ERROR
;
3672 int32_t loopCount
= 500;
3674 UnicodeString breakType
= "all";
3675 Locale
locale("en");
3676 UBool useUText
= FALSE
;
3678 if (quick
== FALSE
) {
3683 UnicodeString
p(fTestParams
);
3684 loopCount
= getIntParam("loop", p
, loopCount
);
3685 seed
= getIntParam("seed", p
, seed
);
3687 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3689 breakType
= m
.group(1, status
);
3691 p
= m
.replaceFirst("", status
);
3694 RegexMatcher
u(" *utext", p
, 0, status
);
3698 p
= u
.replaceFirst("", status
);
3703 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
3704 // Each option is stripped out of the option string as it is processed.
3705 // All options have been checked. The option string should have been completely emptied..
3707 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3708 buf
[sizeof(buf
)-1] = 0;
3709 errln("Unrecognized or extra parameter: %s\n", buf
);
3715 if (breakType
== "char" || breakType
== "all") {
3717 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3718 if (U_SUCCESS(status
)) {
3719 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
3720 if (breakType
== "all" && useUText
==FALSE
) {
3721 // Also run a quick test with UText when "all" is specified
3722 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
3726 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
3731 if (breakType
== "word" || breakType
== "all") {
3732 logln("Word Break Monkey Test");
3734 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3735 if (U_SUCCESS(status
)) {
3736 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
3739 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
3744 if (breakType
== "line" || breakType
== "all") {
3745 logln("Line Break Monkey Test");
3747 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3748 if (loopCount
>= 10) {
3749 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
3751 if (U_SUCCESS(status
)) {
3752 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
3755 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3760 if (breakType
== "sent" || breakType
== "all" ) {
3761 logln("Sentence Break Monkey Test");
3763 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3764 if (loopCount
>= 10) {
3765 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
3767 if (U_SUCCESS(status
)) {
3768 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
3771 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3780 // Run a RBBI monkey test. Common routine, for all break iterator types.
3782 // bi - the break iterator to use
3783 // mk - MonkeyKind, abstraction for obtaining expected results
3784 // name - Name of test (char, word, etc.) for use in error messages
3785 // seed - Seed for starting random number generator (parameter from user)
3788 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
3789 int32_t numIterations
, UBool useUText
) {
3791 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3793 const int32_t TESTSTRINGLEN
= 500;
3794 UnicodeString testText
;
3795 int32_t numCharClasses
;
3797 int expected
[TESTSTRINGLEN
*2 + 1];
3798 int expectedCount
= 0;
3799 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
3800 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
3801 char reverseBreaks
[TESTSTRINGLEN
*2+1];
3802 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
3803 char followingBreaks
[TESTSTRINGLEN
*2+1];
3804 char precedingBreaks
[TESTSTRINGLEN
*2+1];
3810 numCharClasses
= mk
.charClasses()->size();
3811 chClasses
= mk
.charClasses();
3813 // Check for errors that occured during the construction of the MonkeyKind object.
3814 // Can't report them where they occured because errln() is a method coming from intlTest,
3815 // and is not visible outside of RBBITest :-(
3816 if (U_FAILURE(mk
.deferredStatus
)) {
3817 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
3821 // Verify that the character classes all have at least one member.
3822 for (i
=0; i
<numCharClasses
; i
++) {
3823 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
3824 if (s
== NULL
|| s
->size() == 0) {
3825 errln("Character Class #%d is null or of zero size.", i
);
3830 while (loopCount
< numIterations
|| numIterations
== -1) {
3831 if (numIterations
== -1 && loopCount
% 10 == 0) {
3832 // If test is running in an infinite loop, display a periodic tic so
3833 // we can tell that it is making progress.
3834 fprintf(stderr
, ".");
3836 // Save current random number seed, so that we can recreate the random numbers
3837 // for this loop iteration in event of an error.
3840 // Populate a test string with data.
3841 testText
.truncate(0);
3842 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
3843 int32_t aClassNum
= m_rand() % numCharClasses
;
3844 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
3845 int32_t charIdx
= m_rand() % classSet
->size();
3846 UChar32 c
= classSet
->charAt(charIdx
);
3847 if (c
< 0) { // TODO: deal with sets containing strings.
3848 errln("%s:%d c < 0", __FILE__
, __LINE__
);
3851 // Do not assemble a supplementary character from randomly generated separate surrogates.
3852 // (It could be a dictionary character)
3853 if (U16_IS_TRAIL(c
) && testText
.length() > 0 && U16_IS_LEAD(testText
.charAt(testText
.length()-1))) {
3860 // Calculate the expected results for this test string.
3861 mk
.setText(testText
);
3862 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
3863 expectedBreaks
[0] = 1;
3864 int32_t breakPos
= 0;
3867 breakPos
= mk
.next(breakPos
);
3868 if (breakPos
== -1) {
3871 if (breakPos
> testText
.length()) {
3872 errln("breakPos > testText.length()");
3874 expectedBreaks
[breakPos
] = 1;
3875 U_ASSERT(expectedCount
<testText
.length());
3876 expected
[expectedCount
++] = breakPos
;
3877 (void)expected
; // Set but not used warning.
3878 // TODO (andy): check it out.
3881 // Find the break positions using forward iteration
3882 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
3884 UErrorCode status
= U_ZERO_ERROR
;
3885 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
3886 // testUText = utext_openUnicodeString(testUText, &testText, &status);
3887 bi
->setText(testUText
, status
);
3888 TEST_ASSERT_SUCCESS(status
);
3889 utext_close(testUText
); // The break iterator does a shallow clone of the UText
3890 // This UText can be closed immediately, so long as the
3891 // testText string continues to exist.
3893 bi
->setText(testText
);
3896 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
3897 if (i
< 0 || i
> testText
.length()) {
3898 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
3901 forwardBreaks
[i
] = 1;
3904 // Find the break positions using reverse iteration
3905 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
3906 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
3907 if (i
< 0 || i
> testText
.length()) {
3908 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
3911 reverseBreaks
[i
] = 1;
3914 // Find the break positions using isBoundary() tests.
3915 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
3916 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
3917 for (i
=0; i
<=testText
.length(); i
++) {
3918 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
3922 // Find the break positions using the following() function.
3924 memset(followingBreaks
, 0, sizeof(followingBreaks
));
3925 int32_t lastBreakPos
= 0;
3926 followingBreaks
[0] = 1;
3927 for (i
=0; i
<testText
.length(); i
++) {
3928 breakPos
= bi
->following(i
);
3929 if (breakPos
<= i
||
3930 breakPos
< lastBreakPos
||
3931 breakPos
> testText
.length() ||
3932 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
3933 errln("%s break monkey test: "
3934 "Out of range value returned by BreakIterator::following().\n"
3935 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
3936 name
, seed
, i
, breakPos
, lastBreakPos
);
3939 followingBreaks
[breakPos
] = 1;
3940 lastBreakPos
= breakPos
;
3943 // Find the break positions using the preceding() function.
3944 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
3945 lastBreakPos
= testText
.length();
3946 precedingBreaks
[testText
.length()] = 1;
3947 for (i
=testText
.length(); i
>0; i
--) {
3948 breakPos
= bi
->preceding(i
);
3949 if (breakPos
>= i
||
3950 breakPos
> lastBreakPos
||
3951 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
3952 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
3953 errln("%s break monkey test: "
3954 "Out of range value returned by BreakIterator::preceding().\n"
3955 "index=%d; prev returned %d; lastBreak=%d" ,
3956 name
, i
, breakPos
, lastBreakPos
);
3957 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
3958 precedingBreaks
[i
] = 2; // Forces an error.
3961 if (breakPos
>= 0) {
3962 precedingBreaks
[breakPos
] = 1;
3964 lastBreakPos
= breakPos
;
3968 // Compare the expected and actual results.
3969 for (i
=0; i
<=testText
.length(); i
++) {
3970 const char *errorType
= NULL
;
3971 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
3972 errorType
= "next()";
3973 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
3974 errorType
= "previous()";
3975 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
3976 errorType
= "isBoundary()";
3977 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
3978 errorType
= "following()";
3979 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
3980 errorType
= "preceding()";
3984 if (errorType
!= NULL
) {
3985 // Format a range of the test text that includes the failure as
3986 // a data item that can be included in the rbbi test data file.
3988 // Start of the range is the last point where expected and actual results
3989 // both agreed that there was a break position.
3990 int startContext
= i
;
3993 if (startContext
==0) { break; }
3995 if (expectedBreaks
[startContext
] != 0) {
3996 if (count
== 2) break;
4001 // End of range is two expected breaks past the start position.
4002 int endContext
= i
+ 1;
4004 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4006 if (endContext
>= testText
.length()) {break;}
4007 if (expectedBreaks
[endContext
-1] != 0) {
4008 if (count
== 0) break;
4015 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4016 UnicodeString errorText
= "<data>";
4017 /***if (strcmp(errorType, "next()") == 0) {
4019 endContext = testText.length();
4021 printStringBreaks(testText, expected, expectedCount);
4024 for (ci
=startContext
; ci
<endContext
;) {
4025 UnicodeString
hexChars("0123456789abcdef");
4028 c
= testText
.char32At(ci
);
4030 // This is the location of the error.
4031 errorText
.append("<?>");
4032 } else if (expectedBreaks
[ci
] != 0) {
4033 // This a non-error expected break position.
4034 errorText
.append("\\");
4037 errorText
.append("\\u");
4038 for (bn
=12; bn
>=0; bn
-=4) {
4039 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4042 errorText
.append("\\U");
4043 for (bn
=28; bn
>=0; bn
-=4) {
4044 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4047 ci
= testText
.moveIndex32(ci
, 1);
4049 errorText
.append("\\");
4050 errorText
.append("</data>\n");
4053 char charErrorTxt
[500];
4054 UErrorCode status
= U_ZERO_ERROR
;
4055 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4056 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4057 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4059 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4060 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4061 errorType
, seed
, i
, charErrorTxt
);
4072 // Bug 5532. UTF-8 based UText fails in dictionary code.
4073 // This test checks the initial patch,
4074 // which is to just keep it from crashing. Correct word boundaries
4075 // await a proper fix to the dictionary code.
4077 void RBBITest::TestBug5532(void) {
4078 // Text includes a mixture of Thai and Latin.
4079 const unsigned char utf8Data
[] = {
4080 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4081 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4082 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4083 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4084 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4085 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4086 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4087 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4088 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4089 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4090 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4092 UErrorCode status
= U_ZERO_ERROR
;
4093 UText utext
=UTEXT_INITIALIZER
;
4094 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4095 TEST_ASSERT_SUCCESS(status
);
4097 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4098 TEST_ASSERT_SUCCESS(status
);
4099 if (U_SUCCESS(status
)) {
4100 bi
->setText(&utext
, status
);
4101 TEST_ASSERT_SUCCESS(status
);
4103 int32_t breakCount
= 0;
4104 int32_t previousBreak
= -1;
4105 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4106 // For now, just make sure that the break iterator doesn't hang.
4107 TEST_ASSERT(previousBreak
< bi
->current());
4108 previousBreak
= bi
->current();
4110 TEST_ASSERT(breakCount
> 0);
4113 utext_close(&utext
);
4117 void RBBITest::TestBug9983(void) {
4118 UnicodeString text
= UnicodeString("\\u002A" // * Other
4120 "\\u309C" // Katakana
4124 "\\u0000").unescape();
4126 UErrorCode status
= U_ZERO_ERROR
;
4127 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4128 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4129 TEST_ASSERT_SUCCESS(status
);
4130 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4131 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4132 TEST_ASSERT_SUCCESS(status
);
4133 if (U_FAILURE(status
)) {
4136 int32_t offset
, rstatus
, iterationCount
;
4138 brkiter
->setText(text
);
4141 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4143 rstatus
= brkiter
->getRuleStatus();
4144 (void)rstatus
; // Suppress set but not used warning.
4145 if (iterationCount
>= 10) {
4149 TEST_ASSERT(iterationCount
== 6);
4151 brkiterPOSIX
->setText(text
);
4152 brkiterPOSIX
->last();
4154 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4156 rstatus
= brkiterPOSIX
->getRuleStatus();
4157 (void)rstatus
; // Suppress set but not used warning.
4158 if (iterationCount
>= 10) {
4162 TEST_ASSERT(iterationCount
== 6);
4165 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4167 void RBBITest::TestBug7547() {
4168 UnicodeString rules
;
4169 UErrorCode status
= U_ZERO_ERROR
;
4170 UParseError parseError
;
4171 RuleBasedBreakIterator
breakIterator(rules
, parseError
, status
);
4172 if (status
!= U_BRK_RULE_SYNTAX
) {
4173 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__
, __LINE__
, u_errorName(status
));
4175 if (parseError
.line
!= 1 || parseError
.offset
!= 0) {
4176 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError
.line
, parseError
.offset
);
4181 void RBBITest::TestBug12797() {
4182 UnicodeString rules
= "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4183 UErrorCode status
= U_ZERO_ERROR
;
4184 UParseError parseError
;
4185 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
4186 if (U_FAILURE(status
)) {
4187 errln("%s:%s status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4190 UnicodeString text
= "abc";
4193 int32_t boundary
= bi
.next();
4194 if (boundary
!= 3) {
4195 errln("%s:%d expected boundary==3, got %d", __FILE__
, __LINE__
, boundary
);
4199 void RBBITest::TestBug12918() {
4200 // This test triggers an assertion failure in dictbe.cpp
4201 const UChar
*crasherString
= u
"\u3325\u4a16";
4202 UErrorCode status
= U_ZERO_ERROR
;
4203 UBreakIterator
* iter
= ubrk_open(UBRK_WORD
, NULL
, crasherString
, -1, &status
);
4204 if (U_FAILURE(status
)) {
4205 dataerrln("%s:%d status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4210 int32_t lastPos
= -1;
4211 while((pos
= ubrk_next(iter
)) != UBRK_DONE
) {
4212 if (pos
<= lastPos
) {
4213 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__
, __LINE__
, pos
, lastPos
);
4220 void RBBITest::TestBug12932() {
4221 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4222 UnicodeString
ruleStr(
4223 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4224 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4225 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4226 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4227 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4228 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4230 UErrorCode status
= U_ZERO_ERROR
;
4231 UParseError parseError
;
4232 RuleBasedBreakIterator
rbbi(ruleStr
, parseError
, status
);
4233 if (status
!= U_BRK_RULE_SYNTAX
) {
4234 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4235 __FILE__
, __LINE__
, u_errorName(status
));
4240 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4241 // remain undevided by ICU char, word and line break.
4242 void RBBITest::TestEmoji() {
4243 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4244 UErrorCode status
= U_ZERO_ERROR
;
4246 CharString testFileName
;
4247 testFileName
.append(IntlTest::getSourceTestData(status
), status
);
4248 testFileName
.appendPathPart("emoji-test.txt", status
);
4249 if (U_FAILURE(status
)) {
4250 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4253 logln("Opening data file %s\n", testFileName
.data());
4256 UChar
*testFile
= ReadAndConvertFile(testFileName
.data(), len
, "UTF-8", status
);
4257 if (U_FAILURE(status
) || testFile
== NULL
) {
4258 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4261 UnicodeString
testFileAsString(testFile
, len
);
4264 RegexMatcher
lineMatcher(u
"^.*?$", testFileAsString
, UREGEX_MULTILINE
, status
);
4265 RegexMatcher
hexMatcher(u
"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE
, status
);
4266 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4267 int32_t lineNumber
= 0;
4269 LocalPointer
<BreakIterator
> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status
), status
);
4270 LocalPointer
<BreakIterator
> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status
), status
);
4271 LocalPointer
<BreakIterator
> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status
), status
);
4272 if (U_FAILURE(status
)) {
4273 dataerrln("%s:%d %s while opening break iterators", __FILE__
, __LINE__
, u_errorName(status
));
4277 while (lineMatcher
.find()) {
4279 UnicodeString line
= lineMatcher
.group(status
);
4280 hexMatcher
.reset(line
);
4281 UnicodeString testString
; // accumulates the emoji sequence.
4282 while (hexMatcher
.find() && hexMatcher
.group(1, status
).length() > 0) {
4283 UnicodeString hex
= hexMatcher
.group(1, status
);
4284 if (hex
.length() > 8) {
4285 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__
, __LINE__
, lineNumber
, CStr(hex
)());
4289 hex8
.appendInvariantChars(hex
, status
);
4290 UChar32 c
= (UChar32
)strtol(hex8
.data(), NULL
, 16);
4292 testString
.append(c
);
4294 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4295 __FILE__
, __LINE__
, lineNumber
, hex8
.data());
4300 if (testString
.length() > 1) {
4301 charBreaks
->setText(testString
);
4302 charBreaks
->first();
4303 int32_t firstBreak
= charBreaks
->next();
4304 if (testString
.length() != firstBreak
) {
4305 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4306 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4308 wordBreaks
->setText(testString
);
4309 wordBreaks
->first();
4310 firstBreak
= wordBreaks
->next();
4311 if (testString
.length() != firstBreak
) {
4312 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4313 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4315 lineBreaks
->setText(testString
);
4316 lineBreaks
->first();
4317 firstBreak
= lineBreaks
->next();
4318 if (testString
.length() != firstBreak
) {
4319 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4320 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4328 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4330 // WHERE Macro yields a literal string of the form "source_file_name:line number "
4331 // TODO: propose something equivalent as a test framework addition.
4333 #define WHERE __FILE__ ":" XLINE(__LINE__) " "
4334 #define XLINE(s) LINE(s)
4337 void RBBITest::TestBug12519() {
4338 UErrorCode status
= U_ZERO_ERROR
;
4339 LocalPointer
<RuleBasedBreakIterator
> biEn((RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
));
4340 LocalPointer
<RuleBasedBreakIterator
> biFr((RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getFrance(), status
));
4341 if (!assertSuccess(WHERE
, status
)) {
4342 dataerrln("%s %d status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4345 assertTrue(WHERE
, Locale::getEnglish() == biEn
->getLocale(ULOC_VALID_LOCALE
, status
));
4347 assertTrue(WHERE
, Locale::getFrench() == biFr
->getLocale(ULOC_VALID_LOCALE
, status
));
4348 assertTrue(WHERE
"Locales do not participate in BreakIterator equality.", *biEn
== *biFr
);
4350 LocalPointer
<RuleBasedBreakIterator
>cloneEn((RuleBasedBreakIterator
*)biEn
->clone());
4351 assertTrue(WHERE
, *biEn
== *cloneEn
);
4352 assertTrue(WHERE
, Locale::getEnglish() == cloneEn
->getLocale(ULOC_VALID_LOCALE
, status
));
4354 LocalPointer
<RuleBasedBreakIterator
>cloneFr((RuleBasedBreakIterator
*)biFr
->clone());
4355 assertTrue(WHERE
, *biFr
== *cloneFr
);
4356 assertTrue(WHERE
, Locale::getFrench() == cloneFr
->getLocale(ULOC_VALID_LOCALE
, status
));
4358 LocalPointer
<RuleBasedBreakIterator
>biDe((RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getGerman(), status
));
4359 UnicodeString
text("Hallo Welt");
4360 biDe
->setText(text
);
4361 assertTrue(WHERE
"before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr
!= *biDe
);
4363 assertTrue(WHERE
"after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr
== *biDe
);
4366 void RBBITest::TestBug12677() {
4367 // Check that stripping of comments from rules for getRules() is not confused by
4368 // the presence of '#' characters in the rules that do not introduce comments.
4369 UnicodeString
rules(u
"!!forward; \n"
4370 "$x = [ab#]; # a set with a # literal. \n"
4371 " # .; # a comment that looks sort of like a rule. \n"
4372 " '#' '?'; # a rule with a quoted # \n"
4375 UErrorCode status
= U_ZERO_ERROR
;
4377 RuleBasedBreakIterator
bi(rules
, pe
, status
);
4378 assertSuccess(WHERE
, status
);
4379 UnicodeString rtRules
= bi
.getRules();
4380 assertEquals(WHERE
, UnicodeString(u
"!!forward; $x = [ab#]; '#' '?'; "), rtRules
);
4384 void RBBITest::TestTableRedundancies() {
4385 UErrorCode status
= U_ZERO_ERROR
;
4387 LocalPointer
<RuleBasedBreakIterator
> bi (
4388 (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
));
4389 assertSuccess(WHERE
, status
);
4390 if (U_FAILURE(status
)) return;
4392 RBBIDataWrapper
*dw
= bi
->fData
;
4393 const RBBIStateTable
*fwtbl
= dw
->fForwardTable
;
4394 int32_t numCharClasses
= dw
->fHeader
->fCatCount
;
4395 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4397 // Check for duplicate columns (character categories)
4399 std::vector
<UnicodeString
> columns
;
4400 for (int32_t column
= 0; column
< numCharClasses
; column
++) {
4402 for (int32_t r
= 1; r
< (int32_t)fwtbl
->fNumStates
; r
++) {
4403 RBBIStateTableRow
*row
= (RBBIStateTableRow
*) (fwtbl
->fTableData
+ (fwtbl
->fRowLen
* r
));
4404 s
.append(row
->fNextState
[column
]);
4406 columns
.push_back(s
);
4408 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4409 for (int c1
=1; c1
<numCharClasses
; c1
++) {
4410 for (int c2
= c1
+1; c2
< numCharClasses
; c2
++) {
4411 if (columns
.at(c1
) == columns
.at(c2
)) {
4412 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__
, __LINE__
, c1
, c2
);
4419 // Check for duplicate states
4420 std::vector
<UnicodeString
> rows
;
4421 for (int32_t r
=0; r
< (int32_t)fwtbl
->fNumStates
; r
++) {
4423 RBBIStateTableRow
*row
= (RBBIStateTableRow
*) (fwtbl
->fTableData
+ (fwtbl
->fRowLen
* r
));
4424 assertTrue(WHERE
, row
->fAccepting
>= -1);
4425 s
.append(row
->fAccepting
+ 1); // values of -1 are expected.
4426 s
.append(row
->fLookAhead
);
4427 s
.append(row
->fTagIdx
);
4428 for (int32_t column
= 0; column
< numCharClasses
; column
++) {
4429 s
.append(row
->fNextState
[column
]);
4433 for (int r1
=0; r1
< (int32_t)fwtbl
->fNumStates
; r1
++) {
4434 for (int r2
= r1
+1; r2
< (int32_t)fwtbl
->fNumStates
; r2
++) {
4435 if (rows
.at(r1
) == rows
.at(r2
)) {
4436 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__
, __LINE__
, r1
, r2
);
4443 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4444 // even after next() has returned DONE.
4446 void RBBITest::TestBug13447() {
4447 UErrorCode status
= U_ZERO_ERROR
;
4448 LocalPointer
<RuleBasedBreakIterator
> bi(
4449 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
));
4450 assertSuccess(WHERE
, status
);
4451 if (U_FAILURE(status
)) return;
4452 UnicodeString
data(u
"1234");
4454 assertEquals(WHERE
, UBRK_WORD_NONE
, bi
->getRuleStatus());
4455 assertEquals(WHERE
, 4, bi
->next());
4456 assertEquals(WHERE
, UBRK_WORD_NUMBER
, bi
->getRuleStatus());
4457 assertEquals(WHERE
, UBRK_DONE
, bi
->next());
4458 assertEquals(WHERE
, 4, bi
->current());
4459 assertEquals(WHERE
, UBRK_WORD_NUMBER
, bi
->getRuleStatus());
4462 // TestReverse exercises both the synthesized safe reverse rules and the logic
4463 // for filling the break iterator cache when starting from random positions
4466 // It's a monkey test, working on random data, with the expected data obtained
4467 // from forward iteration (no safe rules involved), comparing with results
4468 // when indexing into the interior of the string (safe rules needed).
4470 void RBBITest::TestReverse() {
4471 UErrorCode status
= U_ZERO_ERROR
;
4473 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4474 BreakIterator::createCharacterInstance(Locale::getEnglish(), status
)));
4475 assertSuccess(WHERE
, status
, true);
4476 status
= U_ZERO_ERROR
;
4477 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4478 BreakIterator::createWordInstance(Locale::getEnglish(), status
)));
4479 assertSuccess(WHERE
, status
, true);
4480 status
= U_ZERO_ERROR
;
4481 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4482 BreakIterator::createLineInstance(Locale::getEnglish(), status
)));
4483 assertSuccess(WHERE
, status
, true);
4484 status
= U_ZERO_ERROR
;
4485 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4486 BreakIterator::createSentenceInstance(Locale::getEnglish(), status
)));
4487 assertSuccess(WHERE
, status
, true);
4490 void RBBITest::TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>bi
) {
4495 // From the mapping trie in the break iterator's internal data, create a
4496 // vector of UnicodeStrings, one for each character category, containing
4497 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4498 // to avoid an execess of unassigned code points.
4500 RBBIDataWrapper
*data
= bi
->fData
;
4501 int32_t categoryCount
= data
->fHeader
->fCatCount
;
4502 UTrie2
*trie
= data
->fTrie
;
4504 std::vector
<UnicodeString
> strings(categoryCount
, UnicodeString());
4505 for (int cp
=0; cp
<0x1fff0; ++cp
) {
4506 int cat
= utrie2_get32(trie
, cp
);
4507 cat
&= ~0x4000; // And off the dictionary bit from the category.
4508 assertTrue(WHERE
, cat
< categoryCount
&& cat
>= 0);
4509 if (cat
< 0 || cat
>= categoryCount
) return;
4510 strings
[cat
].append(cp
);
4514 const int testStringLength
= 10000;
4515 UnicodeString testString
;
4517 for (int i
=0; i
<testStringLength
; ++i
) {
4518 int charClass
= randomGen() % categoryCount
;
4519 if (strings
[charClass
].length() > 0) {
4520 int cp
= strings
[charClass
].char32At(randomGen() % strings
[charClass
].length());
4521 testString
.append(cp
);
4525 typedef std::pair
<UBool
, int32_t> Result
;
4526 std::vector
<Result
> expectedResults
;
4527 bi
->setText(testString
);
4528 for (int i
=0; i
<testString
.length(); ++i
) {
4529 bool isboundary
= bi
->isBoundary(i
);
4530 int ruleStatus
= bi
->getRuleStatus();
4531 expectedResults
.push_back(std::make_pair(isboundary
, ruleStatus
));
4534 for (int i
=testString
.length()-1; i
>=0; --i
) {
4535 bi
->setText(testString
); // clears the internal break cache
4536 Result expected
= expectedResults
[i
];
4537 assertEquals(WHERE
, expected
.first
, bi
->isBoundary(i
));
4538 assertEquals(WHERE
, expected
.second
, bi
->getRuleStatus());
4543 // Ticket 13692 - finding word boundaries in very large numbers or words could
4544 // be very time consuming. When the problem was present, this void test
4545 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4547 void RBBITest::TestBug13692() {
4548 UErrorCode status
= U_ZERO_ERROR
;
4549 LocalPointer
<RuleBasedBreakIterator
> bi ((RuleBasedBreakIterator
*)
4550 BreakIterator::createWordInstance(Locale::getEnglish(), status
), status
);
4551 if (!assertSuccess(WHERE
, status
, true)) {
4554 constexpr int32_t LENGTH
= 1000000;
4555 UnicodeString
longNumber(LENGTH
, (UChar32
)u
'3', LENGTH
);
4556 for (int i
=0; i
<20; i
+=2) {
4557 longNumber
.setCharAt(i
, u
' ');
4559 bi
->setText(longNumber
);
4560 assertFalse(WHERE
, bi
->isBoundary(LENGTH
-5));
4561 assertSuccess(WHERE
, status
);
4565 // TestDebug - A place-holder test for debugging purposes.
4566 // For putting in fragments of other tests that can be invoked
4567 // for tracing without a lot of unwanted extra stuff happening.
4569 void RBBITest::TestDebug(void) {
4570 UErrorCode status
= U_ZERO_ERROR
;
4571 LocalPointer
<RuleBasedBreakIterator
> bi ((RuleBasedBreakIterator
*)
4572 BreakIterator::createCharacterInstance(Locale::getEnglish(), status
), status
);
4573 if (!assertSuccess(WHERE
, status
, true)) {
4576 const UnicodeString
&rules
= bi
->getRules();
4578 LocalPointer
<RuleBasedBreakIterator
> newbi(new RuleBasedBreakIterator(rules
, pe
, status
));
4579 assertSuccess(WHERE
, status
);
4582 void RBBITest::TestProperties() {
4583 UErrorCode errorCode
= U_ZERO_ERROR
;
4584 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4585 if (!prependSet
.isEmpty()) {
4587 "[:GCB=Prepend:] is not empty any more. "
4588 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4589 "change this test to the opposite condition.");
4593 #endif // #if !UCONFIG_NO_BREAK_ITERATION