1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
23 #include "unicode/brkiter.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/numfmt.h"
26 #include "unicode/rbbi.h"
27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/regex.h"
30 #include "unicode/schriter.h"
31 #include "unicode/uchar.h"
32 #include "unicode/utf16.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uscript.h"
36 #include "unicode/ustring.h"
37 #include "unicode/utext.h"
45 #include "utypeinfo.h" // for 'typeid' to work
50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
51 #include "unicode/filteredbrk.h"
52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
54 #define TEST_ASSERT(x) {if (!(x)) { \
55 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
58 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
60 //---------------------------------------------
62 //---------------------------------------------
65 // Note: Before adding new tests to this file, check whether the desired test data can
66 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
67 // it's much less work than writing a new test, diagnostic output in the event of failures
68 // is good, and the test data file will is shared with ICU4J, so eventually the test
69 // will run there as well, without additional effort.
71 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
73 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(TestBug4153072
);
80 #if !UCONFIG_NO_FILE_IO
81 TESTCASE_AUTO(TestUnicodeFiles
);
83 TESTCASE_AUTO(TestGetAvailableLocales
);
84 TESTCASE_AUTO(TestGetDisplayName
);
85 #if !UCONFIG_NO_FILE_IO
86 TESTCASE_AUTO(TestEndBehaviour
);
87 TESTCASE_AUTO(TestWordBreaks
);
88 TESTCASE_AUTO(TestWordBoundary
);
89 TESTCASE_AUTO(TestLineBreaks
);
90 TESTCASE_AUTO(TestSentBreaks
);
91 TESTCASE_AUTO(TestExtended
);
93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestMonkey
);
96 #if !UCONFIG_NO_FILE_IO
97 TESTCASE_AUTO(TestBug3818
);
99 TESTCASE_AUTO(TestDebug
);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestBug5775
);
103 TESTCASE_AUTO(TestBug9983
);
104 TESTCASE_AUTO(TestDictRules
);
105 TESTCASE_AUTO(TestBug5532
);
106 TESTCASE_AUTO(TestBug7547
);
107 TESTCASE_AUTO(TestBug12797
);
108 TESTCASE_AUTO(TestBug12918
);
109 TESTCASE_AUTO(TestBug12932
);
110 TESTCASE_AUTO(TestEmoji
);
111 TESTCASE_AUTO(TestBug12519
);
112 TESTCASE_AUTO(TestBug12677
);
113 TESTCASE_AUTO(TestTableRedundancies
);
114 TESTCASE_AUTO(TestBug13447
);
115 TESTCASE_AUTO(TestReverse
);
116 TESTCASE_AUTO(TestBug13692
);
121 //--------------------------------------------------------------------------------------
123 // RBBITest constructor and destructor
125 //--------------------------------------------------------------------------------------
127 RBBITest::RBBITest() {
132 RBBITest::~RBBITest() {
136 static void printStringBreaks(UText
*tstr
, int expected
[], int expectedCount
) {
137 UErrorCode status
= U_ZERO_ERROR
;
139 printf("code alpha extend alphanum type word sent line name\n");
140 int nextExpectedIndex
= 0;
141 utext_setNativeIndex(tstr
, 0);
142 for (int j
= 0; j
< static_cast<int>(utext_nativeLength(tstr
)); j
=static_cast<int>(utext_getNativeIndex(tstr
))) {
143 if (nextExpectedIndex
< expectedCount
&& j
>= expected
[nextExpectedIndex
] ) {
144 printf("------------------------------------------------ %d\n", j
);
148 UChar32 c
= utext_next32(tstr
);
149 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
150 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
152 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
154 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
156 U_SHORT_PROPERTY_NAME
),
157 u_getPropertyValueName(UCHAR_WORD_BREAK
,
158 u_getIntPropertyValue(c
,
160 U_SHORT_PROPERTY_NAME
),
161 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
162 u_getIntPropertyValue(c
,
163 UCHAR_SENTENCE_BREAK
),
164 U_SHORT_PROPERTY_NAME
),
165 u_getPropertyValueName(UCHAR_LINE_BREAK
,
166 u_getIntPropertyValue(c
,
168 U_SHORT_PROPERTY_NAME
),
174 static void printStringBreaks(const UnicodeString
&ustr
, int expected
[], int expectedCount
) {
175 UErrorCode status
= U_ZERO_ERROR
;
177 tstr
= utext_openConstUnicodeString(NULL
, &ustr
, &status
);
178 if (U_FAILURE(status
)) {
179 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
));
182 printStringBreaks(tstr
, expected
, expectedCount
);
187 void RBBITest::TestBug3818() {
188 UErrorCode status
= U_ZERO_ERROR
;
190 // Four Thai words...
191 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
192 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
193 UnicodeString
thaiStr(thaiWordData
);
195 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
196 if (U_FAILURE(status
) || bi
== NULL
) {
197 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
200 bi
->setText(thaiStr
);
202 int32_t startOfSecondWord
= bi
->following(1);
203 if (startOfSecondWord
!= 4) {
204 errln("Fail at file %s, line %d expected start of word at 4, got %d",
205 __FILE__
, __LINE__
, startOfSecondWord
);
207 startOfSecondWord
= bi
->following(0);
208 if (startOfSecondWord
!= 4) {
209 errln("Fail at file %s, line %d expected start of word at 4, got %d",
210 __FILE__
, __LINE__
, startOfSecondWord
);
216 //---------------------------------------------
220 //---------------------------------------------
222 void RBBITest::TestGetAvailableLocales()
224 int32_t locCount
= 0;
225 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
228 dataerrln("getAvailableLocales() returned an empty list!");
229 // Just make sure that it's returning good memory.
231 for (i
= 0; i
< locCount
; ++i
) {
232 logln(locList
[i
].getName());
236 //Testing the BreakIterator::getDisplayName() function
237 void RBBITest::TestGetDisplayName()
239 UnicodeString result
;
241 BreakIterator::getDisplayName(Locale::getUS(), result
);
242 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
243 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
246 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
247 if (result
!= "French (France)")
248 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
255 void RBBITest::TestEndBehaviour()
257 UErrorCode status
= U_ZERO_ERROR
;
258 UnicodeString
testString("boo.");
259 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
260 if (U_FAILURE(status
))
262 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
265 wb
->setText(testString
);
267 if (wb
->first() != 0)
268 errln("Didn't get break at beginning of string.");
270 errln("Didn't get break before period in \"boo.\"");
271 if (wb
->current() != 4 && wb
->next() != 4)
272 errln("Didn't get break at end of string.");
278 void RBBITest::TestBug4153072() {
279 UErrorCode status
= U_ZERO_ERROR
;
280 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
281 if (U_FAILURE(status
))
283 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
286 UnicodeString
str("...Hello, World!...");
288 int32_t end
= str
.length() - 3;
291 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
292 iter
->adoptText(textIterator
);
294 // Note: with the switch to UText, there is no way to restrict the
295 // iteration range to begin at an index other than zero.
296 // String character iterators created with a non-zero bound are
297 // treated by RBBI as being empty.
298 for (index
= -1; index
< begin
+ 1; ++index
) {
299 onBoundary
= iter
->isBoundary(index
);
300 if (index
== 0? !onBoundary
: onBoundary
) {
301 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
302 " and begin index = " + begin
);
310 // Test for problem reported by Ashok Matoria on 9 July 2007
311 // One.<kSoftHyphen><kSpace>Two.
313 // Sentence break at start (0) and then on calling next() it breaks at
314 // 'T' of "Two". Now, at this point if I do next() and
315 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
317 void RBBITest::TestBug5775() {
318 UErrorCode status
= U_ZERO_ERROR
;
319 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
320 TEST_ASSERT_SUCCESS(status
);
321 if (U_FAILURE(status
)) {
324 // Check for status first for better handling of no data errors.
325 TEST_ASSERT(bi
!= NULL
);
330 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
334 int pos
= bi
->next();
335 TEST_ASSERT(pos
== 6);
337 TEST_ASSERT(pos
== 10);
338 pos
= bi
->previous();
339 TEST_ASSERT(pos
== 6);
345 //------------------------------------------------------------------------------
347 // RBBITest::Extended Run RBBI Tests from an external test data file
349 //------------------------------------------------------------------------------
352 BreakIterator
*bi
; // Break iterator is set while parsing test source.
353 // Changed out whenever test data changes break type.
355 UnicodeString dataToBreak
; // Data that is built up while parsing the test.
356 UVector32
*expectedBreaks
; // Expected break positions, matches dataToBreak UnicodeString.
357 UVector32
*srcLine
; // Positions in source file, indexed same as dataToBreak.
360 UText
*textToBreak
; // UText, could be UTF8 or UTF16.
361 UVector32
*textMap
; // Map from UTF-16 dataToBreak offsets to UText offsets.
362 CharString utf8String
; // UTF-8 form of text to break.
364 TestParams(UErrorCode
&status
) : dataToBreak() {
366 expectedBreaks
= new UVector32(status
);
367 srcLine
= new UVector32(status
);
368 srcCol
= new UVector32(status
);
370 textMap
= new UVector32(status
);
375 delete expectedBreaks
;
378 utext_close(textToBreak
);
382 int32_t getSrcLine(int32_t bp
);
383 int32_t getExpectedBreak(int32_t bp
);
384 int32_t getSrcCol(int32_t bp
);
386 void setUTF16(UErrorCode
&status
);
387 void setUTF8(UErrorCode
&status
);
390 // Append a UnicodeString to a CharString with UTF-8 encoding.
391 // Substitute any invalid chars.
392 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
393 static void CharStringAppend(CharString
&dest
, const UnicodeString
&src
, UErrorCode
&status
) {
394 if (U_FAILURE(status
)) {
398 u_strToUTF8WithSub(NULL
, 0, &utf8Length
, // Output Buffer, NULL for preflight.
399 src
.getBuffer(), src
.length(), // UTF-16 data
400 0xfffd, NULL
, // Substitution char, number of subs.
402 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
405 status
= U_ZERO_ERROR
;
407 char *buffer
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
);
408 u_strToUTF8WithSub(buffer
, utf8Length
, NULL
,
409 src
.getBuffer(), src
.length(),
410 0xfffd, NULL
, &status
);
411 dest
.append(buffer
, utf8Length
, status
);
415 void TestParams::setUTF16(UErrorCode
&status
) {
416 textToBreak
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
);
417 textMap
->removeAllElements();
418 for (int32_t i
=0; i
<dataToBreak
.length(); i
++) {
419 if (i
== dataToBreak
.getChar32Start(i
)) {
420 textMap
->addElement(i
, status
);
422 textMap
->addElement(-1, status
);
425 textMap
->addElement(dataToBreak
.length(), status
);
426 U_ASSERT(dataToBreak
.length() + 1 == textMap
->size());
430 void TestParams::setUTF8(UErrorCode
&status
) {
431 if (U_FAILURE(status
)) {
435 CharStringAppend(utf8String
, dataToBreak
, status
);
436 textToBreak
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
);
437 if (U_FAILURE(status
)) {
441 textMap
->removeAllElements();
442 int32_t utf16Index
= 0;
444 textMap
->addElement(utf16Index
, status
);
445 UChar32 c32
= utext_current32(textToBreak
);
449 utf16Index
+= U16_LENGTH(c32
);
450 utext_next32(textToBreak
);
451 while (textMap
->size() < utext_getNativeIndex(textToBreak
)) {
452 textMap
->addElement(-1, status
);
455 U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size());
459 int32_t TestParams::getSrcLine(int32_t bp
) {
460 if (bp
>= textMap
->size()) {
461 bp
= textMap
->size() - 1;
464 for(; bp
>= 0 ; --bp
) {
465 // Move to a character boundary if we are not on one already.
466 i
= textMap
->elementAti(bp
);
471 return srcLine
->elementAti(i
);
475 int32_t TestParams::getExpectedBreak(int32_t bp
) {
476 if (bp
>= textMap
->size()) {
479 int32_t i
= textMap
->elementAti(bp
);
482 retVal
= expectedBreaks
->elementAti(i
);
488 int32_t TestParams::getSrcCol(int32_t bp
) {
489 if (bp
>= textMap
->size()) {
490 bp
= textMap
->size() - 1;
493 for(; bp
>= 0; --bp
) {
494 // Move bp to a character boundary if we are not on one already.
495 i
= textMap
->elementAti(bp
);
500 return srcCol
->elementAti(i
);
504 void RBBITest::executeTest(TestParams
*t
, UErrorCode
&status
) {
509 TEST_ASSERT_SUCCESS(status
);
510 if (U_FAILURE(status
)) {
518 t
->bi
->setText(t
->textToBreak
, status
);
520 // Run the iterator forward
523 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
525 // Fail for lack of forward progress.
526 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
527 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
531 // Check that there we didn't miss an expected break between the last one
533 for (i
=prevBP
+1; i
<bp
; i
++) {
534 if (t
->getExpectedBreak(i
) != 0) {
535 int expected
[] = {0, i
};
536 printStringBreaks(t
->dataToBreak
, expected
, 2);
537 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
538 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
542 // Check that the break we did find was expected
543 if (t
->getExpectedBreak(bp
) == 0) {
544 int expected
[] = {0, bp
};
545 printStringBreaks(t
->textToBreak
, expected
, 2);
546 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
547 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
549 // The break was expected.
550 // Check that the {nnn} tag value is correct.
551 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
552 if (expectedTagVal
== -1) {
555 int32_t line
= t
->getSrcLine(bp
);
556 int32_t rs
= t
->bi
->getRuleStatus();
557 if (rs
!= expectedTagVal
) {
558 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
559 " Actual, Expected status = %4d, %4d",
560 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
567 // Verify that there were no missed expected breaks after the last one found
568 for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) {
569 if (t
->getExpectedBreak(i
) != 0) {
570 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
571 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
576 // Run the iterator backwards, verify that the same breaks are found.
578 prevBP
= static_cast<int32_t>(utext_nativeLength(t
->textToBreak
) + 2); // start with a phony value for the last break pos seen.
580 while (bp
!= BreakIterator::DONE
) {
582 // Fail for lack of progress.
583 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
584 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
588 // Check that we didn't miss an expected break between the last one
589 // and this one. (UVector returns zeros for index out of bounds.)
590 for (i
=prevBP
-1; i
>bp
; i
--) {
591 if (t
->getExpectedBreak(i
) != 0) {
592 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
593 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
597 // Check that the break we did find was expected
598 if (t
->getExpectedBreak(bp
) == 0) {
599 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
600 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
602 // The break was expected.
603 // Check that the {nnn} tag value is correct.
604 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
605 if (expectedTagVal
== -1) {
608 int line
= t
->getSrcLine(bp
);
609 int32_t rs
= t
->bi
->getRuleStatus();
610 if (rs
!= expectedTagVal
) {
611 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
612 " Actual, Expected status = %4d, %4d",
613 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
618 bp
= t
->bi
->previous();
621 // Verify that there were no missed breaks prior to the last one found
622 for (i
=prevBP
-1; i
>=0; i
--) {
623 if (t
->getExpectedBreak(i
) != 0) {
624 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
625 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
629 // Check isBoundary()
630 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
631 UBool boundaryExpected
= (t
->getExpectedBreak(i
) != 0);
632 UBool boundaryFound
= t
->bi
->isBoundary(i
);
633 if (boundaryExpected
!= boundaryFound
) {
634 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
635 " Expected, Actual= %s, %s",
636 i
, t
->getSrcLine(i
), t
->getSrcCol(i
),
637 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
642 for (i
=0; i
< static_cast<int32_t>(utext_nativeLength(t
->textToBreak
)); i
++) {
643 int32_t actualBreak
= t
->bi
->following(i
);
644 int32_t expectedBreak
= BreakIterator::DONE
;
645 for (int32_t j
=i
+1; j
<= static_cast<int32_t>(utext_nativeLength(t
->textToBreak
)); j
++) {
646 if (t
->getExpectedBreak(j
) != 0) {
651 if (expectedBreak
!= actualBreak
) {
652 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
653 " Expected, Actual= %d, %d",
654 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
659 for (i
=static_cast<int32_t>(utext_nativeLength(t
->textToBreak
)); i
>=0; i
--) {
660 int32_t actualBreak
= t
->bi
->preceding(i
);
661 int32_t expectedBreak
= BreakIterator::DONE
;
663 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
664 // preceding(trailing byte) will return the index of some preceding code point,
665 // not the lead byte of the current code point, even though that has a smaller index.
666 // Therefore, start looking at the expected break data not at i-1, but at
667 // the start of code point index - 1.
668 utext_setNativeIndex(t
->textToBreak
, i
);
669 int32_t j
= static_cast<int32_t>(utext_getNativeIndex(t
->textToBreak
) - 1);
670 for (; j
>= 0; j
--) {
671 if (t
->getExpectedBreak(j
) != 0) {
676 if (expectedBreak
!= actualBreak
) {
677 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
678 " Expected, Actual= %d, %d",
679 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
685 void RBBITest::TestExtended() {
686 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
687 // data driven test closely entangles filtered and regular data.
688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
689 UErrorCode status
= U_ZERO_ERROR
;
692 TestParams
tp(status
);
694 RegexMatcher
localeMatcher(UnicodeString(u
"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status
);
695 if (U_FAILURE(status
)) {
696 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
700 // Open and read the test data file.
702 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
703 CharString
testFileName(testDataDirectory
, -1, status
);
704 testFileName
.append("rbbitst.txt", -1, status
);
707 UChar
*testFile
= ReadAndConvertFile(testFileName
.data(), len
, "UTF-8", status
);
708 if (U_FAILURE(status
)) {
709 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__
, __LINE__
, u_errorName(status
));
713 bool skipTest
= false; // Skip this test?
716 // Put the test data into a UnicodeString
718 UnicodeString
testString(FALSE
, testFile
, len
);
727 parseState
= PARSE_TAG
;
729 EParseState savedState
= PARSE_TAG
;
732 int32_t colStart
= 0;
736 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
738 UnicodeString rules
; // Holds rules from a <rules> ... </rules> block
739 int32_t rulesFirstLine
; // Line number of the start of current <rules> block
741 for (charIdx
= 0; charIdx
< len
; ) {
742 status
= U_ZERO_ERROR
;
743 UChar c
= testString
.charAt(charIdx
);
745 if (c
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
) == u
'\n') {
746 // treat CRLF as a unit
750 if (c
== u
'\n' || c
== u
'\r') {
754 column
= charIdx
- colStart
+ 1;
756 switch (parseState
) {
758 if (c
== u
'\n' || c
== u
'\r') {
759 parseState
= savedState
;
766 parseState
= PARSE_COMMENT
;
767 savedState
= PARSE_TAG
;
770 if (u_isUWhiteSpace(c
)) {
773 if (testString
.compare(charIdx
-1, 6, u
"<word>") == 0) {
775 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
780 if (testString
.compare(charIdx
-1, 6, u
"<char>") == 0) {
782 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
787 if (testString
.compare(charIdx
-1, 6, u
"<line>") == 0) {
789 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
794 if (testString
.compare(charIdx
-1, 6, u
"<sent>") == 0) {
796 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
801 if (testString
.compare(charIdx
-1, 7, u
"<title>") == 0) {
803 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
808 if (testString
.compare(charIdx
-1, 7, u
"<rules>") == 0 ||
809 testString
.compare(charIdx
-1, 10, u
"<badrules>") == 0) {
810 charIdx
= testString
.indexOf(u
'>', charIdx
) + 1;
811 parseState
= PARSE_RULES
;
813 rulesFirstLine
= lineNum
;
818 localeMatcher
.reset(testString
);
819 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
820 UnicodeString localeName
= localeMatcher
.group(1, status
);
821 char localeName8
[100];
822 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
823 locale
= Locale::createFromName(localeName8
);
824 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
825 TEST_ASSERT_SUCCESS(status
);
828 if (testString
.compare(charIdx
-1, 6, u
"<data>") == 0) {
829 parseState
= PARSE_DATA
;
832 tp
.expectedBreaks
->removeAllElements();
833 tp
.srcCol
->removeAllElements();
834 tp
.srcLine
->removeAllElements();
838 errln("line %d: Tag expected in test file.", lineNum
);
839 parseState
= PARSE_COMMENT
;
840 savedState
= PARSE_DATA
;
841 goto end_test
; // Stop the test.
846 if (testString
.compare(charIdx
-1, 8, u
"</rules>") == 0) {
848 parseState
= PARSE_TAG
;
851 tp
.bi
= new RuleBasedBreakIterator(rules
, pe
, status
);
852 skipTest
= U_FAILURE(status
);
853 if (U_FAILURE(status
)) {
854 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
855 rulesFirstLine
+ pe
.line
- 1, u_errorName(status
));
857 } else if (testString
.compare(charIdx
-1, 11, u
"</badrules>") == 0) {
859 parseState
= PARSE_TAG
;
860 UErrorCode ec
= U_ZERO_ERROR
;
862 RuleBasedBreakIterator
bi(rules
, pe
, ec
);
864 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
865 rulesFirstLine
+ pe
.line
- 1);
873 if (c
== u
'\u2022') { // u'•'
874 int32_t breakIdx
= tp
.dataToBreak
.length();
875 tp
.expectedBreaks
->setSize(breakIdx
+1);
876 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
877 tp
.srcLine
->setSize(breakIdx
+1);
878 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
879 tp
.srcCol
->setSize(breakIdx
+1);
880 tp
.srcCol
->setElementAt(column
, breakIdx
);
884 if (testString
.compare(charIdx
-1, 7, u
"</data>") == 0) {
885 // Add final entry to mappings from break location to source file position.
886 // Need one extra because last break position returned is after the
887 // last char in the data, not at the last char.
888 tp
.srcLine
->addElement(lineNum
, status
);
889 tp
.srcCol
->addElement(column
, status
);
891 parseState
= PARSE_TAG
;
896 status
= U_ZERO_ERROR
;
898 executeTest(&tp
, status
);
899 TEST_ASSERT_SUCCESS(status
);
901 // Run again, this time with UTF-8 text wrapped in a UText.
902 status
= U_ZERO_ERROR
;
904 TEST_ASSERT_SUCCESS(status
);
905 executeTest(&tp
, status
);
910 if (testString
.compare(charIdx
-1, 3, u
"\\N{") == 0) {
911 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
912 // Get the code point from the name and insert it into the test data.
913 // (Damn, no API takes names in Unicode !!!
914 // we've got to take it back to char *)
915 int32_t nameEndIdx
= testString
.indexOf(u
'}', charIdx
);
916 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
917 char charNameBuf
[200];
918 UChar32 theChar
= -1;
919 if (nameEndIdx
!= -1) {
920 UErrorCode status
= U_ZERO_ERROR
;
921 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
922 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
923 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
924 if (U_FAILURE(status
)) {
929 errln("Error in named character in test file at line %d, col %d",
932 // Named code point was recognized. Insert it
933 // into the test data.
934 tp
.dataToBreak
.append(theChar
);
935 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
936 tp
.srcLine
->addElement(lineNum
, status
);
937 tp
.srcCol
->addElement(column
, status
);
940 if (nameEndIdx
> charIdx
) {
941 charIdx
= nameEndIdx
+1;
949 if (testString
.compare(charIdx
-1, 2, u
"<>") == 0) {
951 int32_t breakIdx
= tp
.dataToBreak
.length();
952 tp
.expectedBreaks
->setSize(breakIdx
+1);
953 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
954 tp
.srcLine
->setSize(breakIdx
+1);
955 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
956 tp
.srcCol
->setSize(breakIdx
+1);
957 tp
.srcCol
->setElementAt(column
, breakIdx
);
963 parseState
= PARSE_NUM
;
967 if (c
== u
'#' && column
==3) { // TODO: why is column off so far?
968 parseState
= PARSE_COMMENT
;
969 savedState
= PARSE_DATA
;
974 // Check for \ at end of line, a line continuation.
975 // Advance over (discard) the newline
976 UChar32 cp
= testString
.char32At(charIdx
);
977 if (cp
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
+1) == u
'\n') {
979 // Need an extra increment of the input ptr to move over both of them
982 if (cp
== u
'\n' || cp
== u
'\r') {
989 // Let unescape handle the back slash.
990 cp
= testString
.unescapeAt(charIdx
);
992 // Escape sequence was recognized. Insert the char
993 // into the test data.
994 tp
.dataToBreak
.append(cp
);
995 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
996 tp
.srcLine
->addElement(lineNum
, status
);
997 tp
.srcCol
->addElement(column
, status
);
1003 // Not a recognized backslash escape sequence.
1004 // Take the next char as a literal.
1005 // TODO: Should this be an error?
1006 c
= testString
.charAt(charIdx
);
1007 charIdx
= testString
.moveIndex32(charIdx
, 1);
1010 // Normal, non-escaped data char.
1011 tp
.dataToBreak
.append(c
);
1013 // Save the mapping from offset in the data to line/column numbers in
1014 // the original input file. Will be used for better error messages only.
1015 // If there's an expected break before this char, the slot in the mapping
1016 // vector will already be set for this char; don't overwrite it.
1017 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1018 tp
.srcLine
->addElement(lineNum
, status
);
1019 tp
.srcCol
->addElement(column
, status
);
1025 // We are parsing an expected numeric tag value, like <1234>,
1026 // within a chunk of data.
1027 if (u_isUWhiteSpace(c
)) {
1032 // Finished the number. Add the info to the expected break data,
1033 // and switch parse state back to doing plain data.
1034 parseState
= PARSE_DATA
;
1035 if (tagValue
== 0) {
1038 int32_t breakIdx
= tp
.dataToBreak
.length();
1039 tp
.expectedBreaks
->setSize(breakIdx
+1);
1040 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1041 tp
.srcLine
->setSize(breakIdx
+1);
1042 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1043 tp
.srcCol
->setSize(breakIdx
+1);
1044 tp
.srcCol
->setElementAt(column
, breakIdx
);
1049 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1053 errln("Syntax Error in test file at line %d, col %d",
1055 parseState
= PARSE_COMMENT
;
1056 goto end_test
; // Stop the test
1061 if (U_FAILURE(status
)) {
1062 dataerrln("ICU Error %s while parsing test file at line %d.",
1063 u_errorName(status
), lineNum
);
1064 status
= U_ZERO_ERROR
;
1065 goto end_test
; // Stop the test
1070 // Reached end of test file. Raise an error if parseState indicates that we are
1071 // within a block that should have been terminated.
1073 if (parseState
== PARSE_RULES
) {
1074 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075 lineNum
, rulesFirstLine
);
1077 if (parseState
== PARSE_DATA
) {
1078 errln("rbbitst.txt:%d <data> block not closed.", lineNum
);
1088 //-------------------------------------------------------------------------------
1090 // TestDictRules create a break iterator from source rules that includes a
1091 // dictionary range. Regression for bug #7130. Source rules
1092 // do not declare a break iterator type (word, line, sentence, etc.
1093 // but the dictionary code, without a type, would loop.
1095 //-------------------------------------------------------------------------------
1096 void RBBITest::TestDictRules() {
1097 const char *rules
= "$dictionary = [a-z]; \n"
1099 "$dictionary $dictionary; \n"
1101 "$dictionary $dictionary; \n";
1102 const char *text
= "aa";
1103 UErrorCode status
= U_ZERO_ERROR
;
1104 UParseError parseError
;
1106 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1107 if (U_SUCCESS(status
)) {
1108 UnicodeString utext
= text
;
1112 for (loops
= 0; loops
<10; loops
++) {
1113 position
= bi
.next();
1114 if (position
== RuleBasedBreakIterator::DONE
) {
1118 TEST_ASSERT(loops
== 1);
1120 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1126 //-------------------------------------------------------------------------------
1128 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1129 // return the data in one big UChar * buffer, which the caller must delete.
1132 // fileName: the name of the file, with no directory part. The test data directory
1134 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1135 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1136 // specified here. The BOM, if it exists, will be stripped from the returned data.
1137 // Pass NULL for the system default encoding.
1140 // The file data, converted to UChar.
1141 // The caller must delete this when done with
1142 // delete [] theBuffer;
1144 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1145 // Move this function to some common place.
1147 //--------------------------------------------------------------------------------
1148 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1149 UChar
*retPtr
= NULL
;
1150 char *fileBuf
= NULL
;
1151 UConverter
* conv
= NULL
;
1155 if (U_FAILURE(status
)) {
1162 f
= fopen(fileName
, "rb");
1164 dataerrln("Error opening test data file %s\n", fileName
);
1165 status
= U_FILE_ACCESS_ERROR
;
1174 fseek( f
, 0, SEEK_END
);
1175 fileSize
= ftell(f
);
1176 fileBuf
= new char[fileSize
];
1177 fseek(f
, 0, SEEK_SET
);
1178 amt_read
= static_cast<int>(fread(fileBuf
, 1, fileSize
, f
));
1179 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1180 errln("Error reading test data file.");
1181 goto cleanUpAndReturn
;
1185 // Look for a Unicode Signature (BOM) on the data just read
1187 int32_t signatureLength
;
1188 const char * fileBufC
;
1189 const char* bomEncoding
;
1192 bomEncoding
= ucnv_detectUnicodeSignature(
1193 fileBuf
, fileSize
, &signatureLength
, &status
);
1194 if(bomEncoding
!=NULL
){
1195 fileBufC
+= signatureLength
;
1196 fileSize
-= signatureLength
;
1197 encoding
= bomEncoding
;
1201 // Open a converter to take the rule file to UTF-16
1203 conv
= ucnv_open(encoding
, &status
);
1204 if (U_FAILURE(status
)) {
1205 goto cleanUpAndReturn
;
1209 // Convert the rules to UChar.
1210 // Preflight first to determine required buffer size.
1212 ulen
= ucnv_toUChars(conv
,
1218 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1219 // Buffer Overflow is expected from the preflight operation.
1220 status
= U_ZERO_ERROR
;
1222 retPtr
= new UChar
[ulen
+1];
1235 if (U_FAILURE(status
)) {
1236 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1246 //--------------------------------------------------------------------------------------------
1248 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1250 //-------------------------------------------------------------------------------------------
1251 void RBBITest::TestUnicodeFiles() {
1252 RuleBasedBreakIterator
*bi
;
1253 UErrorCode status
= U_ZERO_ERROR
;
1255 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1256 TEST_ASSERT_SUCCESS(status
);
1257 if (U_SUCCESS(status
)) {
1258 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1262 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1263 TEST_ASSERT_SUCCESS(status
);
1264 if (U_SUCCESS(status
)) {
1265 runUnicodeTestData("WordBreakTest.txt", bi
);
1269 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1270 TEST_ASSERT_SUCCESS(status
);
1271 if (U_SUCCESS(status
)) {
1272 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1276 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1277 TEST_ASSERT_SUCCESS(status
);
1278 if (U_SUCCESS(status
)) {
1279 runUnicodeTestData("LineBreakTest.txt", bi
);
1285 // Check for test cases from the Unicode test data files that are known to fail
1286 // and should be skipped as known issues because ICU does not fully implement
1287 // the Unicode specifications, or because ICU includes tailorings that differ from
1288 // the Unicode standard.
1290 // Test cases are identified by the test data sequence, which tends to be more stable
1291 // across Unicode versions than the test file line numbers.
1293 // The test case with ticket "10666" is a dummy, included as an example.
1295 UBool
RBBITest::testCaseIsKnownIssue(const UnicodeString
&testCase
, const char *fileName
) {
1296 static struct TestCase
{
1297 const char *fTicketNum
;
1298 const char *fFileName
;
1299 const UChar
*fString
;
1300 } badTestCases
[] = {
1301 {"10666", "GraphemeBreakTest.txt", u
"\u0020\u0020\u0033"}, // Fake example, for illustration.
1302 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1303 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1304 // ICU is out of sync with Unicode.
1305 {"8151", "LineBreakTest.txt", u
"-#"},
1306 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\u0023"},
1307 {"8151", "LineBreakTest.txt", u
"\u002d\u00a7"},
1308 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\u00a7"},
1309 {"8151", "LineBreakTest.txt", u
"\u002d\U00050005"},
1310 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\U00050005"},
1311 {"8151", "LineBreakTest.txt", u
"\u002d\u0e01"},
1312 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\u0e01"},
1314 // Issue ICU-12017 Improve line break around numbers
1315 {"12017", "LineBreakTest.txt", u
"\u002C\u0030"}, // ",0"
1316 {"12017", "LineBreakTest.txt", u
"\u002C\u0308\u0030"},
1317 {"12017", "LineBreakTest.txt", u
"find .com"},
1318 {"12017", "LineBreakTest.txt", u
"equals .35 cents"},
1319 {"12017", "LineBreakTest.txt", u
"a.2 "},
1320 {"12017", "LineBreakTest.txt", u
"a.2 \u0915"},
1321 {"12017", "LineBreakTest.txt", u
"a.2 \u672C"},
1322 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u672C"},
1323 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u307E"},
1324 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u0033"},
1325 {"12017", "LineBreakTest.txt", u
"A.1 \uBABB"},
1326 {"12017", "LineBreakTest.txt", u
"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1327 {"12017", "LineBreakTest.txt", u
"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1328 {"12017", "LineBreakTest.txt", u
"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1329 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u300C"},
1332 for (int n
=0; n
<UPRV_LENGTHOF(badTestCases
); n
++) {
1333 const TestCase
&badCase
= badTestCases
[n
];
1334 if (!strcmp(fileName
, badCase
.fFileName
) &&
1335 testCase
== UnicodeString(badCase
.fString
)) {
1336 return logKnownIssue(badCase
.fTicketNum
);
1343 //--------------------------------------------------------------------------------------------
1345 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1347 //-------------------------------------------------------------------------------------------
1348 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1349 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1350 UErrorCode status
= U_ZERO_ERROR
;
1353 // Open and read the test data file, put it into a UnicodeString.
1355 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1356 char testFileName
[1000];
1357 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1358 dataerrln("Can't open test data. Path too long.");
1361 strcpy(testFileName
, testDataDirectory
);
1362 strcat(testFileName
, fileName
);
1364 logln("Opening data file %s\n", fileName
);
1367 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1368 if (status
!= U_FILE_ACCESS_ERROR
) {
1369 TEST_ASSERT_SUCCESS(status
);
1370 TEST_ASSERT(testFile
!= NULL
);
1372 if (U_FAILURE(status
) || testFile
== NULL
) {
1373 return; /* something went wrong, error already output */
1375 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1378 // Parse the test data file using a regular expression.
1379 // Each kind of token is recognized in its own capture group; what type of item was scanned
1380 // is identified by which group had a match.
1382 // Caputure Group # 1 2 3 4 5
1383 // Parses this item: divide x hex digits comment \n unrecognized \n
1385 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1386 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1387 UnicodeString testString
;
1388 UVector32
breakPositions(status
);
1390 TEST_ASSERT_SUCCESS(status
);
1391 if (U_FAILURE(status
)) {
1396 // Scan through each test case, building up the string to be broken in testString,
1397 // and the positions that should be boundaries in the breakPositions vector.
1400 while (tokenMatcher
.find()) {
1401 if(tokenMatcher
.hitEnd()) {
1402 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1403 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1404 and caused an infinite loop here on EBCDIC systems!
1406 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1409 if (tokenMatcher
.start(1, status
) >= 0) {
1410 // Scanned a divide sign, indicating a break position in the test data.
1411 if (testString
.length()>0) {
1412 breakPositions
.addElement(testString
.length(), status
);
1415 else if (tokenMatcher
.start(2, status
) >= 0) {
1416 // Scanned an 'x', meaning no break at this position in the test data
1417 // Nothing to be done here.
1419 else if (tokenMatcher
.start(3, status
) >= 0) {
1420 // Scanned Hex digits. Convert them to binary, append to the character data string.
1421 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1422 int length
= hexNumber
.length();
1425 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1426 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1428 testString
.append(c
);
1430 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1431 fileName
, lineNumber
);
1434 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1435 fileName
, lineNumber
);
1438 else if (tokenMatcher
.start(4, status
) >= 0) {
1439 // Scanned to end of a line, possibly skipping over a comment in the process.
1440 // If the line from the file contained test data, run the test now.
1441 if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {
1442 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1445 // Clear out this test case.
1446 // The string and breakPositions vector will be refilled as the next
1447 // test case is parsed.
1448 testString
.remove();
1449 breakPositions
.removeAllElements();
1452 // Scanner catchall. Something unrecognized appeared on the line.
1454 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1455 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1456 token
[sizeof(token
)-1] = 0;
1457 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1459 // Clean up, in preparation for continuing with the next line.
1460 testString
.remove();
1461 breakPositions
.removeAllElements();
1464 TEST_ASSERT_SUCCESS(status
);
1465 if (U_FAILURE(status
)) {
1471 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1474 //--------------------------------------------------------------------------------------------
1476 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1477 // test data files. Do only a simple, forward-only check -
1478 // this test is mostly to check that ICU and the Unicode
1479 // data agree with each other.
1481 //--------------------------------------------------------------------------------------------
1482 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1483 const UnicodeString
&testString
, // Text data to be broken
1484 UVector32
*breakPositions
, // Positions where breaks should be found.
1485 RuleBasedBreakIterator
*bi
) {
1486 int32_t pos
; // Break Position in the test string
1487 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1488 int32_t expectedPos
; // Expected break position (index into test string)
1490 bi
->setText(testString
);
1494 while (pos
!= BreakIterator::DONE
) {
1495 if (expectedI
>= breakPositions
->size()) {
1496 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1497 testFileName
, lineNumber
, pos
);
1500 expectedPos
= breakPositions
->elementAti(expectedI
);
1501 if (pos
< expectedPos
) {
1502 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1503 testFileName
, lineNumber
, pos
);
1506 if (pos
> expectedPos
) {
1507 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1508 testFileName
, lineNumber
, expectedPos
);
1515 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1516 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1517 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1523 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1524 //---------------------------------------------------------------------------------------
1526 // classs RBBIMonkeyKind
1528 // Monkey Test for Break Iteration
1529 // Abstract interface class. Concrete derived classes independently
1530 // implement the break rules for different iterator types.
1532 // The Monkey Test itself uses doesn't know which type of break iterator it is
1533 // testing, but works purely in terms of the interface defined here.
1535 //---------------------------------------------------------------------------------------
1536 class RBBIMonkeyKind
{
1538 // Return a UVector of UnicodeSets, representing the character classes used
1539 // for this type of iterator.
1540 virtual UVector
*charClasses() = 0;
1542 // Set the test text on which subsequent calls to next() will operate
1543 virtual void setText(const UnicodeString
&s
) = 0;
1545 // Find the next break postion, starting from the prev break position, or from zero.
1546 // Return -1 after reaching end of string.
1547 virtual int32_t next(int32_t i
) = 0;
1549 virtual ~RBBIMonkeyKind();
1550 UErrorCode deferredStatus
;
1559 RBBIMonkeyKind::RBBIMonkeyKind() {
1560 deferredStatus
= U_ZERO_ERROR
;
1563 RBBIMonkeyKind::~RBBIMonkeyKind() {
1567 //----------------------------------------------------------------------------------------
1569 // Random Numbers. Similar to standard lib rand() and srand()
1570 // Not using library to
1571 // 1. Get same results on all platforms.
1572 // 2. Get access to current seed, to more easily reproduce failures.
1574 //---------------------------------------------------------------------------------------
1575 static uint32_t m_seed
= 1;
1577 static uint32_t m_rand()
1579 m_seed
= m_seed
* 1103515245 + 12345;
1580 return (uint32_t)(m_seed
/65536) % 32768;
1584 //------------------------------------------------------------------------------------------
1586 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1587 // of RBBIMonkeyKind.
1589 //------------------------------------------------------------------------------------------
1590 class RBBICharMonkey
: public RBBIMonkeyKind
{
1593 virtual ~RBBICharMonkey();
1594 virtual UVector
*charClasses();
1595 virtual void setText(const UnicodeString
&s
);
1596 virtual int32_t next(int32_t i
);
1600 UnicodeSet
*fCRLFSet
;
1601 UnicodeSet
*fControlSet
;
1602 UnicodeSet
*fExtendSet
;
1603 UnicodeSet
*fZWJSet
;
1604 UnicodeSet
*fRegionalIndicatorSet
;
1605 UnicodeSet
*fPrependSet
;
1606 UnicodeSet
*fSpacingSet
;
1611 UnicodeSet
*fLVTSet
;
1612 UnicodeSet
*fHangulSet
;
1613 UnicodeSet
*fExtendedPictSet
;
1614 UnicodeSet
*fAnySet
;
1616 const UnicodeString
*fText
;
1620 RBBICharMonkey::RBBICharMonkey() {
1621 UErrorCode status
= U_ZERO_ERROR
;
1625 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
1626 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status
);
1627 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status
);
1628 fZWJSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status
);
1629 fRegionalIndicatorSet
=
1630 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
1631 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
1632 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
1633 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
1634 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
1635 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
1636 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
1637 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
1638 fHangulSet
= new UnicodeSet();
1639 fHangulSet
->addAll(*fLSet
);
1640 fHangulSet
->addAll(*fVSet
);
1641 fHangulSet
->addAll(*fTSet
);
1642 fHangulSet
->addAll(*fLVSet
);
1643 fHangulSet
->addAll(*fLVTSet
);
1645 fExtendedPictSet
= new UnicodeSet(u
"[:Extended_Pictographic:]", status
);
1646 fAnySet
= new UnicodeSet(0, 0x10ffff);
1648 fSets
= new UVector(status
);
1649 fSets
->addElement(fCRLFSet
, status
);
1650 fSets
->addElement(fControlSet
, status
);
1651 fSets
->addElement(fExtendSet
, status
);
1652 fSets
->addElement(fRegionalIndicatorSet
, status
);
1653 if (!fPrependSet
->isEmpty()) {
1654 fSets
->addElement(fPrependSet
, status
);
1656 fSets
->addElement(fSpacingSet
, status
);
1657 fSets
->addElement(fHangulSet
, status
);
1658 fSets
->addElement(fAnySet
, status
);
1659 fSets
->addElement(fZWJSet
, status
);
1660 fSets
->addElement(fExtendedPictSet
, status
);
1661 if (U_FAILURE(status
)) {
1662 deferredStatus
= status
;
1667 void RBBICharMonkey::setText(const UnicodeString
&s
) {
1673 int32_t RBBICharMonkey::next(int32_t prevPos
) {
1674 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
1675 // break position being tested. The candidate break
1676 // location is before p2.
1680 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
1681 UChar32 cBase
; // for (X Extend*) patterns, the X character.
1683 if (U_FAILURE(deferredStatus
)) {
1687 // Previous break at end of string. return DONE.
1688 if (prevPos
>= fText
->length()) {
1691 p0
= p1
= p2
= p3
= prevPos
;
1692 c3
= fText
->char32At(prevPos
);
1693 c0
= c1
= c2
= cBase
= 0;
1694 (void)p0
; // suppress set but not used warning.
1697 // Loop runs once per "significant" character position in the input text.
1699 // Move all of the positions forward in the input string.
1704 // Advancd p3 by one codepoint
1705 p3
= fText
->moveIndex32(p3
, 1);
1706 c3
= fText
->char32At(p3
);
1709 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1712 if (p2
== fText
->length()) {
1713 // Reached end of string. Always a break position.
1718 // No Extend or Format characters may appear between the CR and LF,
1719 // which requires the additional check for p2 immediately following p1.
1721 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
1725 // Rule (GB4). ( Control | CR | LF ) <break>
1726 if (fControlSet
->contains(c1
) ||
1732 // Rule (GB5) <break> ( Control | CR | LF )
1734 if (fControlSet
->contains(c2
) ||
1741 // Rule (GB6) L x ( L | V | LV | LVT )
1742 if (fLSet
->contains(c1
) &&
1743 (fLSet
->contains(c2
) ||
1744 fVSet
->contains(c2
) ||
1745 fLVSet
->contains(c2
) ||
1746 fLVTSet
->contains(c2
))) {
1750 // Rule (GB7) ( LV | V ) x ( V | T )
1751 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
1752 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
1756 // Rule (GB8) ( LVT | T) x T
1757 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
1758 fTSet
->contains(c2
)) {
1762 // Rule (GB9) x (Extend | ZWJ)
1763 if (fExtendSet
->contains(c2
) || fZWJSet
->contains(c2
)) {
1764 if (!fExtendSet
->contains(c1
)) {
1770 // Rule (GB9a) x SpacingMark
1771 if (fSpacingSet
->contains(c2
)) {
1775 // Rule (GB9b) Prepend x
1776 if (fPrependSet
->contains(c1
)) {
1780 // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1781 if (fExtendedPictSet
->contains(cBase
) && fZWJSet
->contains(c1
) && fExtendedPictSet
->contains(c2
)) {
1785 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
1786 // Note: The first if condition is a little tricky. We only need to force
1787 // a break if there are three or more contiguous RIs. If there are
1788 // only two, a break following will occur via other rules, and will include
1789 // any trailing extend characters, which is needed behavior.
1790 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)
1791 && fRegionalIndicatorSet
->contains(c2
)) {
1794 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
1798 // Rule (GB999) Any <break> Any
1808 UVector
*RBBICharMonkey::charClasses() {
1813 RBBICharMonkey::~RBBICharMonkey() {
1818 delete fRegionalIndicatorSet
;
1829 delete fExtendedPictSet
;
1832 //------------------------------------------------------------------------------------------
1834 // class RBBIWordMonkey Word Break specific implementation
1835 // of RBBIMonkeyKind.
1837 //------------------------------------------------------------------------------------------
1838 class RBBIWordMonkey
: public RBBIMonkeyKind
{
1841 virtual ~RBBIWordMonkey();
1842 virtual UVector
*charClasses();
1843 virtual void setText(const UnicodeString
&s
);
1844 virtual int32_t next(int32_t i
);
1850 UnicodeSet
*fNewlineSet
;
1851 UnicodeSet
*fRegionalIndicatorSet
;
1852 UnicodeSet
*fKatakanaSet
;
1853 UnicodeSet
*fHebrew_LetterSet
;
1854 UnicodeSet
*fALetterSet
;
1855 UnicodeSet
*fSingle_QuoteSet
;
1856 UnicodeSet
*fDouble_QuoteSet
;
1857 UnicodeSet
*fMidNumLetSet
;
1858 UnicodeSet
*fMidLetterSet
;
1859 UnicodeSet
*fMidNumSet
;
1860 UnicodeSet
*fNumericSet
;
1861 UnicodeSet
*fFormatSet
;
1862 UnicodeSet
*fOtherSet
;
1863 UnicodeSet
*fExtendSet
;
1864 UnicodeSet
*fExtendNumLetSet
;
1865 UnicodeSet
*fWSegSpaceSet
;
1866 UnicodeSet
*fDictionarySet
;
1867 UnicodeSet
*fZWJSet
;
1868 UnicodeSet
*fExtendedPictSet
;
1870 const UnicodeString
*fText
;
1874 RBBIWordMonkey::RBBIWordMonkey()
1876 UErrorCode status
= U_ZERO_ERROR
;
1878 fSets
= new UVector(status
);
1880 fCRSet
= new UnicodeSet(u
"[\\p{Word_Break = CR}]", status
);
1881 fLFSet
= new UnicodeSet(u
"[\\p{Word_Break = LF}]", status
);
1882 fNewlineSet
= new UnicodeSet(u
"[\\p{Word_Break = Newline}]", status
);
1883 fKatakanaSet
= new UnicodeSet(u
"[\\p{Word_Break = Katakana}]", status
);
1884 fRegionalIndicatorSet
= new UnicodeSet(u
"[\\p{Word_Break = Regional_Indicator}]", status
);
1885 fHebrew_LetterSet
= new UnicodeSet(u
"[\\p{Word_Break = Hebrew_Letter}]", status
);
1886 fALetterSet
= new UnicodeSet(u
"[\\p{Word_Break = ALetter}]", status
);
1887 fSingle_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Single_Quote}]", status
);
1888 fDouble_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Double_Quote}]", status
);
1889 fMidNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNumLet}]", status
);
1890 fMidLetterSet
= new UnicodeSet(u
"[\\p{Word_Break = MidLetter} - [\\:]]", status
);
1891 fMidNumSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNum}]", status
);
1892 fNumericSet
= new UnicodeSet(u
"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status
);
1893 fFormatSet
= new UnicodeSet(u
"[\\p{Word_Break = Format}]", status
);
1894 fExtendNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = ExtendNumLet}]", status
);
1895 fExtendSet
= new UnicodeSet(u
"[\\p{Word_Break = Extend}]", status
);
1896 fWSegSpaceSet
= new UnicodeSet(u
"[\\p{Word_Break = WSegSpace}]", status
);
1898 fZWJSet
= new UnicodeSet(u
"[\\p{Word_Break = ZWJ}]", status
);
1899 fExtendedPictSet
= new UnicodeSet(u
"[:Extended_Pictographic:]", status
);
1901 fDictionarySet
= new UnicodeSet(u
"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status
);
1902 fDictionarySet
->addAll(*fKatakanaSet
);
1903 fDictionarySet
->addAll(UnicodeSet(u
"[\\p{LineBreak = Complex_Context}]", status
));
1905 fALetterSet
->removeAll(*fDictionarySet
);
1907 fOtherSet
= new UnicodeSet();
1908 if(U_FAILURE(status
)) {
1909 IntlTest::gTest
->errln("%s:%d %s", __FILE__
, __LINE__
, u_errorName(status
));
1910 deferredStatus
= status
;
1914 fOtherSet
->complement();
1915 fOtherSet
->removeAll(*fCRSet
);
1916 fOtherSet
->removeAll(*fLFSet
);
1917 fOtherSet
->removeAll(*fNewlineSet
);
1918 fOtherSet
->removeAll(*fKatakanaSet
);
1919 fOtherSet
->removeAll(*fHebrew_LetterSet
);
1920 fOtherSet
->removeAll(*fALetterSet
);
1921 fOtherSet
->removeAll(*fSingle_QuoteSet
);
1922 fOtherSet
->removeAll(*fDouble_QuoteSet
);
1923 fOtherSet
->removeAll(*fMidLetterSet
);
1924 fOtherSet
->removeAll(*fMidNumSet
);
1925 fOtherSet
->removeAll(*fNumericSet
);
1926 fOtherSet
->removeAll(*fExtendNumLetSet
);
1927 fOtherSet
->removeAll(*fWSegSpaceSet
);
1928 fOtherSet
->removeAll(*fFormatSet
);
1929 fOtherSet
->removeAll(*fExtendSet
);
1930 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
1931 fOtherSet
->removeAll(*fZWJSet
);
1932 fOtherSet
->removeAll(*fExtendedPictSet
);
1934 // Inhibit dictionary characters from being tested at all.
1935 fOtherSet
->removeAll(*fDictionarySet
);
1937 fSets
->addElement(fCRSet
, status
);
1938 fSets
->addElement(fLFSet
, status
);
1939 fSets
->addElement(fNewlineSet
, status
);
1940 fSets
->addElement(fRegionalIndicatorSet
, status
);
1941 fSets
->addElement(fHebrew_LetterSet
, status
);
1942 fSets
->addElement(fALetterSet
, status
);
1943 fSets
->addElement(fSingle_QuoteSet
, status
);
1944 fSets
->addElement(fDouble_QuoteSet
, status
);
1945 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
1946 // from the test data. They are all in the dictionary set,
1947 // which this (old, to be retired) monkey test cannot handle.
1948 fSets
->addElement(fMidLetterSet
, status
);
1949 fSets
->addElement(fMidNumLetSet
, status
);
1950 fSets
->addElement(fMidNumSet
, status
);
1951 fSets
->addElement(fNumericSet
, status
);
1952 fSets
->addElement(fFormatSet
, status
);
1953 fSets
->addElement(fExtendSet
, status
);
1954 fSets
->addElement(fOtherSet
, status
);
1955 fSets
->addElement(fExtendNumLetSet
, status
);
1956 fSets
->addElement(fWSegSpaceSet
, status
);
1958 fSets
->addElement(fZWJSet
, status
);
1959 fSets
->addElement(fExtendedPictSet
, status
);
1961 if (U_FAILURE(status
)) {
1962 deferredStatus
= status
;
1966 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
1971 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
1972 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
1973 // break position being tested. The candidate break
1974 // location is before p2.
1978 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
1980 if (U_FAILURE(deferredStatus
)) {
1984 // Prev break at end of string. return DONE.
1985 if (prevPos
>= fText
->length()) {
1988 p0
= p1
= p2
= p3
= prevPos
;
1989 c3
= fText
->char32At(prevPos
);
1991 (void)p0
; // Suppress set but not used warning.
1993 // Loop runs once per "significant" character position in the input text.
1995 // Move all of the positions forward in the input string.
2000 // Advancd p3 by X(Extend | Format)* Rule 4
2001 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2003 p3
= fText
->moveIndex32(p3
, 1);
2004 c3
= fText
->char32At(p3
);
2005 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2009 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
) || fZWJSet
->contains(c3
));
2013 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2016 if (p2
== fText
->length()) {
2017 // Reached end of string. Always a break position.
2022 // No Extend or Format characters may appear between the CR and LF,
2023 // which requires the additional check for p2 immediately following p1.
2025 if (c1
==0x0D && c2
==0x0A) {
2029 // Rule (3a) Break before and after newlines (including CR and LF)
2031 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2034 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2038 // Rule (3c) ZWJ x Extended_Pictographic
2039 // Not ignoring extend chars, so peek into input text to
2040 // get the potential ZWJ, the character immediately preceding c2.
2041 // Sloppy UChar32 indexing: p2-1 may reference trail half
2042 // but char32At will get the full code point.
2043 if (fZWJSet
->contains(fText
->char32At(p2
-1)) && fExtendedPictSet
->contains(c2
)) {
2047 // Rule (3d) Keep horizontal whitespace together.
2048 if (fWSegSpaceSet
->contains(fText
->char32At(p2
-1)) && fWSegSpaceSet
->contains(c2
)) {
2052 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2053 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2054 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2058 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2060 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2061 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2062 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2066 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2067 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2068 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2069 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2073 // Rule (7a) Hebrew_Letter x Single_Quote
2074 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2078 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2079 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2083 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2084 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2088 // Rule (8) Numeric x Numeric
2089 if (fNumericSet
->contains(c1
) &&
2090 fNumericSet
->contains(c2
)) {
2094 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2095 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2096 fNumericSet
->contains(c2
)) {
2100 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2101 if (fNumericSet
->contains(c1
) &&
2102 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2106 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2107 if (fNumericSet
->contains(c0
) &&
2108 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2109 fNumericSet
->contains(c2
)) {
2113 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2114 if (fNumericSet
->contains(c1
) &&
2115 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2116 fNumericSet
->contains(c3
)) {
2120 // Rule (13) Katakana x Katakana
2121 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2122 // all Katakana are handled by the dictionary breaker.
2123 if (fKatakanaSet
->contains(c1
) &&
2124 fKatakanaSet
->contains(c2
)) {
2128 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2129 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2130 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2131 fExtendNumLetSet
->contains(c2
)) {
2135 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2136 if (fExtendNumLetSet
->contains(c1
) &&
2137 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2138 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2142 // Rule 15 - 17 Group pairs of Regional Indicators.
2143 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)) {
2146 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2150 // Rule 999. Break found here.
2159 UVector
*RBBIWordMonkey::charClasses() {
2164 RBBIWordMonkey::~RBBIWordMonkey() {
2169 delete fKatakanaSet
;
2170 delete fHebrew_LetterSet
;
2172 delete fSingle_QuoteSet
;
2173 delete fDouble_QuoteSet
;
2174 delete fMidNumLetSet
;
2175 delete fMidLetterSet
;
2180 delete fExtendNumLetSet
;
2181 delete fWSegSpaceSet
;
2182 delete fRegionalIndicatorSet
;
2183 delete fDictionarySet
;
2186 delete fExtendedPictSet
;
2192 //------------------------------------------------------------------------------------------
2194 // class RBBISentMonkey Sentence Break specific implementation
2195 // of RBBIMonkeyKind.
2197 //------------------------------------------------------------------------------------------
2198 class RBBISentMonkey
: public RBBIMonkeyKind
{
2201 virtual ~RBBISentMonkey();
2202 virtual UVector
*charClasses();
2203 virtual void setText(const UnicodeString
&s
);
2204 virtual int32_t next(int32_t i
);
2206 int moveBack(int posFrom
);
2207 int moveForward(int posFrom
);
2208 UChar32
cAt(int pos
);
2212 UnicodeSet
*fSepSet
;
2213 UnicodeSet
*fFormatSet
;
2215 UnicodeSet
*fLowerSet
;
2216 UnicodeSet
*fUpperSet
;
2217 UnicodeSet
*fOLetterSet
;
2218 UnicodeSet
*fNumericSet
;
2219 UnicodeSet
*fATermSet
;
2220 UnicodeSet
*fSContinueSet
;
2221 UnicodeSet
*fSTermSet
;
2222 UnicodeSet
*fCloseSet
;
2223 UnicodeSet
*fOtherSet
;
2224 UnicodeSet
*fExtendSet
;
2226 const UnicodeString
*fText
;
2230 RBBISentMonkey::RBBISentMonkey()
2232 UErrorCode status
= U_ZERO_ERROR
;
2234 fSets
= new UVector(status
);
2236 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2237 // set and made into character classes of their own. For the monkey impl,
2238 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2239 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2240 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2241 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2242 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2243 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2244 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2245 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2246 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2247 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2248 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2249 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2250 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2251 fOtherSet
= new UnicodeSet();
2253 if(U_FAILURE(status
)) {
2254 deferredStatus
= status
;
2258 fOtherSet
->complement();
2259 fOtherSet
->removeAll(*fSepSet
);
2260 fOtherSet
->removeAll(*fFormatSet
);
2261 fOtherSet
->removeAll(*fSpSet
);
2262 fOtherSet
->removeAll(*fLowerSet
);
2263 fOtherSet
->removeAll(*fUpperSet
);
2264 fOtherSet
->removeAll(*fOLetterSet
);
2265 fOtherSet
->removeAll(*fNumericSet
);
2266 fOtherSet
->removeAll(*fATermSet
);
2267 fOtherSet
->removeAll(*fSContinueSet
);
2268 fOtherSet
->removeAll(*fSTermSet
);
2269 fOtherSet
->removeAll(*fCloseSet
);
2270 fOtherSet
->removeAll(*fExtendSet
);
2272 fSets
->addElement(fSepSet
, status
);
2273 fSets
->addElement(fFormatSet
, status
);
2274 fSets
->addElement(fSpSet
, status
);
2275 fSets
->addElement(fLowerSet
, status
);
2276 fSets
->addElement(fUpperSet
, status
);
2277 fSets
->addElement(fOLetterSet
, status
);
2278 fSets
->addElement(fNumericSet
, status
);
2279 fSets
->addElement(fATermSet
, status
);
2280 fSets
->addElement(fSContinueSet
, status
);
2281 fSets
->addElement(fSTermSet
, status
);
2282 fSets
->addElement(fCloseSet
, status
);
2283 fSets
->addElement(fOtherSet
, status
);
2284 fSets
->addElement(fExtendSet
, status
);
2286 if (U_FAILURE(status
)) {
2287 deferredStatus
= status
;
2293 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2297 UVector
*RBBISentMonkey::charClasses() {
2302 // moveBack() Find the "significant" code point preceding the index i.
2303 // Skips over ($Extend | $Format)* .
2305 int RBBISentMonkey::moveBack(int i
) {
2312 j
= fText
->moveIndex32(j
, -1);
2313 c
= fText
->char32At(j
);
2315 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2321 int RBBISentMonkey::moveForward(int i
) {
2322 if (i
>=fText
->length()) {
2323 return fText
->length();
2328 j
= fText
->moveIndex32(j
, 1);
2331 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2335 UChar32
RBBISentMonkey::cAt(int pos
) {
2336 if (pos
<0 || pos
>=fText
->length()) {
2339 return fText
->char32At(pos
);
2343 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2344 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2345 // break position being tested. The candidate break
2346 // location is before p2.
2350 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2353 if (U_FAILURE(deferredStatus
)) {
2357 // Prev break at end of string. return DONE.
2358 if (prevPos
>= fText
->length()) {
2361 p0
= p1
= p2
= p3
= prevPos
;
2362 c3
= fText
->char32At(prevPos
);
2364 (void)p0
; // Suppress set but not used warning.
2366 // Loop runs once per "significant" character position in the input text.
2368 // Move all of the positions forward in the input string.
2373 // Advancd p3 by X(Extend | Format)* Rule 4
2374 p3
= moveForward(p3
);
2378 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2382 // Rule (4). Sep <break>
2383 if (fSepSet
->contains(c1
)) {
2384 p2
= p1
+1; // Separators don't combine with Extend or Format.
2388 if (p2
>= fText
->length()) {
2389 // Reached end of string. Always a break position.
2393 if (p2
== prevPos
) {
2394 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2398 // Rule (6). ATerm x Numeric
2399 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2403 // Rule (7). (Upper | Lower) ATerm x Uppper
2404 if ((fUpperSet
->contains(c0
) || fLowerSet
->contains(c0
)) &&
2405 fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2409 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2410 // Note: STerm | ATerm are added to the negated part of the expression by a
2411 // note to the Unicode 5.0 documents.
2413 while (fSpSet
->contains(cAt(p8
))) {
2416 while (fCloseSet
->contains(cAt(p8
))) {
2419 if (fATermSet
->contains(cAt(p8
))) {
2423 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2424 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2425 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2428 p8
= moveForward(p8
);
2430 if (fLowerSet
->contains(cAt(p8
))) {
2435 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2436 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2438 while (fSpSet
->contains(cAt(p8
))) {
2441 while (fCloseSet
->contains(cAt(p8
))) {
2445 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2450 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2452 while (fCloseSet
->contains(cAt(p9
))) {
2456 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2457 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2462 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2464 while (fSpSet
->contains(cAt(p10
))) {
2465 p10
= moveBack(p10
);
2467 while (fCloseSet
->contains(cAt(p10
))) {
2468 p10
= moveBack(p10
);
2470 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2471 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2476 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2478 if (fSepSet
->contains(cAt(p11
))) {
2479 p11
= moveBack(p11
);
2481 while (fSpSet
->contains(cAt(p11
))) {
2482 p11
= moveBack(p11
);
2484 while (fCloseSet
->contains(cAt(p11
))) {
2485 p11
= moveBack(p11
);
2487 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2491 // Rule (12) Any x Any
2498 RBBISentMonkey::~RBBISentMonkey() {
2508 delete fSContinueSet
;
2517 //-------------------------------------------------------------------------------------------
2521 //-------------------------------------------------------------------------------------------
2523 class RBBILineMonkey
: public RBBIMonkeyKind
{
2526 virtual ~RBBILineMonkey();
2527 virtual UVector
*charClasses();
2528 virtual void setText(const UnicodeString
&s
);
2529 virtual int32_t next(int32_t i
);
2530 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2578 BreakIterator
*fCharBI
;
2579 const UnicodeString
*fText
;
2580 RegexMatcher
*fNumberMatcher
;
2583 RBBILineMonkey::RBBILineMonkey() :
2589 fNumberMatcher(NULL
)
2592 if (U_FAILURE(deferredStatus
)) {
2596 UErrorCode status
= U_ZERO_ERROR
;
2598 fSets
= new UVector(status
);
2600 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
2601 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
2602 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
2603 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
2604 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
2605 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
2606 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
2607 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
2608 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
2609 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
2610 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
2611 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
2612 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
2613 fHH
= new UnicodeSet();
2614 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
2615 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
2616 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
2617 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
2618 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
2619 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
2620 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
2621 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
2622 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
2623 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
2624 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
2625 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
2626 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
2627 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
2628 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
2629 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
2630 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
2631 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
2632 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
2633 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
2634 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
2635 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
2636 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
2637 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
2638 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
2639 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
2640 fEB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status
);
2641 fEM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status
);
2642 fZWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status
);
2644 if (U_FAILURE(status
)) {
2645 deferredStatus
= status
;
2649 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2650 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2651 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2653 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
2654 fCM
->addAll(*fZWJ
); // ZWJ behaves as a CM.
2656 fHH
->add(u
'\u2010'); // Hyphen, '‐'
2658 fSets
->addElement(fBK
, status
);
2659 fSets
->addElement(fCR
, status
);
2660 fSets
->addElement(fLF
, status
);
2661 fSets
->addElement(fCM
, status
);
2662 fSets
->addElement(fNL
, status
);
2663 fSets
->addElement(fWJ
, status
);
2664 fSets
->addElement(fZW
, status
);
2665 fSets
->addElement(fGL
, status
);
2666 fSets
->addElement(fCB
, status
);
2667 fSets
->addElement(fSP
, status
);
2668 fSets
->addElement(fB2
, status
);
2669 fSets
->addElement(fBA
, status
);
2670 fSets
->addElement(fBB
, status
);
2671 fSets
->addElement(fHY
, status
);
2672 fSets
->addElement(fH2
, status
);
2673 fSets
->addElement(fH3
, status
);
2674 fSets
->addElement(fCL
, status
);
2675 fSets
->addElement(fCP
, status
);
2676 fSets
->addElement(fEX
, status
);
2677 fSets
->addElement(fIN
, status
);
2678 fSets
->addElement(fJL
, status
);
2679 fSets
->addElement(fJT
, status
);
2680 fSets
->addElement(fJV
, status
);
2681 fSets
->addElement(fNS
, status
);
2682 fSets
->addElement(fOP
, status
);
2683 fSets
->addElement(fQU
, status
);
2684 fSets
->addElement(fIS
, status
);
2685 fSets
->addElement(fNU
, status
);
2686 fSets
->addElement(fPO
, status
);
2687 fSets
->addElement(fPR
, status
);
2688 fSets
->addElement(fSY
, status
);
2689 fSets
->addElement(fAI
, status
);
2690 fSets
->addElement(fAL
, status
);
2691 fSets
->addElement(fHL
, status
);
2692 fSets
->addElement(fID
, status
);
2693 fSets
->addElement(fWJ
, status
);
2694 fSets
->addElement(fRI
, status
);
2695 fSets
->addElement(fSG
, status
);
2696 fSets
->addElement(fEB
, status
);
2697 fSets
->addElement(fEM
, status
);
2698 fSets
->addElement(fZWJ
, status
);
2702 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2703 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2704 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2705 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2706 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2707 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2708 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2710 fNumberMatcher
= new RegexMatcher(
2711 UnicodeString(rules
, -1, US_INV
), 0, status
);
2713 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2715 if (U_FAILURE(status
)) {
2716 deferredStatus
= status
;
2721 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2723 fCharBI
->setText(s
);
2724 fNumberMatcher
->reset(s
);
2729 // Line Break TR rules 9 and 10 implementation.
2730 // This deals with combining marks and other sequences that
2731 // that must be treated as if they were something other than what they actually are.
2733 // This is factored out into a separate function because it must be applied twice for
2734 // each potential break, once to the chars before the position being checked, then
2735 // again to the text following the possible break.
2737 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2739 // Invalid initial position. Happens during the warmup iteration of the
2740 // main loop in next().
2744 int32_t nPos
= *nextPos
;
2746 // LB 9 Keep combining sequences together.
2747 // advance over any CM class chars. Note that Line Break CM is different
2748 // from the normal Grapheme Extend property.
2749 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
2750 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
2752 *nextChar
= fText
->char32At(nPos
);
2753 if (!fCM
->contains(*nextChar
)) {
2756 nPos
= fText
->moveIndex32(nPos
, 1);
2761 // LB 9 Treat X CM* as if it were x.
2762 // No explicit action required.
2764 // LB 10 Treat any remaining combining mark as AL
2765 if (fCM
->contains(*posChar
)) {
2769 // Push the updated nextPos and nextChar back to our caller.
2770 // This only makes a difference if posChar got bigger by consuming a
2771 // combining sequence.
2773 *nextChar
= fText
->char32At(nPos
);
2778 int32_t RBBILineMonkey::next(int32_t startPos
) {
2779 UErrorCode status
= U_ZERO_ERROR
;
2780 int32_t pos
; // Index of the char following a potential break position
2781 UChar32 thisChar
; // Character at above position "pos"
2783 int32_t prevPos
; // Index of the char preceding a potential break position
2784 UChar32 prevChar
; // Character at above position. Note that prevChar
2785 // and thisChar may not be adjacent because combining
2786 // characters between them will be ignored.
2788 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
2791 int32_t nextPos
; // Index of the next character following pos.
2792 // Usually skips over combining marks.
2793 int32_t nextCPPos
; // Index of the code point following "pos."
2794 // May point to a combining mark.
2795 int32_t tPos
; // temp value.
2798 if (U_FAILURE(deferredStatus
)) {
2802 if (startPos
>= fText
->length()) {
2807 // Initial values for loop. Loop will run the first time without finding breaks,
2808 // while the invalid values shift out and the "this" and
2809 // "prev" positions are filled in with good values.
2810 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
2811 thisChar
= prevChar
= prevCharX2
= 0;
2812 nextPos
= nextCPPos
= startPos
;
2815 // Loop runs once per position in the test text, until a break position
2818 prevPosX2
= prevPos
;
2819 prevCharX2
= prevChar
;
2822 prevChar
= thisChar
;
2825 thisChar
= fText
->char32At(pos
);
2827 nextCPPos
= fText
->moveIndex32(pos
, 1);
2828 nextPos
= nextCPPos
;
2830 // Rule LB2 - Break at end of text.
2831 if (pos
>= fText
->length()) {
2835 // Rule LB 9 - adjust for combining sequences.
2836 // We do this one out-of-order because the adjustment does not change anything
2837 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2839 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
2840 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
2841 c
= fText
->char32At(nextPos
);
2842 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
2844 // If the loop is still warming up - if we haven't shifted the initial
2845 // -1 positions out of prevPos yet - loop back to advance the
2846 // position in the input without any further looking for breaks.
2847 if (prevPos
== -1) {
2851 // LB 4 Always break after hard line breaks,
2852 if (fBK
->contains(prevChar
)) {
2856 // LB 5 Break after CR, LF, NL, but not inside CR LF
2857 if (prevChar
== 0x0d && thisChar
== 0x0a) {
2860 if (prevChar
== 0x0d ||
2866 // LB 6 Don't break before hard line breaks
2867 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
2868 fBK
->contains(thisChar
)) {
2873 // LB 7 Don't break before spaces or zero-width space.
2874 if (fSP
->contains(thisChar
)) {
2878 if (fZW
->contains(thisChar
)) {
2882 // LB 8 Break after zero width space
2884 // Scan backwards from prevChar for SP* ZW
2886 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
2887 tPos
= fText
->moveIndex32(tPos
, -1);
2889 if (fZW
->contains(fText
->char32At(tPos
))) {
2894 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2895 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2896 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
2897 if (U_FAILURE(status
)) {
2900 // Matched a number. But could have been just a single digit, which would
2901 // not represent a "no break here" between prevChar and thisChar
2902 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
2903 if (numEndIdx
> pos
) {
2904 // Number match includes at least our two chars being checked
2905 if (numEndIdx
> nextPos
) {
2906 // Number match includes additional chars. Update pos and nextPos
2907 // so that next loop iteration will continue at the end of the number,
2908 // checking for breaks between last char in number & whatever follows.
2909 pos
= nextPos
= numEndIdx
;
2911 pos
= fText
->moveIndex32(pos
, -1);
2912 thisChar
= fText
->char32At(pos
);
2913 } while (fCM
->contains(thisChar
));
2920 // The monkey test's way of ignoring combining characters doesn't work
2921 // for this rule. ZJ is also a CM. Need to get the actual character
2922 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
2924 int32_t prevIdx
= fText
->moveIndex32(pos
, -1);
2925 UChar32 prevC
= fText
->char32At(prevIdx
);
2926 if (fZWJ
->contains(prevC
)) {
2931 // LB 9, 10 Already done, at top of loop.
2935 // LB 11 Do not break before or after WORD JOINER and related characters.
2939 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
2945 if (fGL
->contains(prevChar
)) {
2951 if (!(fSP
->contains(prevChar
) ||
2952 fBA
->contains(prevChar
) ||
2953 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
2957 // LB 13 Don't break before closings.
2959 if (fCL
->contains(thisChar
) ||
2960 fCP
->contains(thisChar
) ||
2961 fEX
->contains(thisChar
) ||
2962 fSY
->contains(thisChar
)) {
2966 // LB 14 Don't break after OP SP*
2967 // Scan backwards, checking for this sequence.
2968 // The OP char could include combining marks, so we actually check for
2970 // Another Twist: The Rule 9 fixes may have changed a SP CM
2971 // sequence into a ID char, so before scanning back through spaces,
2972 // verify that prevChar is indeed a space. The prevChar variable
2973 // may differ from fText[prevPos]
2975 if (fSP
->contains(prevChar
)) {
2976 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
2977 tPos
=fText
->moveIndex32(tPos
, -1);
2980 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
2981 tPos
=fText
->moveIndex32(tPos
, -1);
2983 if (fOP
->contains(fText
->char32At(tPos
))) {
2988 // LB 14a Break before an IS that begins a number and follows a space
2989 if (nextPos
< fText
->length()) {
2990 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
2991 // from a legit ffff character. So test length separately.
2992 UChar32 nextChar
= fText
->char32At(nextPos
);
2993 if (fSP
->contains(prevChar
) && fIS
->contains(thisChar
) && fNU
->contains(nextChar
)) {
2998 // LB14b Do not break before numeric separators, even after spaces.
2999 if (fIS
->contains(thisChar
)) {
3003 // LB 15 QU SP* x OP
3004 if (fOP
->contains(thisChar
)) {
3005 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3007 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3008 tPos
= fText
->moveIndex32(tPos
, -1);
3010 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3011 tPos
= fText
->moveIndex32(tPos
, -1);
3013 if (fQU
->contains(fText
->char32At(tPos
))) {
3020 // LB 16 (CL | CP) SP* x NS
3021 // Scan backwards for SP* CM* (CL | CP)
3022 if (fNS
->contains(thisChar
)) {
3024 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3025 tPos
= fText
->moveIndex32(tPos
, -1);
3027 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3028 tPos
= fText
->moveIndex32(tPos
, -1);
3030 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3036 // LB 17 B2 SP* x B2
3037 if (fB2
->contains(thisChar
)) {
3038 // Scan backwards, checking for the B2 CM* SP* sequence.
3040 if (fSP
->contains(prevChar
)) {
3041 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3042 tPos
=fText
->moveIndex32(tPos
, -1);
3045 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3046 tPos
=fText
->moveIndex32(tPos
, -1);
3048 if (fB2
->contains(fText
->char32At(tPos
))) {
3054 // LB 18 break after space
3055 if (fSP
->contains(prevChar
)) {
3062 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3066 // LB 20 Break around a CB
3067 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3071 // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
3072 // Formerly this was a Finnish tailoring.
3073 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3074 // ^($HY | $HH) $AL;
3075 if (fAL
->contains(thisChar
) && (fHY
->contains(prevChar
) || fHH
->contains(prevChar
)) &&
3081 if (fBA
->contains(thisChar
) ||
3082 fHY
->contains(thisChar
) ||
3083 fNS
->contains(thisChar
) ||
3084 fBB
->contains(prevChar
) ) {
3090 if (fHL
->contains(prevCharX2
) &&
3091 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3097 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3102 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3103 (fEX
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3104 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3105 ((fID
->contains(prevChar
) || fEB
->contains(prevChar
) || fEM
->contains(prevChar
)) && fIN
->contains(thisChar
)) ||
3106 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3107 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3112 // LB 23 (AL | HL) x NU
3114 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fNU
->contains(thisChar
)) {
3117 if (fNU
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3121 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3122 // PR x (ID | EB | EM)
3123 // (ID | EB | EM) x PO
3124 if (fPR
->contains(prevChar
) &&
3125 (fID
->contains(thisChar
) || fEB
->contains(thisChar
) || fEM
->contains(thisChar
))) {
3128 if ((fID
->contains(prevChar
) || fEB
->contains(prevChar
) || fEM
->contains(prevChar
)) &&
3129 fPO
->contains(thisChar
)) {
3133 // LB 24 Do not break between prefix and letters or ideographs.
3134 // (PR | PO) x (AL | HL)
3135 // (AL | HL) x (PR | PO)
3136 if ((fPR
->contains(prevChar
) || fPO
->contains(prevChar
)) &&
3137 (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3140 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) &&
3141 (fPR
->contains(thisChar
) || fPO
->contains(thisChar
))) {
3145 // LB 25 numbers match, moved up, before LB 8a,
3147 // LB 26 Do not break a Korean syllable.
3148 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3149 fJV
->contains(thisChar
) ||
3150 fH2
->contains(thisChar
) ||
3151 fH3
->contains(thisChar
))) {
3155 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3156 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3160 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3161 fJT
->contains(thisChar
)) {
3165 // LB 27 Treat a Korean Syllable Block the same as ID.
3166 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3167 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3168 fIN
->contains(thisChar
)) {
3171 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3172 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3173 fPO
->contains(thisChar
)) {
3176 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3177 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3183 // LB 28 Do not break between alphabetics ("at").
3184 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3188 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3189 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3193 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3196 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3199 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3205 if (fRI
->contains(prevCharX2
) && fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3208 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3209 // Two Regional Indicators have been paired.
3210 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3211 // following RI. This is a hack.
3216 // LB30b Emoji Base x Emoji Modifier
3217 if (fEB
->contains(prevChar
) && fEM
->contains(thisChar
)) {
3221 // LB 31 Break everywhere else
3230 UVector
*RBBILineMonkey::charClasses() {
3235 RBBILineMonkey::~RBBILineMonkey() {
3283 delete fNumberMatcher
;
3287 //-------------------------------------------------------------------------------------------
3292 // seed=nnnnn Random number starting seed.
3293 // Setting the seed allows errors to be reproduced.
3294 // loop=nnn Looping count. Controls running time.
3296 // 0 or greater: run length.
3298 // type = char | word | line | sent | title
3301 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3303 //-------------------------------------------------------------------------------------------
3305 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3306 int32_t val
= defaultVal
;
3307 name
.append(" *= *(-?\\d+)");
3308 UErrorCode status
= U_ZERO_ERROR
;
3309 RegexMatcher
m(name
, params
, 0, status
);
3311 // The param exists. Convert the string to an int.
3312 char valString
[100];
3313 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3314 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3315 paramLength
= (int32_t)(sizeof(valString
)-2);
3317 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3318 val
= strtol(valString
, NULL
, 10);
3320 // Delete this parameter from the params string.
3322 params
= m
.replaceFirst("", status
);
3324 U_ASSERT(U_SUCCESS(status
));
3329 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3330 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3339 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3341 if (count
< expectedcount
&& expected
[count
] != i
) {
3342 test
->errln("%s:%d break forward test failed: expected %d but got %d",
3343 __FILE__
, __LINE__
, expected
[count
], i
);
3348 if (count
!= expectedcount
) {
3349 printStringBreaks(ustr
, expected
, expectedcount
);
3350 test
->errln("%s:%d break forward test failed: missed %d match",
3351 __FILE__
, __LINE__
, expectedcount
- count
);
3354 // testing boundaries
3355 for (i
= 1; i
< expectedcount
; i
++) {
3356 int j
= expected
[i
- 1];
3357 if (!bi
->isBoundary(j
)) {
3358 printStringBreaks(ustr
, expected
, expectedcount
);
3359 test
->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3360 __FILE__
, __LINE__
, j
);
3363 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3364 if (bi
->isBoundary(j
)) {
3365 printStringBreaks(ustr
, expected
, expectedcount
);
3366 test
->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3367 __FILE__
, __LINE__
, j
);
3373 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3375 if (forward
[count
] != i
) {
3376 printStringBreaks(ustr
, expected
, expectedcount
);
3377 test
->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3378 __FILE__
, __LINE__
, forward
[count
], i
);
3383 printStringBreaks(ustr
, expected
, expectedcount
);
3384 test
->errln("break test previous() failed: missed a match");
3388 // testing preceding
3389 for (i
= 0; i
< expectedcount
- 1; i
++) {
3390 // int j = expected[i] + 1;
3391 int j
= ustr
.moveIndex32(expected
[i
], 1);
3392 for (; j
<= expected
[i
+ 1]; j
++) {
3393 int32_t expectedPreceding
= expected
[i
];
3394 int32_t actualPreceding
= bi
->preceding(j
);
3395 if (actualPreceding
!= expectedPreceding
) {
3396 printStringBreaks(ustr
, expected
, expectedcount
);
3397 test
->errln("%s:%d preceding(%d): expected %d, got %d",
3398 __FILE__
, __LINE__
, j
, expectedPreceding
, actualPreceding
);
3406 void RBBITest::TestWordBreaks(void)
3408 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3410 Locale
locale("en");
3411 UErrorCode status
= U_ZERO_ERROR
;
3412 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3413 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3414 // Replaced any C+J characters in a row with a random sequence of characters
3415 // of the same length to make our C+J segmentation not get in the way.
3416 static const char *strlist
[] =
3418 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3419 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3420 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3421 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3422 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3423 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3424 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3425 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3426 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3427 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3428 "\\u2027\\U000e0067\\u0a47\\u00b7",
3429 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3430 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3431 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3432 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3433 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3434 "\\u0027\\u11af\\U000e0057\\u0602",
3435 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3436 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3437 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3438 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3439 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3440 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3441 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3442 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3443 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3444 "\\u18f4\\U000e0049\\u20e7\\u2027",
3445 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3446 "\\ua183\\u102d\\u0bec\\u003a",
3447 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3448 "\\u003a\\u0e57\\u0fad\\u002e",
3449 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3450 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3451 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3452 "\\u003a\\u0664\\u00b7\\u1fba",
3453 "\\u003b\\u0027\\u00b7\\u47a3",
3454 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3455 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3456 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3459 if (U_FAILURE(status
)) {
3460 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3463 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3464 // printf("looping %d\n", loop);
3465 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3466 // RBBICharMonkey monkey;
3467 RBBIWordMonkey monkey
;
3470 int expectedcount
= 0;
3472 monkey
.setText(ustr
);
3474 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3475 expected
[expectedcount
++] = i
;
3478 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3484 void RBBITest::TestWordBoundary(void)
3486 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3487 Locale
locale("en");
3488 UErrorCode status
= U_ZERO_ERROR
;
3489 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3490 LocalPointer
<BreakIterator
> bi(BreakIterator::createWordInstance(locale
, status
), status
);
3491 if (U_FAILURE(status
)) {
3492 errcheckln(status
, "%s:%d Creation of break iterator failed %s",
3493 __FILE__
, __LINE__
, u_errorName(status
));
3497 static const char *strlist
[] =
3499 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3500 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3501 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3502 "\\u2027\\U000e0067\\u0a47\\u00b7",
3503 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3504 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3505 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3506 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3507 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3508 "\\u0027\\u11af\\U000e0057\\u0602",
3509 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3510 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3511 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3512 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3513 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3514 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3515 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3516 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3517 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3518 "\\u58f4\\U000e0049\\u20e7\\u2027",
3519 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3520 "\\ua183\\u102d\\u0bec\\u003a",
3521 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3522 "\\u003a\\u0e57\\u0fad\\u002e",
3523 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3524 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3525 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3526 "\\u003a\\u0664\\u00b7\\u1fba",
3527 "\\u003b\\u0027\\u00b7\\u47a3",
3530 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3531 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
3532 UnicodeString
ustr(str
);
3538 for (int32_t boundary
= bi
->first(); boundary
!= BreakIterator::DONE
; boundary
= bi
->next()) {
3540 if (count
>= UPRV_LENGTHOF(forward
)) {
3541 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3542 __FILE__
, __LINE__
, loop
, count
, boundary
);
3545 forward
[count
] = boundary
;
3546 if (boundary
<= prev
) {
3547 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3548 __FILE__
, __LINE__
, loop
, prev
, boundary
);
3551 for (int32_t nonBoundary
= prev
+ 1; nonBoundary
< boundary
; nonBoundary
++) {
3552 if (bi
->isBoundary(nonBoundary
)) {
3553 printStringBreaks(ustr
, forward
, count
);
3554 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3555 __FILE__
, __LINE__
, loop
, prev
, nonBoundary
, boundary
);
3559 if (!bi
->isBoundary(boundary
)) {
3560 printStringBreaks(ustr
, forward
, count
);
3561 errln("%s:%d happy boundary test failed: expected %d a boundary",
3562 __FILE__
, __LINE__
, boundary
);
3570 void RBBITest::TestLineBreaks(void)
3572 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3573 Locale
locale("en");
3574 UErrorCode status
= U_ZERO_ERROR
;
3575 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3576 const int32_t STRSIZE
= 50;
3578 static const char *strlist
[] =
3580 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3581 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3582 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3583 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3584 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3585 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3586 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3587 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3588 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3589 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3590 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3591 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3592 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3593 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3594 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3595 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3596 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3597 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3598 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3599 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3600 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3601 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3602 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3603 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3604 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3605 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3606 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3607 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3608 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3609 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3610 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3611 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3612 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3613 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3614 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3615 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3616 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3617 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3618 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3619 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3622 TEST_ASSERT_SUCCESS(status
);
3623 if (U_FAILURE(status
)) {
3626 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3627 // printf("looping %d\n", loop);
3628 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3635 UnicodeString
ustr(str
);
3636 RBBILineMonkey monkey
;
3637 if (U_FAILURE(monkey
.deferredStatus
)) {
3641 const int EXPECTEDSIZE
= 50;
3642 int expected
[EXPECTEDSIZE
];
3643 int expectedcount
= 0;
3645 monkey
.setText(ustr
);
3647 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3648 if (expectedcount
>= EXPECTEDSIZE
) {
3649 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3652 expected
[expectedcount
++] = i
;
3655 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3661 void RBBITest::TestSentBreaks(void)
3663 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3664 Locale
locale("en");
3665 UErrorCode status
= U_ZERO_ERROR
;
3666 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3668 static const char *strlist
[] =
3670 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3672 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3673 "\"Sentence ending with a quote.\" Bye.",
3674 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3675 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3676 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3677 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3678 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3679 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3680 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3681 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3682 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3683 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3684 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3685 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3686 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3687 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3688 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3689 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3692 if (U_FAILURE(status
)) {
3693 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3696 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3697 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
3698 UnicodeString
ustr(str
);
3700 RBBISentMonkey monkey
;
3701 if (U_FAILURE(monkey
.deferredStatus
)) {
3705 const int EXPECTEDSIZE
= 50;
3706 int expected
[EXPECTEDSIZE
];
3707 int expectedcount
= 0;
3709 monkey
.setText(ustr
);
3711 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3712 if (expectedcount
>= EXPECTEDSIZE
) {
3713 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3716 expected
[expectedcount
++] = i
;
3719 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3725 void RBBITest::TestMonkey() {
3726 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3728 UErrorCode status
= U_ZERO_ERROR
;
3729 int32_t loopCount
= 500;
3731 UnicodeString breakType
= "all";
3732 Locale
locale("en");
3733 UBool useUText
= FALSE
;
3735 if (quick
== FALSE
) {
3740 UnicodeString
p(fTestParams
);
3741 loopCount
= getIntParam("loop", p
, loopCount
);
3742 seed
= getIntParam("seed", p
, seed
);
3744 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3746 breakType
= m
.group(1, status
);
3748 p
= m
.replaceFirst("", status
);
3751 RegexMatcher
u(" *utext", p
, 0, status
);
3755 p
= u
.replaceFirst("", status
);
3760 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
3761 // Each option is stripped out of the option string as it is processed.
3762 // All options have been checked. The option string should have been completely emptied..
3764 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3765 buf
[sizeof(buf
)-1] = 0;
3766 errln("Unrecognized or extra parameter: %s\n", buf
);
3772 if (breakType
== "char" || breakType
== "all") {
3774 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3775 if (U_SUCCESS(status
)) {
3776 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
3777 if (breakType
== "all" && useUText
==FALSE
) {
3778 // Also run a quick test with UText when "all" is specified
3779 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
3783 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
3788 if (breakType
== "word" || breakType
== "all") {
3789 logln("Word Break Monkey Test");
3791 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3792 if (U_SUCCESS(status
)) {
3793 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
3796 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
3801 if (breakType
== "line" || breakType
== "all") {
3802 logln("Line Break Monkey Test");
3804 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3805 if (loopCount
>= 10) {
3806 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
3808 if (U_SUCCESS(status
)) {
3809 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
3812 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3817 if (breakType
== "sent" || breakType
== "all" ) {
3818 logln("Sentence Break Monkey Test");
3820 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3821 if (loopCount
>= 10) {
3822 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
3824 if (U_SUCCESS(status
)) {
3825 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
3828 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3837 // Run a RBBI monkey test. Common routine, for all break iterator types.
3839 // bi - the break iterator to use
3840 // mk - MonkeyKind, abstraction for obtaining expected results
3841 // name - Name of test (char, word, etc.) for use in error messages
3842 // seed - Seed for starting random number generator (parameter from user)
3845 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
3846 int32_t numIterations
, UBool useUText
) {
3848 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3850 const int32_t TESTSTRINGLEN
= 500;
3851 UnicodeString testText
;
3852 int32_t numCharClasses
;
3854 int expected
[TESTSTRINGLEN
*2 + 1];
3855 int expectedCount
= 0;
3856 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
3857 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
3858 char reverseBreaks
[TESTSTRINGLEN
*2+1];
3859 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
3860 char followingBreaks
[TESTSTRINGLEN
*2+1];
3861 char precedingBreaks
[TESTSTRINGLEN
*2+1];
3867 numCharClasses
= mk
.charClasses()->size();
3868 chClasses
= mk
.charClasses();
3870 // Check for errors that occured during the construction of the MonkeyKind object.
3871 // Can't report them where they occured because errln() is a method coming from intlTest,
3872 // and is not visible outside of RBBITest :-(
3873 if (U_FAILURE(mk
.deferredStatus
)) {
3874 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
3878 // Verify that the character classes all have at least one member.
3879 for (i
=0; i
<numCharClasses
; i
++) {
3880 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
3881 if (s
== NULL
|| s
->size() == 0) {
3882 errln("Character Class #%d is null or of zero size.", i
);
3887 while (loopCount
< numIterations
|| numIterations
== -1) {
3888 if (numIterations
== -1 && loopCount
% 10 == 0) {
3889 // If test is running in an infinite loop, display a periodic tic so
3890 // we can tell that it is making progress.
3891 fprintf(stderr
, ".");
3893 // Save current random number seed, so that we can recreate the random numbers
3894 // for this loop iteration in event of an error.
3897 // Populate a test string with data.
3898 testText
.truncate(0);
3899 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
3900 int32_t aClassNum
= m_rand() % numCharClasses
;
3901 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
3902 int32_t charIdx
= m_rand() % classSet
->size();
3903 UChar32 c
= classSet
->charAt(charIdx
);
3904 if (c
< 0) { // TODO: deal with sets containing strings.
3905 errln("%s:%d c < 0", __FILE__
, __LINE__
);
3908 // Do not assemble a supplementary character from randomly generated separate surrogates.
3909 // (It could be a dictionary character)
3910 if (U16_IS_TRAIL(c
) && testText
.length() > 0 && U16_IS_LEAD(testText
.charAt(testText
.length()-1))) {
3917 // Calculate the expected results for this test string.
3918 mk
.setText(testText
);
3919 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
3920 expectedBreaks
[0] = 1;
3921 int32_t breakPos
= 0;
3924 breakPos
= mk
.next(breakPos
);
3925 if (breakPos
== -1) {
3928 if (breakPos
> testText
.length()) {
3929 errln("breakPos > testText.length()");
3931 expectedBreaks
[breakPos
] = 1;
3932 U_ASSERT(expectedCount
<testText
.length());
3933 expected
[expectedCount
++] = breakPos
;
3934 (void)expected
; // Set but not used warning.
3935 // TODO (andy): check it out.
3938 // Find the break positions using forward iteration
3939 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
3941 UErrorCode status
= U_ZERO_ERROR
;
3942 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
3943 // testUText = utext_openUnicodeString(testUText, &testText, &status);
3944 bi
->setText(testUText
, status
);
3945 TEST_ASSERT_SUCCESS(status
);
3946 utext_close(testUText
); // The break iterator does a shallow clone of the UText
3947 // This UText can be closed immediately, so long as the
3948 // testText string continues to exist.
3950 bi
->setText(testText
);
3953 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
3954 if (i
< 0 || i
> testText
.length()) {
3955 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
3958 forwardBreaks
[i
] = 1;
3961 // Find the break positions using reverse iteration
3962 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
3963 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
3964 if (i
< 0 || i
> testText
.length()) {
3965 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
3968 reverseBreaks
[i
] = 1;
3971 // Find the break positions using isBoundary() tests.
3972 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
3973 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
3974 for (i
=0; i
<=testText
.length(); i
++) {
3975 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
3979 // Find the break positions using the following() function.
3981 memset(followingBreaks
, 0, sizeof(followingBreaks
));
3982 int32_t lastBreakPos
= 0;
3983 followingBreaks
[0] = 1;
3984 for (i
=0; i
<testText
.length(); i
++) {
3985 breakPos
= bi
->following(i
);
3986 if (breakPos
<= i
||
3987 breakPos
< lastBreakPos
||
3988 breakPos
> testText
.length() ||
3989 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
3990 errln("%s break monkey test: "
3991 "Out of range value returned by BreakIterator::following().\n"
3992 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
3993 name
, seed
, i
, breakPos
, lastBreakPos
);
3996 followingBreaks
[breakPos
] = 1;
3997 lastBreakPos
= breakPos
;
4000 // Find the break positions using the preceding() function.
4001 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4002 lastBreakPos
= testText
.length();
4003 precedingBreaks
[testText
.length()] = 1;
4004 for (i
=testText
.length(); i
>0; i
--) {
4005 breakPos
= bi
->preceding(i
);
4006 if (breakPos
>= i
||
4007 breakPos
> lastBreakPos
||
4008 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4009 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4010 errln("%s break monkey test: "
4011 "Out of range value returned by BreakIterator::preceding().\n"
4012 "index=%d; prev returned %d; lastBreak=%d" ,
4013 name
, i
, breakPos
, lastBreakPos
);
4014 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4015 precedingBreaks
[i
] = 2; // Forces an error.
4018 if (breakPos
>= 0) {
4019 precedingBreaks
[breakPos
] = 1;
4021 lastBreakPos
= breakPos
;
4025 // Compare the expected and actual results.
4026 for (i
=0; i
<=testText
.length(); i
++) {
4027 const char *errorType
= NULL
;
4028 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4029 errorType
= "next()";
4030 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4031 errorType
= "previous()";
4032 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4033 errorType
= "isBoundary()";
4034 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4035 errorType
= "following()";
4036 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4037 errorType
= "preceding()";
4041 if (errorType
!= NULL
) {
4042 // Format a range of the test text that includes the failure as
4043 // a data item that can be included in the rbbi test data file.
4045 // Start of the range is the last point where expected and actual results
4046 // both agreed that there was a break position.
4047 int startContext
= i
;
4050 if (startContext
==0) { break; }
4052 if (expectedBreaks
[startContext
] != 0) {
4053 if (count
== 2) break;
4058 // End of range is two expected breaks past the start position.
4059 int endContext
= i
+ 1;
4061 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4063 if (endContext
>= testText
.length()) {break;}
4064 if (expectedBreaks
[endContext
-1] != 0) {
4065 if (count
== 0) break;
4072 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4073 UnicodeString errorText
= "<data>";
4074 /***if (strcmp(errorType, "next()") == 0) {
4076 endContext = testText.length();
4078 printStringBreaks(testText, expected, expectedCount);
4081 for (ci
=startContext
; ci
<endContext
;) {
4082 UnicodeString
hexChars("0123456789abcdef");
4085 c
= testText
.char32At(ci
);
4087 // This is the location of the error.
4088 errorText
.append("<?>");
4089 } else if (expectedBreaks
[ci
] != 0) {
4090 // This a non-error expected break position.
4091 errorText
.append("\\");
4094 errorText
.append("\\u");
4095 for (bn
=12; bn
>=0; bn
-=4) {
4096 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4099 errorText
.append("\\U");
4100 for (bn
=28; bn
>=0; bn
-=4) {
4101 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4104 ci
= testText
.moveIndex32(ci
, 1);
4106 errorText
.append("\\");
4107 errorText
.append("</data>\n");
4110 char charErrorTxt
[500];
4111 UErrorCode status
= U_ZERO_ERROR
;
4112 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4113 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4114 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4116 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4117 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4118 errorType
, seed
, i
, charErrorTxt
);
4129 // Bug 5532. UTF-8 based UText fails in dictionary code.
4130 // This test checks the initial patch,
4131 // which is to just keep it from crashing. Correct word boundaries
4132 // await a proper fix to the dictionary code.
4134 void RBBITest::TestBug5532(void) {
4135 // Text includes a mixture of Thai and Latin.
4136 const unsigned char utf8Data
[] = {
4137 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4138 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4139 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4140 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4141 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4142 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4143 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4144 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4145 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4146 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4147 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4149 UErrorCode status
= U_ZERO_ERROR
;
4150 UText utext
=UTEXT_INITIALIZER
;
4151 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4152 TEST_ASSERT_SUCCESS(status
);
4154 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4155 TEST_ASSERT_SUCCESS(status
);
4156 if (U_SUCCESS(status
)) {
4157 bi
->setText(&utext
, status
);
4158 TEST_ASSERT_SUCCESS(status
);
4160 int32_t breakCount
= 0;
4161 int32_t previousBreak
= -1;
4162 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4163 // For now, just make sure that the break iterator doesn't hang.
4164 TEST_ASSERT(previousBreak
< bi
->current());
4165 previousBreak
= bi
->current();
4167 TEST_ASSERT(breakCount
> 0);
4170 utext_close(&utext
);
4174 void RBBITest::TestBug9983(void) {
4175 UnicodeString text
= UnicodeString("\\u002A" // * Other
4177 "\\u309C" // Katakana
4181 "\\u0000").unescape();
4183 UErrorCode status
= U_ZERO_ERROR
;
4184 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4185 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4186 TEST_ASSERT_SUCCESS(status
);
4187 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4188 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4189 TEST_ASSERT_SUCCESS(status
);
4190 if (U_FAILURE(status
)) {
4193 int32_t offset
, rstatus
, iterationCount
;
4195 brkiter
->setText(text
);
4198 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4200 rstatus
= brkiter
->getRuleStatus();
4201 (void)rstatus
; // Suppress set but not used warning.
4202 if (iterationCount
>= 10) {
4206 TEST_ASSERT(iterationCount
== 6);
4208 brkiterPOSIX
->setText(text
);
4209 brkiterPOSIX
->last();
4211 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4213 rstatus
= brkiterPOSIX
->getRuleStatus();
4214 (void)rstatus
; // Suppress set but not used warning.
4215 if (iterationCount
>= 10) {
4219 TEST_ASSERT(iterationCount
== 6);
4222 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4224 void RBBITest::TestBug7547() {
4225 UnicodeString rules
;
4226 UErrorCode status
= U_ZERO_ERROR
;
4227 UParseError parseError
;
4228 RuleBasedBreakIterator
breakIterator(rules
, parseError
, status
);
4229 if (status
!= U_BRK_RULE_SYNTAX
) {
4230 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__
, __LINE__
, u_errorName(status
));
4232 if (parseError
.line
!= 1 || parseError
.offset
!= 0) {
4233 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError
.line
, parseError
.offset
);
4238 void RBBITest::TestBug12797() {
4239 UnicodeString rules
= "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4240 UErrorCode status
= U_ZERO_ERROR
;
4241 UParseError parseError
;
4242 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
4243 if (U_FAILURE(status
)) {
4244 errln("%s:%s status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4247 UnicodeString text
= "abc";
4250 int32_t boundary
= bi
.next();
4251 if (boundary
!= 3) {
4252 errln("%s:%d expected boundary==3, got %d", __FILE__
, __LINE__
, boundary
);
4256 void RBBITest::TestBug12918() {
4257 // This test triggers an assertion failure in dictbe.cpp
4258 const UChar
*crasherString
= u
"\u3325\u4a16";
4259 UErrorCode status
= U_ZERO_ERROR
;
4260 UBreakIterator
* iter
= ubrk_open(UBRK_WORD
, NULL
, crasherString
, -1, &status
);
4261 if (U_FAILURE(status
)) {
4262 dataerrln("%s:%d status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4267 int32_t lastPos
= -1;
4268 while((pos
= ubrk_next(iter
)) != UBRK_DONE
) {
4269 if (pos
<= lastPos
) {
4270 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__
, __LINE__
, pos
, lastPos
);
4277 void RBBITest::TestBug12932() {
4278 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4279 UnicodeString
ruleStr(
4280 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4281 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4282 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4283 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4284 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4285 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4287 UErrorCode status
= U_ZERO_ERROR
;
4288 UParseError parseError
;
4289 RuleBasedBreakIterator
rbbi(ruleStr
, parseError
, status
);
4290 if (status
!= U_BRK_RULE_SYNTAX
) {
4291 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4292 __FILE__
, __LINE__
, u_errorName(status
));
4297 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4298 // remain undevided by ICU char, word and line break.
4299 void RBBITest::TestEmoji() {
4300 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4301 UErrorCode status
= U_ZERO_ERROR
;
4303 CharString testFileName
;
4304 testFileName
.append(IntlTest::getSourceTestData(status
), status
);
4305 testFileName
.appendPathPart("emoji-test.txt", status
);
4306 if (U_FAILURE(status
)) {
4307 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4310 logln("Opening data file %s\n", testFileName
.data());
4313 UChar
*testFile
= ReadAndConvertFile(testFileName
.data(), len
, "UTF-8", status
);
4314 if (U_FAILURE(status
) || testFile
== NULL
) {
4315 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4318 UnicodeString
testFileAsString(testFile
, len
);
4321 RegexMatcher
lineMatcher(u
"^.*?$", testFileAsString
, UREGEX_MULTILINE
, status
);
4322 RegexMatcher
hexMatcher(u
"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE
, status
);
4323 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4324 int32_t lineNumber
= 0;
4326 LocalPointer
<BreakIterator
> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status
), status
);
4327 LocalPointer
<BreakIterator
> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status
), status
);
4328 LocalPointer
<BreakIterator
> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status
), status
);
4329 if (U_FAILURE(status
)) {
4330 dataerrln("%s:%d %s while opening break iterators", __FILE__
, __LINE__
, u_errorName(status
));
4334 while (lineMatcher
.find()) {
4336 UnicodeString line
= lineMatcher
.group(status
);
4337 hexMatcher
.reset(line
);
4338 UnicodeString testString
; // accumulates the emoji sequence.
4339 while (hexMatcher
.find() && hexMatcher
.group(1, status
).length() > 0) {
4340 UnicodeString hex
= hexMatcher
.group(1, status
);
4341 if (hex
.length() > 8) {
4342 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__
, __LINE__
, lineNumber
, CStr(hex
)());
4346 hex8
.appendInvariantChars(hex
, status
);
4347 UChar32 c
= (UChar32
)strtol(hex8
.data(), NULL
, 16);
4349 testString
.append(c
);
4351 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4352 __FILE__
, __LINE__
, lineNumber
, hex8
.data());
4357 if (testString
.length() > 1) {
4358 charBreaks
->setText(testString
);
4359 charBreaks
->first();
4360 int32_t firstBreak
= charBreaks
->next();
4361 if (testString
.length() != firstBreak
) {
4362 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4363 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4365 wordBreaks
->setText(testString
);
4366 wordBreaks
->first();
4367 firstBreak
= wordBreaks
->next();
4368 if (testString
.length() != firstBreak
) {
4369 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4370 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4372 lineBreaks
->setText(testString
);
4373 lineBreaks
->first();
4374 firstBreak
= lineBreaks
->next();
4375 if (testString
.length() != firstBreak
) {
4376 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4377 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4385 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4387 void RBBITest::TestBug12519() {
4388 UErrorCode status
= U_ZERO_ERROR
;
4389 LocalPointer
<RuleBasedBreakIterator
> biEn((RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
));
4390 LocalPointer
<RuleBasedBreakIterator
> biFr((RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getFrance(), status
));
4391 if (!assertSuccess(WHERE
, status
)) {
4392 dataerrln("%s %d status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4395 assertTrue(WHERE
, Locale::getEnglish() == biEn
->getLocale(ULOC_VALID_LOCALE
, status
));
4397 assertTrue(WHERE
, Locale::getFrench() == biFr
->getLocale(ULOC_VALID_LOCALE
, status
));
4398 assertTrue(WHERE
"Locales do not participate in BreakIterator equality.", *biEn
== *biFr
);
4400 LocalPointer
<RuleBasedBreakIterator
>cloneEn((RuleBasedBreakIterator
*)biEn
->clone());
4401 assertTrue(WHERE
, *biEn
== *cloneEn
);
4402 assertTrue(WHERE
, Locale::getEnglish() == cloneEn
->getLocale(ULOC_VALID_LOCALE
, status
));
4404 LocalPointer
<RuleBasedBreakIterator
>cloneFr((RuleBasedBreakIterator
*)biFr
->clone());
4405 assertTrue(WHERE
, *biFr
== *cloneFr
);
4406 assertTrue(WHERE
, Locale::getFrench() == cloneFr
->getLocale(ULOC_VALID_LOCALE
, status
));
4408 LocalPointer
<RuleBasedBreakIterator
>biDe((RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getGerman(), status
));
4409 UnicodeString
text("Hallo Welt");
4410 biDe
->setText(text
);
4411 assertTrue(WHERE
"before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr
!= *biDe
);
4413 assertTrue(WHERE
"after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr
== *biDe
);
4416 void RBBITest::TestBug12677() {
4417 // Check that stripping of comments from rules for getRules() is not confused by
4418 // the presence of '#' characters in the rules that do not introduce comments.
4419 UnicodeString
rules(u
"!!forward; \n"
4420 "$x = [ab#]; # a set with a # literal. \n"
4421 " # .; # a comment that looks sort of like a rule. \n"
4422 " '#' '?'; # a rule with a quoted # \n"
4425 UErrorCode status
= U_ZERO_ERROR
;
4427 RuleBasedBreakIterator
bi(rules
, pe
, status
);
4428 assertSuccess(WHERE
, status
);
4429 UnicodeString rtRules
= bi
.getRules();
4430 assertEquals(WHERE
, UnicodeString(u
"!!forward; $x = [ab#]; '#' '?'; "), rtRules
);
4434 void RBBITest::TestTableRedundancies() {
4435 UErrorCode status
= U_ZERO_ERROR
;
4437 LocalPointer
<RuleBasedBreakIterator
> bi (
4438 (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
));
4439 assertSuccess(WHERE
, status
);
4440 if (U_FAILURE(status
)) return;
4442 RBBIDataWrapper
*dw
= bi
->fData
;
4443 const RBBIStateTable
*fwtbl
= dw
->fForwardTable
;
4444 int32_t numCharClasses
= dw
->fHeader
->fCatCount
;
4445 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4447 // Check for duplicate columns (character categories)
4449 std::vector
<UnicodeString
> columns
;
4450 for (int32_t column
= 0; column
< numCharClasses
; column
++) {
4452 for (int32_t r
= 1; r
< (int32_t)fwtbl
->fNumStates
; r
++) {
4453 RBBIStateTableRow
*row
= (RBBIStateTableRow
*) (fwtbl
->fTableData
+ (fwtbl
->fRowLen
* r
));
4454 s
.append(row
->fNextState
[column
]);
4456 columns
.push_back(s
);
4458 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4459 for (int c1
=1; c1
<numCharClasses
; c1
++) {
4460 for (int c2
= c1
+1; c2
< numCharClasses
; c2
++) {
4461 if (columns
.at(c1
) == columns
.at(c2
)) {
4462 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__
, __LINE__
, c1
, c2
);
4469 // Check for duplicate states
4470 std::vector
<UnicodeString
> rows
;
4471 for (int32_t r
=0; r
< (int32_t)fwtbl
->fNumStates
; r
++) {
4473 RBBIStateTableRow
*row
= (RBBIStateTableRow
*) (fwtbl
->fTableData
+ (fwtbl
->fRowLen
* r
));
4474 assertTrue(WHERE
, row
->fAccepting
>= -1);
4475 s
.append(row
->fAccepting
+ 1); // values of -1 are expected.
4476 s
.append(row
->fLookAhead
);
4477 s
.append(row
->fTagIdx
);
4478 for (int32_t column
= 0; column
< numCharClasses
; column
++) {
4479 s
.append(row
->fNextState
[column
]);
4483 for (int r1
=0; r1
< (int32_t)fwtbl
->fNumStates
; r1
++) {
4484 for (int r2
= r1
+1; r2
< (int32_t)fwtbl
->fNumStates
; r2
++) {
4485 if (rows
.at(r1
) == rows
.at(r2
)) {
4486 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__
, __LINE__
, r1
, r2
);
4493 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4494 // even after next() has returned DONE.
4496 void RBBITest::TestBug13447() {
4497 UErrorCode status
= U_ZERO_ERROR
;
4498 LocalPointer
<RuleBasedBreakIterator
> bi(
4499 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
));
4500 assertSuccess(WHERE
, status
);
4501 if (U_FAILURE(status
)) return;
4502 UnicodeString
data(u
"1234");
4504 assertEquals(WHERE
, UBRK_WORD_NONE
, bi
->getRuleStatus());
4505 assertEquals(WHERE
, 4, bi
->next());
4506 assertEquals(WHERE
, UBRK_WORD_NUMBER
, bi
->getRuleStatus());
4507 assertEquals(WHERE
, UBRK_DONE
, bi
->next());
4508 assertEquals(WHERE
, 4, bi
->current());
4509 assertEquals(WHERE
, UBRK_WORD_NUMBER
, bi
->getRuleStatus());
4512 // TestReverse exercises both the synthesized safe reverse rules and the logic
4513 // for filling the break iterator cache when starting from random positions
4516 // It's a monkey test, working on random data, with the expected data obtained
4517 // from forward iteration (no safe rules involved), comparing with results
4518 // when indexing into the interior of the string (safe rules needed).
4520 void RBBITest::TestReverse() {
4521 UErrorCode status
= U_ZERO_ERROR
;
4523 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4524 BreakIterator::createCharacterInstance(Locale::getEnglish(), status
)));
4525 assertSuccess(WHERE
, status
, true);
4526 status
= U_ZERO_ERROR
;
4527 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4528 BreakIterator::createWordInstance(Locale::getEnglish(), status
)));
4529 assertSuccess(WHERE
, status
, true);
4530 status
= U_ZERO_ERROR
;
4531 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4532 BreakIterator::createLineInstance(Locale::getEnglish(), status
)));
4533 assertSuccess(WHERE
, status
, true);
4534 status
= U_ZERO_ERROR
;
4535 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4536 BreakIterator::createSentenceInstance(Locale::getEnglish(), status
)));
4537 assertSuccess(WHERE
, status
, true);
4540 void RBBITest::TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>bi
) {
4545 // From the mapping trie in the break iterator's internal data, create a
4546 // vector of UnicodeStrings, one for each character category, containing
4547 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4548 // to avoid an execess of unassigned code points.
4550 RBBIDataWrapper
*data
= bi
->fData
;
4551 int32_t categoryCount
= data
->fHeader
->fCatCount
;
4552 UTrie2
*trie
= data
->fTrie
;
4554 std::vector
<UnicodeString
> strings(categoryCount
, UnicodeString());
4555 for (int cp
=0; cp
<0x1fff0; ++cp
) {
4556 int cat
= utrie2_get32(trie
, cp
);
4557 cat
&= ~0x4000; // And off the dictionary bit from the category.
4558 assertTrue(WHERE
, cat
< categoryCount
&& cat
>= 0);
4559 if (cat
< 0 || cat
>= categoryCount
) return;
4560 strings
[cat
].append(cp
);
4564 const int testStringLength
= 10000;
4565 UnicodeString testString
;
4567 for (int i
=0; i
<testStringLength
; ++i
) {
4568 int charClass
= randomGen() % categoryCount
;
4569 if (strings
[charClass
].length() > 0) {
4570 int cp
= strings
[charClass
].char32At(randomGen() % strings
[charClass
].length());
4571 testString
.append(cp
);
4575 typedef std::pair
<UBool
, int32_t> Result
;
4576 std::vector
<Result
> expectedResults
;
4577 bi
->setText(testString
);
4578 for (int i
=0; i
<testString
.length(); ++i
) {
4579 bool isboundary
= bi
->isBoundary(i
);
4580 int ruleStatus
= bi
->getRuleStatus();
4581 expectedResults
.push_back(std::make_pair(isboundary
, ruleStatus
));
4584 for (int i
=testString
.length()-1; i
>=0; --i
) {
4585 bi
->setText(testString
); // clears the internal break cache
4586 Result expected
= expectedResults
[i
];
4587 assertEquals(WHERE
, expected
.first
, bi
->isBoundary(i
));
4588 assertEquals(WHERE
, expected
.second
, bi
->getRuleStatus());
4593 // Ticket 13692 - finding word boundaries in very large numbers or words could
4594 // be very time consuming. When the problem was present, this void test
4595 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4597 void RBBITest::TestBug13692() {
4598 UErrorCode status
= U_ZERO_ERROR
;
4599 LocalPointer
<RuleBasedBreakIterator
> bi ((RuleBasedBreakIterator
*)
4600 BreakIterator::createWordInstance(Locale::getEnglish(), status
), status
);
4601 if (!assertSuccess(WHERE
, status
, true)) {
4604 constexpr int32_t LENGTH
= 1000000;
4605 UnicodeString
longNumber(LENGTH
, (UChar32
)u
'3', LENGTH
);
4606 for (int i
=0; i
<20; i
+=2) {
4607 longNumber
.setCharAt(i
, u
' ');
4609 bi
->setText(longNumber
);
4610 assertFalse(WHERE
, bi
->isBoundary(LENGTH
-5));
4611 assertSuccess(WHERE
, status
);
4615 // TestDebug - A place-holder test for debugging purposes.
4616 // For putting in fragments of other tests that can be invoked
4617 // for tracing without a lot of unwanted extra stuff happening.
4619 void RBBITest::TestDebug(void) {
4620 UErrorCode status
= U_ZERO_ERROR
;
4621 LocalPointer
<RuleBasedBreakIterator
> bi ((RuleBasedBreakIterator
*)
4622 BreakIterator::createCharacterInstance(Locale::getEnglish(), status
), status
);
4623 if (!assertSuccess(WHERE
, status
, true)) {
4626 const UnicodeString
&rules
= bi
->getRules();
4628 LocalPointer
<RuleBasedBreakIterator
> newbi(new RuleBasedBreakIterator(rules
, pe
, status
));
4629 assertSuccess(WHERE
, status
);
4632 void RBBITest::TestProperties() {
4633 UErrorCode errorCode
= U_ZERO_ERROR
;
4634 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4635 if (!prependSet
.isEmpty()) {
4637 "[:GCB=Prepend:] is not empty any more. "
4638 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4639 "change this test to the opposite condition.");
4643 #endif // #if !UCONFIG_NO_BREAK_ITERATION