1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
23 #include "unicode/brkiter.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/numfmt.h"
26 #include "unicode/rbbi.h"
27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/regex.h"
30 #include "unicode/schriter.h"
31 #include "unicode/uchar.h"
32 #include "unicode/utf16.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uscript.h"
36 #include "unicode/ustring.h"
37 #include "unicode/utext.h"
45 #include "utypeinfo.h" // for 'typeid' to work
49 // Needed for Apple perf tests <rdar://problem/51193810>
51 #include <mach/mach_time.h>
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
60 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
62 } UPRV_BLOCK_MACRO_END
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65 if (U_FAILURE(errcode)) { \
66 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
68 } UPRV_BLOCK_MACRO_END
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
75 //---------------------------------------------
77 //---------------------------------------------
80 // Note: Before adding new tests to this file, check whether the desired test data can
81 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
82 // it's much less work than writing a new test, diagnostic output in the event of failures
83 // is good, and the test data file will is shared with ICU4J, so eventually the test
84 // will run there as well, without additional effort.
86 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
88 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
92 #if !UCONFIG_NO_FILE_IO
93 TESTCASE_AUTO(TestBug4153072
);
95 #if !UCONFIG_NO_FILE_IO
96 TESTCASE_AUTO(TestUnicodeFiles
);
98 TESTCASE_AUTO(TestGetAvailableLocales
);
99 TESTCASE_AUTO(TestGetDisplayName
);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestEndBehaviour
);
102 TESTCASE_AUTO(TestWordBreaks
);
103 TESTCASE_AUTO(TestWordBoundary
);
104 TESTCASE_AUTO(TestLineBreaks
);
105 TESTCASE_AUTO(TestSentBreaks
);
106 TESTCASE_AUTO(TestExtended
);
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109 TESTCASE_AUTO(TestMonkey
);
111 #if !UCONFIG_NO_FILE_IO
112 TESTCASE_AUTO(TestBug3818
);
114 TESTCASE_AUTO(TestDebug
);
115 #if !UCONFIG_NO_FILE_IO
116 TESTCASE_AUTO(TestBug5775
);
118 TESTCASE_AUTO(TestBug9983
);
119 TESTCASE_AUTO(TestDictRules
);
120 TESTCASE_AUTO(TestBug5532
);
121 TESTCASE_AUTO(TestBug7547
);
122 TESTCASE_AUTO(TestBug12797
);
123 TESTCASE_AUTO(TestBug12918
);
124 TESTCASE_AUTO(TestBug12932
);
125 TESTCASE_AUTO(TestEmoji
);
126 TESTCASE_AUTO(TestBug12519
);
127 TESTCASE_AUTO(TestBug12677
);
128 TESTCASE_AUTO(TestTableRedundancies
);
129 TESTCASE_AUTO(TestBug13447
);
130 TESTCASE_AUTO(TestReverse
);
131 TESTCASE_AUTO(TestBug13692
);
136 //--------------------------------------------------------------------------------------
138 // RBBITest constructor and destructor
140 //--------------------------------------------------------------------------------------
142 RBBITest::RBBITest() {
147 RBBITest::~RBBITest() {
151 static void printStringBreaks(UText
*tstr
, int expected
[], int expectedCount
) {
152 UErrorCode status
= U_ZERO_ERROR
;
154 printf("code alpha extend alphanum type word sent line name\n");
155 int nextExpectedIndex
= 0;
156 utext_setNativeIndex(tstr
, 0);
157 for (int j
= 0; j
< static_cast<int>(utext_nativeLength(tstr
)); j
=static_cast<int>(utext_getNativeIndex(tstr
))) {
158 if (nextExpectedIndex
< expectedCount
&& j
>= expected
[nextExpectedIndex
] ) {
159 printf("------------------------------------------------ %d\n", j
);
163 UChar32 c
= utext_next32(tstr
);
164 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
165 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
167 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
169 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
171 U_SHORT_PROPERTY_NAME
),
172 u_getPropertyValueName(UCHAR_WORD_BREAK
,
173 u_getIntPropertyValue(c
,
175 U_SHORT_PROPERTY_NAME
),
176 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
177 u_getIntPropertyValue(c
,
178 UCHAR_SENTENCE_BREAK
),
179 U_SHORT_PROPERTY_NAME
),
180 u_getPropertyValueName(UCHAR_LINE_BREAK
,
181 u_getIntPropertyValue(c
,
183 U_SHORT_PROPERTY_NAME
),
189 static void printStringBreaks(const UnicodeString
&ustr
, int expected
[], int expectedCount
) {
190 UErrorCode status
= U_ZERO_ERROR
;
192 tstr
= utext_openConstUnicodeString(NULL
, &ustr
, &status
);
193 if (U_FAILURE(status
)) {
194 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
));
197 printStringBreaks(tstr
, expected
, expectedCount
);
202 void RBBITest::TestBug3818() {
203 UErrorCode status
= U_ZERO_ERROR
;
205 // Four Thai words...
206 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
207 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
208 UnicodeString
thaiStr(thaiWordData
);
210 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
211 if (U_FAILURE(status
) || bi
== NULL
) {
212 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
215 bi
->setText(thaiStr
);
217 int32_t startOfSecondWord
= bi
->following(1);
218 if (startOfSecondWord
!= 4) {
219 errln("Fail at file %s, line %d expected start of word at 4, got %d",
220 __FILE__
, __LINE__
, startOfSecondWord
);
222 startOfSecondWord
= bi
->following(0);
223 if (startOfSecondWord
!= 4) {
224 errln("Fail at file %s, line %d expected start of word at 4, got %d",
225 __FILE__
, __LINE__
, startOfSecondWord
);
231 //---------------------------------------------
235 //---------------------------------------------
237 void RBBITest::TestGetAvailableLocales()
239 int32_t locCount
= 0;
240 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
243 dataerrln("getAvailableLocales() returned an empty list!");
244 // Just make sure that it's returning good memory.
246 for (i
= 0; i
< locCount
; ++i
) {
247 logln(locList
[i
].getName());
251 //Testing the BreakIterator::getDisplayName() function
252 void RBBITest::TestGetDisplayName()
254 UnicodeString result
;
256 BreakIterator::getDisplayName(Locale::getUS(), result
);
257 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
258 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
261 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
262 if (result
!= "French (France)")
263 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
270 void RBBITest::TestEndBehaviour()
272 UErrorCode status
= U_ZERO_ERROR
;
273 UnicodeString
testString("boo.");
274 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
275 if (U_FAILURE(status
))
277 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
280 wb
->setText(testString
);
282 if (wb
->first() != 0)
283 errln("Didn't get break at beginning of string.");
285 errln("Didn't get break before period in \"boo.\"");
286 if (wb
->current() != 4 && wb
->next() != 4)
287 errln("Didn't get break at end of string.");
293 void RBBITest::TestBug4153072() {
294 UErrorCode status
= U_ZERO_ERROR
;
295 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
296 if (U_FAILURE(status
))
298 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
301 UnicodeString
str("...Hello, World!...");
303 int32_t end
= str
.length() - 3;
306 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
307 iter
->adoptText(textIterator
);
309 // Note: with the switch to UText, there is no way to restrict the
310 // iteration range to begin at an index other than zero.
311 // String character iterators created with a non-zero bound are
312 // treated by RBBI as being empty.
313 for (index
= -1; index
< begin
+ 1; ++index
) {
314 onBoundary
= iter
->isBoundary(index
);
315 if (index
== 0? !onBoundary
: onBoundary
) {
316 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
317 " and begin index = " + begin
);
325 // Test for problem reported by Ashok Matoria on 9 July 2007
326 // One.<kSoftHyphen><kSpace>Two.
328 // Sentence break at start (0) and then on calling next() it breaks at
329 // 'T' of "Two". Now, at this point if I do next() and
330 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
332 void RBBITest::TestBug5775() {
333 UErrorCode status
= U_ZERO_ERROR
;
334 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
335 TEST_ASSERT_SUCCESS(status
);
336 if (U_FAILURE(status
)) {
339 // Check for status first for better handling of no data errors.
340 TEST_ASSERT(bi
!= NULL
);
345 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
349 int pos
= bi
->next();
350 TEST_ASSERT(pos
== 6);
352 TEST_ASSERT(pos
== 10);
353 pos
= bi
->previous();
354 TEST_ASSERT(pos
== 6);
360 //------------------------------------------------------------------------------
362 // RBBITest::Extended Run RBBI Tests from an external test data file
364 //------------------------------------------------------------------------------
367 BreakIterator
*bi
; // Break iterator is set while parsing test source.
368 // Changed out whenever test data changes break type.
370 UnicodeString dataToBreak
; // Data that is built up while parsing the test.
371 UVector32
*expectedBreaks
; // Expected break positions, matches dataToBreak UnicodeString.
372 UVector32
*srcLine
; // Positions in source file, indexed same as dataToBreak.
375 UText
*textToBreak
; // UText, could be UTF8 or UTF16.
376 UVector32
*textMap
; // Map from UTF-16 dataToBreak offsets to UText offsets.
377 CharString utf8String
; // UTF-8 form of text to break.
379 TestParams(UErrorCode
&status
) : dataToBreak() {
381 expectedBreaks
= new UVector32(status
);
382 srcLine
= new UVector32(status
);
383 srcCol
= new UVector32(status
);
385 textMap
= new UVector32(status
);
390 delete expectedBreaks
;
393 utext_close(textToBreak
);
397 int32_t getSrcLine(int32_t bp
);
398 int32_t getExpectedBreak(int32_t bp
);
399 int32_t getSrcCol(int32_t bp
);
401 void setUTF16(UErrorCode
&status
);
402 void setUTF8(UErrorCode
&status
);
405 // Append a UnicodeString to a CharString with UTF-8 encoding.
406 // Substitute any invalid chars.
407 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
408 static void CharStringAppend(CharString
&dest
, const UnicodeString
&src
, UErrorCode
&status
) {
409 if (U_FAILURE(status
)) {
413 u_strToUTF8WithSub(NULL
, 0, &utf8Length
, // Output Buffer, NULL for preflight.
414 src
.getBuffer(), src
.length(), // UTF-16 data
415 0xfffd, NULL
, // Substitution char, number of subs.
417 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
420 status
= U_ZERO_ERROR
;
422 char *buffer
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
);
423 u_strToUTF8WithSub(buffer
, utf8Length
, NULL
,
424 src
.getBuffer(), src
.length(),
425 0xfffd, NULL
, &status
);
426 dest
.append(buffer
, utf8Length
, status
);
430 void TestParams::setUTF16(UErrorCode
&status
) {
431 textToBreak
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
);
432 textMap
->removeAllElements();
433 for (int32_t i
=0; i
<dataToBreak
.length(); i
++) {
434 if (i
== dataToBreak
.getChar32Start(i
)) {
435 textMap
->addElement(i
, status
);
437 textMap
->addElement(-1, status
);
440 textMap
->addElement(dataToBreak
.length(), status
);
441 U_ASSERT(dataToBreak
.length() + 1 == textMap
->size());
445 void TestParams::setUTF8(UErrorCode
&status
) {
446 if (U_FAILURE(status
)) {
450 CharStringAppend(utf8String
, dataToBreak
, status
);
451 textToBreak
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
);
452 if (U_FAILURE(status
)) {
456 textMap
->removeAllElements();
457 int32_t utf16Index
= 0;
459 textMap
->addElement(utf16Index
, status
);
460 UChar32 c32
= utext_current32(textToBreak
);
464 utf16Index
+= U16_LENGTH(c32
);
465 utext_next32(textToBreak
);
466 while (textMap
->size() < utext_getNativeIndex(textToBreak
)) {
467 textMap
->addElement(-1, status
);
470 U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size());
474 int32_t TestParams::getSrcLine(int32_t bp
) {
475 if (bp
>= textMap
->size()) {
476 bp
= textMap
->size() - 1;
479 for(; bp
>= 0 ; --bp
) {
480 // Move to a character boundary if we are not on one already.
481 i
= textMap
->elementAti(bp
);
486 return srcLine
->elementAti(i
);
490 int32_t TestParams::getExpectedBreak(int32_t bp
) {
491 if (bp
>= textMap
->size()) {
494 int32_t i
= textMap
->elementAti(bp
);
497 retVal
= expectedBreaks
->elementAti(i
);
503 int32_t TestParams::getSrcCol(int32_t bp
) {
504 if (bp
>= textMap
->size()) {
505 bp
= textMap
->size() - 1;
508 for(; bp
>= 0; --bp
) {
509 // Move bp to a character boundary if we are not on one already.
510 i
= textMap
->elementAti(bp
);
515 return srcCol
->elementAti(i
);
519 void RBBITest::executeTest(TestParams
*t
, UErrorCode
&status
) {
524 TEST_ASSERT_SUCCESS(status
);
525 if (U_FAILURE(status
)) {
533 t
->bi
->setText(t
->textToBreak
, status
);
535 // Run the iterator forward
538 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
540 // Fail for lack of forward progress.
541 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
542 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
546 // Check that there we didn't miss an expected break between the last one
548 for (i
=prevBP
+1; i
<bp
; i
++) {
549 if (t
->getExpectedBreak(i
) != 0) {
550 int expected
[] = {0, i
};
551 printStringBreaks(t
->dataToBreak
, expected
, 2);
552 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
553 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
557 // Check that the break we did find was expected
558 if (t
->getExpectedBreak(bp
) == 0) {
559 int expected
[] = {0, bp
};
560 printStringBreaks(t
->textToBreak
, expected
, 2);
561 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
562 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
564 // The break was expected.
565 // Check that the {nnn} tag value is correct.
566 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
567 if (expectedTagVal
== -1) {
570 int32_t line
= t
->getSrcLine(bp
);
571 int32_t rs
= t
->bi
->getRuleStatus();
572 if (rs
!= expectedTagVal
) {
573 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
574 " Actual, Expected status = %4d, %4d",
575 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
582 // Verify that there were no missed expected breaks after the last one found
583 for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) {
584 if (t
->getExpectedBreak(i
) != 0) {
585 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
586 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
591 // Run the iterator backwards, verify that the same breaks are found.
593 prevBP
= static_cast<int32_t>(utext_nativeLength(t
->textToBreak
) + 2); // start with a phony value for the last break pos seen.
595 while (bp
!= BreakIterator::DONE
) {
597 // Fail for lack of progress.
598 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
599 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
603 // Check that we didn't miss an expected break between the last one
604 // and this one. (UVector returns zeros for index out of bounds.)
605 for (i
=prevBP
-1; i
>bp
; i
--) {
606 if (t
->getExpectedBreak(i
) != 0) {
607 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
608 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
612 // Check that the break we did find was expected
613 if (t
->getExpectedBreak(bp
) == 0) {
614 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
615 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
617 // The break was expected.
618 // Check that the {nnn} tag value is correct.
619 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
620 if (expectedTagVal
== -1) {
623 int line
= t
->getSrcLine(bp
);
624 int32_t rs
= t
->bi
->getRuleStatus();
625 if (rs
!= expectedTagVal
) {
626 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
627 " Actual, Expected status = %4d, %4d",
628 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
633 bp
= t
->bi
->previous();
636 // Verify that there were no missed breaks prior to the last one found
637 for (i
=prevBP
-1; i
>=0; i
--) {
638 if (t
->getExpectedBreak(i
) != 0) {
639 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
640 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
644 // Check isBoundary()
645 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
646 UBool boundaryExpected
= (t
->getExpectedBreak(i
) != 0);
647 UBool boundaryFound
= t
->bi
->isBoundary(i
);
648 if (boundaryExpected
!= boundaryFound
) {
649 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
650 " Expected, Actual= %s, %s",
651 i
, t
->getSrcLine(i
), t
->getSrcCol(i
),
652 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
657 for (i
=0; i
< static_cast<int32_t>(utext_nativeLength(t
->textToBreak
)); i
++) {
658 int32_t actualBreak
= t
->bi
->following(i
);
659 int32_t expectedBreak
= BreakIterator::DONE
;
660 for (int32_t j
=i
+1; j
<= static_cast<int32_t>(utext_nativeLength(t
->textToBreak
)); j
++) {
661 if (t
->getExpectedBreak(j
) != 0) {
666 if (expectedBreak
!= actualBreak
) {
667 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
668 " Expected, Actual= %d, %d",
669 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
674 for (i
=static_cast<int32_t>(utext_nativeLength(t
->textToBreak
)); i
>=0; i
--) {
675 int32_t actualBreak
= t
->bi
->preceding(i
);
676 int32_t expectedBreak
= BreakIterator::DONE
;
678 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
679 // preceding(trailing byte) will return the index of some preceding code point,
680 // not the lead byte of the current code point, even though that has a smaller index.
681 // Therefore, start looking at the expected break data not at i-1, but at
682 // the start of code point index - 1.
683 utext_setNativeIndex(t
->textToBreak
, i
);
684 int32_t j
= static_cast<int32_t>(utext_getNativeIndex(t
->textToBreak
) - 1);
685 for (; j
>= 0; j
--) {
686 if (t
->getExpectedBreak(j
) != 0) {
691 if (expectedBreak
!= actualBreak
) {
692 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
693 " Expected, Actual= %d, %d",
694 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
700 void RBBITest::TestExtended() {
701 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
702 // data driven test closely entangles filtered and regular data.
703 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
704 UErrorCode status
= U_ZERO_ERROR
;
707 TestParams
tp(status
);
709 RegexMatcher
localeMatcher(UnicodeString(u
"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status
);
710 if (U_FAILURE(status
)) {
711 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
715 // Open and read the test data file.
717 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
718 CharString
testFileName(testDataDirectory
, -1, status
);
719 testFileName
.append("rbbitst.txt", -1, status
);
722 UChar
*testFile
= ReadAndConvertFile(testFileName
.data(), len
, "UTF-8", status
);
723 if (U_FAILURE(status
)) {
724 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__
, __LINE__
, u_errorName(status
));
728 bool skipTest
= false; // Skip this test?
731 // Put the test data into a UnicodeString
733 UnicodeString
testString(FALSE
, testFile
, len
);
742 parseState
= PARSE_TAG
;
744 EParseState savedState
= PARSE_TAG
;
747 int32_t colStart
= 0;
751 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
753 UnicodeString rules
; // Holds rules from a <rules> ... </rules> block
754 int32_t rulesFirstLine
= 0; // Line number of the start of current <rules> block
756 // <rdar://problem/51193810>
757 mach_timebase_info_data_t info
;
758 uint64_t start
, durationOpen
= 0.0, durationUse
= 0.0;
759 mach_timebase_info(&info
);
760 UBool isLine
= FALSE
;
762 for (charIdx
= 0; charIdx
< len
; ) {
763 status
= U_ZERO_ERROR
;
764 UChar c
= testString
.charAt(charIdx
);
766 if (c
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
) == u
'\n') {
767 // treat CRLF as a unit
771 if (c
== u
'\n' || c
== u
'\r') {
775 column
= charIdx
- colStart
+ 1;
777 switch (parseState
) {
779 if (c
== u
'\n' || c
== u
'\r') {
780 parseState
= savedState
;
787 parseState
= PARSE_COMMENT
;
788 savedState
= PARSE_TAG
;
791 if (u_isUWhiteSpace(c
)) {
794 if (testString
.compare(charIdx
-1, 6, u
"<word>") == 0) {
796 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
802 if (testString
.compare(charIdx
-1, 6, u
"<char>") == 0) {
804 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
810 if (testString
.compare(charIdx
-1, 6, u
"<line>") == 0) {
812 start
= mach_absolute_time(); // <rdar://problem/51193810>
813 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
814 durationOpen
+= (((mach_absolute_time() - start
) * info
.numer
)/info
.denom
);
820 if (testString
.compare(charIdx
-1, 6, u
"<sent>") == 0) {
822 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
827 if (testString
.compare(charIdx
-1, 7, u
"<title>") == 0) {
829 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
835 if (testString
.compare(charIdx
-1, 7, u
"<rules>") == 0 ||
836 testString
.compare(charIdx
-1, 10, u
"<badrules>") == 0) {
837 charIdx
= testString
.indexOf(u
'>', charIdx
) + 1;
838 parseState
= PARSE_RULES
;
840 rulesFirstLine
= lineNum
;
846 localeMatcher
.reset(testString
);
847 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
848 UnicodeString localeName
= localeMatcher
.group(1, status
);
849 char localeName8
[100];
850 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
851 locale
= Locale::createFromName(localeName8
);
852 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
853 TEST_ASSERT_SUCCESS(status
);
856 if (testString
.compare(charIdx
-1, 6, u
"<data>") == 0) {
857 parseState
= PARSE_DATA
;
860 tp
.expectedBreaks
->removeAllElements();
861 tp
.srcCol
->removeAllElements();
862 tp
.srcLine
->removeAllElements();
866 errln("line %d: Tag expected in test file.", lineNum
);
867 parseState
= PARSE_COMMENT
;
868 savedState
= PARSE_DATA
;
869 goto end_test
; // Stop the test.
874 if (testString
.compare(charIdx
-1, 8, u
"</rules>") == 0) {
876 parseState
= PARSE_TAG
;
879 tp
.bi
= new RuleBasedBreakIterator(rules
, pe
, status
);
880 skipTest
= U_FAILURE(status
);
881 if (U_FAILURE(status
)) {
882 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
883 rulesFirstLine
+ pe
.line
- 1, u_errorName(status
));
885 } else if (testString
.compare(charIdx
-1, 11, u
"</badrules>") == 0) {
887 parseState
= PARSE_TAG
;
888 UErrorCode ec
= U_ZERO_ERROR
;
890 RuleBasedBreakIterator
bi(rules
, pe
, ec
);
892 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
893 rulesFirstLine
+ pe
.line
- 1);
901 if (c
== u
'\u2022') { // u'•'
902 int32_t breakIdx
= tp
.dataToBreak
.length();
903 tp
.expectedBreaks
->setSize(breakIdx
+1);
904 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
905 tp
.srcLine
->setSize(breakIdx
+1);
906 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
907 tp
.srcCol
->setSize(breakIdx
+1);
908 tp
.srcCol
->setElementAt(column
, breakIdx
);
912 if (testString
.compare(charIdx
-1, 7, u
"</data>") == 0) {
913 // Add final entry to mappings from break location to source file position.
914 // Need one extra because last break position returned is after the
915 // last char in the data, not at the last char.
916 tp
.srcLine
->addElement(lineNum
, status
);
917 tp
.srcCol
->addElement(column
, status
);
919 parseState
= PARSE_TAG
;
924 status
= U_ZERO_ERROR
;
926 start
= mach_absolute_time(); // <rdar://problem/51193810>
927 executeTest(&tp
, status
);
929 durationUse
+= (((mach_absolute_time() - start
) * info
.numer
)/info
.denom
);
931 TEST_ASSERT_SUCCESS(status
);
933 // Run again, this time with UTF-8 text wrapped in a UText.
934 status
= U_ZERO_ERROR
;
936 TEST_ASSERT_SUCCESS(status
);
937 executeTest(&tp
, status
);
942 if (testString
.compare(charIdx
-1, 3, u
"\\N{") == 0) {
943 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
944 // Get the code point from the name and insert it into the test data.
945 // (Damn, no API takes names in Unicode !!!
946 // we've got to take it back to char *)
947 int32_t nameEndIdx
= testString
.indexOf(u
'}', charIdx
);
948 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
949 char charNameBuf
[200];
950 UChar32 theChar
= -1;
951 if (nameEndIdx
!= -1) {
952 UErrorCode status
= U_ZERO_ERROR
;
953 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
954 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
955 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
956 if (U_FAILURE(status
)) {
961 errln("Error in named character in test file at line %d, col %d",
964 // Named code point was recognized. Insert it
965 // into the test data.
966 tp
.dataToBreak
.append(theChar
);
967 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
968 tp
.srcLine
->addElement(lineNum
, status
);
969 tp
.srcCol
->addElement(column
, status
);
972 if (nameEndIdx
> charIdx
) {
973 charIdx
= nameEndIdx
+1;
981 if (testString
.compare(charIdx
-1, 2, u
"<>") == 0) {
983 int32_t breakIdx
= tp
.dataToBreak
.length();
984 tp
.expectedBreaks
->setSize(breakIdx
+1);
985 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
986 tp
.srcLine
->setSize(breakIdx
+1);
987 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
988 tp
.srcCol
->setSize(breakIdx
+1);
989 tp
.srcCol
->setElementAt(column
, breakIdx
);
995 parseState
= PARSE_NUM
;
999 if (c
== u
'#' && column
==3) { // TODO: why is column off so far?
1000 parseState
= PARSE_COMMENT
;
1001 savedState
= PARSE_DATA
;
1006 // Check for \ at end of line, a line continuation.
1007 // Advance over (discard) the newline
1008 UChar32 cp
= testString
.char32At(charIdx
);
1009 if (cp
== u
'\r' && charIdx
<len
&& testString
.charAt(charIdx
+1) == u
'\n') {
1011 // Need an extra increment of the input ptr to move over both of them
1014 if (cp
== u
'\n' || cp
== u
'\r') {
1021 // Let unescape handle the back slash.
1022 cp
= testString
.unescapeAt(charIdx
);
1024 // Escape sequence was recognized. Insert the char
1025 // into the test data.
1026 tp
.dataToBreak
.append(cp
);
1027 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1028 tp
.srcLine
->addElement(lineNum
, status
);
1029 tp
.srcCol
->addElement(column
, status
);
1035 // Not a recognized backslash escape sequence.
1036 // Take the next char as a literal.
1037 // TODO: Should this be an error?
1038 c
= testString
.charAt(charIdx
);
1039 charIdx
= testString
.moveIndex32(charIdx
, 1);
1042 // Normal, non-escaped data char.
1043 tp
.dataToBreak
.append(c
);
1045 // Save the mapping from offset in the data to line/column numbers in
1046 // the original input file. Will be used for better error messages only.
1047 // If there's an expected break before this char, the slot in the mapping
1048 // vector will already be set for this char; don't overwrite it.
1049 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1050 tp
.srcLine
->addElement(lineNum
, status
);
1051 tp
.srcCol
->addElement(column
, status
);
1057 // We are parsing an expected numeric tag value, like <1234>,
1058 // within a chunk of data.
1059 if (u_isUWhiteSpace(c
)) {
1064 // Finished the number. Add the info to the expected break data,
1065 // and switch parse state back to doing plain data.
1066 parseState
= PARSE_DATA
;
1067 if (tagValue
== 0) {
1070 int32_t breakIdx
= tp
.dataToBreak
.length();
1071 tp
.expectedBreaks
->setSize(breakIdx
+1);
1072 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1073 tp
.srcLine
->setSize(breakIdx
+1);
1074 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1075 tp
.srcCol
->setSize(breakIdx
+1);
1076 tp
.srcCol
->setElementAt(column
, breakIdx
);
1081 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1085 errln("Syntax Error in test file at line %d, col %d",
1087 parseState
= PARSE_COMMENT
;
1088 goto end_test
; // Stop the test
1093 if (U_FAILURE(status
)) {
1094 dataerrln("ICU Error %s while parsing test file at line %d.",
1095 u_errorName(status
), lineNum
);
1096 status
= U_ZERO_ERROR
;
1097 goto end_test
; // Stop the test
1102 // Reached end of test file. Raise an error if parseState indicates that we are
1103 // within a block that should have been terminated.
1105 if (parseState
== PARSE_RULES
) {
1106 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1107 lineNum
, rulesFirstLine
);
1109 if (parseState
== PARSE_DATA
) {
1110 errln("rbbitst.txt:%d <data> block not closed.", lineNum
);
1114 infoln("TestExtended total time in createLineInstance (nsec):\t%llu\n", durationOpen
);
1115 infoln("TestExtended total time in linebreak test execute (nsec):\t%llu\n", durationUse
);
1124 //-------------------------------------------------------------------------------
1126 // TestDictRules create a break iterator from source rules that includes a
1127 // dictionary range. Regression for bug #7130. Source rules
1128 // do not declare a break iterator type (word, line, sentence, etc.
1129 // but the dictionary code, without a type, would loop.
1131 //-------------------------------------------------------------------------------
1132 void RBBITest::TestDictRules() {
1133 const char *rules
= "$dictionary = [a-z]; \n"
1135 "$dictionary $dictionary; \n"
1137 "$dictionary $dictionary; \n";
1138 const char *text
= "aa";
1139 UErrorCode status
= U_ZERO_ERROR
;
1140 UParseError parseError
;
1142 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1143 if (U_SUCCESS(status
)) {
1144 UnicodeString utext
= text
;
1148 for (loops
= 0; loops
<10; loops
++) {
1149 position
= bi
.next();
1150 if (position
== RuleBasedBreakIterator::DONE
) {
1154 TEST_ASSERT(loops
== 1);
1156 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1162 //-------------------------------------------------------------------------------
1164 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1165 // return the data in one big UChar * buffer, which the caller must delete.
1168 // fileName: the name of the file, with no directory part. The test data directory
1170 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1171 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1172 // specified here. The BOM, if it exists, will be stripped from the returned data.
1173 // Pass NULL for the system default encoding.
1176 // The file data, converted to UChar.
1177 // The caller must delete this when done with
1178 // delete [] theBuffer;
1180 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1181 // Move this function to some common place.
1183 //--------------------------------------------------------------------------------
1184 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1185 UChar
*retPtr
= NULL
;
1186 char *fileBuf
= NULL
;
1187 UConverter
* conv
= NULL
;
1191 if (U_FAILURE(status
)) {
1198 f
= fopen(fileName
, "rb");
1200 dataerrln("Error opening test data file %s\n", fileName
);
1201 status
= U_FILE_ACCESS_ERROR
;
1210 fseek( f
, 0, SEEK_END
);
1211 fileSize
= ftell(f
);
1212 fileBuf
= new char[fileSize
];
1213 fseek(f
, 0, SEEK_SET
);
1214 amt_read
= static_cast<int>(fread(fileBuf
, 1, fileSize
, f
));
1215 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1216 errln("Error reading test data file.");
1217 goto cleanUpAndReturn
;
1221 // Look for a Unicode Signature (BOM) on the data just read
1223 int32_t signatureLength
;
1224 const char * fileBufC
;
1225 const char* bomEncoding
;
1228 bomEncoding
= ucnv_detectUnicodeSignature(
1229 fileBuf
, fileSize
, &signatureLength
, &status
);
1230 if(bomEncoding
!=NULL
){
1231 fileBufC
+= signatureLength
;
1232 fileSize
-= signatureLength
;
1233 encoding
= bomEncoding
;
1237 // Open a converter to take the rule file to UTF-16
1239 conv
= ucnv_open(encoding
, &status
);
1240 if (U_FAILURE(status
)) {
1241 goto cleanUpAndReturn
;
1245 // Convert the rules to UChar.
1246 // Preflight first to determine required buffer size.
1248 ulen
= ucnv_toUChars(conv
,
1254 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1255 // Buffer Overflow is expected from the preflight operation.
1256 status
= U_ZERO_ERROR
;
1258 retPtr
= new UChar
[ulen
+1];
1271 if (U_FAILURE(status
)) {
1272 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1282 //--------------------------------------------------------------------------------------------
1284 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1286 //-------------------------------------------------------------------------------------------
1287 void RBBITest::TestUnicodeFiles() {
1288 RuleBasedBreakIterator
*bi
;
1289 UErrorCode status
= U_ZERO_ERROR
;
1291 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1292 TEST_ASSERT_SUCCESS(status
);
1293 if (U_SUCCESS(status
)) {
1294 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1298 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1299 TEST_ASSERT_SUCCESS(status
);
1300 if (U_SUCCESS(status
)) {
1301 runUnicodeTestData("WordBreakTest.txt", bi
);
1305 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1306 TEST_ASSERT_SUCCESS(status
);
1307 if (U_SUCCESS(status
)) {
1308 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1312 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1313 TEST_ASSERT_SUCCESS(status
);
1314 if (U_SUCCESS(status
)) {
1315 runUnicodeTestData("LineBreakTest.txt", bi
);
1321 // Check for test cases from the Unicode test data files that are known to fail
1322 // and should be skipped as known issues because ICU does not fully implement
1323 // the Unicode specifications, or because ICU includes tailorings that differ from
1324 // the Unicode standard.
1326 // Test cases are identified by the test data sequence, which tends to be more stable
1327 // across Unicode versions than the test file line numbers.
1329 // The test case with ticket "10666" is a dummy, included as an example.
1331 UBool
RBBITest::testCaseIsKnownIssue(const UnicodeString
&testCase
, const char *fileName
) {
1332 static struct TestCase
{
1333 const char *fTicketNum
;
1334 const char *fFileName
;
1335 const UChar
*fString
;
1336 } badTestCases
[] = {
1337 {"10666", "GraphemeBreakTest.txt", u
"\u0020\u0020\u0033"}, // Fake example, for illustration.
1338 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1339 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1340 // ICU is out of sync with Unicode.
1341 {"8151", "LineBreakTest.txt", u
"-#"},
1342 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\u0023"},
1343 {"8151", "LineBreakTest.txt", u
"\u002d\u00a7"},
1344 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\u00a7"},
1345 {"8151", "LineBreakTest.txt", u
"\u002d\U00050005"},
1346 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\U00050005"},
1347 {"8151", "LineBreakTest.txt", u
"\u002d\u0e01"},
1348 {"8151", "LineBreakTest.txt", u
"\u002d\u0308\u0e01"},
1350 // Issue ICU-12017 Improve line break around numbers
1351 {"12017", "LineBreakTest.txt", u
"\u002C\u0030"}, // ",0"
1352 {"12017", "LineBreakTest.txt", u
"\u002C\u0308\u0030"},
1353 {"12017", "LineBreakTest.txt", u
"find .com"},
1354 {"12017", "LineBreakTest.txt", u
"equals .35 cents"},
1355 {"12017", "LineBreakTest.txt", u
"a.2 "},
1356 {"12017", "LineBreakTest.txt", u
"a.2 \u0915"},
1357 {"12017", "LineBreakTest.txt", u
"a.2 \u672C"},
1358 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u672C"},
1359 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u307E"},
1360 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u0033"},
1361 {"12017", "LineBreakTest.txt", u
"A.1 \uBABB"},
1362 {"12017", "LineBreakTest.txt", u
"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1363 {"12017", "LineBreakTest.txt", u
"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1364 {"12017", "LineBreakTest.txt", u
"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1365 {"12017", "LineBreakTest.txt", u
"a.2\u3000\u300C"},
1368 for (int n
=0; n
<UPRV_LENGTHOF(badTestCases
); n
++) {
1369 const TestCase
&badCase
= badTestCases
[n
];
1370 if (!strcmp(fileName
, badCase
.fFileName
) &&
1371 testCase
== UnicodeString(badCase
.fString
)) {
1372 return logKnownIssue(badCase
.fTicketNum
);
1379 //--------------------------------------------------------------------------------------------
1381 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1383 //-------------------------------------------------------------------------------------------
1384 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1386 UErrorCode status
= U_ZERO_ERROR
;
1389 // Open and read the test data file, put it into a UnicodeString.
1391 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1392 char testFileName
[1000];
1393 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1394 dataerrln("Can't open test data. Path too long.");
1397 strcpy(testFileName
, testDataDirectory
);
1398 strcat(testFileName
, fileName
);
1400 logln("Opening data file %s\n", fileName
);
1403 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1404 if (status
!= U_FILE_ACCESS_ERROR
) {
1405 TEST_ASSERT_SUCCESS(status
);
1406 TEST_ASSERT(testFile
!= NULL
);
1408 if (U_FAILURE(status
) || testFile
== NULL
) {
1409 return; /* something went wrong, error already output */
1411 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1414 // Parse the test data file using a regular expression.
1415 // Each kind of token is recognized in its own capture group; what type of item was scanned
1416 // is identified by which group had a match.
1418 // Caputure Group # 1 2 3 4 5
1419 // Parses this item: divide x hex digits comment \n unrecognized \n
1421 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1422 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1423 UnicodeString testString
;
1424 UVector32
breakPositions(status
);
1426 TEST_ASSERT_SUCCESS(status
);
1427 if (U_FAILURE(status
)) {
1432 // Scan through each test case, building up the string to be broken in testString,
1433 // and the positions that should be boundaries in the breakPositions vector.
1436 while (tokenMatcher
.find()) {
1437 if(tokenMatcher
.hitEnd()) {
1438 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1439 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1440 and caused an infinite loop here on EBCDIC systems!
1442 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1445 if (tokenMatcher
.start(1, status
) >= 0) {
1446 // Scanned a divide sign, indicating a break position in the test data.
1447 if (testString
.length()>0) {
1448 breakPositions
.addElement(testString
.length(), status
);
1451 else if (tokenMatcher
.start(2, status
) >= 0) {
1452 // Scanned an 'x', meaning no break at this position in the test data
1453 // Nothing to be done here.
1455 else if (tokenMatcher
.start(3, status
) >= 0) {
1456 // Scanned Hex digits. Convert them to binary, append to the character data string.
1457 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1458 int length
= hexNumber
.length();
1461 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1462 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1464 testString
.append(c
);
1466 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1467 fileName
, lineNumber
);
1470 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1471 fileName
, lineNumber
);
1474 else if (tokenMatcher
.start(4, status
) >= 0) {
1475 // Scanned to end of a line, possibly skipping over a comment in the process.
1476 // If the line from the file contained test data, run the test now.
1477 if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {
1478 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1481 // Clear out this test case.
1482 // The string and breakPositions vector will be refilled as the next
1483 // test case is parsed.
1484 testString
.remove();
1485 breakPositions
.removeAllElements();
1488 // Scanner catchall. Something unrecognized appeared on the line.
1490 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1491 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1492 token
[sizeof(token
)-1] = 0;
1493 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1495 // Clean up, in preparation for continuing with the next line.
1496 testString
.remove();
1497 breakPositions
.removeAllElements();
1500 TEST_ASSERT_SUCCESS(status
);
1501 if (U_FAILURE(status
)) {
1507 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1510 //--------------------------------------------------------------------------------------------
1512 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1513 // test data files. Do only a simple, forward-only check -
1514 // this test is mostly to check that ICU and the Unicode
1515 // data agree with each other.
1517 //--------------------------------------------------------------------------------------------
1518 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1519 const UnicodeString
&testString
, // Text data to be broken
1520 UVector32
*breakPositions
, // Positions where breaks should be found.
1521 RuleBasedBreakIterator
*bi
) {
1522 int32_t pos
; // Break Position in the test string
1523 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1524 int32_t expectedPos
; // Expected break position (index into test string)
1526 bi
->setText(testString
);
1530 while (pos
!= BreakIterator::DONE
) {
1531 if (expectedI
>= breakPositions
->size()) {
1532 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1533 testFileName
, lineNumber
, pos
);
1536 expectedPos
= breakPositions
->elementAti(expectedI
);
1537 if (pos
< expectedPos
) {
1538 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1539 testFileName
, lineNumber
, pos
);
1542 if (pos
> expectedPos
) {
1543 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1544 testFileName
, lineNumber
, expectedPos
);
1551 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1552 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1553 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1559 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1560 //---------------------------------------------------------------------------------------
1562 // classs RBBIMonkeyKind
1564 // Monkey Test for Break Iteration
1565 // Abstract interface class. Concrete derived classes independently
1566 // implement the break rules for different iterator types.
1568 // The Monkey Test itself uses doesn't know which type of break iterator it is
1569 // testing, but works purely in terms of the interface defined here.
1571 //---------------------------------------------------------------------------------------
1572 class RBBIMonkeyKind
{
1574 // Return a UVector of UnicodeSets, representing the character classes used
1575 // for this type of iterator.
1576 virtual UVector
*charClasses() = 0;
1578 // Set the test text on which subsequent calls to next() will operate
1579 virtual void setText(const UnicodeString
&s
) = 0;
1581 // Find the next break postion, starting from the prev break position, or from zero.
1582 // Return -1 after reaching end of string.
1583 virtual int32_t next(int32_t i
) = 0;
1585 // Name of each character class, parallel with charClasses. Used for debugging output
1587 virtual std::vector
<std::string
>& characterClassNames();
1589 void setAppliedRule(int32_t position
, const char* value
);
1591 std::string
getAppliedRule(int32_t position
);
1593 virtual ~RBBIMonkeyKind();
1594 UErrorCode deferredStatus
;
1596 std::string
classNameFromCodepoint(const UChar32 c
);
1597 unsigned int maxClassNameSize();
1601 std::vector
<std::string
> classNames
;
1602 std::vector
<std::string
> appliedRules
;
1604 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1605 void prepareAppliedRules(int32_t size
);
1611 RBBIMonkeyKind::RBBIMonkeyKind() {
1612 deferredStatus
= U_ZERO_ERROR
;
1615 RBBIMonkeyKind::~RBBIMonkeyKind() {
1618 std::vector
<std::string
>& RBBIMonkeyKind::characterClassNames() {
1622 void RBBIMonkeyKind::prepareAppliedRules(int32_t size
) {
1623 // Remove all the information in the `appliedRules`.
1624 appliedRules
.clear();
1625 appliedRules
.resize(size
+ 1);
1628 void RBBIMonkeyKind::setAppliedRule(int32_t position
, const char* value
) {
1629 appliedRules
[position
] = value
;
1632 std::string
RBBIMonkeyKind::getAppliedRule(int32_t position
){
1633 return appliedRules
[position
];
1636 std::string
RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c
) {
1637 // Simply iterate through charClasses to find character's class
1638 for (int aClassNum
= 0; aClassNum
< charClasses()->size(); aClassNum
++) {
1639 UnicodeSet
*classSet
= (UnicodeSet
*)charClasses()->elementAt(aClassNum
);
1640 if (classSet
->contains(c
)) {
1641 return classNames
[aClassNum
];
1644 U_ASSERT(FALSE
); // This should not happen.
1645 return "bad class name";
1648 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1649 unsigned int maxSize
= 0;
1650 for (int aClassNum
= 0; aClassNum
< charClasses()->size(); aClassNum
++) {
1651 if (classNames
[aClassNum
].size() > maxSize
) {
1652 maxSize
= classNames
[aClassNum
].size();
1658 //----------------------------------------------------------------------------------------
1660 // Random Numbers. Similar to standard lib rand() and srand()
1661 // Not using library to
1662 // 1. Get same results on all platforms.
1663 // 2. Get access to current seed, to more easily reproduce failures.
1665 //---------------------------------------------------------------------------------------
1666 static uint32_t m_seed
= 1;
1668 static uint32_t m_rand()
1670 m_seed
= m_seed
* 1103515245 + 12345;
1671 return (uint32_t)(m_seed
/65536) % 32768;
1675 //------------------------------------------------------------------------------------------
1677 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1678 // of RBBIMonkeyKind.
1680 //------------------------------------------------------------------------------------------
1681 class RBBICharMonkey
: public RBBIMonkeyKind
{
1684 virtual ~RBBICharMonkey();
1685 virtual UVector
*charClasses();
1686 virtual void setText(const UnicodeString
&s
);
1687 virtual int32_t next(int32_t i
);
1691 UnicodeSet
*fCRLFSet
;
1692 UnicodeSet
*fControlSet
;
1693 UnicodeSet
*fExtendSet
;
1694 UnicodeSet
*fZWJSet
;
1695 UnicodeSet
*fRegionalIndicatorSet
;
1696 UnicodeSet
*fPrependSet
;
1697 UnicodeSet
*fSpacingSet
;
1702 UnicodeSet
*fLVTSet
;
1703 UnicodeSet
*fHangulSet
;
1704 UnicodeSet
*fExtendedPictSet
;
1705 UnicodeSet
*fViramaSet
;
1706 UnicodeSet
*fLinkingConsonantSet
;
1707 UnicodeSet
*fExtCccZwjSet
;
1708 UnicodeSet
*fAnySet
;
1710 const UnicodeString
*fText
;
1714 RBBICharMonkey::RBBICharMonkey() {
1715 UErrorCode status
= U_ZERO_ERROR
;
1719 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
1720 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status
);
1721 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status
);
1722 fZWJSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status
);
1723 fRegionalIndicatorSet
=
1724 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
1725 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
1726 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
1727 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
1728 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
1729 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
1730 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
1731 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
1732 fHangulSet
= new UnicodeSet();
1733 fHangulSet
->addAll(*fLSet
);
1734 fHangulSet
->addAll(*fVSet
);
1735 fHangulSet
->addAll(*fTSet
);
1736 fHangulSet
->addAll(*fLVSet
);
1737 fHangulSet
->addAll(*fLVTSet
);
1739 fExtendedPictSet
= new UnicodeSet(u
"[:Extended_Pictographic:]", status
);
1740 fViramaSet
= new UnicodeSet(u
"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1741 "\\p{Indic_Syllabic_Category=Virama}]", status
);
1742 fLinkingConsonantSet
= new UnicodeSet(u
"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1743 "\\p{Indic_Syllabic_Category=Consonant}]", status
);
1744 fExtCccZwjSet
= new UnicodeSet(u
"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status
);
1745 fAnySet
= new UnicodeSet(0, 0x10ffff);
1747 // Create sets of characters, and add the names of the above character sets.
1748 // In each new ICU release, add new names corresponding to the sets above.
1749 fSets
= new UVector(status
);
1751 // Important: Keep class names the same as the class contents.
1752 fSets
->addElement(fCRLFSet
, status
); classNames
.push_back("CRLF");
1753 fSets
->addElement(fControlSet
, status
); classNames
.push_back("Control");
1754 fSets
->addElement(fExtendSet
, status
); classNames
.push_back("Extended");
1755 fSets
->addElement(fRegionalIndicatorSet
, status
); classNames
.push_back("RegionalIndicator");
1756 if (!fPrependSet
->isEmpty()) {
1757 fSets
->addElement(fPrependSet
, status
); classNames
.push_back("Prepend");
1759 fSets
->addElement(fSpacingSet
, status
); classNames
.push_back("Spacing");
1760 fSets
->addElement(fHangulSet
, status
); classNames
.push_back("Hangul");
1761 fSets
->addElement(fZWJSet
, status
); classNames
.push_back("ZWJ");
1762 fSets
->addElement(fExtendedPictSet
, status
); classNames
.push_back("ExtendedPict");
1763 fSets
->addElement(fViramaSet
, status
); classNames
.push_back("Virama");
1764 fSets
->addElement(fLinkingConsonantSet
, status
); classNames
.push_back("LinkingConsonant");
1765 fSets
->addElement(fExtCccZwjSet
, status
); classNames
.push_back("ExtCcccZwj");
1766 fSets
->addElement(fAnySet
, status
); classNames
.push_back("Any");
1768 if (U_FAILURE(status
)) {
1769 deferredStatus
= status
;
1774 void RBBICharMonkey::setText(const UnicodeString
&s
) {
1776 prepareAppliedRules(s
.length());
1781 int32_t RBBICharMonkey::next(int32_t prevPos
) {
1782 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
1783 // break position being tested. The candidate break
1784 // location is before p2.
1788 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
1789 UChar32 cBase
; // for (X Extend*) patterns, the X character.
1791 if (U_FAILURE(deferredStatus
)) {
1795 // Previous break at end of string. return DONE.
1796 if (prevPos
>= fText
->length()) {
1800 p0
= p1
= p2
= p3
= prevPos
;
1801 c3
= fText
->char32At(prevPos
);
1802 c0
= c1
= c2
= cBase
= 0;
1803 (void)p0
; // suppress set but not used warning.
1806 // Loop runs once per "significant" character position in the input text.
1808 // Move all of the positions forward in the input string.
1813 // Advance p3 by one codepoint
1814 p3
= fText
->moveIndex32(p3
, 1);
1815 c3
= fText
->char32At(p3
);
1818 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1822 if (p2
== fText
->length()) {
1823 setAppliedRule(p2
, "End of String");
1827 // No Extend or Format characters may appear between the CR and LF,
1828 // which requires the additional check for p2 immediately following p1.
1830 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
1831 setAppliedRule(p2
, "GB3 CR x LF");
1835 if (fControlSet
->contains(c1
) ||
1838 setAppliedRule(p2
, "GB4 ( Control | CR | LF ) <break>");
1842 if (fControlSet
->contains(c2
) ||
1845 setAppliedRule(p2
, "GB5 <break> ( Control | CR | LF )");
1849 if (fLSet
->contains(c1
) &&
1850 (fLSet
->contains(c2
) ||
1851 fVSet
->contains(c2
) ||
1852 fLVSet
->contains(c2
) ||
1853 fLVTSet
->contains(c2
))) {
1854 setAppliedRule(p2
, "GB6 L x ( L | V | LV | LVT )");
1858 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
1859 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
1860 setAppliedRule(p2
, "GB7 ( LV | V ) x ( V | T )");
1864 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
1865 fTSet
->contains(c2
)) {
1866 setAppliedRule(p2
, "GB8 ( LVT | T) x T");
1870 if (fExtendSet
->contains(c2
) || fZWJSet
->contains(c2
)) {
1871 if (!fExtendSet
->contains(c1
)) {
1874 setAppliedRule(p2
, "GB9 x (Extend | ZWJ)");
1878 if (fSpacingSet
->contains(c2
)) {
1879 setAppliedRule(p2
, "GB9a x SpacingMark");
1883 if (fPrependSet
->contains(c1
)) {
1884 setAppliedRule(p2
, "GB9b Prepend x");
1888 // Note: Viramas are also included in the ExtCccZwj class.
1889 if (fLinkingConsonantSet
->contains(c2
)) {
1891 bool sawVirama
= false;
1892 while (pi
> 0 && fExtCccZwjSet
->contains(fText
->char32At(pi
))) {
1893 if (fViramaSet
->contains(fText
->char32At(pi
))) {
1896 pi
= fText
->moveIndex32(pi
, -1);
1898 if (sawVirama
&& fLinkingConsonantSet
->contains(fText
->char32At(pi
))) {
1899 setAppliedRule(p2
, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1904 if (fExtendedPictSet
->contains(cBase
) && fZWJSet
->contains(c1
) && fExtendedPictSet
->contains(c2
)) {
1905 setAppliedRule(p2
, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1909 // Note: The first if condition is a little tricky. We only need to force
1910 // a break if there are three or more contiguous RIs. If there are
1911 // only two, a break following will occur via other rules, and will include
1912 // any trailing extend characters, which is needed behavior.
1913 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)
1914 && fRegionalIndicatorSet
->contains(c2
)) {
1915 setAppliedRule(p2
, "GB12-13 Regional_Indicator x Regional_Indicator");
1918 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
1919 setAppliedRule(p2
, "GB12-13 Regional_Indicator x Regional_Indicator");
1923 setAppliedRule(p2
, "GB999 Any <break> Any");
1933 UVector
*RBBICharMonkey::charClasses() {
1937 RBBICharMonkey::~RBBICharMonkey() {
1942 delete fRegionalIndicatorSet
;
1953 delete fExtendedPictSet
;
1955 delete fLinkingConsonantSet
;
1956 delete fExtCccZwjSet
;
1959 //------------------------------------------------------------------------------------------
1961 // class RBBIWordMonkey Word Break specific implementation
1962 // of RBBIMonkeyKind.
1964 //------------------------------------------------------------------------------------------
1965 class RBBIWordMonkey
: public RBBIMonkeyKind
{
1968 virtual ~RBBIWordMonkey();
1969 virtual UVector
*charClasses();
1970 virtual void setText(const UnicodeString
&s
);
1971 virtual int32_t next(int32_t i
);
1977 UnicodeSet
*fNewlineSet
;
1978 UnicodeSet
*fRegionalIndicatorSet
;
1979 UnicodeSet
*fKatakanaSet
;
1980 UnicodeSet
*fHebrew_LetterSet
;
1981 UnicodeSet
*fALetterSet
;
1982 UnicodeSet
*fSingle_QuoteSet
;
1983 UnicodeSet
*fDouble_QuoteSet
;
1984 UnicodeSet
*fMidNumLetSet
;
1985 UnicodeSet
*fMidLetterSet
;
1986 UnicodeSet
*fMidNumSet
;
1987 UnicodeSet
*fNumericSet
;
1988 UnicodeSet
*fFormatSet
;
1989 UnicodeSet
*fOtherSet
;
1990 UnicodeSet
*fExtendSet
;
1991 UnicodeSet
*fExtendNumLetSet
;
1992 UnicodeSet
*fWSegSpaceSet
;
1993 UnicodeSet
*fDictionarySet
;
1994 UnicodeSet
*fZWJSet
;
1995 UnicodeSet
*fExtendedPictSet
;
1997 const UnicodeString
*fText
;
2001 RBBIWordMonkey::RBBIWordMonkey()
2003 UErrorCode status
= U_ZERO_ERROR
;
2005 fSets
= new UVector(status
);
2007 fCRSet
= new UnicodeSet(u
"[\\p{Word_Break = CR}]", status
);
2008 fLFSet
= new UnicodeSet(u
"[\\p{Word_Break = LF}]", status
);
2009 fNewlineSet
= new UnicodeSet(u
"[\\p{Word_Break = Newline}]", status
);
2010 fKatakanaSet
= new UnicodeSet(u
"[\\p{Word_Break = Katakana}]", status
);
2011 fRegionalIndicatorSet
= new UnicodeSet(u
"[\\p{Word_Break = Regional_Indicator}]", status
);
2012 fHebrew_LetterSet
= new UnicodeSet(u
"[\\p{Word_Break = Hebrew_Letter}]", status
);
2013 fALetterSet
= new UnicodeSet(u
"[\\p{Word_Break = ALetter}]", status
);
2014 fSingle_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Single_Quote}]", status
);
2015 fDouble_QuoteSet
= new UnicodeSet(u
"[\\p{Word_Break = Double_Quote}]", status
);
2016 fMidNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNumLet}]", status
);
2017 fMidLetterSet
= new UnicodeSet(u
"[\\p{Word_Break = MidLetter} - [\\:]]", status
);
2018 fMidNumSet
= new UnicodeSet(u
"[\\p{Word_Break = MidNum}]", status
);
2019 fNumericSet
= new UnicodeSet(u
"[\\p{Word_Break = Numeric}]", status
);
2020 fFormatSet
= new UnicodeSet(u
"[\\p{Word_Break = Format}]", status
);
2021 fExtendNumLetSet
= new UnicodeSet(u
"[\\p{Word_Break = ExtendNumLet}]", status
);
2022 // There are some sc=Hani characters with WB=Extend.
2023 // The break rules need to pick one or the other because
2024 // Extend overlapping with something else is messy.
2025 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2026 // in $Han (for $dictionary) and out of $Extend.
2027 fExtendSet
= new UnicodeSet(u
"[\\p{Word_Break = Extend}-[:Hani:]]", status
);
2028 fWSegSpaceSet
= new UnicodeSet(u
"[\\p{Word_Break = WSegSpace}]", status
);
2030 fZWJSet
= new UnicodeSet(u
"[\\p{Word_Break = ZWJ}]", status
);
2031 fExtendedPictSet
= new UnicodeSet(u
"[:Extended_Pictographic:]", status
);
2033 fDictionarySet
= new UnicodeSet(u
"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status
);
2034 fDictionarySet
->addAll(*fKatakanaSet
);
2035 fDictionarySet
->addAll(UnicodeSet(u
"[\\p{LineBreak = Complex_Context}]", status
));
2037 fALetterSet
->removeAll(*fDictionarySet
);
2039 fOtherSet
= new UnicodeSet();
2040 if(U_FAILURE(status
)) {
2041 IntlTest::gTest
->errln("%s:%d %s", __FILE__
, __LINE__
, u_errorName(status
));
2042 deferredStatus
= status
;
2046 fOtherSet
->complement();
2047 fOtherSet
->removeAll(*fCRSet
);
2048 fOtherSet
->removeAll(*fLFSet
);
2049 fOtherSet
->removeAll(*fNewlineSet
);
2050 fOtherSet
->removeAll(*fKatakanaSet
);
2051 fOtherSet
->removeAll(*fHebrew_LetterSet
);
2052 fOtherSet
->removeAll(*fALetterSet
);
2053 fOtherSet
->removeAll(*fSingle_QuoteSet
);
2054 fOtherSet
->removeAll(*fDouble_QuoteSet
);
2055 fOtherSet
->removeAll(*fMidLetterSet
);
2056 fOtherSet
->removeAll(*fMidNumSet
);
2057 fOtherSet
->removeAll(*fNumericSet
);
2058 fOtherSet
->removeAll(*fExtendNumLetSet
);
2059 fOtherSet
->removeAll(*fWSegSpaceSet
);
2060 fOtherSet
->removeAll(*fFormatSet
);
2061 fOtherSet
->removeAll(*fExtendSet
);
2062 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
2063 fOtherSet
->removeAll(*fZWJSet
);
2064 fOtherSet
->removeAll(*fExtendedPictSet
);
2066 // Inhibit dictionary characters from being tested at all.
2067 fOtherSet
->removeAll(*fDictionarySet
);
2069 // Add classes and their names
2070 fSets
->addElement(fCRSet
, status
); classNames
.push_back("CR");
2071 fSets
->addElement(fLFSet
, status
); classNames
.push_back("LF");
2072 fSets
->addElement(fNewlineSet
, status
); classNames
.push_back("Newline");
2073 fSets
->addElement(fRegionalIndicatorSet
, status
); classNames
.push_back("RegionalIndicator");
2074 fSets
->addElement(fHebrew_LetterSet
, status
); classNames
.push_back("Hebrew");
2075 fSets
->addElement(fALetterSet
, status
); classNames
.push_back("ALetter");
2076 fSets
->addElement(fSingle_QuoteSet
, status
); classNames
.push_back("Single Quote");
2077 fSets
->addElement(fDouble_QuoteSet
, status
); classNames
.push_back("Double Quote");
2078 // Omit Katakana from fSets, which omits Katakana characters
2079 // from the test data. They are all in the dictionary set,
2080 // which this (old, to be retired) monkey test cannot handle.
2081 //fSets->addElement(fKatakanaSet, status);
2083 fSets
->addElement(fMidLetterSet
, status
); classNames
.push_back("MidLetter");
2084 fSets
->addElement(fMidNumLetSet
, status
); classNames
.push_back("MidNumLet");
2085 fSets
->addElement(fMidNumSet
, status
); classNames
.push_back("MidNum");
2086 fSets
->addElement(fNumericSet
, status
); classNames
.push_back("Numeric");
2087 fSets
->addElement(fFormatSet
, status
); classNames
.push_back("Format");
2088 fSets
->addElement(fExtendSet
, status
); classNames
.push_back("Extend");
2089 fSets
->addElement(fOtherSet
, status
); classNames
.push_back("Other");
2090 fSets
->addElement(fExtendNumLetSet
, status
); classNames
.push_back("ExtendNumLet");
2091 fSets
->addElement(fWSegSpaceSet
, status
); classNames
.push_back("WSegSpace");
2093 fSets
->addElement(fZWJSet
, status
); classNames
.push_back("ZWJ");
2094 fSets
->addElement(fExtendedPictSet
, status
); classNames
.push_back("ExtendedPict");
2096 if (U_FAILURE(status
)) {
2097 deferredStatus
= status
;
2101 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2103 prepareAppliedRules(s
.length());
2107 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2108 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2109 // break position being tested. The candidate break
2110 // location is before p2.
2114 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2116 if (U_FAILURE(deferredStatus
)) {
2120 // Prev break at end of string. return DONE.
2121 if (prevPos
>= fText
->length()) {
2124 p0
= p1
= p2
= p3
= prevPos
;
2125 c3
= fText
->char32At(prevPos
);
2127 (void)p0
; // Suppress set but not used warning.
2129 // Loop runs once per "significant" character position in the input text.
2131 // Move all of the positions forward in the input string.
2136 // Advance p3 by X(Extend | Format)* Rule 4
2137 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2139 p3
= fText
->moveIndex32(p3
, 1);
2140 c3
= fText
->char32At(p3
);
2141 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2145 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
) || fZWJSet
->contains(c3
));
2149 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2153 if (p2
== fText
->length()) {
2154 // Reached end of string. Always a break position.
2158 // No Extend or Format characters may appear between the CR and LF,
2159 // which requires the additional check for p2 immediately following p1.
2161 if (c1
==0x0D && c2
==0x0A) {
2162 setAppliedRule(p2
, "WB3 CR x LF");
2166 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2167 setAppliedRule(p2
, "WB3a Break before and after newlines (including CR and LF)");
2170 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2171 setAppliedRule(p2
, "WB3a Break before and after newlines (including CR and LF)");
2175 // Not ignoring extend chars, so peek into input text to
2176 // get the potential ZWJ, the character immediately preceding c2.
2177 // Sloppy UChar32 indexing: p2-1 may reference trail half
2178 // but char32At will get the full code point.
2179 if (fZWJSet
->contains(fText
->char32At(p2
- 1)) && fExtendedPictSet
->contains(c2
)){
2180 setAppliedRule(p2
, "WB3c ZWJ x Extended_Pictographic");
2184 if (fWSegSpaceSet
->contains(fText
->char32At(p2
-1)) && fWSegSpaceSet
->contains(c2
)) {
2185 setAppliedRule(p2
, "WB3d Keep horizontal whitespace together.");
2189 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2190 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2191 setAppliedRule(p2
, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2195 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2196 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2197 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2199 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2203 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2204 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2205 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2207 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2211 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2212 setAppliedRule(p2
, "WB7a Hebrew_Letter x Single_Quote");
2216 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2217 setAppliedRule(p2
, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2221 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2222 setAppliedRule(p2
, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2226 if (fNumericSet
->contains(c1
) &&
2227 fNumericSet
->contains(c2
)) {
2228 setAppliedRule(p2
, "WB8 Numeric x Numeric");
2232 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2233 fNumericSet
->contains(c2
)) {
2234 setAppliedRule(p2
, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2238 if (fNumericSet
->contains(c1
) &&
2239 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2240 setAppliedRule(p2
, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2244 if (fNumericSet
->contains(c0
) &&
2245 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2246 fNumericSet
->contains(c2
)) {
2247 setAppliedRule(p2
, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2251 if (fNumericSet
->contains(c1
) &&
2252 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2253 fNumericSet
->contains(c3
)) {
2254 setAppliedRule(p2
, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2258 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2259 // all Katakana are handled by the dictionary breaker.
2260 if (fKatakanaSet
->contains(c1
) &&
2261 fKatakanaSet
->contains(c2
)) {
2262 setAppliedRule(p2
, "WB13 Katakana x Katakana");
2266 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2267 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2268 fExtendNumLetSet
->contains(c2
)) {
2270 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2274 if (fExtendNumLetSet
->contains(c1
) &&
2275 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2276 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2277 setAppliedRule(p2
, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2281 if (fRegionalIndicatorSet
->contains(c0
) && fRegionalIndicatorSet
->contains(c1
)) {
2282 setAppliedRule(p2
, "WB15 - WB17 Group pairs of Regional Indicators.");
2285 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2286 setAppliedRule(p2
, "WB15 - WB17 Group pairs of Regional Indicators.");
2290 setAppliedRule(p2
, "WB999");
2299 UVector
*RBBIWordMonkey::charClasses() {
2303 RBBIWordMonkey::~RBBIWordMonkey() {
2308 delete fKatakanaSet
;
2309 delete fHebrew_LetterSet
;
2311 delete fSingle_QuoteSet
;
2312 delete fDouble_QuoteSet
;
2313 delete fMidNumLetSet
;
2314 delete fMidLetterSet
;
2319 delete fExtendNumLetSet
;
2320 delete fWSegSpaceSet
;
2321 delete fRegionalIndicatorSet
;
2322 delete fDictionarySet
;
2325 delete fExtendedPictSet
;
2331 //------------------------------------------------------------------------------------------
2333 // class RBBISentMonkey Sentence Break specific implementation
2334 // of RBBIMonkeyKind.
2336 //------------------------------------------------------------------------------------------
2337 class RBBISentMonkey
: public RBBIMonkeyKind
{
2340 virtual ~RBBISentMonkey();
2341 virtual UVector
*charClasses();
2342 virtual void setText(const UnicodeString
&s
);
2343 virtual int32_t next(int32_t i
);
2345 int moveBack(int posFrom
);
2346 int moveForward(int posFrom
);
2347 UChar32
cAt(int pos
);
2351 UnicodeSet
*fSepSet
;
2352 UnicodeSet
*fFormatSet
;
2354 UnicodeSet
*fLowerSet
;
2355 UnicodeSet
*fUpperSet
;
2356 UnicodeSet
*fOLetterSet
;
2357 UnicodeSet
*fNumericSet
;
2358 UnicodeSet
*fATermSet
;
2359 UnicodeSet
*fSContinueSet
;
2360 UnicodeSet
*fSTermSet
;
2361 UnicodeSet
*fCloseSet
;
2362 UnicodeSet
*fOtherSet
;
2363 UnicodeSet
*fExtendSet
;
2365 const UnicodeString
*fText
;
2368 RBBISentMonkey::RBBISentMonkey()
2370 UErrorCode status
= U_ZERO_ERROR
;
2372 fSets
= new UVector(status
);
2374 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2375 // set and made into character classes of their own. For the monkey impl,
2376 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2377 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2378 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2379 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2380 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2381 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2382 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2383 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2384 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2385 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2386 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2387 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2388 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2389 fOtherSet
= new UnicodeSet();
2391 if(U_FAILURE(status
)) {
2392 deferredStatus
= status
;
2396 fOtherSet
->complement();
2397 fOtherSet
->removeAll(*fSepSet
);
2398 fOtherSet
->removeAll(*fFormatSet
);
2399 fOtherSet
->removeAll(*fSpSet
);
2400 fOtherSet
->removeAll(*fLowerSet
);
2401 fOtherSet
->removeAll(*fUpperSet
);
2402 fOtherSet
->removeAll(*fOLetterSet
);
2403 fOtherSet
->removeAll(*fNumericSet
);
2404 fOtherSet
->removeAll(*fATermSet
);
2405 fOtherSet
->removeAll(*fSContinueSet
);
2406 fOtherSet
->removeAll(*fSTermSet
);
2407 fOtherSet
->removeAll(*fCloseSet
);
2408 fOtherSet
->removeAll(*fExtendSet
);
2410 fSets
->addElement(fSepSet
, status
); classNames
.push_back("Sep");
2411 fSets
->addElement(fFormatSet
, status
); classNames
.push_back("Format");
2412 fSets
->addElement(fSpSet
, status
); classNames
.push_back("Sp");
2413 fSets
->addElement(fLowerSet
, status
); classNames
.push_back("Lower");
2414 fSets
->addElement(fUpperSet
, status
); classNames
.push_back("Upper");
2415 fSets
->addElement(fOLetterSet
, status
); classNames
.push_back("OLetter");
2416 fSets
->addElement(fNumericSet
, status
); classNames
.push_back("Numeric");
2417 fSets
->addElement(fATermSet
, status
); classNames
.push_back("ATerm");
2418 fSets
->addElement(fSContinueSet
, status
); classNames
.push_back("SContinue");
2419 fSets
->addElement(fSTermSet
, status
); classNames
.push_back("STerm");
2420 fSets
->addElement(fCloseSet
, status
); classNames
.push_back("Close");
2421 fSets
->addElement(fOtherSet
, status
); classNames
.push_back("Other");
2422 fSets
->addElement(fExtendSet
, status
); classNames
.push_back("Extend");
2424 if (U_FAILURE(status
)) {
2425 deferredStatus
= status
;
2431 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2433 prepareAppliedRules(s
.length());
2436 UVector
*RBBISentMonkey::charClasses() {
2440 // moveBack() Find the "significant" code point preceding the index i.
2441 // Skips over ($Extend | $Format)* .
2443 int RBBISentMonkey::moveBack(int i
) {
2450 j
= fText
->moveIndex32(j
, -1);
2451 c
= fText
->char32At(j
);
2453 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2459 int RBBISentMonkey::moveForward(int i
) {
2460 if (i
>=fText
->length()) {
2461 return fText
->length();
2466 j
= fText
->moveIndex32(j
, 1);
2469 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2473 UChar32
RBBISentMonkey::cAt(int pos
) {
2474 if (pos
<0 || pos
>=fText
->length()) {
2477 return fText
->char32At(pos
);
2481 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2482 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2483 // break position being tested. The candidate break
2484 // location is before p2.
2488 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2491 if (U_FAILURE(deferredStatus
)) {
2495 // Prev break at end of string. return DONE.
2496 if (prevPos
>= fText
->length()) {
2499 p0
= p1
= p2
= p3
= prevPos
;
2500 c3
= fText
->char32At(prevPos
);
2502 (void)p0
; // Suppress set but not used warning.
2504 // Loop runs once per "significant" character position in the input text.
2506 // Move all of the positions forward in the input string.
2511 // Advance p3 by X(Extend | Format)* Rule 4
2512 p3
= moveForward(p3
);
2515 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2516 setAppliedRule(p2
, "SB3 CR x LF");
2520 if (fSepSet
->contains(c1
)) {
2521 p2
= p1
+1; // Separators don't combine with Extend or Format.
2523 setAppliedRule(p2
, "SB4 Sep <break>");
2527 if (p2
>= fText
->length()) {
2528 // Reached end of string. Always a break position.
2529 setAppliedRule(p2
, "SB4 Sep <break>");
2533 if (p2
== prevPos
) {
2534 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2535 setAppliedRule(p2
, "SB4 Sep <break>");
2539 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2540 setAppliedRule(p2
, "SB6 ATerm x Numeric");
2544 if ((fUpperSet
->contains(c0
) || fLowerSet
->contains(c0
)) &&
2545 fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2546 setAppliedRule(p2
, "SB7 (Upper | Lower) ATerm x Uppper");
2550 // Note: STerm | ATerm are added to the negated part of the expression by a
2551 // note to the Unicode 5.0 documents.
2553 while (fSpSet
->contains(cAt(p8
))) {
2556 while (fCloseSet
->contains(cAt(p8
))) {
2559 if (fATermSet
->contains(cAt(p8
))) {
2563 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2564 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2565 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2568 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2571 p8
= moveForward(p8
);
2573 if (fLowerSet
->contains(cAt(p8
))) {
2576 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2581 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2583 while (fSpSet
->contains(cAt(p8
))) {
2586 while (fCloseSet
->contains(cAt(p8
))) {
2590 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2591 setAppliedRule(p2
, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2597 while (fCloseSet
->contains(cAt(p9
))) {
2601 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2602 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2604 setAppliedRule(p2
, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2610 while (fSpSet
->contains(cAt(p10
))) {
2611 p10
= moveBack(p10
);
2613 while (fCloseSet
->contains(cAt(p10
))) {
2614 p10
= moveBack(p10
);
2616 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2617 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2618 setAppliedRule(p2
, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2624 if (fSepSet
->contains(cAt(p11
))) {
2625 p11
= moveBack(p11
);
2627 while (fSpSet
->contains(cAt(p11
))) {
2628 p11
= moveBack(p11
);
2630 while (fCloseSet
->contains(cAt(p11
))) {
2631 p11
= moveBack(p11
);
2633 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2634 setAppliedRule(p2
, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2638 setAppliedRule(p2
, "SB12 Any x Any");
2646 RBBISentMonkey::~RBBISentMonkey() {
2656 delete fSContinueSet
;
2665 //-------------------------------------------------------------------------------------------
2669 //-------------------------------------------------------------------------------------------
2671 class RBBILineMonkey
: public RBBIMonkeyKind
{
2674 virtual ~RBBILineMonkey();
2675 virtual UVector
*charClasses();
2676 virtual void setText(const UnicodeString
&s
);
2677 virtual int32_t next(int32_t i
);
2678 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2728 BreakIterator
*fCharBI
;
2729 const UnicodeString
*fText
;
2730 RegexMatcher
*fNumberMatcher
;
2733 RBBILineMonkey::RBBILineMonkey() :
2739 fNumberMatcher(NULL
)
2742 if (U_FAILURE(deferredStatus
)) {
2746 UErrorCode status
= U_ZERO_ERROR
;
2748 fSets
= new UVector(status
);
2750 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
2751 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
2752 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
2753 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
2754 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
2755 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
2756 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
2757 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
2758 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
2759 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
2760 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
2761 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
2762 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
2763 fHH
= new UnicodeSet();
2764 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
2765 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
2766 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
2767 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=CL}] [\\u201D]]"), status
); // en adjustments for rdar://problem/51193810
2768 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
2769 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
2770 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
2771 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
2772 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
2773 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
2774 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
2775 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=OP}] [\\u201C\\u2018]]"), status
); // en adjustments
2776 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=QU}]-[\\u201C\\u2018\\u201D]]"), status
); // en adjustments
2777 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
2778 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
2779 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
2780 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
2781 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
2782 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
2783 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
2784 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
2785 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
2786 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
2787 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
2788 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
2789 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
2790 fEB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status
);
2791 fEM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status
);
2792 fZWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status
);
2793 fOP30
= new UnicodeSet(u
"[[\\p{Line_break=OP} [\\u201C\\u2018]]-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status
); // en adjustments
2794 fCP30
= new UnicodeSet(u
"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status
);
2796 if (U_FAILURE(status
)) {
2797 deferredStatus
= status
;
2801 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2802 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2803 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2805 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
2806 fCM
->addAll(*fZWJ
); // ZWJ behaves as a CM.
2808 fHH
->add(u
'\u2010'); // Hyphen, '‐'
2811 fSets
->addElement(fBK
, status
); classNames
.push_back("fBK");
2812 fSets
->addElement(fCR
, status
); classNames
.push_back("fCR");
2813 fSets
->addElement(fLF
, status
); classNames
.push_back("fLF");
2814 fSets
->addElement(fCM
, status
); classNames
.push_back("fCM");
2815 fSets
->addElement(fNL
, status
); classNames
.push_back("fNL");
2816 fSets
->addElement(fWJ
, status
); classNames
.push_back("fWJ");
2817 fSets
->addElement(fZW
, status
); classNames
.push_back("fZW");
2818 fSets
->addElement(fGL
, status
); classNames
.push_back("fGL");
2819 fSets
->addElement(fCB
, status
); classNames
.push_back("fCB");
2820 fSets
->addElement(fSP
, status
); classNames
.push_back("fSP");
2821 fSets
->addElement(fB2
, status
); classNames
.push_back("fB2");
2822 fSets
->addElement(fBA
, status
); classNames
.push_back("fBA");
2823 fSets
->addElement(fBB
, status
); classNames
.push_back("fBB");
2824 fSets
->addElement(fHY
, status
); classNames
.push_back("fHY");
2825 fSets
->addElement(fH2
, status
); classNames
.push_back("fH2");
2826 fSets
->addElement(fH3
, status
); classNames
.push_back("fH3");
2827 fSets
->addElement(fCL
, status
); classNames
.push_back("fCL");
2828 fSets
->addElement(fCP
, status
); classNames
.push_back("fCP");
2829 fSets
->addElement(fEX
, status
); classNames
.push_back("fEX");
2830 fSets
->addElement(fIN
, status
); classNames
.push_back("fIN");
2831 fSets
->addElement(fJL
, status
); classNames
.push_back("fJL");
2832 fSets
->addElement(fJT
, status
); classNames
.push_back("fJT");
2833 fSets
->addElement(fJV
, status
); classNames
.push_back("fJV");
2834 fSets
->addElement(fNS
, status
); classNames
.push_back("fNS");
2835 fSets
->addElement(fOP
, status
); classNames
.push_back("fOP");
2836 fSets
->addElement(fQU
, status
); classNames
.push_back("fQU");
2837 fSets
->addElement(fIS
, status
); classNames
.push_back("fIS");
2838 fSets
->addElement(fNU
, status
); classNames
.push_back("fNU");
2839 fSets
->addElement(fPO
, status
); classNames
.push_back("fPO");
2840 fSets
->addElement(fPR
, status
); classNames
.push_back("fPR");
2841 fSets
->addElement(fSY
, status
); classNames
.push_back("fSY");
2842 fSets
->addElement(fAI
, status
); classNames
.push_back("fAI");
2843 fSets
->addElement(fAL
, status
); classNames
.push_back("fAL");
2844 fSets
->addElement(fHL
, status
); classNames
.push_back("fHL");
2845 fSets
->addElement(fID
, status
); classNames
.push_back("fID");
2846 fSets
->addElement(fWJ
, status
); classNames
.push_back("fWJ");
2847 fSets
->addElement(fRI
, status
); classNames
.push_back("fRI");
2848 fSets
->addElement(fSG
, status
); classNames
.push_back("fSG");
2849 fSets
->addElement(fEB
, status
); classNames
.push_back("fEB");
2850 fSets
->addElement(fEM
, status
); classNames
.push_back("fEM");
2851 fSets
->addElement(fZWJ
, status
); classNames
.push_back("fZWJ");
2852 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2853 fSets
->addElement(fOP30
, status
); classNames
.push_back("fOP30");
2854 fSets
->addElement(fCP30
, status
); classNames
.push_back("fCP30");
2857 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2858 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2859 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2860 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2861 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2862 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2863 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2865 fNumberMatcher
= new RegexMatcher(
2866 UnicodeString(rules
, -1, US_INV
), 0, status
);
2868 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2870 if (U_FAILURE(status
)) {
2871 deferredStatus
= status
;
2877 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2879 fCharBI
->setText(s
);
2880 prepareAppliedRules(s
.length());
2881 fNumberMatcher
->reset(s
);
2886 // Line Break TR rules 9 and 10 implementation.
2887 // This deals with combining marks and other sequences that
2888 // that must be treated as if they were something other than what they actually are.
2890 // This is factored out into a separate function because it must be applied twice for
2891 // each potential break, once to the chars before the position being checked, then
2892 // again to the text following the possible break.
2894 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2896 // Invalid initial position. Happens during the warmup iteration of the
2897 // main loop in next().
2901 int32_t nPos
= *nextPos
;
2903 // LB 9 Keep combining sequences together.
2904 // advance over any CM class chars. Note that Line Break CM is different
2905 // from the normal Grapheme Extend property.
2906 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
2907 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
2909 *nextChar
= fText
->char32At(nPos
);
2910 if (!fCM
->contains(*nextChar
)) {
2913 nPos
= fText
->moveIndex32(nPos
, 1);
2918 // LB 9 Treat X CM* as if it were x.
2919 // No explicit action required.
2921 // LB 10 Treat any remaining combining mark as AL
2922 if (fCM
->contains(*posChar
)) {
2926 // Push the updated nextPos and nextChar back to our caller.
2927 // This only makes a difference if posChar got bigger by consuming a
2928 // combining sequence.
2930 *nextChar
= fText
->char32At(nPos
);
2935 int32_t RBBILineMonkey::next(int32_t startPos
) {
2936 UErrorCode status
= U_ZERO_ERROR
;
2937 int32_t pos
; // Index of the char following a potential break position
2938 UChar32 thisChar
; // Character at above position "pos"
2940 int32_t prevPos
; // Index of the char preceding a potential break position
2941 UChar32 prevChar
; // Character at above position. Note that prevChar
2942 // and thisChar may not be adjacent because combining
2943 // characters between them will be ignored.
2945 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
2948 int32_t nextPos
; // Index of the next character following pos.
2949 // Usually skips over combining marks.
2950 int32_t nextCPPos
; // Index of the code point following "pos."
2951 // May point to a combining mark.
2952 int32_t tPos
; // temp value.
2955 if (U_FAILURE(deferredStatus
)) {
2959 if (startPos
>= fText
->length()) {
2964 // Initial values for loop. Loop will run the first time without finding breaks,
2965 // while the invalid values shift out and the "this" and
2966 // "prev" positions are filled in with good values.
2967 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
2968 thisChar
= prevChar
= prevCharX2
= 0;
2969 nextPos
= nextCPPos
= startPos
;
2972 // Loop runs once per position in the test text, until a break position
2975 prevPosX2
= prevPos
;
2976 prevCharX2
= prevChar
;
2979 prevChar
= thisChar
;
2982 thisChar
= fText
->char32At(pos
);
2984 nextCPPos
= fText
->moveIndex32(pos
, 1);
2985 nextPos
= nextCPPos
;
2988 if (pos
>= fText
->length()) {
2989 setAppliedRule(pos
, "LB2 - Break at end of text.");
2994 // We do this one out-of-order because the adjustment does not change anything
2995 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2997 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
2998 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
2999 c
= fText
->char32At(nextPos
);
3000 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3002 // If the loop is still warming up - if we haven't shifted the initial
3003 // -1 positions out of prevPos yet - loop back to advance the
3004 // position in the input without any further looking for breaks.
3005 if (prevPos
== -1) {
3006 setAppliedRule(pos
, "LB 9 - adjust for combining sequences.");
3011 if (fBK
->contains(prevChar
)) {
3012 setAppliedRule(pos
, "LB 4 Always break after hard line breaks");
3017 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3018 setAppliedRule(pos
, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3021 if (prevChar
== 0x0d ||
3024 setAppliedRule(pos
, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3029 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3030 fBK
->contains(thisChar
)) {
3031 setAppliedRule(pos
, "LB 6 Don't break before hard line breaks");
3036 if (fSP
->contains(thisChar
)) {
3037 setAppliedRule(pos
, "LB 7 Don't break before spaces or zero-width space.");
3041 // !!! ??? Is this the right text for the applied rule?
3042 if (fZW
->contains(thisChar
)) {
3043 setAppliedRule(pos
, "LB 7 Don't break before spaces or zero-width space.");
3049 // Scan backwards from prevChar for SP* ZW
3051 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3052 tPos
= fText
->moveIndex32(tPos
, -1);
3054 if (fZW
->contains(fText
->char32At(tPos
))) {
3055 setAppliedRule(pos
, "LB 8 Break after zero width space");
3060 // Move this test up, before LB8a, because numbers can match a longer sequence that would
3061 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
3062 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3063 if (U_FAILURE(status
)) {
3064 setAppliedRule(pos
, "LB 25 Numbers");
3067 // Matched a number. But could have been just a single digit, which would
3068 // not represent a "no break here" between prevChar and thisChar
3069 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3070 if (numEndIdx
> pos
) {
3071 // Number match includes at least our two chars being checked
3072 if (numEndIdx
> nextPos
) {
3073 // Number match includes additional chars. Update pos and nextPos
3074 // so that next loop iteration will continue at the end of the number,
3075 // checking for breaks between last char in number & whatever follows.
3076 pos
= nextPos
= numEndIdx
;
3078 pos
= fText
->moveIndex32(pos
, -1);
3079 thisChar
= fText
->char32At(pos
);
3080 } while (fCM
->contains(thisChar
));
3082 setAppliedRule(pos
, "LB 25 Numbers");
3088 // The monkey test's way of ignoring combining characters doesn't work
3089 // for this rule. ZJ is also a CM. Need to get the actual character
3090 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3092 int32_t prevIdx
= fText
->moveIndex32(pos
, -1);
3093 UChar32 prevC
= fText
->char32At(prevIdx
);
3094 if (fZWJ
->contains(prevC
)) {
3095 setAppliedRule(pos
, "LB 8a ZWJ x");
3101 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3108 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3109 setAppliedRule(pos
, "LB 11 Do not break before or after WORD JOINER and related characters.");
3114 if (fGL
->contains(prevChar
)) {
3115 setAppliedRule(pos
, "LB 12 GL x");
3120 if (!(fSP
->contains(prevChar
) ||
3121 fBA
->contains(prevChar
) ||
3122 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3123 setAppliedRule(pos
, "LB 12a [^SP BA HY] x GL");
3128 if (fCL
->contains(thisChar
) ||
3129 fCP
->contains(thisChar
) ||
3130 fEX
->contains(thisChar
) ||
3131 fSY
->contains(thisChar
)) {
3132 setAppliedRule(pos
, "LB 13 Don't break before closings.");
3137 // Scan backwards, checking for this sequence.
3138 // The OP char could include combining marks, so we actually check for
3140 // Another Twist: The Rule 9 fixes may have changed a SP CM
3141 // sequence into a ID char, so before scanning back through spaces,
3142 // verify that prevChar is indeed a space. The prevChar variable
3143 // may differ from fText[prevPos]
3145 if (fSP
->contains(prevChar
)) {
3146 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3147 tPos
=fText
->moveIndex32(tPos
, -1);
3150 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3151 tPos
=fText
->moveIndex32(tPos
, -1);
3153 if (fOP
->contains(fText
->char32At(tPos
))) {
3154 setAppliedRule(pos
, "LB 14 Don't break after OP SP*");
3159 if (nextPos
< fText
->length()) {
3160 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3161 // from a legit ffff character. So test length separately.
3162 UChar32 nextChar
= fText
->char32At(nextPos
);
3163 if (fSP
->contains(prevChar
) && fIS
->contains(thisChar
) && fNU
->contains(nextChar
)) {
3164 setAppliedRule(pos
, "LB 14a Break before an IS that begins a number and follows a space");
3170 if (fIS
->contains(thisChar
)) {
3171 setAppliedRule(pos
, "LB 14b Do not break before numeric separators, even after spaces.");
3176 if (fOP
->contains(thisChar
)) {
3177 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3179 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3180 tPos
= fText
->moveIndex32(tPos
, -1);
3182 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3183 tPos
= fText
->moveIndex32(tPos
, -1);
3185 if (fQU
->contains(fText
->char32At(tPos
))) {
3186 setAppliedRule(pos
, "LB 15 QU SP* x OP");
3192 // Scan backwards for SP* CM* (CL | CP)
3193 if (fNS
->contains(thisChar
)) {
3195 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3196 tPos
= fText
->moveIndex32(tPos
, -1);
3198 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3199 tPos
= fText
->moveIndex32(tPos
, -1);
3201 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3202 setAppliedRule(pos
, "LB 16 (CL | CP) SP* x NS");
3208 if (fB2
->contains(thisChar
)) {
3209 // Scan backwards, checking for the B2 CM* SP* sequence.
3211 if (fSP
->contains(prevChar
)) {
3212 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3213 tPos
=fText
->moveIndex32(tPos
, -1);
3216 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3217 tPos
=fText
->moveIndex32(tPos
, -1);
3219 if (fB2
->contains(fText
->char32At(tPos
))) {
3220 setAppliedRule(pos
, "LB 17 B2 SP* x B2");
3226 if (fSP
->contains(prevChar
)) {
3227 setAppliedRule(pos
, "LB 18 break after space");
3233 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3234 setAppliedRule(pos
, "LB 19");
3238 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3239 setAppliedRule(pos
, "LB 20 Break around a CB");
3243 // Don't break between Hyphens and letters if a break precedes the hyphen.
3244 // Formerly this was a Finnish tailoring.
3245 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3246 // ^($HY | $HH) $AL;
3247 if (fAL
->contains(thisChar
) && (fHY
->contains(prevChar
) || fHH
->contains(prevChar
)) &&
3249 setAppliedRule(pos
, "LB 20.09");
3253 if (fBA
->contains(thisChar
) ||
3254 fHY
->contains(thisChar
) ||
3255 fNS
->contains(thisChar
) ||
3256 fBB
->contains(prevChar
) ) {
3257 setAppliedRule(pos
, "LB 21");
3261 if (fHL
->contains(prevCharX2
) &&
3262 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3263 setAppliedRule(pos
, "LB 21a HL (HY | BA) x");
3267 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3268 setAppliedRule(pos
, "LB 21b SY x HL");
3272 if (fIN
->contains(thisChar
)) {
3273 setAppliedRule(pos
, "LB 22");
3280 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && fNU
->contains(thisChar
)) {
3281 setAppliedRule(pos
, "LB 23");
3284 if (fNU
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3285 setAppliedRule(pos
, "LB 23");
3289 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3290 // PR x (ID | EB | EM)
3291 // (ID | EB | EM) x PO
3292 if (fPR
->contains(prevChar
) &&
3293 (fID
->contains(thisChar
) || fEB
->contains(thisChar
) || fEM
->contains(thisChar
))) {
3294 setAppliedRule(pos
, "LB 23a");
3297 if ((fID
->contains(prevChar
) || fEB
->contains(prevChar
) || fEM
->contains(prevChar
)) &&
3298 fPO
->contains(thisChar
)) {
3299 setAppliedRule(pos
, "LB 23a");
3303 // Do not break between prefix and letters or ideographs.
3304 // (PR | PO) x (AL | HL)
3305 // (AL | HL) x (PR | PO)
3306 if ((fPR
->contains(prevChar
) || fPO
->contains(prevChar
)) &&
3307 (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3308 setAppliedRule(pos
, "LB 24 no break between prefix and letters or ideographs");
3311 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) &&
3312 (fPR
->contains(thisChar
) || fPO
->contains(thisChar
))) {
3313 setAppliedRule(pos
, "LB 24 no break between prefix and letters or ideographs");
3317 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3319 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3320 fJV
->contains(thisChar
) ||
3321 fH2
->contains(thisChar
) ||
3322 fH3
->contains(thisChar
))) {
3323 setAppliedRule(pos
, "LB 26 Do not break a Korean syllable.");
3327 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3328 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3329 setAppliedRule(pos
, "LB 26 Do not break a Korean syllable.");
3333 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3334 fJT
->contains(thisChar
)) {
3335 setAppliedRule(pos
, "LB 26 Do not break a Korean syllable.");
3339 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3340 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3341 fIN
->contains(thisChar
)) {
3342 setAppliedRule(pos
, "LB 27 Treat a Korean Syllable Block the same as ID.");
3345 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3346 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3347 fPO
->contains(thisChar
)) {
3348 setAppliedRule(pos
, "LB 27 Treat a Korean Syllable Block the same as ID.");
3351 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3352 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3353 setAppliedRule(pos
, "LB 27 Treat a Korean Syllable Block the same as ID.");
3359 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3360 setAppliedRule(pos
, "LB 28 Do not break between alphabetics (\"at\").");
3364 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3365 setAppliedRule(pos
, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3371 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP30
->contains(thisChar
)) {
3372 setAppliedRule(pos
, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3375 if (fCP30
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3376 setAppliedRule(pos
, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3381 if (fRI
->contains(prevCharX2
) && fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3382 setAppliedRule(pos
, "LB30a RI RI ÷ RI");
3385 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3386 // Two Regional Indicators have been paired.
3387 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3388 // following RI. This is a hack.
3390 setAppliedRule(pos
, "LB30a RI RI ÷ RI");
3394 if (fEB
->contains(prevChar
) && fEM
->contains(thisChar
)) {
3395 setAppliedRule(pos
, "LB30b Emoji Base x Emoji Modifier");
3399 setAppliedRule(pos
, "LB 31 Break everywhere else");
3407 UVector
*RBBILineMonkey::charClasses() {
3412 RBBILineMonkey::~RBBILineMonkey() {
3462 delete fNumberMatcher
;
3466 //-------------------------------------------------------------------------------------------
3471 // seed=nnnnn Random number starting seed.
3472 // Setting the seed allows errors to be reproduced.
3473 // loop=nnn Looping count. Controls running time.
3475 // 0 or greater: run length.
3477 // type = char | word | line | sent | title
3480 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3482 //-------------------------------------------------------------------------------------------
3484 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3485 int32_t val
= defaultVal
;
3486 name
.append(" *= *(-?\\d+)");
3487 UErrorCode status
= U_ZERO_ERROR
;
3488 RegexMatcher
m(name
, params
, 0, status
);
3490 // The param exists. Convert the string to an int.
3491 char valString
[100];
3492 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3493 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3494 paramLength
= (int32_t)(sizeof(valString
)-2);
3496 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3497 val
= strtol(valString
, NULL
, 10);
3499 // Delete this parameter from the params string.
3501 params
= m
.replaceFirst("", status
);
3503 U_ASSERT(U_SUCCESS(status
));
3508 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3509 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3518 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3520 if (count
< expectedcount
&& expected
[count
] != i
) {
3521 test
->errln("%s:%d break forward test failed: expected %d but got %d",
3522 __FILE__
, __LINE__
, expected
[count
], i
);
3527 if (count
!= expectedcount
) {
3528 printStringBreaks(ustr
, expected
, expectedcount
);
3529 test
->errln("%s:%d break forward test failed: missed %d match",
3530 __FILE__
, __LINE__
, expectedcount
- count
);
3533 // testing boundaries
3534 for (i
= 1; i
< expectedcount
; i
++) {
3535 int j
= expected
[i
- 1];
3536 if (!bi
->isBoundary(j
)) {
3537 printStringBreaks(ustr
, expected
, expectedcount
);
3538 test
->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3539 __FILE__
, __LINE__
, j
);
3542 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3543 if (bi
->isBoundary(j
)) {
3544 printStringBreaks(ustr
, expected
, expectedcount
);
3545 test
->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3546 __FILE__
, __LINE__
, j
);
3552 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3554 if (forward
[count
] != i
) {
3555 printStringBreaks(ustr
, expected
, expectedcount
);
3556 test
->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3557 __FILE__
, __LINE__
, forward
[count
], i
);
3562 printStringBreaks(ustr
, expected
, expectedcount
);
3563 test
->errln("break test previous() failed: missed a match");
3567 // testing preceding
3568 for (i
= 0; i
< expectedcount
- 1; i
++) {
3569 // int j = expected[i] + 1;
3570 int j
= ustr
.moveIndex32(expected
[i
], 1);
3571 for (; j
<= expected
[i
+ 1]; j
++) {
3572 int32_t expectedPreceding
= expected
[i
];
3573 int32_t actualPreceding
= bi
->preceding(j
);
3574 if (actualPreceding
!= expectedPreceding
) {
3575 printStringBreaks(ustr
, expected
, expectedcount
);
3576 test
->errln("%s:%d preceding(%d): expected %d, got %d",
3577 __FILE__
, __LINE__
, j
, expectedPreceding
, actualPreceding
);
3585 void RBBITest::TestWordBreaks(void)
3587 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3589 Locale
locale("en");
3590 UErrorCode status
= U_ZERO_ERROR
;
3591 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3592 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3593 // Replaced any C+J characters in a row with a random sequence of characters
3594 // of the same length to make our C+J segmentation not get in the way.
3595 static const char *strlist
[] =
3597 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3598 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3599 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3600 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3601 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3602 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3603 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3604 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3605 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3606 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3607 "\\u2027\\U000e0067\\u0a47\\u00b7",
3608 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3609 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3610 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3611 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3612 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3613 "\\u0027\\u11af\\U000e0057\\u0602",
3614 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3615 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3616 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3617 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3618 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3619 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3620 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3621 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3622 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3623 "\\u18f4\\U000e0049\\u20e7\\u2027",
3624 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3625 "\\ua183\\u102d\\u0bec\\u003a",
3626 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3627 "\\u003a\\u0e57\\u0fad\\u002e",
3628 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3629 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3630 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3631 "\\u003a\\u0664\\u00b7\\u1fba",
3632 "\\u003b\\u0027\\u00b7\\u47a3",
3633 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3634 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3635 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3638 if (U_FAILURE(status
)) {
3639 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3642 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3643 // printf("looping %d\n", loop);
3644 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3645 // RBBICharMonkey monkey;
3646 RBBIWordMonkey monkey
;
3649 int expectedcount
= 0;
3651 monkey
.setText(ustr
);
3653 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3654 expected
[expectedcount
++] = i
;
3657 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3663 void RBBITest::TestWordBoundary(void)
3665 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3666 Locale
locale("en");
3667 UErrorCode status
= U_ZERO_ERROR
;
3668 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3669 LocalPointer
<BreakIterator
> bi(BreakIterator::createWordInstance(locale
, status
), status
);
3670 if (U_FAILURE(status
)) {
3671 errcheckln(status
, "%s:%d Creation of break iterator failed %s",
3672 __FILE__
, __LINE__
, u_errorName(status
));
3676 static const char *strlist
[] =
3678 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3679 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3680 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3681 "\\u2027\\U000e0067\\u0a47\\u00b7",
3682 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3683 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3684 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3685 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3686 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3687 "\\u0027\\u11af\\U000e0057\\u0602",
3688 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3689 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3690 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3691 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3692 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3693 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3694 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3695 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3696 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3697 "\\u58f4\\U000e0049\\u20e7\\u2027",
3698 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3699 "\\ua183\\u102d\\u0bec\\u003a",
3700 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3701 "\\u003a\\u0e57\\u0fad\\u002e",
3702 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3703 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3704 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3705 "\\u003a\\u0664\\u00b7\\u1fba",
3706 "\\u003b\\u0027\\u00b7\\u47a3",
3709 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3710 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
3711 UnicodeString
ustr(str
);
3717 for (int32_t boundary
= bi
->first(); boundary
!= BreakIterator::DONE
; boundary
= bi
->next()) {
3719 if (count
>= UPRV_LENGTHOF(forward
)) {
3720 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3721 __FILE__
, __LINE__
, loop
, count
, boundary
);
3724 forward
[count
] = boundary
;
3725 if (boundary
<= prev
) {
3726 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3727 __FILE__
, __LINE__
, loop
, prev
, boundary
);
3730 for (int32_t nonBoundary
= prev
+ 1; nonBoundary
< boundary
; nonBoundary
++) {
3731 if (bi
->isBoundary(nonBoundary
)) {
3732 printStringBreaks(ustr
, forward
, count
);
3733 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3734 __FILE__
, __LINE__
, loop
, prev
, nonBoundary
, boundary
);
3738 if (!bi
->isBoundary(boundary
)) {
3739 printStringBreaks(ustr
, forward
, count
);
3740 errln("%s:%d happy boundary test failed: expected %d a boundary",
3741 __FILE__
, __LINE__
, boundary
);
3749 void RBBITest::TestLineBreaks(void)
3751 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3752 Locale
locale("en");
3753 UErrorCode status
= U_ZERO_ERROR
;
3754 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3755 const int32_t STRSIZE
= 50;
3757 static const char *strlist
[] =
3759 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3760 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3761 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3762 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3763 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3764 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3765 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3766 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3767 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3768 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3769 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3770 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3771 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3772 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3773 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3774 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3775 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3776 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3777 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3778 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3779 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3780 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3781 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3782 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3783 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3784 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3785 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3786 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3787 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3788 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3789 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3790 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3791 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3792 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3793 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3794 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3795 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3796 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3797 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3798 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3801 TEST_ASSERT_SUCCESS(status
);
3802 if (U_FAILURE(status
)) {
3805 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3806 // printf("looping %d\n", loop);
3807 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3814 UnicodeString
ustr(str
);
3815 RBBILineMonkey monkey
;
3816 if (U_FAILURE(monkey
.deferredStatus
)) {
3820 const int EXPECTEDSIZE
= 50;
3821 int expected
[EXPECTEDSIZE
];
3822 int expectedcount
= 0;
3824 monkey
.setText(ustr
);
3827 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3828 if (expectedcount
>= EXPECTEDSIZE
) {
3829 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3832 expected
[expectedcount
++] = i
;
3835 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3841 void RBBITest::TestSentBreaks(void)
3843 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3844 Locale
locale("en");
3845 UErrorCode status
= U_ZERO_ERROR
;
3846 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3848 static const char *strlist
[] =
3850 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3852 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3853 "\"Sentence ending with a quote.\" Bye.",
3854 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3855 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3856 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3857 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3858 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3859 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3860 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3861 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3862 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3863 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3864 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3865 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3866 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3867 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3868 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3869 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3872 if (U_FAILURE(status
)) {
3873 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3876 for (loop
= 0; loop
< UPRV_LENGTHOF(strlist
); loop
++) {
3877 u_unescape(strlist
[loop
], str
, UPRV_LENGTHOF(str
));
3878 UnicodeString
ustr(str
);
3880 RBBISentMonkey monkey
;
3881 if (U_FAILURE(monkey
.deferredStatus
)) {
3885 const int EXPECTEDSIZE
= 50;
3886 int expected
[EXPECTEDSIZE
];
3887 int expectedcount
= 0;
3889 monkey
.setText(ustr
);
3892 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3893 if (expectedcount
>= EXPECTEDSIZE
) {
3894 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3897 expected
[expectedcount
++] = i
;
3900 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3906 void RBBITest::TestMonkey() {
3907 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3909 UErrorCode status
= U_ZERO_ERROR
;
3910 int32_t loopCount
= 500;
3912 UnicodeString breakType
= "all";
3913 Locale
locale("en");
3914 UBool useUText
= FALSE
;
3916 if (quick
== FALSE
) {
3921 UnicodeString
p(fTestParams
);
3922 loopCount
= getIntParam("loop", p
, loopCount
);
3923 seed
= getIntParam("seed", p
, seed
);
3925 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3927 breakType
= m
.group(1, status
);
3929 p
= m
.replaceFirst("", status
);
3932 RegexMatcher
u(" *utext", p
, 0, status
);
3936 p
= u
.replaceFirst("", status
);
3941 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
3942 // Each option is stripped out of the option string as it is processed.
3943 // All options have been checked. The option string should have been completely emptied..
3945 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3946 buf
[sizeof(buf
)-1] = 0;
3947 errln("Unrecognized or extra parameter: %s\n", buf
);
3953 if (breakType
== "char" || breakType
== "all") {
3955 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3956 if (U_SUCCESS(status
)) {
3957 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
3958 if (breakType
== "all" && useUText
==FALSE
) {
3959 // Also run a quick test with UText when "all" is specified
3960 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
3964 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
3969 if (breakType
== "word" || breakType
== "all") {
3970 logln("Word Break Monkey Test");
3972 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3973 if (U_SUCCESS(status
)) {
3974 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
3977 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
3982 if (breakType
== "line" || breakType
== "all") {
3983 logln("Line Break Monkey Test");
3985 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3986 if (loopCount
>= 10) {
3987 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
3989 if (U_SUCCESS(status
)) {
3990 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
3993 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3998 if (breakType
== "sent" || breakType
== "all" ) {
3999 logln("Sentence Break Monkey Test");
4001 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4002 if (loopCount
>= 10) {
4003 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
4005 if (U_SUCCESS(status
)) {
4006 RunMonkey(bi
, m
, "sent", seed
, loopCount
, useUText
);
4009 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4018 // Run a RBBI monkey test. Common routine, for all break iterator types.
4020 // bi - the break iterator to use
4021 // mk - MonkeyKind, abstraction for obtaining expected results
4022 // name - Name of test (char, word, etc.) for use in error messages
4023 // seed - Seed for starting random number generator (parameter from user)
4026 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4027 int32_t numIterations
, UBool useUText
) {
4029 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4031 const int32_t TESTSTRINGLEN
= 500;
4032 UnicodeString testText
;
4033 int32_t numCharClasses
;
4035 int expectedCount
= 0;
4036 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4037 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4038 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4039 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4040 char followingBreaks
[TESTSTRINGLEN
*2+1];
4041 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4048 numCharClasses
= mk
.charClasses()->size();
4049 chClasses
= mk
.charClasses();
4051 // Check for errors that occured during the construction of the MonkeyKind object.
4052 // Can't report them where they occured because errln() is a method coming from intlTest,
4053 // and is not visible outside of RBBITest :-(
4054 if (U_FAILURE(mk
.deferredStatus
)) {
4055 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4059 // Verify that the character classes all have at least one member.
4060 for (i
=0; i
<numCharClasses
; i
++) {
4061 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4062 if (s
== NULL
|| s
->size() == 0) {
4063 errln("Character Class #%d is null or of zero size.", i
);
4068 // For minimizing width of class name output.
4069 int classNameSize
= mk
.maxClassNameSize();
4071 while (loopCount
< numIterations
|| numIterations
== -1) {
4072 if (numIterations
== -1 && loopCount
% 10 == 0) {
4073 // If test is running in an infinite loop, display a periodic tic so
4074 // we can tell that it is making progress.
4075 fprintf(stderr
, ".");
4077 // Save current random number seed, so that we can recreate the random numbers
4078 // for this loop iteration in event of an error.
4081 // Populate a test string with data.
4082 testText
.truncate(0);
4083 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4084 int32_t aClassNum
= m_rand() % numCharClasses
;
4085 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4086 int32_t charIdx
= m_rand() % classSet
->size();
4087 UChar32 c
= classSet
->charAt(charIdx
);
4088 if (c
< 0) { // TODO: deal with sets containing strings.
4089 errln("%s:%d c < 0", __FILE__
, __LINE__
);
4092 // Do not assemble a supplementary character from randomly generated separate surrogates.
4093 // (It could be a dictionary character)
4094 if (U16_IS_TRAIL(c
) && testText
.length() > 0 && U16_IS_LEAD(testText
.charAt(testText
.length()-1))) {
4101 // Calculate the expected results for this test string and reset applied rules.
4102 mk
.setText(testText
);
4104 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4105 expectedBreaks
[0] = 1;
4106 int32_t breakPos
= 0;
4109 breakPos
= mk
.next(breakPos
);
4110 if (breakPos
== -1) {
4113 if (breakPos
> testText
.length()) {
4114 errln("breakPos > testText.length()");
4116 expectedBreaks
[breakPos
] = 1;
4117 U_ASSERT(expectedCount
<testText
.length());
4120 // Find the break positions using forward iteration
4121 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4123 UErrorCode status
= U_ZERO_ERROR
;
4124 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4125 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4126 bi
->setText(testUText
, status
);
4127 TEST_ASSERT_SUCCESS(status
);
4128 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4129 // This UText can be closed immediately, so long as the
4130 // testText string continues to exist.
4132 bi
->setText(testText
);
4135 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4136 if (i
< 0 || i
> testText
.length()) {
4137 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4140 forwardBreaks
[i
] = 1;
4143 // Find the break positions using reverse iteration
4144 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4145 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4146 if (i
< 0 || i
> testText
.length()) {
4147 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4150 reverseBreaks
[i
] = 1;
4153 // Find the break positions using isBoundary() tests.
4154 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4155 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4156 for (i
=0; i
<=testText
.length(); i
++) {
4157 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4161 // Find the break positions using the following() function.
4163 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4164 int32_t lastBreakPos
= 0;
4165 followingBreaks
[0] = 1;
4166 for (i
=0; i
<testText
.length(); i
++) {
4167 breakPos
= bi
->following(i
);
4168 if (breakPos
<= i
||
4169 breakPos
< lastBreakPos
||
4170 breakPos
> testText
.length() ||
4171 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4172 errln("%s break monkey test: "
4173 "Out of range value returned by BreakIterator::following().\n"
4174 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4175 name
, seed
, i
, breakPos
, lastBreakPos
);
4178 followingBreaks
[breakPos
] = 1;
4179 lastBreakPos
= breakPos
;
4182 // Find the break positions using the preceding() function.
4183 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4184 lastBreakPos
= testText
.length();
4185 precedingBreaks
[testText
.length()] = 1;
4186 for (i
=testText
.length(); i
>0; i
--) {
4187 breakPos
= bi
->preceding(i
);
4188 if (breakPos
>= i
||
4189 breakPos
> lastBreakPos
||
4190 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4191 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4192 errln("%s break monkey test: "
4193 "Out of range value returned by BreakIterator::preceding().\n"
4194 "index=%d; prev returned %d; lastBreak=%d" ,
4195 name
, i
, breakPos
, lastBreakPos
);
4196 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4197 precedingBreaks
[i
] = 2; // Forces an error.
4200 if (breakPos
>= 0) {
4201 precedingBreaks
[breakPos
] = 1;
4203 lastBreakPos
= breakPos
;
4207 // Compare the expected and actual results.
4208 for (i
=0; i
<=testText
.length(); i
++) {
4209 const char *errorType
= NULL
;
4210 const char* currentBreakData
= NULL
;
4211 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4212 errorType
= "next()";
4213 currentBreakData
= forwardBreaks
;
4214 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4215 errorType
= "previous()";
4216 currentBreakData
= reverseBreaks
;
4217 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4218 errorType
= "isBoundary()";
4219 currentBreakData
= isBoundaryBreaks
;
4220 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4221 errorType
= "following()";
4222 currentBreakData
= followingBreaks
;
4223 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4224 errorType
= "preceding()";
4225 currentBreakData
= precedingBreaks
;
4228 if (errorType
!= NULL
) {
4229 // Format a range of the test text that includes the failure as
4230 // a data item that can be included in the rbbi test data file.
4232 // Start of the range is the last point where expected and actual results
4233 // both agreed that there was a break position.
4235 int startContext
= i
;
4238 if (startContext
==0) { break; }
4240 if (expectedBreaks
[startContext
] != 0) {
4241 if (count
== 2) break;
4246 // End of range is two expected breaks past the start position.
4247 int endContext
= i
+ 1;
4249 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4251 if (endContext
>= testText
.length()) {break;}
4252 if (expectedBreaks
[endContext
-1] != 0) {
4253 if (count
== 0) break;
4260 // Formatting of each line includes:
4262 // reference break: '|' -> a break, '.' -> no break
4263 // actual break: '|' -> a break, '.' -> no break
4264 // (name of character clase)
4265 // Unicode name of character
4266 // '-->' indicates location of the difference.
4269 (expectedBreaks
[i
] ? "Break expected but not found" :
4270 "Break found but not expected"),
4273 for (ci
=startContext
; (ci
= testText
.moveIndex32(ci
, 1));) {
4275 c
= testText
.char32At(ci
);
4277 std::string currentLineFlag
= " ";
4279 currentLineFlag
= "-->"; // Error position
4282 // BMP or SMP character in hex
4283 char hexCodePoint
[12];
4284 std::string format
= " \\u%04x";
4288 sprintf(hexCodePoint
, format
.c_str(), c
);
4290 // Get the class name and character name for the character.
4292 UErrorCode status
= U_ZERO_ERROR
;
4293 u_charName(c
, U_EXTENDED_CHAR_NAME
, cName
, sizeof(cName
), &status
);
4296 snprintf(buffer
, 200,
4297 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4298 currentLineFlag
.c_str(),
4300 expectedBreaks
[ci
] == 0 ? "." : "|", // Reference break
4301 currentBreakData
[ci
] == 0 ? "." : "|", // Actual break
4304 mk
.classNameFromCodepoint(c
).c_str(),
4305 mk
.getAppliedRule(ci
).c_str(), cName
);
4314 if (ci
>= endContext
) { break; }
4326 // Bug 5532. UTF-8 based UText fails in dictionary code.
4327 // This test checks the initial patch,
4328 // which is to just keep it from crashing. Correct word boundaries
4329 // await a proper fix to the dictionary code.
4331 void RBBITest::TestBug5532(void) {
4332 // Text includes a mixture of Thai and Latin.
4333 const unsigned char utf8Data
[] = {
4334 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4335 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4336 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4337 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4338 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4339 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4340 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4341 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4342 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4343 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4344 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4346 UErrorCode status
= U_ZERO_ERROR
;
4347 UText utext
=UTEXT_INITIALIZER
;
4348 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4349 TEST_ASSERT_SUCCESS(status
);
4351 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4352 TEST_ASSERT_SUCCESS(status
);
4353 if (U_SUCCESS(status
)) {
4354 bi
->setText(&utext
, status
);
4355 TEST_ASSERT_SUCCESS(status
);
4357 int32_t breakCount
= 0;
4358 int32_t previousBreak
= -1;
4359 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4360 // For now, just make sure that the break iterator doesn't hang.
4361 TEST_ASSERT(previousBreak
< bi
->current());
4362 previousBreak
= bi
->current();
4364 TEST_ASSERT(breakCount
> 0);
4367 utext_close(&utext
);
4371 void RBBITest::TestBug9983(void) {
4372 UnicodeString text
= UnicodeString("\\u002A" // * Other
4374 "\\u309C" // Katakana
4378 "\\u0000").unescape();
4380 UErrorCode status
= U_ZERO_ERROR
;
4381 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4382 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4383 TEST_ASSERT_SUCCESS(status
);
4384 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4385 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4386 TEST_ASSERT_SUCCESS(status
);
4387 if (U_FAILURE(status
)) {
4390 int32_t offset
, rstatus
, iterationCount
;
4392 brkiter
->setText(text
);
4395 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4397 rstatus
= brkiter
->getRuleStatus();
4398 (void)rstatus
; // Suppress set but not used warning.
4399 if (iterationCount
>= 10) {
4403 TEST_ASSERT(iterationCount
== 6);
4405 brkiterPOSIX
->setText(text
);
4406 brkiterPOSIX
->last();
4408 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4410 rstatus
= brkiterPOSIX
->getRuleStatus();
4411 (void)rstatus
; // Suppress set but not used warning.
4412 if (iterationCount
>= 10) {
4416 TEST_ASSERT(iterationCount
== 6);
4419 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4421 void RBBITest::TestBug7547() {
4422 UnicodeString rules
;
4423 UErrorCode status
= U_ZERO_ERROR
;
4424 UParseError parseError
;
4425 RuleBasedBreakIterator
breakIterator(rules
, parseError
, status
);
4426 if (status
!= U_BRK_RULE_SYNTAX
) {
4427 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__
, __LINE__
, u_errorName(status
));
4429 if (parseError
.line
!= 1 || parseError
.offset
!= 0) {
4430 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError
.line
, parseError
.offset
);
4435 void RBBITest::TestBug12797() {
4436 UnicodeString rules
= "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4437 UErrorCode status
= U_ZERO_ERROR
;
4438 UParseError parseError
;
4439 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
4440 if (U_FAILURE(status
)) {
4441 errln("%s:%s status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4444 UnicodeString text
= "abc";
4447 int32_t boundary
= bi
.next();
4448 if (boundary
!= 3) {
4449 errln("%s:%d expected boundary==3, got %d", __FILE__
, __LINE__
, boundary
);
4453 void RBBITest::TestBug12918() {
4454 // This test triggers an assertion failure in dictbe.cpp
4455 const UChar
*crasherString
= u
"\u3325\u4a16";
4456 UErrorCode status
= U_ZERO_ERROR
;
4457 UBreakIterator
* iter
= ubrk_open(UBRK_WORD
, NULL
, crasherString
, -1, &status
);
4458 if (U_FAILURE(status
)) {
4459 dataerrln("%s:%d status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4464 int32_t lastPos
= -1;
4465 while((pos
= ubrk_next(iter
)) != UBRK_DONE
) {
4466 if (pos
<= lastPos
) {
4467 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__
, __LINE__
, pos
, lastPos
);
4474 void RBBITest::TestBug12932() {
4475 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4476 UnicodeString
ruleStr(
4477 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4478 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4479 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4480 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4481 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4482 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4484 UErrorCode status
= U_ZERO_ERROR
;
4485 UParseError parseError
;
4486 RuleBasedBreakIterator
rbbi(ruleStr
, parseError
, status
);
4487 if (status
!= U_BRK_RULE_SYNTAX
) {
4488 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4489 __FILE__
, __LINE__
, u_errorName(status
));
4494 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4495 // remain undevided by ICU char, word and line break.
4496 void RBBITest::TestEmoji() {
4497 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4498 UErrorCode status
= U_ZERO_ERROR
;
4500 CharString testFileName
;
4501 testFileName
.append(IntlTest::getSourceTestData(status
), status
);
4502 testFileName
.appendPathPart("emoji-test.txt", status
);
4503 if (U_FAILURE(status
)) {
4504 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4507 logln("Opening data file %s\n", testFileName
.data());
4510 UChar
*testFile
= ReadAndConvertFile(testFileName
.data(), len
, "UTF-8", status
);
4511 if (U_FAILURE(status
) || testFile
== NULL
) {
4512 errln("%s:%s %s while opening emoji-test.txt", __FILE__
, __LINE__
, u_errorName(status
));
4515 UnicodeString
testFileAsString(testFile
, len
);
4518 RegexMatcher
lineMatcher(u
"^.*?$", testFileAsString
, UREGEX_MULTILINE
, status
);
4519 RegexMatcher
hexMatcher(u
"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE
, status
);
4520 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4521 int32_t lineNumber
= 0;
4523 LocalPointer
<BreakIterator
> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status
), status
);
4524 LocalPointer
<BreakIterator
> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status
), status
);
4525 LocalPointer
<BreakIterator
> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status
), status
);
4526 if (U_FAILURE(status
)) {
4527 dataerrln("%s:%d %s while opening break iterators", __FILE__
, __LINE__
, u_errorName(status
));
4531 while (lineMatcher
.find()) {
4533 UnicodeString line
= lineMatcher
.group(status
);
4534 hexMatcher
.reset(line
);
4535 UnicodeString testString
; // accumulates the emoji sequence.
4536 while (hexMatcher
.find() && hexMatcher
.group(1, status
).length() > 0) {
4537 UnicodeString hex
= hexMatcher
.group(1, status
);
4538 if (hex
.length() > 8) {
4539 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__
, __LINE__
, lineNumber
, CStr(hex
)());
4543 hex8
.appendInvariantChars(hex
, status
);
4544 UChar32 c
= (UChar32
)strtol(hex8
.data(), NULL
, 16);
4546 testString
.append(c
);
4548 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4549 __FILE__
, __LINE__
, lineNumber
, hex8
.data());
4554 if (testString
.length() > 1) {
4555 charBreaks
->setText(testString
);
4556 charBreaks
->first();
4557 int32_t firstBreak
= charBreaks
->next();
4558 if (testString
.length() != firstBreak
) {
4559 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4560 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4562 wordBreaks
->setText(testString
);
4563 wordBreaks
->first();
4564 firstBreak
= wordBreaks
->next();
4565 if (testString
.length() != firstBreak
) {
4566 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4567 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4569 lineBreaks
->setText(testString
);
4570 lineBreaks
->first();
4571 firstBreak
= lineBreaks
->next();
4572 if (testString
.length() != firstBreak
) {
4573 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4574 __FILE__
, __LINE__
, lineNumber
, firstBreak
);
4582 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4584 void RBBITest::TestBug12519() {
4585 UErrorCode status
= U_ZERO_ERROR
;
4586 LocalPointer
<RuleBasedBreakIterator
> biEn((RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
));
4587 LocalPointer
<RuleBasedBreakIterator
> biFr((RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getFrance(), status
));
4588 if (!assertSuccess(WHERE
, status
)) {
4589 dataerrln("%s %d status = %s", __FILE__
, __LINE__
, u_errorName(status
));
4592 assertTrue(WHERE
, Locale::getEnglish() == biEn
->getLocale(ULOC_VALID_LOCALE
, status
));
4594 assertTrue(WHERE
, Locale::getFrench() == biFr
->getLocale(ULOC_VALID_LOCALE
, status
));
4595 assertTrue(WHERE
"Locales do not participate in BreakIterator equality.", *biEn
== *biFr
);
4597 LocalPointer
<RuleBasedBreakIterator
>cloneEn(biEn
->clone());
4598 assertTrue(WHERE
, *biEn
== *cloneEn
);
4599 assertTrue(WHERE
, Locale::getEnglish() == cloneEn
->getLocale(ULOC_VALID_LOCALE
, status
));
4601 LocalPointer
<RuleBasedBreakIterator
>cloneFr(biFr
->clone());
4602 assertTrue(WHERE
, *biFr
== *cloneFr
);
4603 assertTrue(WHERE
, Locale::getFrench() == cloneFr
->getLocale(ULOC_VALID_LOCALE
, status
));
4605 LocalPointer
<RuleBasedBreakIterator
>biDe((RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getGerman(), status
));
4606 UnicodeString
text("Hallo Welt");
4607 biDe
->setText(text
);
4608 assertTrue(WHERE
"before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr
!= *biDe
);
4610 assertTrue(WHERE
"after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr
== *biDe
);
4613 void RBBITest::TestBug12677() {
4614 // Check that stripping of comments from rules for getRules() is not confused by
4615 // the presence of '#' characters in the rules that do not introduce comments.
4616 UnicodeString
rules(u
"!!forward; \n"
4617 "$x = [ab#]; # a set with a # literal. \n"
4618 " # .; # a comment that looks sort of like a rule. \n"
4619 " '#' '?'; # a rule with a quoted # \n"
4622 UErrorCode status
= U_ZERO_ERROR
;
4624 RuleBasedBreakIterator
bi(rules
, pe
, status
);
4625 assertSuccess(WHERE
, status
);
4626 UnicodeString rtRules
= bi
.getRules();
4627 assertEquals(WHERE
, UnicodeString(u
"!!forward; $x = [ab#]; '#' '?'; "), rtRules
);
4631 void RBBITest::TestTableRedundancies() {
4632 UErrorCode status
= U_ZERO_ERROR
;
4634 LocalPointer
<RuleBasedBreakIterator
> bi (
4635 (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
));
4636 assertSuccess(WHERE
, status
);
4637 if (U_FAILURE(status
)) return;
4639 RBBIDataWrapper
*dw
= bi
->fData
;
4640 const RBBIStateTable
*fwtbl
= dw
->fForwardTable
;
4641 int32_t numCharClasses
= dw
->fHeader
->fCatCount
;
4642 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4644 // Check for duplicate columns (character categories)
4646 std::vector
<UnicodeString
> columns
;
4647 for (int32_t column
= 0; column
< numCharClasses
; column
++) {
4649 for (int32_t r
= 1; r
< (int32_t)fwtbl
->fNumStates
; r
++) {
4650 RBBIStateTableRow
*row
= (RBBIStateTableRow
*) (fwtbl
->fTableData
+ (fwtbl
->fRowLen
* r
));
4651 s
.append(row
->fNextState
[column
]);
4653 columns
.push_back(s
);
4655 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4656 for (int c1
=1; c1
<numCharClasses
; c1
++) {
4657 for (int c2
= c1
+1; c2
< numCharClasses
; c2
++) {
4658 if (columns
.at(c1
) == columns
.at(c2
)) {
4659 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__
, __LINE__
, c1
, c2
);
4666 // Check for duplicate states
4667 std::vector
<UnicodeString
> rows
;
4668 for (int32_t r
=0; r
< (int32_t)fwtbl
->fNumStates
; r
++) {
4670 RBBIStateTableRow
*row
= (RBBIStateTableRow
*) (fwtbl
->fTableData
+ (fwtbl
->fRowLen
* r
));
4671 assertTrue(WHERE
, row
->fAccepting
>= -1);
4672 s
.append(row
->fAccepting
+ 1); // values of -1 are expected.
4673 s
.append(row
->fLookAhead
);
4674 s
.append(row
->fTagIdx
);
4675 for (int32_t column
= 0; column
< numCharClasses
; column
++) {
4676 s
.append(row
->fNextState
[column
]);
4680 for (int r1
=0; r1
< (int32_t)fwtbl
->fNumStates
; r1
++) {
4681 for (int r2
= r1
+1; r2
< (int32_t)fwtbl
->fNumStates
; r2
++) {
4682 if (rows
.at(r1
) == rows
.at(r2
)) {
4683 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__
, __LINE__
, r1
, r2
);
4690 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4691 // even after next() has returned DONE.
4693 void RBBITest::TestBug13447() {
4694 UErrorCode status
= U_ZERO_ERROR
;
4695 LocalPointer
<RuleBasedBreakIterator
> bi(
4696 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
));
4697 assertSuccess(WHERE
, status
);
4698 if (U_FAILURE(status
)) return;
4699 UnicodeString
data(u
"1234");
4701 assertEquals(WHERE
, UBRK_WORD_NONE
, bi
->getRuleStatus());
4702 assertEquals(WHERE
, 4, bi
->next());
4703 assertEquals(WHERE
, UBRK_WORD_NUMBER
, bi
->getRuleStatus());
4704 assertEquals(WHERE
, UBRK_DONE
, bi
->next());
4705 assertEquals(WHERE
, 4, bi
->current());
4706 assertEquals(WHERE
, UBRK_WORD_NUMBER
, bi
->getRuleStatus());
4709 // TestReverse exercises both the synthesized safe reverse rules and the logic
4710 // for filling the break iterator cache when starting from random positions
4713 // It's a monkey test, working on random data, with the expected data obtained
4714 // from forward iteration (no safe rules involved), comparing with results
4715 // when indexing into the interior of the string (safe rules needed).
4717 void RBBITest::TestReverse() {
4718 UErrorCode status
= U_ZERO_ERROR
;
4720 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4721 BreakIterator::createCharacterInstance(Locale::getEnglish(), status
)));
4722 assertSuccess(WHERE
, status
, true);
4723 status
= U_ZERO_ERROR
;
4724 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4725 BreakIterator::createWordInstance(Locale::getEnglish(), status
)));
4726 assertSuccess(WHERE
, status
, true);
4727 status
= U_ZERO_ERROR
;
4728 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4729 BreakIterator::createLineInstance(Locale::getEnglish(), status
)));
4730 assertSuccess(WHERE
, status
, true);
4731 status
= U_ZERO_ERROR
;
4732 TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>((RuleBasedBreakIterator
*)
4733 BreakIterator::createSentenceInstance(Locale::getEnglish(), status
)));
4734 assertSuccess(WHERE
, status
, true);
4737 void RBBITest::TestReverse(std::unique_ptr
<RuleBasedBreakIterator
>bi
) {
4742 // From the mapping trie in the break iterator's internal data, create a
4743 // vector of UnicodeStrings, one for each character category, containing
4744 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4745 // to avoid an execess of unassigned code points.
4747 RBBIDataWrapper
*data
= bi
->fData
;
4748 int32_t categoryCount
= data
->fHeader
->fCatCount
;
4749 UTrie2
*trie
= data
->fTrie
;
4751 std::vector
<UnicodeString
> strings(categoryCount
, UnicodeString());
4752 for (int cp
=0; cp
<0x1fff0; ++cp
) {
4753 int cat
= utrie2_get32(trie
, cp
);
4754 cat
&= ~0x4000; // And off the dictionary bit from the category.
4755 assertTrue(WHERE
, cat
< categoryCount
&& cat
>= 0);
4756 if (cat
< 0 || cat
>= categoryCount
) return;
4757 strings
[cat
].append(cp
);
4761 const int testStringLength
= 10000;
4762 UnicodeString testString
;
4764 for (int i
=0; i
<testStringLength
; ++i
) {
4765 int charClass
= randomGen() % categoryCount
;
4766 if (strings
[charClass
].length() > 0) {
4767 int cp
= strings
[charClass
].char32At(randomGen() % strings
[charClass
].length());
4768 testString
.append(cp
);
4772 typedef std::pair
<UBool
, int32_t> Result
;
4773 std::vector
<Result
> expectedResults
;
4774 bi
->setText(testString
);
4775 for (int i
=0; i
<testString
.length(); ++i
) {
4776 bool isboundary
= bi
->isBoundary(i
);
4777 int ruleStatus
= bi
->getRuleStatus();
4778 expectedResults
.push_back(std::make_pair(isboundary
, ruleStatus
));
4781 for (int i
=testString
.length()-1; i
>=0; --i
) {
4782 bi
->setText(testString
); // clears the internal break cache
4783 Result expected
= expectedResults
[i
];
4784 assertEquals(WHERE
, expected
.first
, bi
->isBoundary(i
));
4785 assertEquals(WHERE
, expected
.second
, bi
->getRuleStatus());
4790 // Ticket 13692 - finding word boundaries in very large numbers or words could
4791 // be very time consuming. When the problem was present, this void test
4792 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4794 void RBBITest::TestBug13692() {
4795 UErrorCode status
= U_ZERO_ERROR
;
4796 LocalPointer
<RuleBasedBreakIterator
> bi ((RuleBasedBreakIterator
*)
4797 BreakIterator::createWordInstance(Locale::getEnglish(), status
), status
);
4798 if (!assertSuccess(WHERE
, status
, true)) {
4801 constexpr int32_t LENGTH
= 1000000;
4802 UnicodeString
longNumber(LENGTH
, (UChar32
)u
'3', LENGTH
);
4803 for (int i
=0; i
<20; i
+=2) {
4804 longNumber
.setCharAt(i
, u
' ');
4806 bi
->setText(longNumber
);
4807 assertFalse(WHERE
, bi
->isBoundary(LENGTH
-5));
4808 assertSuccess(WHERE
, status
);
4812 // TestDebug - A place-holder test for debugging purposes.
4813 // For putting in fragments of other tests that can be invoked
4814 // for tracing without a lot of unwanted extra stuff happening.
4816 void RBBITest::TestDebug(void) {
4817 UErrorCode status
= U_ZERO_ERROR
;
4818 LocalPointer
<RuleBasedBreakIterator
> bi ((RuleBasedBreakIterator
*)
4819 BreakIterator::createCharacterInstance(Locale::getEnglish(), status
), status
);
4820 if (!assertSuccess(WHERE
, status
, true)) {
4823 const UnicodeString
&rules
= bi
->getRules();
4825 LocalPointer
<RuleBasedBreakIterator
> newbi(new RuleBasedBreakIterator(rules
, pe
, status
));
4826 assertSuccess(WHERE
, status
);
4829 void RBBITest::TestProperties() {
4830 UErrorCode errorCode
= U_ZERO_ERROR
;
4831 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4832 if (!prependSet
.isEmpty()) {
4834 "[:GCB=Prepend:] is not empty any more. "
4835 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4836 "change this test to the opposite condition.");
4840 #endif // #if !UCONFIG_NO_BREAK_ITERATION