1 /********************************************************************
3 * Copyright (c) 1999-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "utypeinfo.h" // for 'typeid' to work
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
43 #define TEST_ASSERT(x) {if (!(x)) { \
44 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
46 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
47 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
50 //---------------------------------------------
52 //---------------------------------------------
55 // Note: Before adding new tests to this file, check whether the desired test data can
56 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
57 // it's much less work than writing a new test, diagnostic output in the event of failures
58 // is good, and the test data file will is shared with ICU4J, so eventually the test
59 // will run there as well, without additional effort.
61 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
63 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
66 #if !UCONFIG_NO_FILE_IO
67 case 0: name
= "TestBug4153072";
68 if(exec
) TestBug4153072(); break;
70 case 0: name
= "skip";
74 case 1: name
= "skip";
76 case 2: name
= "TestStatusReturn";
77 if(exec
) TestStatusReturn(); break;
79 #if !UCONFIG_NO_FILE_IO
80 case 3: name
= "TestUnicodeFiles";
81 if(exec
) TestUnicodeFiles(); break;
82 case 4: name
= "TestEmptyString";
83 if(exec
) TestEmptyString(); break;
85 case 3: case 4: name
= "skip";
89 case 5: name
= "TestGetAvailableLocales";
90 if(exec
) TestGetAvailableLocales(); break;
92 case 6: name
= "TestGetDisplayName";
93 if(exec
) TestGetDisplayName(); break;
95 #if !UCONFIG_NO_FILE_IO
96 case 7: name
= "TestEndBehaviour";
97 if(exec
) TestEndBehaviour(); break;
98 case 8: case 9: case 10: name
= "skip";
100 case 11: name
= "TestWordBreaks";
101 if(exec
) TestWordBreaks(); break;
102 case 12: name
= "TestWordBoundary";
103 if(exec
) TestWordBoundary(); break;
104 case 13: name
= "TestLineBreaks";
105 if(exec
) TestLineBreaks(); break;
106 case 14: name
= "TestSentBreaks";
107 if(exec
) TestSentBreaks(); break;
108 case 15: name
= "TestExtended";
109 if(exec
) TestExtended(); break;
111 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name
= "skip";
115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
117 name
= "TestMonkey"; if(exec
) TestMonkey(params
); break;
120 name
= "skip"; break;
123 #if !UCONFIG_NO_FILE_IO
124 case 17: name
= "TestBug3818";
125 if(exec
) TestBug3818(); break;
127 case 17: name
= "skip";
131 case 18: name
= "skip";
133 case 19: name
= "TestDebug";
134 if(exec
) TestDebug(); break;
135 case 20: name
= "skip";
138 #if !UCONFIG_NO_FILE_IO
139 case 21: name
= "TestBug5775";
140 if (exec
) TestBug5775(); break;
142 case 21: name
= "skip";
146 case 22: name
= "TestBug9983";
147 if (exec
) TestBug9983(); break;
148 case 23: name
= "TestDictRules";
149 if (exec
) TestDictRules(); break;
150 case 24: name
= "TestBug5532";
151 if (exec
) TestBug5532(); break;
152 default: name
= ""; break; //needed to end loop
157 //---------------------------------------------------------------------------
159 // class BITestData Holds a set of Break iterator test data and results
161 // - the string data to be broken
162 // - a vector of the expected break positions.
163 // - a vector of source line numbers for the data,
164 // (to help see where errors occured.)
165 // - The expected break tag values.
166 // - Vectors of actual break positions and tag values.
167 // - Functions for comparing actual with expected and
170 //----------------------------------------------------------------------------
173 UnicodeString fDataToBreak
;
174 UVector fExpectedBreakPositions
;
175 UVector fExpectedTags
;
177 UVector fActualBreakPositions
; // Test Results.
180 BITestData(UErrorCode
&status
);
181 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
182 void checkResults(const char *heading
, RBBITest
*test
);
183 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
190 BITestData::BITestData(UErrorCode
&status
)
191 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
197 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
198 // The macro form collects the line number, which is helpful
199 // when tracking down failures.
201 // A null data item is inserted at the start of each test's data
202 // to put the starting zero into the data list. The position saved for
203 // each non-null item is its ending position.
205 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
206 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
207 if (U_FAILURE(status
)) {return;}
209 fDataToBreak
.append(CharsToUnicodeString(data
));
211 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
212 fExpectedTags
.addElement(tag
, status
);
213 fLineNum
.addElement(lineNum
, status
);
218 // checkResults. Compare the actual and expected break positions, report any differences.
220 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
221 int32_t expectedIndex
= 0;
222 int32_t actualIndex
= 0;
225 // If we've run through both the expected and actual results vectors, we're done.
226 // break out of the loop.
227 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
228 actualIndex
>= fActualBreakPositions
.size()) {
233 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
234 err(heading
, test
, expectedIndex
-1, actualIndex
);
239 if (actualIndex
>= fActualBreakPositions
.size()) {
240 err(heading
, test
, expectedIndex
, actualIndex
-1);
245 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
246 err(heading
, test
, expectedIndex
, actualIndex
);
247 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
248 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
256 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
257 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
258 heading
, fLineNum
.elementAt(expectedIndex
),
259 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
268 // err - An error was found. Report it, along with information about where the
269 // incorrectly broken test data appeared in the source file.
271 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
273 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
274 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
276 int32_t line
= fLineNum
.elementAti(expectedIdx
);
277 if (expectedIdx
> 0) {
278 // The line numbers are off by one because a premature break occurs somewhere
279 // within the previous item, rather than at the start of the current (expected) item.
280 // We want to report the offset of the unexpected break from the start of
281 // this previous item.
282 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
284 if (actual
< expected
) {
285 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
287 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
292 void BITestData::clearResults() {
293 fActualBreakPositions
.removeAllElements();
294 fActualTags
.removeAllElements();
298 //--------------------------------------------------------------------------------------
300 // RBBITest constructor and destructor
302 //--------------------------------------------------------------------------------------
304 RBBITest::RBBITest() {
308 RBBITest::~RBBITest() {
311 //-----------------------------------------------------------------------------------
313 // Test for status {tag} return value from break rules.
314 // TODO: a more thorough test.
316 //-----------------------------------------------------------------------------------
317 void RBBITest::TestStatusReturn() {
318 UnicodeString
rulesString1("$Letters = [:L:];\n"
319 "$Numbers = [:N:];\n"
322 "Help\\ {4}/me\\!;\n"
323 "[^$Letters $Numbers];\n"
324 "!.*;\n", -1, US_INV
);
325 UnicodeString testString1
= "abc123..abc Help me Help me!";
326 // 01234567890123456789012345678
327 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
328 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
330 UErrorCode status
=U_ZERO_ERROR
;
331 UParseError parseError
;
333 BreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
334 if(U_FAILURE(status
)) {
335 dataerrln("FAIL : in construction - %s", u_errorName(status
));
339 bi
->setText(testString1
);
340 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
341 if (pos
!= bounds1
[i
]) {
342 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
346 int tag
= bi
->getRuleStatus();
347 if (tag
!= brkStatus
[i
]) {
348 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
358 static void printStringBreaks(UText
*tstr
, int expected
[], int expectedCount
) {
359 UErrorCode status
= U_ZERO_ERROR
;
361 printf("code alpha extend alphanum type word sent line name\n");
362 int nextExpectedIndex
= 0;
363 utext_setNativeIndex(tstr
, 0);
364 for (int j
= 0; j
< utext_nativeLength(tstr
); j
=utext_getNativeIndex(tstr
)) {
365 if (nextExpectedIndex
< expectedCount
&& j
>= expected
[nextExpectedIndex
] ) {
366 printf("------------------------------------------------ %d\n", j
);
370 UChar32 c
= utext_next32(tstr
);
371 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
372 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
374 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
376 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
378 U_SHORT_PROPERTY_NAME
),
379 u_getPropertyValueName(UCHAR_WORD_BREAK
,
380 u_getIntPropertyValue(c
,
382 U_SHORT_PROPERTY_NAME
),
383 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
384 u_getIntPropertyValue(c
,
385 UCHAR_SENTENCE_BREAK
),
386 U_SHORT_PROPERTY_NAME
),
387 u_getPropertyValueName(UCHAR_LINE_BREAK
,
388 u_getIntPropertyValue(c
,
390 U_SHORT_PROPERTY_NAME
),
396 static void printStringBreaks(const UnicodeString
&ustr
, int expected
[], int expectedCount
) {
397 UErrorCode status
= U_ZERO_ERROR
;
399 tstr
= utext_openConstUnicodeString(NULL
, &ustr
, &status
);
400 if (U_FAILURE(status
)) {
401 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status
));
404 printStringBreaks(tstr
, expected
, expectedCount
);
409 void RBBITest::TestBug3818() {
410 UErrorCode status
= U_ZERO_ERROR
;
412 // Four Thai words...
413 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
414 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
415 UnicodeString
thaiStr(thaiWordData
);
417 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
418 if (U_FAILURE(status
) || bi
== NULL
) {
419 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
422 bi
->setText(thaiStr
);
424 int32_t startOfSecondWord
= bi
->following(1);
425 if (startOfSecondWord
!= 4) {
426 errln("Fail at file %s, line %d expected start of word at 4, got %d",
427 __FILE__
, __LINE__
, startOfSecondWord
);
429 startOfSecondWord
= bi
->following(0);
430 if (startOfSecondWord
!= 4) {
431 errln("Fail at file %s, line %d expected start of word at 4, got %d",
432 __FILE__
, __LINE__
, startOfSecondWord
);
437 //----------------------------------------------------------------------------
439 // generalIteratorTest Given a break iterator and a set of test data,
440 // Run the tests and report the results.
442 //----------------------------------------------------------------------------
443 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
446 bi
.setText(td
.fDataToBreak
);
448 testFirstAndNext(bi
, td
);
450 testLastAndPrevious(bi
, td
);
452 testFollowing(bi
, td
);
453 testPreceding(bi
, td
);
454 testIsBoundary(bi
, td
);
455 doMultipleSelectionTest(bi
, td
);
460 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
463 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
465 UErrorCode status
= U_ZERO_ERROR
;
470 logln("Test first and next");
471 bi
.setText(td
.fDataToBreak
);
474 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
475 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
476 tag
= bi
.getRuleStatus();
477 td
.fActualTags
.addElement(tag
, status
);
479 // If the iterator is not making forward progress, stop.
480 // No need to raise an error here, it'll be detected in the normal check of results.
485 td
.checkResults("testFirstAndNext", this);
490 // TestLastAndPrevious. Run the iterator backwards, starting with last().
492 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
494 UErrorCode status
= U_ZERO_ERROR
;
496 int32_t lastP
= 0x7ffffffe;
499 logln("Test last and previous");
500 bi
.setText(td
.fDataToBreak
);
503 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
504 // Save break position. Insert it at start of vector of results, shoving
505 // already-saved results further towards the end.
506 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
507 // bi.previous(); // TODO: Why does this fix things up????
509 tag
= bi
.getRuleStatus();
510 td
.fActualTags
.insertElementAt(tag
, 0, status
);
512 // If the iterator is not making progress, stop.
513 // No need to raise an error here, it'll be detected in the normal check of results.
518 td
.checkResults("testLastAndPrevious", this);
522 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
524 UErrorCode status
= U_ZERO_ERROR
;
527 int32_t lastP
= -2; // A value that will never be returned as a break position.
528 // cannot be -1; that is returned for DONE.
531 logln("testFollowing():");
532 bi
.setText(td
.fDataToBreak
);
535 // Save the starting point, since we won't get that out of following.
537 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
538 tag
= bi
.getRuleStatus();
539 td
.fActualTags
.addElement(tag
, status
);
541 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
544 if (p
== RuleBasedBreakIterator::DONE
) {
547 // We've reached a new break position. Save it.
548 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
549 tag
= bi
.getRuleStatus();
550 td
.fActualTags
.addElement(tag
, status
);
554 // The loop normally exits by means of the break in the middle.
555 // Make sure that the index was at the correct position for the break iterator to have
557 if (i
!= td
.fDataToBreak
.length()) {
558 errln("testFollowing(): iterator returned DONE prematurely.");
561 // Full check of all results.
562 td
.checkResults("testFollowing", this);
567 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
568 UErrorCode status
= U_ZERO_ERROR
;
571 int32_t lastP
= 0x7ffffffe;
574 logln("testPreceding():");
575 bi
.setText(td
.fDataToBreak
);
579 td
.fActualBreakPositions
.addElement(p
, status
);
580 tag
= bi
.getRuleStatus();
581 td
.fActualTags
.addElement(tag
, status
);
583 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
586 if (p
== RuleBasedBreakIterator::DONE
) {
589 // We've reached a new break position. Save it.
590 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
592 tag
= bi
.getRuleStatus();
593 td
.fActualTags
.insertElementAt(tag
, 0, status
);
596 // The loop normally exits by means of the break in the middle.
597 // Make sure that the index was at the correct position for the break iterator to have
600 errln("testPreceding(): iterator returned DONE prematurely.");
603 // Full check of all results.
604 td
.checkResults("testPreceding", this);
609 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
610 UErrorCode status
= U_ZERO_ERROR
;
614 logln("testIsBoundary():");
615 bi
.setText(td
.fDataToBreak
);
618 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
619 if (bi
.isBoundary(i
)) {
620 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
621 tag
= bi
.getRuleStatus();
622 td
.fActualTags
.addElement(tag
, status
);
625 td
.checkResults("testIsBoundary: ", this);
630 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
632 iterator
.setText(td
.fDataToBreak
);
634 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
635 int32_t offset
= iterator
.first();
639 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
641 if (*testIterator
!= iterator
)
642 errln("clone() or operator!= failed: two clones compared unequal");
645 testOffset
= testIterator
->first();
646 testOffset
= testIterator
->next(count
);
647 if (offset
!= testOffset
)
648 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
650 if (offset
!= RuleBasedBreakIterator::DONE
) {
652 offset
= iterator
.next();
654 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
655 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
656 if (count
> 10000 || offset
== -1) {
657 errln("operator== failed too many times. Stopping test.");
659 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
665 } while (offset
!= RuleBasedBreakIterator::DONE
);
667 // now do it backwards...
668 offset
= iterator
.last();
672 testOffset
= testIterator
->last();
673 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
674 if (offset
!= testOffset
)
675 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
677 if (offset
!= RuleBasedBreakIterator::DONE
) {
679 offset
= iterator
.previous();
681 } while (offset
!= RuleBasedBreakIterator::DONE
);
687 //---------------------------------------------
691 //---------------------------------------------
692 void RBBITest::TestEmptyString()
694 UnicodeString text
= "";
695 UErrorCode status
= U_ZERO_ERROR
;
697 BITestData
x(status
);
698 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
699 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
700 if (U_FAILURE(status
))
702 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
705 generalIteratorTest(*bi
, x
);
709 void RBBITest::TestGetAvailableLocales()
711 int32_t locCount
= 0;
712 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
715 dataerrln("getAvailableLocales() returned an empty list!");
716 // Just make sure that it's returning good memory.
718 for (i
= 0; i
< locCount
; ++i
) {
719 logln(locList
[i
].getName());
723 //Testing the BreakIterator::getDisplayName() function
724 void RBBITest::TestGetDisplayName()
726 UnicodeString result
;
728 BreakIterator::getDisplayName(Locale::getUS(), result
);
729 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
730 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
733 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
734 if (result
!= "French (France)")
735 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
742 void RBBITest::TestEndBehaviour()
744 UErrorCode status
= U_ZERO_ERROR
;
745 UnicodeString
testString("boo.");
746 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
747 if (U_FAILURE(status
))
749 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
752 wb
->setText(testString
);
754 if (wb
->first() != 0)
755 errln("Didn't get break at beginning of string.");
757 errln("Didn't get break before period in \"boo.\"");
758 if (wb
->current() != 4 && wb
->next() != 4)
759 errln("Didn't get break at end of string.");
765 void RBBITest::TestBug4153072() {
766 UErrorCode status
= U_ZERO_ERROR
;
767 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
768 if (U_FAILURE(status
))
770 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
773 UnicodeString
str("...Hello, World!...");
775 int32_t end
= str
.length() - 3;
778 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
779 iter
->adoptText(textIterator
);
781 // Note: with the switch to UText, there is no way to restrict the
782 // iteration range to begin at an index other than zero.
783 // String character iterators created with a non-zero bound are
784 // treated by RBBI as being empty.
785 for (index
= -1; index
< begin
+ 1; ++index
) {
786 onBoundary
= iter
->isBoundary(index
);
787 if (index
== 0? !onBoundary
: onBoundary
) {
788 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
789 " and begin index = " + begin
);
797 // Test for problem reported by Ashok Matoria on 9 July 2007
798 // One.<kSoftHyphen><kSpace>Two.
800 // Sentence break at start (0) and then on calling next() it breaks at
801 // 'T' of "Two". Now, at this point if I do next() and
802 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
804 void RBBITest::TestBug5775() {
805 UErrorCode status
= U_ZERO_ERROR
;
806 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
807 TEST_ASSERT_SUCCESS(status
);
808 if (U_FAILURE(status
)) {
811 // Check for status first for better handling of no data errors.
812 TEST_ASSERT(bi
!= NULL
);
817 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
821 int pos
= bi
->next();
822 TEST_ASSERT(pos
== 6);
824 TEST_ASSERT(pos
== 10);
825 pos
= bi
->previous();
826 TEST_ASSERT(pos
== 6);
832 //------------------------------------------------------------------------------
834 // RBBITest::Extended Run RBBI Tests from an external test data file
836 //------------------------------------------------------------------------------
839 BreakIterator
*bi
; // Break iterator is set while parsing test source.
840 // Changed out whenever test data changes break type.
842 UnicodeString dataToBreak
; // Data that is built up while parsing the test.
843 UVector32
*expectedBreaks
; // Expected break positions, matches dataToBreak UnicodeString.
844 UVector32
*srcLine
; // Positions in source file, indexed same as dataToBreak.
847 UText
*textToBreak
; // UText, could be UTF8 or UTF16.
848 UVector32
*textMap
; // Map from UTF-16 dataToBreak offsets to UText offsets.
849 CharString utf8String
; // UTF-8 form of text to break.
851 TestParams(UErrorCode
&status
) : dataToBreak() {
853 expectedBreaks
= new UVector32(status
);
854 srcLine
= new UVector32(status
);
855 srcCol
= new UVector32(status
);
857 textMap
= new UVector32(status
);
862 delete expectedBreaks
;
865 utext_close(textToBreak
);
869 int32_t getSrcLine(int32_t bp
);
870 int32_t getExpectedBreak(int32_t bp
);
871 int32_t getSrcCol(int32_t bp
);
873 void setUTF16(UErrorCode
&status
);
874 void setUTF8(UErrorCode
&status
);
877 // Append a UnicodeString to a CharString with UTF-8 encoding.
878 // Substitute any invalid chars.
879 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
880 static void CharStringAppend(CharString
&dest
, const UnicodeString
&src
, UErrorCode
&status
) {
881 if (U_FAILURE(status
)) {
885 u_strToUTF8WithSub(NULL
, 0, &utf8Length
, // Output Buffer, NULL for preflight.
886 src
.getBuffer(), src
.length(), // UTF-16 data
887 0xfffd, NULL
, // Substitution char, number of subs.
889 if (U_FAILURE(status
) && status
!= U_BUFFER_OVERFLOW_ERROR
) {
892 status
= U_ZERO_ERROR
;
894 char *buffer
= dest
.getAppendBuffer(utf8Length
, utf8Length
, capacity
, status
);
895 u_strToUTF8WithSub(buffer
, utf8Length
, NULL
,
896 src
.getBuffer(), src
.length(),
897 0xfffd, NULL
, &status
);
898 dest
.append(buffer
, utf8Length
, status
);
902 void TestParams::setUTF16(UErrorCode
&status
) {
903 textToBreak
= utext_openUnicodeString(textToBreak
, &dataToBreak
, &status
);
904 textMap
->removeAllElements();
905 for (int32_t i
=0; i
<dataToBreak
.length(); i
++) {
906 if (i
== dataToBreak
.getChar32Start(i
)) {
907 textMap
->addElement(i
, status
);
909 textMap
->addElement(-1, status
);
912 textMap
->addElement(dataToBreak
.length(), status
);
913 U_ASSERT(dataToBreak
.length() + 1 == textMap
->size());
917 void TestParams::setUTF8(UErrorCode
&status
) {
918 if (U_FAILURE(status
)) {
922 CharStringAppend(utf8String
, dataToBreak
, status
);
923 textToBreak
= utext_openUTF8(textToBreak
, utf8String
.data(), utf8String
.length(), &status
);
924 if (U_FAILURE(status
)) {
928 textMap
->removeAllElements();
929 int32_t utf16Index
= 0;
931 textMap
->addElement(utf16Index
, status
);
932 UChar32 c32
= utext_current32(textToBreak
);
936 utf16Index
+= U16_LENGTH(c32
);
937 utext_next32(textToBreak
);
938 while (textMap
->size() < utext_getNativeIndex(textToBreak
)) {
939 textMap
->addElement(-1, status
);
942 U_ASSERT(utext_nativeLength(textToBreak
) + 1 == textMap
->size());
946 int32_t TestParams::getSrcLine(int bp
) {
947 if (bp
>= textMap
->size()) {
948 bp
= textMap
->size() - 1;
951 for(; bp
>= 0 ; --bp
) {
952 // Move to a character boundary if we are not on one already.
953 i
= textMap
->elementAti(bp
);
958 return srcLine
->elementAti(i
);
962 int32_t TestParams::getExpectedBreak(int bp
) {
963 if (bp
>= textMap
->size()) {
966 int32_t i
= textMap
->elementAti(bp
);
969 retVal
= expectedBreaks
->elementAti(i
);
975 int32_t TestParams::getSrcCol(int bp
) {
976 if (bp
>= textMap
->size()) {
977 bp
= textMap
->size() - 1;
980 for(; bp
>= 0; --bp
) {
981 // Move bp to a character boundary if we are not on one already.
982 i
= textMap
->elementAti(bp
);
987 return srcCol
->elementAti(i
);
991 void RBBITest::executeTest(TestParams
*t
, UErrorCode
&status
) {
996 TEST_ASSERT_SUCCESS(status
);
997 if (U_FAILURE(status
)) {
1001 if (t
->bi
== NULL
) {
1005 t
->bi
->setText(t
->textToBreak
, status
);
1007 // Run the iterator forward
1010 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1012 // Fail for lack of forward progress.
1013 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1014 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1018 // Check that there we didn't miss an expected break between the last one
1020 for (i
=prevBP
+1; i
<bp
; i
++) {
1021 if (t
->getExpectedBreak(i
) != 0) {
1022 int expected
[] = {0, i
};
1023 printStringBreaks(t
->dataToBreak
, expected
, 2);
1024 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1025 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1029 // Check that the break we did find was expected
1030 if (t
->getExpectedBreak(bp
) == 0) {
1031 int expected
[] = {0, bp
};
1032 printStringBreaks(t
->textToBreak
, expected
, 2);
1033 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1034 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1036 // The break was expected.
1037 // Check that the {nnn} tag value is correct.
1038 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
1039 if (expectedTagVal
== -1) {
1042 int32_t line
= t
->getSrcLine(bp
);
1043 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1044 if (rs
!= expectedTagVal
) {
1045 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1046 " Actual, Expected status = %4d, %4d",
1047 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1054 // Verify that there were no missed expected breaks after the last one found
1055 for (i
=prevBP
+1; i
<utext_nativeLength(t
->textToBreak
); i
++) {
1056 if (t
->getExpectedBreak(i
) != 0) {
1057 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1058 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1063 // Run the iterator backwards, verify that the same breaks are found.
1065 prevBP
= utext_nativeLength(t
->textToBreak
)+2; // start with a phony value for the last break pos seen.
1066 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1068 // Fail for lack of progress.
1069 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1070 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1074 // Check that we didn't miss an expected break between the last one
1075 // and this one. (UVector returns zeros for index out of bounds.)
1076 for (i
=prevBP
-1; i
>bp
; i
--) {
1077 if (t
->getExpectedBreak(i
) != 0) {
1078 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1079 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1083 // Check that the break we did find was expected
1084 if (t
->getExpectedBreak(bp
) == 0) {
1085 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1086 bp
, t
->getSrcLine(bp
), t
->getSrcCol(bp
));
1088 // The break was expected.
1089 // Check that the {nnn} tag value is correct.
1090 int32_t expectedTagVal
= t
->getExpectedBreak(bp
);
1091 if (expectedTagVal
== -1) {
1094 int line
= t
->getSrcLine(bp
);
1095 int32_t rs
= t
->bi
->getRuleStatus();
1096 if (rs
!= expectedTagVal
) {
1097 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1098 " Actual, Expected status = %4d, %4d",
1099 bp
, line
, t
->getSrcCol(bp
), rs
, expectedTagVal
);
1106 // Verify that there were no missed breaks prior to the last one found
1107 for (i
=prevBP
-1; i
>=0; i
--) {
1108 if (t
->getExpectedBreak(i
) != 0) {
1109 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1110 i
, t
->getSrcLine(i
), t
->getSrcCol(i
));
1114 // Check isBoundary()
1115 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1116 UBool boundaryExpected
= (t
->getExpectedBreak(i
) != 0);
1117 UBool boundaryFound
= t
->bi
->isBoundary(i
);
1118 if (boundaryExpected
!= boundaryFound
) {
1119 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1120 " Expected, Actual= %s, %s",
1121 i
, t
->getSrcLine(i
), t
->getSrcCol(i
),
1122 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
1126 // Check following()
1127 for (i
=0; i
< utext_nativeLength(t
->textToBreak
); i
++) {
1128 int32_t actualBreak
= t
->bi
->following(i
);
1129 int32_t expectedBreak
= BreakIterator::DONE
;
1130 for (int32_t j
=i
+1; j
<= utext_nativeLength(t
->textToBreak
); j
++) {
1131 if (t
->getExpectedBreak(j
) != 0) {
1136 if (expectedBreak
!= actualBreak
) {
1137 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1138 " Expected, Actual= %d, %d",
1139 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1143 // Check preceding()
1144 for (i
=utext_nativeLength(t
->textToBreak
); i
>=0; i
--) {
1145 int32_t actualBreak
= t
->bi
->preceding(i
);
1146 int32_t expectedBreak
= BreakIterator::DONE
;
1148 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1149 // preceding(trailing byte) will return the index of some preceding code point,
1150 // not the lead byte of the current code point, even though that has a smaller index.
1151 // Therefore, start looking at the expected break data not at i-1, but at
1152 // the start of code point index - 1.
1153 utext_setNativeIndex(t
->textToBreak
, i
);
1154 int32_t j
= utext_getNativeIndex(t
->textToBreak
) - 1;
1155 for (; j
>= 0; j
--) {
1156 if (t
->getExpectedBreak(j
) != 0) {
1161 if (expectedBreak
!= actualBreak
) {
1162 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1163 " Expected, Actual= %d, %d",
1164 i
, t
->getSrcLine(i
), t
->getSrcCol(i
), expectedBreak
, actualBreak
);
1170 void RBBITest::TestExtended() {
1171 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1172 UErrorCode status
= U_ZERO_ERROR
;
1175 UnicodeString rules
;
1176 TestParams
tp(status
);
1178 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status
);
1179 if (U_FAILURE(status
)) {
1180 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1185 // Open and read the test data file.
1187 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1188 char testFileName
[1000];
1189 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1190 errln("Can't open test data. Path too long.");
1193 strcpy(testFileName
, testDataDirectory
);
1194 strcat(testFileName
, "rbbitst.txt");
1197 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1198 if (U_FAILURE(status
)) {
1199 return; /* something went wrong, error already output */
1206 // Put the test data into a UnicodeString
1208 UnicodeString
testString(FALSE
, testFile
, len
);
1216 parseState
= PARSE_TAG
;
1218 EParseState savedState
= PARSE_TAG
;
1220 static const UChar CH_LF
= 0x0a;
1221 static const UChar CH_CR
= 0x0d;
1222 static const UChar CH_HASH
= 0x23;
1223 /*static const UChar CH_PERIOD = 0x2e;*/
1224 static const UChar CH_LT
= 0x3c;
1225 static const UChar CH_GT
= 0x3e;
1226 static const UChar CH_BACKSLASH
= 0x5c;
1227 static const UChar CH_BULLET
= 0x2022;
1229 int32_t lineNum
= 1;
1230 int32_t colStart
= 0;
1232 int32_t charIdx
= 0;
1234 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1236 for (charIdx
= 0; charIdx
< len
; ) {
1237 status
= U_ZERO_ERROR
;
1238 UChar c
= testString
.charAt(charIdx
);
1240 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1241 // treat CRLF as a unit
1245 if (c
== CH_LF
|| c
== CH_CR
) {
1249 column
= charIdx
- colStart
+ 1;
1251 switch (parseState
) {
1253 if (c
== 0x0a || c
== 0x0d) {
1254 parseState
= savedState
;
1261 parseState
= PARSE_COMMENT
;
1262 savedState
= PARSE_TAG
;
1265 if (u_isUWhiteSpace(c
)) {
1268 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1270 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1274 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1276 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1280 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1282 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1286 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1289 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1293 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1295 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1300 // <locale loc_name>
1301 localeMatcher
.reset(testString
);
1302 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1303 UnicodeString localeName
= localeMatcher
.group(1, status
);
1304 char localeName8
[100];
1305 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1306 locale
= Locale::createFromName(localeName8
);
1307 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
1308 TEST_ASSERT_SUCCESS(status
);
1311 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1312 parseState
= PARSE_DATA
;
1314 tp
.dataToBreak
= "";
1315 tp
.expectedBreaks
->removeAllElements();
1316 tp
.srcCol
->removeAllElements();
1317 tp
.srcLine
->removeAllElements();
1321 errln("line %d: Tag expected in test file.", lineNum
);
1322 parseState
= PARSE_COMMENT
;
1323 savedState
= PARSE_DATA
;
1324 goto end_test
; // Stop the test.
1329 if (c
== CH_BULLET
) {
1330 int32_t breakIdx
= tp
.dataToBreak
.length();
1331 tp
.expectedBreaks
->setSize(breakIdx
+1);
1332 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1333 tp
.srcLine
->setSize(breakIdx
+1);
1334 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1335 tp
.srcCol
->setSize(breakIdx
+1);
1336 tp
.srcCol
->setElementAt(column
, breakIdx
);
1340 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1341 // Add final entry to mappings from break location to source file position.
1342 // Need one extra because last break position returned is after the
1343 // last char in the data, not at the last char.
1344 tp
.srcLine
->addElement(lineNum
, status
);
1345 tp
.srcCol
->addElement(column
, status
);
1347 parseState
= PARSE_TAG
;
1351 status
= U_ZERO_ERROR
;
1352 tp
.setUTF16(status
);
1353 executeTest(&tp
, status
);
1354 TEST_ASSERT_SUCCESS(status
);
1356 // Run again, this time with UTF-8 text wrapped in a UText.
1357 status
= U_ZERO_ERROR
;
1359 TEST_ASSERT_SUCCESS(status
);
1360 executeTest(&tp
, status
);
1364 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1365 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1366 // Get the code point from the name and insert it into the test data.
1367 // (Damn, no API takes names in Unicode !!!
1368 // we've got to take it back to char *)
1369 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1370 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1371 char charNameBuf
[200];
1372 UChar32 theChar
= -1;
1373 if (nameEndIdx
!= -1) {
1374 UErrorCode status
= U_ZERO_ERROR
;
1375 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1376 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1377 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1378 if (U_FAILURE(status
)) {
1382 if (theChar
== -1) {
1383 errln("Error in named character in test file at line %d, col %d",
1386 // Named code point was recognized. Insert it
1387 // into the test data.
1388 tp
.dataToBreak
.append(theChar
);
1389 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1390 tp
.srcLine
->addElement(lineNum
, status
);
1391 tp
.srcCol
->addElement(column
, status
);
1394 if (nameEndIdx
> charIdx
) {
1395 charIdx
= nameEndIdx
+1;
1404 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1406 int32_t breakIdx
= tp
.dataToBreak
.length();
1407 tp
.expectedBreaks
->setSize(breakIdx
+1);
1408 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1409 tp
.srcLine
->setSize(breakIdx
+1);
1410 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1411 tp
.srcCol
->setSize(breakIdx
+1);
1412 tp
.srcCol
->setElementAt(column
, breakIdx
);
1418 parseState
= PARSE_NUM
;
1422 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1423 parseState
= PARSE_COMMENT
;
1424 savedState
= PARSE_DATA
;
1428 if (c
== CH_BACKSLASH
) {
1429 // Check for \ at end of line, a line continuation.
1430 // Advance over (discard) the newline
1431 UChar32 cp
= testString
.char32At(charIdx
);
1432 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1434 // Need an extra increment of the input ptr to move over both of them
1437 if (cp
== CH_LF
|| cp
== CH_CR
) {
1444 // Let unescape handle the back slash.
1445 cp
= testString
.unescapeAt(charIdx
);
1447 // Escape sequence was recognized. Insert the char
1448 // into the test data.
1449 tp
.dataToBreak
.append(cp
);
1450 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1451 tp
.srcLine
->addElement(lineNum
, status
);
1452 tp
.srcCol
->addElement(column
, status
);
1458 // Not a recognized backslash escape sequence.
1459 // Take the next char as a literal.
1460 // TODO: Should this be an error?
1461 c
= testString
.charAt(charIdx
);
1462 charIdx
= testString
.moveIndex32(charIdx
, 1);
1465 // Normal, non-escaped data char.
1466 tp
.dataToBreak
.append(c
);
1468 // Save the mapping from offset in the data to line/column numbers in
1469 // the original input file. Will be used for better error messages only.
1470 // If there's an expected break before this char, the slot in the mapping
1471 // vector will already be set for this char; don't overwrite it.
1472 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1473 tp
.srcLine
->addElement(lineNum
, status
);
1474 tp
.srcCol
->addElement(column
, status
);
1480 // We are parsing an expected numeric tag value, like <1234>,
1481 // within a chunk of data.
1482 if (u_isUWhiteSpace(c
)) {
1487 // Finished the number. Add the info to the expected break data,
1488 // and switch parse state back to doing plain data.
1489 parseState
= PARSE_DATA
;
1490 if (tagValue
== 0) {
1493 int32_t breakIdx
= tp
.dataToBreak
.length();
1494 tp
.expectedBreaks
->setSize(breakIdx
+1);
1495 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1496 tp
.srcLine
->setSize(breakIdx
+1);
1497 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1498 tp
.srcCol
->setSize(breakIdx
+1);
1499 tp
.srcCol
->setElementAt(column
, breakIdx
);
1504 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1508 errln("Syntax Error in test file at line %d, col %d",
1510 parseState
= PARSE_COMMENT
;
1511 goto end_test
; // Stop the test
1516 if (U_FAILURE(status
)) {
1517 dataerrln("ICU Error %s while parsing test file at line %d.",
1518 u_errorName(status
), lineNum
);
1519 status
= U_ZERO_ERROR
;
1520 goto end_test
; // Stop the test
1531 //-------------------------------------------------------------------------------
1533 // TestDictRules create a break iterator from source rules that includes a
1534 // dictionary range. Regression for bug #7130. Source rules
1535 // do not declare a break iterator type (word, line, sentence, etc.
1536 // but the dictionary code, without a type, would loop.
1538 //-------------------------------------------------------------------------------
1539 void RBBITest::TestDictRules() {
1540 const char *rules
= "$dictionary = [a-z]; \n"
1542 "$dictionary $dictionary; \n"
1544 "$dictionary $dictionary; \n";
1545 const char *text
= "aa";
1546 UErrorCode status
= U_ZERO_ERROR
;
1547 UParseError parseError
;
1549 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1550 if (U_SUCCESS(status
)) {
1551 UnicodeString utext
= text
;
1555 for (loops
= 0; loops
<10; loops
++) {
1556 position
= bi
.next();
1557 if (position
== RuleBasedBreakIterator::DONE
) {
1561 TEST_ASSERT(loops
== 1);
1563 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1569 //-------------------------------------------------------------------------------
1571 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1572 // return the datain one big UChar * buffer, which the caller must delete.
1575 // fileName: the name of the file, with no directory part. The test data directory
1577 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1578 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1579 // specified here. The BOM, if it exists, will be stripped from the returned data.
1580 // Pass NULL for the system default encoding.
1583 // The file data, converted to UChar.
1584 // The caller must delete this when done with
1585 // delete [] theBuffer;
1587 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1588 // Move this function to some common place.
1590 //--------------------------------------------------------------------------------
1591 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1592 UChar
*retPtr
= NULL
;
1593 char *fileBuf
= NULL
;
1594 UConverter
* conv
= NULL
;
1598 if (U_FAILURE(status
)) {
1605 f
= fopen(fileName
, "rb");
1607 dataerrln("Error opening test data file %s\n", fileName
);
1608 status
= U_FILE_ACCESS_ERROR
;
1617 fseek( f
, 0, SEEK_END
);
1618 fileSize
= ftell(f
);
1619 fileBuf
= new char[fileSize
];
1620 fseek(f
, 0, SEEK_SET
);
1621 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1622 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1623 errln("Error reading test data file.");
1624 goto cleanUpAndReturn
;
1628 // Look for a Unicode Signature (BOM) on the data just read
1630 int32_t signatureLength
;
1631 const char * fileBufC
;
1632 const char* bomEncoding
;
1635 bomEncoding
= ucnv_detectUnicodeSignature(
1636 fileBuf
, fileSize
, &signatureLength
, &status
);
1637 if(bomEncoding
!=NULL
){
1638 fileBufC
+= signatureLength
;
1639 fileSize
-= signatureLength
;
1640 encoding
= bomEncoding
;
1644 // Open a converter to take the rule file to UTF-16
1646 conv
= ucnv_open(encoding
, &status
);
1647 if (U_FAILURE(status
)) {
1648 goto cleanUpAndReturn
;
1652 // Convert the rules to UChar.
1653 // Preflight first to determine required buffer size.
1655 ulen
= ucnv_toUChars(conv
,
1661 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1662 // Buffer Overflow is expected from the preflight operation.
1663 status
= U_ZERO_ERROR
;
1665 retPtr
= new UChar
[ulen
+1];
1678 if (U_FAILURE(status
)) {
1679 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1689 //--------------------------------------------------------------------------------------------
1691 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1693 //-------------------------------------------------------------------------------------------
1694 void RBBITest::TestUnicodeFiles() {
1695 RuleBasedBreakIterator
*bi
;
1696 UErrorCode status
= U_ZERO_ERROR
;
1698 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1699 TEST_ASSERT_SUCCESS(status
);
1700 if (U_SUCCESS(status
)) {
1701 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1705 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1706 TEST_ASSERT_SUCCESS(status
);
1707 if (U_SUCCESS(status
)) {
1708 runUnicodeTestData("WordBreakTest.txt", bi
);
1712 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1713 TEST_ASSERT_SUCCESS(status
);
1714 if (U_SUCCESS(status
)) {
1715 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1719 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1720 TEST_ASSERT_SUCCESS(status
);
1721 if (U_SUCCESS(status
)) {
1722 runUnicodeTestData("LineBreakTest.txt", bi
);
1728 // Check for test cases from the Unicode test data files that are known to fail
1729 // and should be skipped because ICU is not yet able to fully implement the spec.
1730 // See ticket #7270.
1732 UBool
RBBITest::testCaseIsKnownIssue(const UnicodeString
&testCase
, const char *fileName
) {
1733 static const UChar badTestCases
[][4] = { // Line Numbers from Unicode 7.0.0 file.
1734 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x007D, (UChar
)0x0000}, // Line 5198
1735 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0029, (UChar
)0x0000}, // Line 5202
1736 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x0021, (UChar
)0x0000}, // Line 5214
1737 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002c, (UChar
)0x0000}, // Line 5246
1738 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x002f, (UChar
)0x0000}, // Line 5298
1739 {(UChar
)0x200B, (UChar
)0x0020, (UChar
)0x2060, (UChar
)0x0000} // Line 5302
1741 if (strcmp(fileName
, "LineBreakTest.txt") != 0) {
1745 for (int i
=0; i
<UPRV_LENGTHOF(badTestCases
); i
++) {
1746 if (testCase
== UnicodeString(badTestCases
[i
])) {
1747 return logKnownIssue("7270");
1754 //--------------------------------------------------------------------------------------------
1756 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1758 //-------------------------------------------------------------------------------------------
1759 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761 UErrorCode status
= U_ZERO_ERROR
;
1764 // Open and read the test data file, put it into a UnicodeString.
1766 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1767 char testFileName
[1000];
1768 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1769 dataerrln("Can't open test data. Path too long.");
1772 strcpy(testFileName
, testDataDirectory
);
1773 strcat(testFileName
, fileName
);
1775 logln("Opening data file %s\n", fileName
);
1778 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1779 if (status
!= U_FILE_ACCESS_ERROR
) {
1780 TEST_ASSERT_SUCCESS(status
);
1781 TEST_ASSERT(testFile
!= NULL
);
1783 if (U_FAILURE(status
) || testFile
== NULL
) {
1784 return; /* something went wrong, error already output */
1786 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1789 // Parse the test data file using a regular expression.
1790 // Each kind of token is recognized in its own capture group; what type of item was scanned
1791 // is identified by which group had a match.
1793 // Caputure Group # 1 2 3 4 5
1794 // Parses this item: divide x hex digits comment \n unrecognized \n
1796 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1797 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1798 UnicodeString testString
;
1799 UVector32
breakPositions(status
);
1801 TEST_ASSERT_SUCCESS(status
);
1802 if (U_FAILURE(status
)) {
1807 // Scan through each test case, building up the string to be broken in testString,
1808 // and the positions that should be boundaries in the breakPositions vector.
1811 while (tokenMatcher
.find()) {
1812 if(tokenMatcher
.hitEnd()) {
1813 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1814 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1815 and caused an infinite loop here on EBCDIC systems!
1817 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1820 if (tokenMatcher
.start(1, status
) >= 0) {
1821 // Scanned a divide sign, indicating a break position in the test data.
1822 if (testString
.length()>0) {
1823 breakPositions
.addElement(testString
.length(), status
);
1826 else if (tokenMatcher
.start(2, status
) >= 0) {
1827 // Scanned an 'x', meaning no break at this position in the test data
1828 // Nothing to be done here.
1830 else if (tokenMatcher
.start(3, status
) >= 0) {
1831 // Scanned Hex digits. Convert them to binary, append to the character data string.
1832 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1833 int length
= hexNumber
.length();
1836 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1837 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1839 testString
.append(c
);
1841 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1842 fileName
, lineNumber
);
1845 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1846 fileName
, lineNumber
);
1849 else if (tokenMatcher
.start(4, status
) >= 0) {
1850 // Scanned to end of a line, possibly skipping over a comment in the process.
1851 // If the line from the file contained test data, run the test now.
1852 if (testString
.length() > 0 && !testCaseIsKnownIssue(testString
, fileName
)) {
1853 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1856 // Clear out this test case.
1857 // The string and breakPositions vector will be refilled as the next
1858 // test case is parsed.
1859 testString
.remove();
1860 breakPositions
.removeAllElements();
1863 // Scanner catchall. Something unrecognized appeared on the line.
1865 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1866 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1867 token
[sizeof(token
)-1] = 0;
1868 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1870 // Clean up, in preparation for continuing with the next line.
1871 testString
.remove();
1872 breakPositions
.removeAllElements();
1875 TEST_ASSERT_SUCCESS(status
);
1876 if (U_FAILURE(status
)) {
1882 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1885 //--------------------------------------------------------------------------------------------
1887 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1888 // test data files. Do only a simple, forward-only check -
1889 // this test is mostly to check that ICU and the Unicode
1890 // data agree with each other.
1892 //--------------------------------------------------------------------------------------------
1893 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1894 const UnicodeString
&testString
, // Text data to be broken
1895 UVector32
*breakPositions
, // Positions where breaks should be found.
1896 RuleBasedBreakIterator
*bi
) {
1897 int32_t pos
; // Break Position in the test string
1898 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1899 int32_t expectedPos
; // Expected break position (index into test string)
1901 bi
->setText(testString
);
1905 while (pos
!= BreakIterator::DONE
) {
1906 if (expectedI
>= breakPositions
->size()) {
1907 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1908 testFileName
, lineNumber
, pos
);
1911 expectedPos
= breakPositions
->elementAti(expectedI
);
1912 if (pos
< expectedPos
) {
1913 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1914 testFileName
, lineNumber
, pos
);
1917 if (pos
> expectedPos
) {
1918 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1919 testFileName
, lineNumber
, expectedPos
);
1926 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1927 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1928 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1935 //---------------------------------------------------------------------------------------
1937 // classs RBBIMonkeyKind
1939 // Monkey Test for Break Iteration
1940 // Abstract interface class. Concrete derived classes independently
1941 // implement the break rules for different iterator types.
1943 // The Monkey Test itself uses doesn't know which type of break iterator it is
1944 // testing, but works purely in terms of the interface defined here.
1946 //---------------------------------------------------------------------------------------
1947 class RBBIMonkeyKind
{
1949 // Return a UVector of UnicodeSets, representing the character classes used
1950 // for this type of iterator.
1951 virtual UVector
*charClasses() = 0;
1953 // Set the test text on which subsequent calls to next() will operate
1954 virtual void setText(const UnicodeString
&s
) = 0;
1956 // Find the next break postion, starting from the prev break position, or from zero.
1957 // Return -1 after reaching end of string.
1958 virtual int32_t next(int32_t i
) = 0;
1960 virtual ~RBBIMonkeyKind();
1961 UErrorCode deferredStatus
;
1970 RBBIMonkeyKind::RBBIMonkeyKind() {
1971 deferredStatus
= U_ZERO_ERROR
;
1974 RBBIMonkeyKind::~RBBIMonkeyKind() {
1978 //----------------------------------------------------------------------------------------
1980 // Random Numbers. Similar to standard lib rand() and srand()
1981 // Not using library to
1982 // 1. Get same results on all platforms.
1983 // 2. Get access to current seed, to more easily reproduce failures.
1985 //---------------------------------------------------------------------------------------
1986 static uint32_t m_seed
= 1;
1988 static uint32_t m_rand()
1990 m_seed
= m_seed
* 1103515245 + 12345;
1991 return (uint32_t)(m_seed
/65536) % 32768;
1995 //------------------------------------------------------------------------------------------
1997 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1998 // of RBBIMonkeyKind.
2000 //------------------------------------------------------------------------------------------
2001 class RBBICharMonkey
: public RBBIMonkeyKind
{
2004 virtual ~RBBICharMonkey();
2005 virtual UVector
*charClasses();
2006 virtual void setText(const UnicodeString
&s
);
2007 virtual int32_t next(int32_t i
);
2011 UnicodeSet
*fCRLFSet
;
2012 UnicodeSet
*fControlSet
;
2013 UnicodeSet
*fExtendSet
;
2014 UnicodeSet
*fRegionalIndicatorSet
;
2015 UnicodeSet
*fPrependSet
;
2016 UnicodeSet
*fSpacingSet
;
2021 UnicodeSet
*fLVTSet
;
2022 UnicodeSet
*fHangulSet
;
2023 UnicodeSet
*fAnySet
;
2025 const UnicodeString
*fText
;
2029 RBBICharMonkey::RBBICharMonkey() {
2030 UErrorCode status
= U_ZERO_ERROR
;
2034 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
2035 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status
);
2036 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status
);
2037 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
2038 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
2039 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
2040 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
2041 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
2042 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
2043 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
2044 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
2045 fHangulSet
= new UnicodeSet();
2046 fHangulSet
->addAll(*fLSet
);
2047 fHangulSet
->addAll(*fVSet
);
2048 fHangulSet
->addAll(*fTSet
);
2049 fHangulSet
->addAll(*fLVSet
);
2050 fHangulSet
->addAll(*fLVTSet
);
2051 fAnySet
= new UnicodeSet(0, 0x10ffff);
2053 fSets
= new UVector(status
);
2054 fSets
->addElement(fCRLFSet
, status
);
2055 fSets
->addElement(fControlSet
, status
);
2056 fSets
->addElement(fExtendSet
, status
);
2057 fSets
->addElement(fRegionalIndicatorSet
, status
);
2058 if (!fPrependSet
->isEmpty()) {
2059 fSets
->addElement(fPrependSet
, status
);
2061 fSets
->addElement(fSpacingSet
, status
);
2062 fSets
->addElement(fHangulSet
, status
);
2063 fSets
->addElement(fAnySet
, status
);
2064 if (U_FAILURE(status
)) {
2065 deferredStatus
= status
;
2070 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2076 int32_t RBBICharMonkey::next(int32_t prevPos
) {
2077 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2078 // break position being tested. The candidate break
2079 // location is before p2.
2083 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2085 if (U_FAILURE(deferredStatus
)) {
2089 // Previous break at end of string. return DONE.
2090 if (prevPos
>= fText
->length()) {
2093 p0
= p1
= p2
= p3
= prevPos
;
2094 c3
= fText
->char32At(prevPos
);
2096 (void)p0
; // suppress set but not used warning.
2099 // Loop runs once per "significant" character position in the input text.
2101 // Move all of the positions forward in the input string.
2106 // Advancd p3 by one codepoint
2107 p3
= fText
->moveIndex32(p3
, 1);
2108 c3
= fText
->char32At(p3
);
2111 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2114 if (p2
== fText
->length()) {
2115 // Reached end of string. Always a break position.
2120 // No Extend or Format characters may appear between the CR and LF,
2121 // which requires the additional check for p2 immediately following p1.
2123 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2127 // Rule (GB4). ( Control | CR | LF ) <break>
2128 if (fControlSet
->contains(c1
) ||
2134 // Rule (GB5) <break> ( Control | CR | LF )
2136 if (fControlSet
->contains(c2
) ||
2143 // Rule (GB6) L x ( L | V | LV | LVT )
2144 if (fLSet
->contains(c1
) &&
2145 (fLSet
->contains(c2
) ||
2146 fVSet
->contains(c2
) ||
2147 fLVSet
->contains(c2
) ||
2148 fLVTSet
->contains(c2
))) {
2152 // Rule (GB7) ( LV | V ) x ( V | T )
2153 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
2154 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
2158 // Rule (GB8) ( LVT | T) x T
2159 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
2160 fTSet
->contains(c2
)) {
2164 // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
2166 // Rule (GB8a) Regional_Indicator x Regional_Indicator
2167 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2171 // Rule (GB9) Numeric x ALetter
2172 if (fExtendSet
->contains(c2
)) {
2176 // Rule (GB9a) x SpacingMark
2177 if (fSpacingSet
->contains(c2
)) {
2181 // Rule (GB9b) Prepend x
2182 if (fPrependSet
->contains(c1
)) {
2186 // Rule (GB10) Any <break> Any
2196 UVector
*RBBICharMonkey::charClasses() {
2201 RBBICharMonkey::~RBBICharMonkey() {
2206 delete fRegionalIndicatorSet
;
2218 //------------------------------------------------------------------------------------------
2220 // class RBBIWordMonkey Word Break specific implementation
2221 // of RBBIMonkeyKind.
2223 //------------------------------------------------------------------------------------------
2224 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2227 virtual ~RBBIWordMonkey();
2228 virtual UVector
*charClasses();
2229 virtual void setText(const UnicodeString
&s
);
2230 virtual int32_t next(int32_t i
);
2236 UnicodeSet
*fNewlineSet
;
2237 UnicodeSet
*fRegionalIndicatorSet
;
2238 UnicodeSet
*fKatakanaSet
;
2239 UnicodeSet
*fHebrew_LetterSet
;
2240 UnicodeSet
*fALetterSet
;
2241 // TODO(jungshik): Do we still need this change?
2242 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2243 UnicodeSet
*fSingle_QuoteSet
;
2244 UnicodeSet
*fDouble_QuoteSet
;
2245 UnicodeSet
*fMidNumLetSet
;
2246 UnicodeSet
*fMidLetterSet
;
2247 UnicodeSet
*fMidNumSet
;
2248 UnicodeSet
*fNumericSet
;
2249 UnicodeSet
*fFormatSet
;
2250 UnicodeSet
*fOtherSet
;
2251 UnicodeSet
*fExtendSet
;
2252 UnicodeSet
*fExtendNumLetSet
;
2253 UnicodeSet
*fDictionaryCjkSet
;
2255 const UnicodeString
*fText
;
2259 RBBIWordMonkey::RBBIWordMonkey()
2261 UErrorCode status
= U_ZERO_ERROR
;
2263 fSets
= new UVector(status
);
2265 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2266 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2267 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2268 fDictionaryCjkSet
= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status
);
2269 // Exclude Hangul syllables from ALetterSet during testing.
2270 // Leave CJK dictionary characters out from the monkey tests!
2272 fALetterSet
= new UnicodeSet("[\\p{Word_Break = ALetter}"
2273 "[\\p{Line_Break = Complex_Context}"
2274 "-\\p{Grapheme_Cluster_Break = Extend}"
2275 "-\\p{Grapheme_Cluster_Break = Control}"
2279 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status
);
2280 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2281 fHebrew_LetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status
);
2282 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2283 fALetterSet
->removeAll(*fDictionaryCjkSet
);
2284 fSingle_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status
);
2285 fDouble_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status
);
2286 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2287 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status
);
2288 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2289 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2290 // we should figure out why
2291 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2292 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2293 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2294 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2296 fOtherSet
= new UnicodeSet();
2297 if(U_FAILURE(status
)) {
2298 deferredStatus
= status
;
2302 fOtherSet
->complement();
2303 fOtherSet
->removeAll(*fCRSet
);
2304 fOtherSet
->removeAll(*fLFSet
);
2305 fOtherSet
->removeAll(*fNewlineSet
);
2306 fOtherSet
->removeAll(*fKatakanaSet
);
2307 fOtherSet
->removeAll(*fHebrew_LetterSet
);
2308 fOtherSet
->removeAll(*fALetterSet
);
2309 fOtherSet
->removeAll(*fSingle_QuoteSet
);
2310 fOtherSet
->removeAll(*fDouble_QuoteSet
);
2311 fOtherSet
->removeAll(*fMidLetterSet
);
2312 fOtherSet
->removeAll(*fMidNumSet
);
2313 fOtherSet
->removeAll(*fNumericSet
);
2314 fOtherSet
->removeAll(*fExtendNumLetSet
);
2315 fOtherSet
->removeAll(*fFormatSet
);
2316 fOtherSet
->removeAll(*fExtendSet
);
2317 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
2318 // Inhibit dictionary characters from being tested at all.
2319 fOtherSet
->removeAll(*fDictionaryCjkSet
);
2320 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2322 fSets
->addElement(fCRSet
, status
);
2323 fSets
->addElement(fLFSet
, status
);
2324 fSets
->addElement(fNewlineSet
, status
);
2325 fSets
->addElement(fRegionalIndicatorSet
, status
);
2326 fSets
->addElement(fHebrew_LetterSet
, status
);
2327 fSets
->addElement(fALetterSet
, status
);
2328 fSets
->addElement(fSingle_QuoteSet
, status
);
2329 fSets
->addElement(fDouble_QuoteSet
, status
);
2330 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2331 fSets
->addElement(fMidLetterSet
, status
);
2332 fSets
->addElement(fMidNumLetSet
, status
);
2333 fSets
->addElement(fMidNumSet
, status
);
2334 fSets
->addElement(fNumericSet
, status
);
2335 fSets
->addElement(fFormatSet
, status
);
2336 fSets
->addElement(fExtendSet
, status
);
2337 fSets
->addElement(fOtherSet
, status
);
2338 fSets
->addElement(fExtendNumLetSet
, status
);
2340 if (U_FAILURE(status
)) {
2341 deferredStatus
= status
;
2345 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2350 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2351 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2352 // break position being tested. The candidate break
2353 // location is before p2.
2357 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2359 if (U_FAILURE(deferredStatus
)) {
2363 // Prev break at end of string. return DONE.
2364 if (prevPos
>= fText
->length()) {
2367 p0
= p1
= p2
= p3
= prevPos
;
2368 c3
= fText
->char32At(prevPos
);
2370 (void)p0
; // Suppress set but not used warning.
2372 // Loop runs once per "significant" character position in the input text.
2374 // Move all of the positions forward in the input string.
2379 // Advancd p3 by X(Extend | Format)* Rule 4
2380 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2382 p3
= fText
->moveIndex32(p3
, 1);
2383 c3
= fText
->char32At(p3
);
2384 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2388 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
));
2392 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2395 if (p2
== fText
->length()) {
2396 // Reached end of string. Always a break position.
2401 // No Extend or Format characters may appear between the CR and LF,
2402 // which requires the additional check for p2 immediately following p1.
2404 if (c1
==0x0D && c2
==0x0A) {
2408 // Rule (3a) Break before and after newlines (including CR and LF)
2410 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2413 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2417 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2418 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2419 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2423 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2425 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2426 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2427 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2431 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2432 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2433 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2434 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2438 // Rule (7a) Hebrew_Letter x Single_Quote
2439 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2443 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2444 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2448 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2449 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2453 // Rule (8) Numeric x Numeric
2454 if (fNumericSet
->contains(c1
) &&
2455 fNumericSet
->contains(c2
)) {
2459 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2460 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2461 fNumericSet
->contains(c2
)) {
2465 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2466 if (fNumericSet
->contains(c1
) &&
2467 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2471 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2472 if (fNumericSet
->contains(c0
) &&
2473 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2474 fNumericSet
->contains(c2
)) {
2478 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2479 if (fNumericSet
->contains(c1
) &&
2480 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2481 fNumericSet
->contains(c3
)) {
2485 // Rule (13) Katakana x Katakana
2486 if (fKatakanaSet
->contains(c1
) &&
2487 fKatakanaSet
->contains(c2
)) {
2491 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2492 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2493 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2494 fExtendNumLetSet
->contains(c2
)) {
2498 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2499 if (fExtendNumLetSet
->contains(c1
) &&
2500 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2501 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2506 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2510 // Rule 14. Break found here.
2519 UVector
*RBBIWordMonkey::charClasses() {
2524 RBBIWordMonkey::~RBBIWordMonkey() {
2529 delete fKatakanaSet
;
2530 delete fHebrew_LetterSet
;
2532 delete fSingle_QuoteSet
;
2533 delete fDouble_QuoteSet
;
2534 delete fMidNumLetSet
;
2535 delete fMidLetterSet
;
2540 delete fExtendNumLetSet
;
2541 delete fRegionalIndicatorSet
;
2542 delete fDictionaryCjkSet
;
2549 //------------------------------------------------------------------------------------------
2551 // class RBBISentMonkey Sentence Break specific implementation
2552 // of RBBIMonkeyKind.
2554 //------------------------------------------------------------------------------------------
2555 class RBBISentMonkey
: public RBBIMonkeyKind
{
2558 virtual ~RBBISentMonkey();
2559 virtual UVector
*charClasses();
2560 virtual void setText(const UnicodeString
&s
);
2561 virtual int32_t next(int32_t i
);
2563 int moveBack(int posFrom
);
2564 int moveForward(int posFrom
);
2565 UChar32
cAt(int pos
);
2569 UnicodeSet
*fSepSet
;
2570 UnicodeSet
*fFormatSet
;
2572 UnicodeSet
*fLowerSet
;
2573 UnicodeSet
*fUpperSet
;
2574 UnicodeSet
*fOLetterSet
;
2575 UnicodeSet
*fNumericSet
;
2576 UnicodeSet
*fATermSet
;
2577 UnicodeSet
*fSContinueSet
;
2578 UnicodeSet
*fSTermSet
;
2579 UnicodeSet
*fCloseSet
;
2580 UnicodeSet
*fOtherSet
;
2581 UnicodeSet
*fExtendSet
;
2583 const UnicodeString
*fText
;
2587 RBBISentMonkey::RBBISentMonkey()
2589 UErrorCode status
= U_ZERO_ERROR
;
2591 fSets
= new UVector(status
);
2593 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2594 // set and made into character classes of their own. For the monkey impl,
2595 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2596 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2597 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2598 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2599 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2600 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2601 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2602 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2603 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2604 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2605 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2606 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2607 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2608 fOtherSet
= new UnicodeSet();
2610 if(U_FAILURE(status
)) {
2611 deferredStatus
= status
;
2615 fOtherSet
->complement();
2616 fOtherSet
->removeAll(*fSepSet
);
2617 fOtherSet
->removeAll(*fFormatSet
);
2618 fOtherSet
->removeAll(*fSpSet
);
2619 fOtherSet
->removeAll(*fLowerSet
);
2620 fOtherSet
->removeAll(*fUpperSet
);
2621 fOtherSet
->removeAll(*fOLetterSet
);
2622 fOtherSet
->removeAll(*fNumericSet
);
2623 fOtherSet
->removeAll(*fATermSet
);
2624 fOtherSet
->removeAll(*fSContinueSet
);
2625 fOtherSet
->removeAll(*fSTermSet
);
2626 fOtherSet
->removeAll(*fCloseSet
);
2627 fOtherSet
->removeAll(*fExtendSet
);
2629 fSets
->addElement(fSepSet
, status
);
2630 fSets
->addElement(fFormatSet
, status
);
2631 fSets
->addElement(fSpSet
, status
);
2632 fSets
->addElement(fLowerSet
, status
);
2633 fSets
->addElement(fUpperSet
, status
);
2634 fSets
->addElement(fOLetterSet
, status
);
2635 fSets
->addElement(fNumericSet
, status
);
2636 fSets
->addElement(fATermSet
, status
);
2637 fSets
->addElement(fSContinueSet
, status
);
2638 fSets
->addElement(fSTermSet
, status
);
2639 fSets
->addElement(fCloseSet
, status
);
2640 fSets
->addElement(fOtherSet
, status
);
2641 fSets
->addElement(fExtendSet
, status
);
2643 if (U_FAILURE(status
)) {
2644 deferredStatus
= status
;
2650 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2654 UVector
*RBBISentMonkey::charClasses() {
2659 // moveBack() Find the "significant" code point preceding the index i.
2660 // Skips over ($Extend | $Format)* .
2662 int RBBISentMonkey::moveBack(int i
) {
2669 j
= fText
->moveIndex32(j
, -1);
2670 c
= fText
->char32At(j
);
2672 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2678 int RBBISentMonkey::moveForward(int i
) {
2679 if (i
>=fText
->length()) {
2680 return fText
->length();
2685 j
= fText
->moveIndex32(j
, 1);
2688 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2692 UChar32
RBBISentMonkey::cAt(int pos
) {
2693 if (pos
<0 || pos
>=fText
->length()) {
2696 return fText
->char32At(pos
);
2700 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2701 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2702 // break position being tested. The candidate break
2703 // location is before p2.
2707 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2710 if (U_FAILURE(deferredStatus
)) {
2714 // Prev break at end of string. return DONE.
2715 if (prevPos
>= fText
->length()) {
2718 p0
= p1
= p2
= p3
= prevPos
;
2719 c3
= fText
->char32At(prevPos
);
2721 (void)p0
; // Suppress set but not used warning.
2723 // Loop runs once per "significant" character position in the input text.
2725 // Move all of the positions forward in the input string.
2730 // Advancd p3 by X(Extend | Format)* Rule 4
2731 p3
= moveForward(p3
);
2735 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2739 // Rule (4). Sep <break>
2740 if (fSepSet
->contains(c1
)) {
2741 p2
= p1
+1; // Separators don't combine with Extend or Format.
2745 if (p2
>= fText
->length()) {
2746 // Reached end of string. Always a break position.
2750 if (p2
== prevPos
) {
2751 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2755 // Rule (6). ATerm x Numeric
2756 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2760 // Rule (7). Upper ATerm x Uppper
2761 if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2765 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2766 // Note: STerm | ATerm are added to the negated part of the expression by a
2767 // note to the Unicode 5.0 documents.
2769 while (fSpSet
->contains(cAt(p8
))) {
2772 while (fCloseSet
->contains(cAt(p8
))) {
2775 if (fATermSet
->contains(cAt(p8
))) {
2779 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2780 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2781 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2784 p8
= moveForward(p8
);
2786 if (fLowerSet
->contains(cAt(p8
))) {
2791 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2792 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2794 while (fSpSet
->contains(cAt(p8
))) {
2797 while (fCloseSet
->contains(cAt(p8
))) {
2801 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2806 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2808 while (fCloseSet
->contains(cAt(p9
))) {
2812 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2813 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2818 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2820 while (fSpSet
->contains(cAt(p10
))) {
2821 p10
= moveBack(p10
);
2823 while (fCloseSet
->contains(cAt(p10
))) {
2824 p10
= moveBack(p10
);
2826 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2827 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2832 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2834 if (fSepSet
->contains(cAt(p11
))) {
2835 p11
= moveBack(p11
);
2837 while (fSpSet
->contains(cAt(p11
))) {
2838 p11
= moveBack(p11
);
2840 while (fCloseSet
->contains(cAt(p11
))) {
2841 p11
= moveBack(p11
);
2843 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2847 // Rule (12) Any x Any
2854 RBBISentMonkey::~RBBISentMonkey() {
2864 delete fSContinueSet
;
2873 //-------------------------------------------------------------------------------------------
2877 //-------------------------------------------------------------------------------------------
2879 class RBBILineMonkey
: public RBBIMonkeyKind
{
2882 virtual ~RBBILineMonkey();
2883 virtual UVector
*charClasses();
2884 virtual void setText(const UnicodeString
&s
);
2885 virtual int32_t next(int32_t i
);
2886 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2931 BreakIterator
*fCharBI
;
2932 const UnicodeString
*fText
;
2933 RegexMatcher
*fNumberMatcher
;
2937 RBBILineMonkey::RBBILineMonkey()
2939 UErrorCode status
= U_ZERO_ERROR
;
2941 fSets
= new UVector(status
);
2943 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
2944 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
2945 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
2946 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
2947 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
2948 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
2949 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
2950 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
2951 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
2952 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
2953 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
2954 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
2955 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
2956 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
2957 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
2958 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
2959 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
2960 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
2961 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
2962 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
2963 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
2964 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
2965 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
2966 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
2967 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
2968 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
2969 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
2970 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
2971 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
2972 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
2973 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
2974 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
2975 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
2976 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
2977 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
2978 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
2979 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
2980 fSA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status
);
2981 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
2982 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
2984 if (U_FAILURE(status
)) {
2985 deferredStatus
= status
;
2987 fNumberMatcher
= NULL
;
2991 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2992 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2993 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
2994 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2996 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
2998 fSets
->addElement(fBK
, status
);
2999 fSets
->addElement(fCR
, status
);
3000 fSets
->addElement(fLF
, status
);
3001 fSets
->addElement(fCM
, status
);
3002 fSets
->addElement(fNL
, status
);
3003 fSets
->addElement(fWJ
, status
);
3004 fSets
->addElement(fZW
, status
);
3005 fSets
->addElement(fGL
, status
);
3006 fSets
->addElement(fCB
, status
);
3007 fSets
->addElement(fSP
, status
);
3008 fSets
->addElement(fB2
, status
);
3009 fSets
->addElement(fBA
, status
);
3010 fSets
->addElement(fBB
, status
);
3011 fSets
->addElement(fHY
, status
);
3012 fSets
->addElement(fH2
, status
);
3013 fSets
->addElement(fH3
, status
);
3014 fSets
->addElement(fCL
, status
);
3015 fSets
->addElement(fCP
, status
);
3016 fSets
->addElement(fEX
, status
);
3017 fSets
->addElement(fIN
, status
);
3018 fSets
->addElement(fJL
, status
);
3019 fSets
->addElement(fJT
, status
);
3020 fSets
->addElement(fJV
, status
);
3021 fSets
->addElement(fNS
, status
);
3022 fSets
->addElement(fOP
, status
);
3023 fSets
->addElement(fQU
, status
);
3024 fSets
->addElement(fIS
, status
);
3025 fSets
->addElement(fNU
, status
);
3026 fSets
->addElement(fPO
, status
);
3027 fSets
->addElement(fPR
, status
);
3028 fSets
->addElement(fSY
, status
);
3029 fSets
->addElement(fAI
, status
);
3030 fSets
->addElement(fAL
, status
);
3031 fSets
->addElement(fHL
, status
);
3032 fSets
->addElement(fID
, status
);
3033 fSets
->addElement(fWJ
, status
);
3034 fSets
->addElement(fRI
, status
);
3035 fSets
->addElement(fSA
, status
);
3036 fSets
->addElement(fSG
, status
);
3039 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3040 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3041 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3042 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3043 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3044 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3046 fNumberMatcher
= new RegexMatcher(
3047 UnicodeString(rules
, -1, US_INV
), 0, status
);
3049 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
3051 if (U_FAILURE(status
)) {
3052 deferredStatus
= status
;
3057 void RBBILineMonkey::setText(const UnicodeString
&s
) {
3059 fCharBI
->setText(s
);
3060 fNumberMatcher
->reset(s
);
3065 // Line Break TR rules 9 and 10 implementation.
3066 // This deals with combining marks and other sequences that
3067 // that must be treated as if they were something other than what they actually are.
3069 // This is factored out into a separate function because it must be applied twice for
3070 // each potential break, once to the chars before the position being checked, then
3071 // again to the text following the possible break.
3073 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
3075 // Invalid initial position. Happens during the warmup iteration of the
3076 // main loop in next().
3080 int32_t nPos
= *nextPos
;
3082 // LB 9 Keep combining sequences together.
3083 // advance over any CM class chars. Note that Line Break CM is different
3084 // from the normal Grapheme Extend property.
3085 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
3086 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
3088 *nextChar
= fText
->char32At(nPos
);
3089 if (!fCM
->contains(*nextChar
)) {
3092 nPos
= fText
->moveIndex32(nPos
, 1);
3097 // LB 9 Treat X CM* as if it were x.
3098 // No explicit action required.
3100 // LB 10 Treat any remaining combining mark as AL
3101 if (fCM
->contains(*posChar
)) {
3102 *posChar
= 0x41; // thisChar = 'A';
3105 // Push the updated nextPos and nextChar back to our caller.
3106 // This only makes a difference if posChar got bigger by consuming a
3107 // combining sequence.
3109 *nextChar
= fText
->char32At(nPos
);
3114 int32_t RBBILineMonkey::next(int32_t startPos
) {
3115 UErrorCode status
= U_ZERO_ERROR
;
3116 int32_t pos
; // Index of the char following a potential break position
3117 UChar32 thisChar
; // Character at above position "pos"
3119 int32_t prevPos
; // Index of the char preceding a potential break position
3120 UChar32 prevChar
; // Character at above position. Note that prevChar
3121 // and thisChar may not be adjacent because combining
3122 // characters between them will be ignored.
3124 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
3127 int32_t nextPos
; // Index of the next character following pos.
3128 // Usually skips over combining marks.
3129 int32_t nextCPPos
; // Index of the code point following "pos."
3130 // May point to a combining mark.
3131 int32_t tPos
; // temp value.
3134 if (U_FAILURE(deferredStatus
)) {
3138 if (startPos
>= fText
->length()) {
3143 // Initial values for loop. Loop will run the first time without finding breaks,
3144 // while the invalid values shift out and the "this" and
3145 // "prev" positions are filled in with good values.
3146 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
3147 thisChar
= prevChar
= prevCharX2
= 0;
3148 nextPos
= nextCPPos
= startPos
;
3151 // Loop runs once per position in the test text, until a break position
3154 prevPosX2
= prevPos
;
3155 prevCharX2
= prevChar
;
3158 prevChar
= thisChar
;
3161 thisChar
= fText
->char32At(pos
);
3163 nextCPPos
= fText
->moveIndex32(pos
, 1);
3164 nextPos
= nextCPPos
;
3166 // Rule LB2 - Break at end of text.
3167 if (pos
>= fText
->length()) {
3171 // Rule LB 9 - adjust for combining sequences.
3172 // We do this one out-of-order because the adjustment does not change anything
3173 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3175 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3176 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3177 c
= fText
->char32At(nextPos
);
3178 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3180 // If the loop is still warming up - if we haven't shifted the initial
3181 // -1 positions out of prevPos yet - loop back to advance the
3182 // position in the input without any further looking for breaks.
3183 if (prevPos
== -1) {
3187 // LB 4 Always break after hard line breaks,
3188 if (fBK
->contains(prevChar
)) {
3192 // LB 5 Break after CR, LF, NL, but not inside CR LF
3193 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3196 if (prevChar
== 0x0d ||
3202 // LB 6 Don't break before hard line breaks
3203 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3204 fBK
->contains(thisChar
)) {
3209 // LB 7 Don't break before spaces or zero-width space.
3210 if (fSP
->contains(thisChar
)) {
3214 if (fZW
->contains(thisChar
)) {
3218 // LB 8 Break after zero width space
3219 if (fZW
->contains(prevChar
)) {
3223 // LB 9, 10 Already done, at top of loop.
3227 // LB 11 Do not break before or after WORD JOINER and related characters.
3231 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3237 if (fGL
->contains(prevChar
)) {
3243 if (!(fSP
->contains(prevChar
) ||
3244 fBA
->contains(prevChar
) ||
3245 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3251 // LB 13 Don't break before closings.
3252 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3253 // fall into LB 17 and the more general number regular expression.
3255 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3256 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3257 fEX
->contains(thisChar
) ||
3258 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3259 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3263 // LB 14 Don't break after OP SP*
3264 // Scan backwards, checking for this sequence.
3265 // The OP char could include combining marks, so we actually check for
3267 // Another Twist: The Rule 67 fixes may have changed a SP CM
3268 // sequence into a ID char, so before scanning back through spaces,
3269 // verify that prevChar is indeed a space. The prevChar variable
3270 // may differ from fText[prevPos]
3272 if (fSP
->contains(prevChar
)) {
3273 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3274 tPos
=fText
->moveIndex32(tPos
, -1);
3277 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3278 tPos
=fText
->moveIndex32(tPos
, -1);
3280 if (fOP
->contains(fText
->char32At(tPos
))) {
3285 // LB 15 QU SP* x OP
3286 if (fOP
->contains(thisChar
)) {
3287 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3289 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3290 tPos
= fText
->moveIndex32(tPos
, -1);
3292 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3293 tPos
= fText
->moveIndex32(tPos
, -1);
3295 if (fQU
->contains(fText
->char32At(tPos
))) {
3302 // LB 16 (CL | CP) SP* x NS
3303 // Scan backwards for SP* CM* (CL | CP)
3304 if (fNS
->contains(thisChar
)) {
3306 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3307 tPos
= fText
->moveIndex32(tPos
, -1);
3309 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3310 tPos
= fText
->moveIndex32(tPos
, -1);
3312 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3318 // LB 17 B2 SP* x B2
3319 if (fB2
->contains(thisChar
)) {
3320 // Scan backwards, checking for the B2 CM* SP* sequence.
3322 if (fSP
->contains(prevChar
)) {
3323 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3324 tPos
=fText
->moveIndex32(tPos
, -1);
3327 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3328 tPos
=fText
->moveIndex32(tPos
, -1);
3330 if (fB2
->contains(fText
->char32At(tPos
))) {
3336 // LB 18 break after space
3337 if (fSP
->contains(prevChar
)) {
3344 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3348 // LB 20 Break around a CB
3349 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3354 if (fBA
->contains(thisChar
) ||
3355 fHY
->contains(thisChar
) ||
3356 fNS
->contains(thisChar
) ||
3357 fBB
->contains(prevChar
) ) {
3363 if (fHL
->contains(prevCharX2
) &&
3364 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3370 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3375 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3376 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3377 (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3378 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3379 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3388 if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) ||
3389 (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3390 (fHL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3391 (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) ||
3392 (fNU
->contains(prevChar
) && fHL
->contains(thisChar
)) ) {
3396 // LB 24 Do not break between prefix and letters or ideographs.
3400 if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) ||
3401 (fPR
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3402 (fPO
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
)))) {
3409 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3410 if (U_FAILURE(status
)) {
3413 // Matched a number. But could have been just a single digit, which would
3414 // not represent a "no break here" between prevChar and thisChar
3415 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3416 if (numEndIdx
> pos
) {
3417 // Number match includes at least our two chars being checked
3418 if (numEndIdx
> nextPos
) {
3419 // Number match includes additional chars. Update pos and nextPos
3420 // so that next loop iteration will continue at the end of the number,
3421 // checking for breaks between last char in number & whatever follows.
3422 pos
= nextPos
= numEndIdx
;
3424 pos
= fText
->moveIndex32(pos
, -1);
3425 thisChar
= fText
->char32At(pos
);
3426 } while (fCM
->contains(thisChar
));
3433 // LB 26 Do not break a Korean syllable.
3434 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3435 fJV
->contains(thisChar
) ||
3436 fH2
->contains(thisChar
) ||
3437 fH3
->contains(thisChar
))) {
3441 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3442 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3446 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3447 fJT
->contains(thisChar
)) {
3451 // LB 27 Treat a Korean Syllable Block the same as ID.
3452 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3453 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3454 fIN
->contains(thisChar
)) {
3457 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3458 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3459 fPO
->contains(thisChar
)) {
3462 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3463 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3469 // LB 28 Do not break between alphabetics ("at").
3470 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3474 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3475 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3479 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3482 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3485 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3489 // LB30a Do not break between regional indicators.
3491 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3495 // LB 31 Break everywhere else
3504 UVector
*RBBILineMonkey::charClasses() {
3509 RBBILineMonkey::~RBBILineMonkey() {
3554 delete fNumberMatcher
;
3558 //-------------------------------------------------------------------------------------------
3563 // seed=nnnnn Random number starting seed.
3564 // Setting the seed allows errors to be reproduced.
3565 // loop=nnn Looping count. Controls running time.
3567 // 0 or greater: run length.
3569 // type = char | word | line | sent | title
3571 //-------------------------------------------------------------------------------------------
3573 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3574 int32_t val
= defaultVal
;
3575 name
.append(" *= *(-?\\d+)");
3576 UErrorCode status
= U_ZERO_ERROR
;
3577 RegexMatcher
m(name
, params
, 0, status
);
3579 // The param exists. Convert the string to an int.
3580 char valString
[100];
3581 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3582 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3583 paramLength
= (int32_t)(sizeof(valString
)-2);
3585 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3586 val
= strtol(valString
, NULL
, 10);
3588 // Delete this parameter from the params string.
3590 params
= m
.replaceFirst("", status
);
3592 U_ASSERT(U_SUCCESS(status
));
3597 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3598 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3607 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3609 if (count
< expectedcount
&& expected
[count
] != i
) {
3610 test
->errln("break forward test failed: expected %d but got %d",
3611 expected
[count
], i
);
3616 if (count
!= expectedcount
) {
3617 printStringBreaks(ustr
, expected
, expectedcount
);
3618 test
->errln("break forward test failed: missed %d match",
3619 expectedcount
- count
);
3622 // testing boundaries
3623 for (i
= 1; i
< expectedcount
; i
++) {
3624 int j
= expected
[i
- 1];
3625 if (!bi
->isBoundary(j
)) {
3626 printStringBreaks(ustr
, expected
, expectedcount
);
3627 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3630 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3631 if (bi
->isBoundary(j
)) {
3632 printStringBreaks(ustr
, expected
, expectedcount
);
3633 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3639 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3641 if (forward
[count
] != i
) {
3642 printStringBreaks(ustr
, expected
, expectedcount
);
3643 test
->errln("happy break test previous() failed: expected %d but got %d",
3649 printStringBreaks(ustr
, expected
, expectedcount
);
3650 test
->errln("break test previous() failed: missed a match");
3654 // testing preceding
3655 for (i
= 0; i
< expectedcount
- 1; i
++) {
3656 // int j = expected[i] + 1;
3657 int j
= ustr
.moveIndex32(expected
[i
], 1);
3658 for (; j
<= expected
[i
+ 1]; j
++) {
3659 if (bi
->preceding(j
) != expected
[i
]) {
3660 printStringBreaks(ustr
, expected
, expectedcount
);
3661 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3669 void RBBITest::TestWordBreaks(void)
3671 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3673 Locale
locale("en");
3674 UErrorCode status
= U_ZERO_ERROR
;
3675 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3676 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3677 // Replaced any C+J characters in a row with a random sequence of characters
3678 // of the same length to make our C+J segmentation not get in the way.
3679 static const char *strlist
[] =
3681 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3682 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3683 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3684 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3685 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3686 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3687 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3688 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3689 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3690 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3691 "\\u2027\\U000e0067\\u0a47\\u00b7",
3692 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3693 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3694 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3695 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3696 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3697 "\\u0027\\u11af\\U000e0057\\u0602",
3698 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3699 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3700 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3701 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3702 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3703 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3704 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3705 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3706 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3707 "\\u18f4\\U000e0049\\u20e7\\u2027",
3708 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3709 "\\ua183\\u102d\\u0bec\\u003a",
3710 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3711 "\\u003a\\u0e57\\u0fad\\u002e",
3712 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3713 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3714 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3715 "\\u003a\\u0664\\u00b7\\u1fba",
3716 "\\u003b\\u0027\\u00b7\\u47a3",
3717 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3718 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3719 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3722 if (U_FAILURE(status
)) {
3723 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3726 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3727 // printf("looping %d\n", loop);
3728 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3729 // RBBICharMonkey monkey;
3730 RBBIWordMonkey monkey
;
3733 int expectedcount
= 0;
3735 monkey
.setText(ustr
);
3737 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3738 expected
[expectedcount
++] = i
;
3741 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3747 void RBBITest::TestWordBoundary(void)
3749 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3750 Locale
locale("en");
3751 UErrorCode status
= U_ZERO_ERROR
;
3752 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3753 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3755 static const char *strlist
[] =
3757 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3758 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3759 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3760 "\\u2027\\U000e0067\\u0a47\\u00b7",
3761 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3762 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3763 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3764 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3765 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3766 "\\u0027\\u11af\\U000e0057\\u0602",
3767 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3768 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3769 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3770 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3771 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3772 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3773 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3774 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3775 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3776 "\\u58f4\\U000e0049\\u20e7\\u2027",
3777 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3778 "\\ua183\\u102d\\u0bec\\u003a",
3779 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3780 "\\u003a\\u0e57\\u0fad\\u002e",
3781 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3782 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3783 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3784 "\\u003a\\u0664\\u00b7\\u1fba",
3785 "\\u003b\\u0027\\u00b7\\u47a3",
3788 if (U_FAILURE(status
)) {
3789 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3792 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3793 // printf("looping %d\n", loop);
3794 u_unescape(strlist
[loop
], str
, 20);
3795 UnicodeString
ustr(str
);
3802 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3803 forward
[count
++] = i
;
3806 for (j
= prev
+ 1; j
< i
; j
++) {
3807 if (bi
->isBoundary(j
)) {
3808 printStringBreaks(ustr
, forward
, count
);
3809 errln("happy boundary test failed: expected %d not a boundary",
3815 if (!bi
->isBoundary(i
)) {
3816 printStringBreaks(ustr
, forward
, count
);
3817 errln("happy boundary test failed: expected %d a boundary",
3827 void RBBITest::TestLineBreaks(void)
3829 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3830 Locale
locale("en");
3831 UErrorCode status
= U_ZERO_ERROR
;
3832 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3833 const int32_t STRSIZE
= 50;
3835 static const char *strlist
[] =
3837 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3838 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3839 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3840 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3841 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3842 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3843 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3844 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3845 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3846 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3847 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3848 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3849 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3850 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3851 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3852 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3853 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3854 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3855 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3856 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3857 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3858 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3859 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3860 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3861 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3862 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3863 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3864 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3865 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3866 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3867 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3868 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3869 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3870 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3871 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3872 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3873 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3874 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3875 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3876 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3877 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3878 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3879 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3880 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3881 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3882 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3883 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3886 TEST_ASSERT_SUCCESS(status
);
3887 if (U_FAILURE(status
)) {
3890 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3891 // printf("looping %d\n", loop);
3892 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3899 UnicodeString
ustr(str
);
3900 RBBILineMonkey monkey
;
3901 if (U_FAILURE(monkey
.deferredStatus
)) {
3905 const int EXPECTEDSIZE
= 50;
3906 int expected
[EXPECTEDSIZE
];
3907 int expectedcount
= 0;
3909 monkey
.setText(ustr
);
3911 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3912 if (expectedcount
>= EXPECTEDSIZE
) {
3913 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3916 expected
[expectedcount
++] = i
;
3919 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3925 void RBBITest::TestSentBreaks(void)
3927 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3928 Locale
locale("en");
3929 UErrorCode status
= U_ZERO_ERROR
;
3930 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3932 static const char *strlist
[] =
3934 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3936 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3937 "\"Sentence ending with a quote.\" Bye.",
3938 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3939 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3940 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3941 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3942 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3943 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3944 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3945 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3946 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3947 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3948 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3949 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3950 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3951 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3952 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3953 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3956 if (U_FAILURE(status
)) {
3957 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3960 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3961 u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0])));
3962 UnicodeString
ustr(str
);
3964 RBBISentMonkey monkey
;
3965 if (U_FAILURE(monkey
.deferredStatus
)) {
3969 const int EXPECTEDSIZE
= 50;
3970 int expected
[EXPECTEDSIZE
];
3971 int expectedcount
= 0;
3973 monkey
.setText(ustr
);
3975 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3976 if (expectedcount
>= EXPECTEDSIZE
) {
3977 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3980 expected
[expectedcount
++] = i
;
3983 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3989 void RBBITest::TestMonkey(char *params
) {
3990 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3992 UErrorCode status
= U_ZERO_ERROR
;
3993 int32_t loopCount
= 500;
3995 UnicodeString breakType
= "all";
3996 Locale
locale("en");
3997 UBool useUText
= FALSE
;
3999 if (quick
== FALSE
) {
4004 UnicodeString
p(params
);
4005 loopCount
= getIntParam("loop", p
, loopCount
);
4006 seed
= getIntParam("seed", p
, seed
);
4008 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
4010 breakType
= m
.group(1, status
);
4012 p
= m
.replaceFirst("", status
);
4015 RegexMatcher
u(" *utext", p
, 0, status
);
4019 p
= u
.replaceFirst("", status
);
4024 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
4025 // Each option is stripped out of the option string as it is processed.
4026 // All options have been checked. The option string should have been completely emptied..
4028 p
.extract(buf
, sizeof(buf
), NULL
, status
);
4029 buf
[sizeof(buf
)-1] = 0;
4030 errln("Unrecognized or extra parameter: %s\n", buf
);
4036 if (breakType
== "char" || breakType
== "all") {
4038 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
4039 if (U_SUCCESS(status
)) {
4040 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
4041 if (breakType
== "all" && useUText
==FALSE
) {
4042 // Also run a quick test with UText when "all" is specified
4043 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
4047 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
4052 if (breakType
== "word" || breakType
== "all") {
4053 logln("Word Break Monkey Test");
4055 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4056 if (U_SUCCESS(status
)) {
4057 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
4060 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
4065 if (breakType
== "line" || breakType
== "all") {
4066 logln("Line Break Monkey Test");
4068 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4069 if (loopCount
>= 10) {
4070 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
4072 if (U_SUCCESS(status
)) {
4073 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
4076 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4081 if (breakType
== "sent" || breakType
== "all" ) {
4082 logln("Sentence Break Monkey Test");
4084 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4085 if (loopCount
>= 10) {
4086 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
4088 if (U_SUCCESS(status
)) {
4089 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
4092 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4101 // Run a RBBI monkey test. Common routine, for all break iterator types.
4103 // bi - the break iterator to use
4104 // mk - MonkeyKind, abstraction for obtaining expected results
4105 // name - Name of test (char, word, etc.) for use in error messages
4106 // seed - Seed for starting random number generator (parameter from user)
4109 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4110 int32_t numIterations
, UBool useUText
) {
4112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4114 const int32_t TESTSTRINGLEN
= 500;
4115 UnicodeString testText
;
4116 int32_t numCharClasses
;
4118 int expected
[TESTSTRINGLEN
*2 + 1];
4119 int expectedCount
= 0;
4120 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4121 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4122 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4123 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4124 char followingBreaks
[TESTSTRINGLEN
*2+1];
4125 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4131 numCharClasses
= mk
.charClasses()->size();
4132 chClasses
= mk
.charClasses();
4134 // Check for errors that occured during the construction of the MonkeyKind object.
4135 // Can't report them where they occured because errln() is a method coming from intlTest,
4136 // and is not visible outside of RBBITest :-(
4137 if (U_FAILURE(mk
.deferredStatus
)) {
4138 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4142 // Verify that the character classes all have at least one member.
4143 for (i
=0; i
<numCharClasses
; i
++) {
4144 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4145 if (s
== NULL
|| s
->size() == 0) {
4146 errln("Character Class #%d is null or of zero size.", i
);
4151 while (loopCount
< numIterations
|| numIterations
== -1) {
4152 if (numIterations
== -1 && loopCount
% 10 == 0) {
4153 // If test is running in an infinite loop, display a periodic tic so
4154 // we can tell that it is making progress.
4155 fprintf(stderr
, ".");
4157 // Save current random number seed, so that we can recreate the random numbers
4158 // for this loop iteration in event of an error.
4161 // Populate a test string with data.
4162 testText
.truncate(0);
4163 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4164 int32_t aClassNum
= m_rand() % numCharClasses
;
4165 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4166 int32_t charIdx
= m_rand() % classSet
->size();
4167 UChar32 c
= classSet
->charAt(charIdx
);
4168 if (c
< 0) { // TODO: deal with sets containing strings.
4175 // Calculate the expected results for this test string.
4176 mk
.setText(testText
);
4177 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4178 expectedBreaks
[0] = 1;
4179 int32_t breakPos
= 0;
4182 breakPos
= mk
.next(breakPos
);
4183 if (breakPos
== -1) {
4186 if (breakPos
> testText
.length()) {
4187 errln("breakPos > testText.length()");
4189 expectedBreaks
[breakPos
] = 1;
4190 U_ASSERT(expectedCount
<testText
.length());
4191 expected
[expectedCount
++] = breakPos
;
4192 (void)expected
; // Set but not used warning.
4193 // TODO (andy): check it out.
4196 // Find the break positions using forward iteration
4197 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4199 UErrorCode status
= U_ZERO_ERROR
;
4200 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4201 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4202 bi
->setText(testUText
, status
);
4203 TEST_ASSERT_SUCCESS(status
);
4204 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4205 // This UText can be closed immediately, so long as the
4206 // testText string continues to exist.
4208 bi
->setText(testText
);
4211 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4212 if (i
< 0 || i
> testText
.length()) {
4213 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4216 forwardBreaks
[i
] = 1;
4219 // Find the break positions using reverse iteration
4220 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4221 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4222 if (i
< 0 || i
> testText
.length()) {
4223 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4226 reverseBreaks
[i
] = 1;
4229 // Find the break positions using isBoundary() tests.
4230 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4231 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4232 for (i
=0; i
<=testText
.length(); i
++) {
4233 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4237 // Find the break positions using the following() function.
4239 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4240 int32_t lastBreakPos
= 0;
4241 followingBreaks
[0] = 1;
4242 for (i
=0; i
<testText
.length(); i
++) {
4243 breakPos
= bi
->following(i
);
4244 if (breakPos
<= i
||
4245 breakPos
< lastBreakPos
||
4246 breakPos
> testText
.length() ||
4247 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4248 UChar32 brkChar
= testText
.char32At(lastBreakPos
);
4249 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4250 errln("%s break monkey test: "
4251 "Out of range value returned by BreakIterator::following().\n"
4252 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4253 name
, seed
, i
, breakPos
, lastBreakPos
);
4257 followingBreaks
[breakPos
] = 1;
4258 lastBreakPos
= breakPos
;
4261 // Find the break positions using the preceding() function.
4262 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4263 lastBreakPos
= testText
.length();
4264 precedingBreaks
[testText
.length()] = 1;
4265 for (i
=testText
.length(); i
>0; i
--) {
4266 breakPos
= bi
->preceding(i
);
4267 if (breakPos
>= i
||
4268 breakPos
> lastBreakPos
||
4269 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4270 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4271 UChar32 brkChar
= testText
.char32At(breakPos
);
4272 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4273 errln("%s break monkey test: "
4274 "Out of range value returned by BreakIterator::preceding().\n"
4275 "index=%d; prev returned %d; lastBreak=%d" ,
4276 name
, i
, breakPos
, lastBreakPos
);
4277 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4278 precedingBreaks
[i
] = 2; // Forces an error.
4282 if (breakPos
>= 0) {
4283 precedingBreaks
[breakPos
] = 1;
4285 lastBreakPos
= breakPos
;
4289 // Compare the expected and actual results.
4290 for (i
=0; i
<=testText
.length(); i
++) {
4291 const char *errorType
= NULL
;
4292 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4293 errorType
= "next()";
4294 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4295 errorType
= "previous()";
4296 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4297 errorType
= "isBoundary()";
4298 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4299 errorType
= "following()";
4300 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4301 errorType
= "preceding()";
4305 if (errorType
!= NULL
) {
4306 // Format a range of the test text that includes the failure as
4307 // a data item that can be included in the rbbi test data file.
4309 // Start of the range is the last point where expected and actual results
4310 // both agreed that there was a break position.
4311 int startContext
= i
;
4314 if (startContext
==0) { break; }
4316 if (expectedBreaks
[startContext
] != 0) {
4317 if (count
== 2) break;
4322 // End of range is two expected breaks past the start position.
4323 int endContext
= i
+ 1;
4325 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4327 if (endContext
>= testText
.length()) {break;}
4328 if (expectedBreaks
[endContext
-1] != 0) {
4329 if (count
== 0) break;
4336 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4337 UnicodeString errorText
= "<data>";
4338 /***if (strcmp(errorType, "next()") == 0) {
4340 endContext = testText.length();
4342 printStringBreaks(testText, expected, expectedCount);
4345 for (ci
=startContext
; ci
<endContext
;) {
4346 UnicodeString
hexChars("0123456789abcdef");
4349 c
= testText
.char32At(ci
);
4351 // This is the location of the error.
4352 errorText
.append("<?>");
4353 } else if (expectedBreaks
[ci
] != 0) {
4354 // This a non-error expected break position.
4355 errorText
.append("\\");
4358 errorText
.append("\\u");
4359 for (bn
=12; bn
>=0; bn
-=4) {
4360 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4363 errorText
.append("\\U");
4364 for (bn
=28; bn
>=0; bn
-=4) {
4365 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4368 ci
= testText
.moveIndex32(ci
, 1);
4370 errorText
.append("\\");
4371 errorText
.append("</data>\n");
4374 char charErrorTxt
[500];
4375 UErrorCode status
= U_ZERO_ERROR
;
4376 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4377 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4378 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4380 UChar32 brkChar
= testText
.char32At(i
);
4381 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4382 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4383 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4384 errorType
, seed
, i
, charErrorTxt
);
4396 // Bug 5532. UTF-8 based UText fails in dictionary code.
4397 // This test checks the initial patch,
4398 // which is to just keep it from crashing. Correct word boundaries
4399 // await a proper fix to the dictionary code.
4401 void RBBITest::TestBug5532(void) {
4402 // Text includes a mixture of Thai and Latin.
4403 const unsigned char utf8Data
[] = {
4404 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4405 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4406 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4407 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4408 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4409 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4410 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4411 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4412 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4413 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4414 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4416 UErrorCode status
= U_ZERO_ERROR
;
4417 UText utext
=UTEXT_INITIALIZER
;
4418 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4419 TEST_ASSERT_SUCCESS(status
);
4421 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4422 TEST_ASSERT_SUCCESS(status
);
4423 if (U_SUCCESS(status
)) {
4424 bi
->setText(&utext
, status
);
4425 TEST_ASSERT_SUCCESS(status
);
4427 int32_t breakCount
= 0;
4428 int32_t previousBreak
= -1;
4429 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4430 // For now, just make sure that the break iterator doesn't hang.
4431 TEST_ASSERT(previousBreak
< bi
->current());
4432 previousBreak
= bi
->current();
4434 TEST_ASSERT(breakCount
> 0);
4437 utext_close(&utext
);
4441 void RBBITest::TestBug9983(void) {
4442 UnicodeString text
= UnicodeString("\\u002A" // * Other
4444 "\\u309C" // Katakana
4448 "\\u0000").unescape();
4450 UErrorCode status
= U_ZERO_ERROR
;
4451 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4452 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4453 TEST_ASSERT_SUCCESS(status
);
4454 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4455 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4456 TEST_ASSERT_SUCCESS(status
);
4457 if (U_FAILURE(status
)) {
4460 int32_t offset
, rstatus
, iterationCount
;
4462 brkiter
->setText(text
);
4465 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4467 rstatus
= brkiter
->getRuleStatus();
4468 (void)rstatus
; // Suppress set but not used warning.
4469 if (iterationCount
>= 10) {
4473 TEST_ASSERT(iterationCount
== 6);
4475 brkiterPOSIX
->setText(text
);
4476 brkiterPOSIX
->last();
4478 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4480 rstatus
= brkiterPOSIX
->getRuleStatus();
4481 (void)rstatus
; // Suppress set but not used warning.
4482 if (iterationCount
>= 10) {
4486 TEST_ASSERT(iterationCount
== 6);
4491 // TestDebug - A place-holder test for debugging purposes.
4492 // For putting in fragments of other tests that can be invoked
4493 // for tracing without a lot of unwanted extra stuff happening.
4495 void RBBITest::TestDebug(void) {
4497 UErrorCode status
= U_ZERO_ERROR
;
4501 RuleBasedBreakIterator
* bi
=
4502 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4503 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4504 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4505 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4506 // UnicodeString s("Aaa. Bcd");
4509 UBool r
= bi
->isBoundary(8);
4510 printf("%s", r
?"true":"false");
4514 // ruleStatus = bi->getRuleStatus();
4515 printf("%d\t%d\n", pos
, ruleStatus
);
4516 pos
= bi
->previous();
4517 } while (pos
!= BreakIterator::DONE
);
4521 void RBBITest::TestProperties() {
4522 UErrorCode errorCode
= U_ZERO_ERROR
;
4523 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4524 if (!prependSet
.isEmpty()) {
4526 "[:GCB=Prepend:] is not empty any more. "
4527 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4528 "change this test to the opposite condition.");
4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */