1 /********************************************************************
3 * Copyright (c) 1999-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include <typeinfo> // for 'typeid' to work
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
41 #define TEST_ASSERT(x) {if (!(x)) { \
42 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
45 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
48 //---------------------------------------------
50 //---------------------------------------------
53 // Note: Before adding new tests to this file, check whether the desired test data can
54 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
55 // it's much less work than writing a new test, diagnostic output in the event of failures
56 // is good, and the test data file will is shared with ICU4J, so eventually the test
57 // will run there as well, without additional effort.
59 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
61 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
64 #if !UCONFIG_NO_FILE_IO
65 case 0: name
= "TestBug4153072";
66 if(exec
) TestBug4153072(); break;
68 case 0: name
= "skip";
72 case 1: name
= "skip";
74 case 2: name
= "TestStatusReturn";
75 if(exec
) TestStatusReturn(); break;
77 #if !UCONFIG_NO_FILE_IO
78 case 3: name
= "TestUnicodeFiles";
79 if(exec
) TestUnicodeFiles(); break;
80 case 4: name
= "TestEmptyString";
81 if(exec
) TestEmptyString(); break;
83 case 3: case 4: name
= "skip";
87 case 5: name
= "TestGetAvailableLocales";
88 if(exec
) TestGetAvailableLocales(); break;
90 case 6: name
= "TestGetDisplayName";
91 if(exec
) TestGetDisplayName(); break;
93 #if !UCONFIG_NO_FILE_IO
94 case 7: name
= "TestEndBehaviour";
95 if(exec
) TestEndBehaviour(); break;
96 case 8: case 9: case 10: name
= "skip";
98 case 11: name
= "TestWordBreaks";
99 if(exec
) TestWordBreaks(); break;
100 case 12: name
= "TestWordBoundary";
101 if(exec
) TestWordBoundary(); break;
102 case 13: name
= "TestLineBreaks";
103 if(exec
) TestLineBreaks(); break;
104 case 14: name
= "TestSentBreaks";
105 if(exec
) TestSentBreaks(); break;
106 case 15: name
= "TestExtended";
107 if(exec
) TestExtended(); break;
109 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name
= "skip";
113 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
114 case 16: name
= "TestMonkey";
115 if(exec
) TestMonkey(params
); break;
118 name
= "skip"; break;
121 #if !UCONFIG_NO_FILE_IO
122 case 17: name
= "TestBug3818";
123 if(exec
) TestBug3818(); break;
125 case 17: name
= "skip";
129 case 18: name
= "skip";
131 case 19: name
= "TestDebug";
132 if(exec
) TestDebug(); break;
133 case 20: name
= "TestTrieDict";
134 if(exec
) TestTrieDict(); break;
136 #if !UCONFIG_NO_FILE_IO
137 case 21: name
= "TestBug5775";
138 if (exec
) TestBug5775(); break;
140 case 21: name
= "skip";
144 case 22: name
= "skip";
146 case 23: name
= "TestDictRules";
147 if (exec
) TestDictRules(); break;
148 case 24: name
= "TestBug5532";
149 if (exec
) TestBug5532(); break;
150 default: name
= ""; break; //needed to end loop
155 //---------------------------------------------------------------------------
157 // class BITestData Holds a set of Break iterator test data and results
159 // - the string data to be broken
160 // - a vector of the expected break positions.
161 // - a vector of source line numbers for the data,
162 // (to help see where errors occured.)
163 // - The expected break tag values.
164 // - Vectors of actual break positions and tag values.
165 // - Functions for comparing actual with expected and
168 //----------------------------------------------------------------------------
171 UnicodeString fDataToBreak
;
172 UVector fExpectedBreakPositions
;
173 UVector fExpectedTags
;
175 UVector fActualBreakPositions
; // Test Results.
178 BITestData(UErrorCode
&status
);
179 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
180 void checkResults(const char *heading
, RBBITest
*test
);
181 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
188 BITestData::BITestData(UErrorCode
&status
)
189 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
195 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
196 // The macro form collects the line number, which is helpful
197 // when tracking down failures.
199 // A null data item is inserted at the start of each test's data
200 // to put the starting zero into the data list. The position saved for
201 // each non-null item is its ending position.
203 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
204 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
205 if (U_FAILURE(status
)) {return;}
207 fDataToBreak
.append(CharsToUnicodeString(data
));
209 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
210 fExpectedTags
.addElement(tag
, status
);
211 fLineNum
.addElement(lineNum
, status
);
216 // checkResults. Compare the actual and expected break positions, report any differences.
218 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
219 int32_t expectedIndex
= 0;
220 int32_t actualIndex
= 0;
223 // If we've run through both the expected and actual results vectors, we're done.
224 // break out of the loop.
225 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
226 actualIndex
>= fActualBreakPositions
.size()) {
231 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
232 err(heading
, test
, expectedIndex
-1, actualIndex
);
237 if (actualIndex
>= fActualBreakPositions
.size()) {
238 err(heading
, test
, expectedIndex
, actualIndex
-1);
243 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
244 err(heading
, test
, expectedIndex
, actualIndex
);
245 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
246 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
254 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
255 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
256 heading
, fLineNum
.elementAt(expectedIndex
),
257 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
266 // err - An error was found. Report it, along with information about where the
267 // incorrectly broken test data appeared in the source file.
269 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
271 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
272 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
274 int32_t line
= fLineNum
.elementAti(expectedIdx
);
275 if (expectedIdx
> 0) {
276 // The line numbers are off by one because a premature break occurs somewhere
277 // within the previous item, rather than at the start of the current (expected) item.
278 // We want to report the offset of the unexpected break from the start of
279 // this previous item.
280 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
282 if (actual
< expected
) {
283 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
285 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
290 void BITestData::clearResults() {
291 fActualBreakPositions
.removeAllElements();
292 fActualTags
.removeAllElements();
296 //--------------------------------------------------------------------------------------
298 // RBBITest constructor and destructor
300 //--------------------------------------------------------------------------------------
302 RBBITest::RBBITest() {
306 RBBITest::~RBBITest() {
309 //-----------------------------------------------------------------------------------
311 // Test for status {tag} return value from break rules.
312 // TODO: a more thorough test.
314 //-----------------------------------------------------------------------------------
315 void RBBITest::TestStatusReturn() {
316 UnicodeString
rulesString1("$Letters = [:L:];\n"
317 "$Numbers = [:N:];\n"
320 "Help\\ {4}/me\\!;\n"
321 "[^$Letters $Numbers];\n"
322 "!.*;\n", -1, US_INV
);
323 UnicodeString testString1
= "abc123..abc Help me Help me!";
324 // 01234567890123456789012345678
325 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
326 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
328 UErrorCode status
=U_ZERO_ERROR
;
329 UParseError parseError
;
331 RuleBasedBreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
332 if(U_FAILURE(status
)) {
333 dataerrln("FAIL : in construction - %s", u_errorName(status
));
337 bi
->setText(testString1
);
338 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
339 if (pos
!= bounds1
[i
]) {
340 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
344 int tag
= bi
->getRuleStatus();
345 if (tag
!= brkStatus
[i
]) {
346 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
356 static void printStringBreaks(UnicodeString ustr
, int expected
[],
359 UErrorCode status
= U_ZERO_ERROR
;
361 printf("code alpha extend alphanum type word sent line name\n");
363 for (j
= 0; j
< ustr
.length(); j
++) {
364 if (expectedcount
> 0) {
366 for (k
= 0; k
< expectedcount
; k
++) {
367 if (j
== expected
[k
]) {
368 printf("------------------------------------------------ %d\n",
373 UChar32 c
= ustr
.char32At(j
);
377 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
378 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
380 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
382 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
384 U_SHORT_PROPERTY_NAME
),
385 u_getPropertyValueName(UCHAR_WORD_BREAK
,
386 u_getIntPropertyValue(c
,
388 U_SHORT_PROPERTY_NAME
),
389 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
390 u_getIntPropertyValue(c
,
391 UCHAR_SENTENCE_BREAK
),
392 U_SHORT_PROPERTY_NAME
),
393 u_getPropertyValueName(UCHAR_LINE_BREAK
,
394 u_getIntPropertyValue(c
,
396 U_SHORT_PROPERTY_NAME
),
402 void RBBITest::TestBug3818() {
403 UErrorCode status
= U_ZERO_ERROR
;
405 // Four Thai words...
406 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
407 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
408 UnicodeString
thaiStr(thaiWordData
);
410 RuleBasedBreakIterator
* bi
=
411 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale("th"), status
);
412 if (U_FAILURE(status
) || bi
== NULL
) {
413 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
416 bi
->setText(thaiStr
);
418 int32_t startOfSecondWord
= bi
->following(1);
419 if (startOfSecondWord
!= 4) {
420 errln("Fail at file %s, line %d expected start of word at 4, got %d",
421 __FILE__
, __LINE__
, startOfSecondWord
);
423 startOfSecondWord
= bi
->following(0);
424 if (startOfSecondWord
!= 4) {
425 errln("Fail at file %s, line %d expected start of word at 4, got %d",
426 __FILE__
, __LINE__
, startOfSecondWord
);
432 void RBBITest::TestTrieDict() {
433 UErrorCode status
= U_ZERO_ERROR
;
436 // Open and read the test data file.
438 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
439 char testFileName
[1000];
440 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) + strlen("riwords.txt") + 10 >= sizeof(testFileName
)) {
441 errln("Can't open test data. Path too long.");
444 strcpy(testFileName
, testDataDirectory
);
445 strcat(testFileName
, "riwords.txt");
447 // Items needing deleting at the end
448 MutableTrieDictionary
*mutableDict
= NULL
;
449 CompactTrieDictionary
*compactDict
= NULL
;
450 UnicodeSet
*breaks
= NULL
;
451 UChar
*testFile
= NULL
;
452 StringEnumeration
*enumer1
= NULL
;
453 StringEnumeration
*enumer2
= NULL
;
454 MutableTrieDictionary
*mutable2
= NULL
;
455 StringEnumeration
*cloneEnum
= NULL
;
456 CompactTrieDictionary
*compact2
= NULL
;
459 const UnicodeString
*originalWord
= NULL
;
460 const UnicodeString
*cloneWord
= NULL
;
469 testFile
= ReadAndConvertFile(testFileName
, len
, NULL
, status
);
470 if (U_FAILURE(status
)) {
471 goto cleanup
; /* something went wrong, error already output */
474 mutableDict
= new MutableTrieDictionary(0x0E1C, status
);
475 if (U_FAILURE(status
)) {
476 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status
));
480 breaks
= new UnicodeSet
;
481 breaks
->add(0x000A); // Line Feed
482 breaks
->add(0x000D); // Carriage Return
483 breaks
->add(0x2028); // Line Separator
484 breaks
->add(0x2029); // Paragraph Separator
486 // Now add each non-comment line of the file as a word.
494 if (uc
== 0x0023) { // #comment line, skip
495 while (uc
&& !breaks
->contains(uc
)) {
499 else while (uc
&& !breaks
->contains(uc
)) {
504 mutableDict
->addWord(word
, wordLen
, status
);
505 if (U_FAILURE(status
)) {
506 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status
));
512 // Find beginning of next line
513 while (uc
&& breaks
->contains(uc
)) {
520 if (wordCount
< 50) {
521 errln("Word count (%d) unreasonably small\n", wordCount
);
525 enumer1
= mutableDict
->openWords(status
);
526 if (U_FAILURE(status
)) {
527 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status
));
532 if (wordCount
!= (testCount
= enumer1
->count(status
))) {
533 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
534 testCount
, wordCount
, u_errorName(status
));
539 compactDict
= new CompactTrieDictionary(*mutableDict
, status
);
540 if (U_FAILURE(status
)) {
541 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status
));
545 enumer2
= compactDict
->openWords(status
);
546 if (U_FAILURE(status
)) {
547 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status
));
551 if (wordCount
!= (testCount
= enumer2
->count(status
))) {
552 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
553 testCount
, wordCount
, u_errorName(status
));
557 if (typeid(*enumer1
) == typeid(*enumer2
)) {
558 errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
566 mutable2
= compactDict
->cloneMutable(status
);
567 if (U_FAILURE(status
)) {
568 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status
));
572 cloneEnum
= mutable2
->openWords(status
);
573 if (U_FAILURE(status
)) {
574 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status
));
578 if (wordCount
!= (testCount
= cloneEnum
->count(status
))) {
579 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
580 testCount
, wordCount
, u_errorName(status
));
584 // Compact original dictionary to clone. Note that we can only compare the same kind of
585 // dictionary as the order of the enumerators is not guaranteed to be the same between
587 enumer1
= mutableDict
->openWords(status
);
588 if (U_FAILURE(status
)) {
589 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status
));
593 originalWord
= enumer1
->snext(status
);
594 cloneWord
= cloneEnum
->snext(status
);
595 while (U_SUCCESS(status
) && originalWord
!= NULL
&& cloneWord
!= NULL
) {
596 if (*originalWord
!= *cloneWord
) {
597 errln("Original and cloned MutableTrieDictionary word mismatch\n");
600 originalWord
= enumer1
->snext(status
);
601 cloneWord
= cloneEnum
->snext(status
);
604 if (U_FAILURE(status
)) {
605 errln("Enumeration failed: %s\n", u_errorName(status
));
609 if (originalWord
!= cloneWord
) {
610 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
614 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
615 compact2
= new CompactTrieDictionary(compactDict
->data(), status
);
616 if (U_FAILURE(status
)) {
617 errln("CompactTrieDictionary(const void *,...) failed\n");
621 if (compact2
->dataSize() == 0) {
622 errln("CompactTrieDictionary->dataSize() == 0\n");
626 // Now count the words via the second dictionary
628 enumer1
= compact2
->openWords(status
);
629 if (U_FAILURE(status
)) {
630 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status
));
634 if (wordCount
!= (testCount
= enumer1
->count(status
))) {
635 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
636 testCount
, wordCount
, u_errorName(status
));
652 //----------------------------------------------------------------------------
654 // generalIteratorTest Given a break iterator and a set of test data,
655 // Run the tests and report the results.
657 //----------------------------------------------------------------------------
658 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
661 bi
.setText(td
.fDataToBreak
);
663 testFirstAndNext(bi
, td
);
665 testLastAndPrevious(bi
, td
);
667 testFollowing(bi
, td
);
668 testPreceding(bi
, td
);
669 testIsBoundary(bi
, td
);
670 doMultipleSelectionTest(bi
, td
);
675 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
678 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
680 UErrorCode status
= U_ZERO_ERROR
;
685 logln("Test first and next");
686 bi
.setText(td
.fDataToBreak
);
689 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
690 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
691 tag
= bi
.getRuleStatus();
692 td
.fActualTags
.addElement(tag
, status
);
694 // If the iterator is not making forward progress, stop.
695 // No need to raise an error here, it'll be detected in the normal check of results.
700 td
.checkResults("testFirstAndNext", this);
705 // TestLastAndPrevious. Run the iterator backwards, starting with last().
707 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
709 UErrorCode status
= U_ZERO_ERROR
;
711 int32_t lastP
= 0x7ffffffe;
714 logln("Test last and previous");
715 bi
.setText(td
.fDataToBreak
);
718 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
719 // Save break position. Insert it at start of vector of results, shoving
720 // already-saved results further towards the end.
721 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
722 // bi.previous(); // TODO: Why does this fix things up????
724 tag
= bi
.getRuleStatus();
725 td
.fActualTags
.insertElementAt(tag
, 0, status
);
727 // If the iterator is not making progress, stop.
728 // No need to raise an error here, it'll be detected in the normal check of results.
733 td
.checkResults("testLastAndPrevious", this);
737 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
739 UErrorCode status
= U_ZERO_ERROR
;
742 int32_t lastP
= -2; // A value that will never be returned as a break position.
743 // cannot be -1; that is returned for DONE.
746 logln("testFollowing():");
747 bi
.setText(td
.fDataToBreak
);
750 // Save the starting point, since we won't get that out of following.
752 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
753 tag
= bi
.getRuleStatus();
754 td
.fActualTags
.addElement(tag
, status
);
756 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
759 if (p
== RuleBasedBreakIterator::DONE
) {
762 // We've reached a new break position. Save it.
763 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
764 tag
= bi
.getRuleStatus();
765 td
.fActualTags
.addElement(tag
, status
);
769 // The loop normally exits by means of the break in the middle.
770 // Make sure that the index was at the correct position for the break iterator to have
772 if (i
!= td
.fDataToBreak
.length()) {
773 errln("testFollowing(): iterator returned DONE prematurely.");
776 // Full check of all results.
777 td
.checkResults("testFollowing", this);
782 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
783 UErrorCode status
= U_ZERO_ERROR
;
786 int32_t lastP
= 0x7ffffffe;
789 logln("testPreceding():");
790 bi
.setText(td
.fDataToBreak
);
794 td
.fActualBreakPositions
.addElement(p
, status
);
795 tag
= bi
.getRuleStatus();
796 td
.fActualTags
.addElement(tag
, status
);
798 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
801 if (p
== RuleBasedBreakIterator::DONE
) {
804 // We've reached a new break position. Save it.
805 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
807 tag
= bi
.getRuleStatus();
808 td
.fActualTags
.insertElementAt(tag
, 0, status
);
811 // The loop normally exits by means of the break in the middle.
812 // Make sure that the index was at the correct position for the break iterator to have
815 errln("testPreceding(): iterator returned DONE prematurely.");
818 // Full check of all results.
819 td
.checkResults("testPreceding", this);
824 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
825 UErrorCode status
= U_ZERO_ERROR
;
829 logln("testIsBoundary():");
830 bi
.setText(td
.fDataToBreak
);
833 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
834 if (bi
.isBoundary(i
)) {
835 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
836 tag
= bi
.getRuleStatus();
837 td
.fActualTags
.addElement(tag
, status
);
840 td
.checkResults("testIsBoundary: ", this);
845 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
847 iterator
.setText(td
.fDataToBreak
);
849 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
850 int32_t offset
= iterator
.first();
854 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
856 if (*testIterator
!= iterator
)
857 errln("clone() or operator!= failed: two clones compared unequal");
860 testOffset
= testIterator
->first();
861 testOffset
= testIterator
->next(count
);
862 if (offset
!= testOffset
)
863 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
865 if (offset
!= RuleBasedBreakIterator::DONE
) {
867 offset
= iterator
.next();
869 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
870 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
871 if (count
> 10000 || offset
== -1) {
872 errln("operator== failed too many times. Stopping test.");
874 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
880 } while (offset
!= RuleBasedBreakIterator::DONE
);
882 // now do it backwards...
883 offset
= iterator
.last();
887 testOffset
= testIterator
->last();
888 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
889 if (offset
!= testOffset
)
890 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
892 if (offset
!= RuleBasedBreakIterator::DONE
) {
894 offset
= iterator
.previous();
896 } while (offset
!= RuleBasedBreakIterator::DONE
);
902 //---------------------------------------------
906 //---------------------------------------------
907 void RBBITest::TestEmptyString()
909 UnicodeString text
= "";
910 UErrorCode status
= U_ZERO_ERROR
;
912 BITestData
x(status
);
913 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
914 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
915 if (U_FAILURE(status
))
917 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
920 generalIteratorTest(*bi
, x
);
924 void RBBITest::TestGetAvailableLocales()
926 int32_t locCount
= 0;
927 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
930 dataerrln("getAvailableLocales() returned an empty list!");
931 // Just make sure that it's returning good memory.
933 for (i
= 0; i
< locCount
; ++i
) {
934 logln(locList
[i
].getName());
938 //Testing the BreakIterator::getDisplayName() function
939 void RBBITest::TestGetDisplayName()
941 UnicodeString result
;
943 BreakIterator::getDisplayName(Locale::getUS(), result
);
944 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
945 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
948 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
949 if (result
!= "French (France)")
950 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
957 void RBBITest::TestEndBehaviour()
959 UErrorCode status
= U_ZERO_ERROR
;
960 UnicodeString
testString("boo.");
961 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
962 if (U_FAILURE(status
))
964 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
967 wb
->setText(testString
);
969 if (wb
->first() != 0)
970 errln("Didn't get break at beginning of string.");
972 errln("Didn't get break before period in \"boo.\"");
973 if (wb
->current() != 4 && wb
->next() != 4)
974 errln("Didn't get break at end of string.");
980 void RBBITest::TestBug4153072() {
981 UErrorCode status
= U_ZERO_ERROR
;
982 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
983 if (U_FAILURE(status
))
985 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
988 UnicodeString
str("...Hello, World!...");
990 int32_t end
= str
.length() - 3;
993 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
994 iter
->adoptText(textIterator
);
996 // Note: with the switch to UText, there is no way to restrict the
997 // iteration range to begin at an index other than zero.
998 // String character iterators created with a non-zero bound are
999 // treated by RBBI as being empty.
1000 for (index
= -1; index
< begin
+ 1; ++index
) {
1001 onBoundary
= iter
->isBoundary(index
);
1002 if (index
== 0? !onBoundary
: onBoundary
) {
1003 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
1004 " and begin index = " + begin
);
1012 // Test for problem reported by Ashok Matoria on 9 July 2007
1013 // One.<kSoftHyphen><kSpace>Two.
1015 // Sentence break at start (0) and then on calling next() it breaks at
1016 // 'T' of "Two". Now, at this point if I do next() and
1017 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1019 void RBBITest::TestBug5775() {
1020 UErrorCode status
= U_ZERO_ERROR
;
1021 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1022 TEST_ASSERT_SUCCESS(status
);
1023 if (U_FAILURE(status
)) {
1026 // Check for status first for better handling of no data errors.
1027 TEST_ASSERT(bi
!= NULL
);
1032 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
1036 int pos
= bi
->next();
1037 TEST_ASSERT(pos
== 6);
1039 TEST_ASSERT(pos
== 10);
1040 pos
= bi
->previous();
1041 TEST_ASSERT(pos
== 6);
1047 //------------------------------------------------------------------------------
1049 // RBBITest::Extended Run RBBI Tests from an external test data file
1051 //------------------------------------------------------------------------------
1055 UnicodeString dataToBreak
;
1056 UVector32
*expectedBreaks
;
1061 void RBBITest::executeTest(TestParams
*t
) {
1066 if (t
->bi
== NULL
) {
1070 t
->bi
->setText(t
->dataToBreak
);
1072 // Run the iterator forward
1075 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1077 // Fail for lack of forward progress.
1078 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1079 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1083 // Check that there were we didn't miss an expected break between the last one
1085 for (i
=prevBP
+1; i
<bp
; i
++) {
1086 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1087 int expected
[] = {0, i
};
1088 printStringBreaks(t
->dataToBreak
, expected
, 2);
1089 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1090 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1094 // Check that the break we did find was expected
1095 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1096 int expected
[] = {0, bp
};
1097 printStringBreaks(t
->dataToBreak
, expected
, 2);
1098 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1099 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1101 // The break was expected.
1102 // Check that the {nnn} tag value is correct.
1103 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1104 if (expectedTagVal
== -1) {
1107 int32_t line
= t
->srcLine
->elementAti(bp
);
1108 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1109 if (rs
!= expectedTagVal
) {
1110 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1111 " Actual, Expected status = %4d, %4d",
1112 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1120 // Verify that there were no missed expected breaks after the last one found
1121 for (i
=prevBP
+1; i
<t
->expectedBreaks
->size(); i
++) {
1122 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1123 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1124 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1129 // Run the iterator backwards, verify that the same breaks are found.
1131 prevBP
= t
->dataToBreak
.length()+2; // start with a phony value for the last break pos seen.
1132 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1134 // Fail for lack of progress.
1135 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1136 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1140 // Check that there were we didn't miss an expected break between the last one
1141 // and this one. (UVector returns zeros for index out of bounds.)
1142 for (i
=prevBP
-1; i
>bp
; i
--) {
1143 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1144 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1145 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1149 // Check that the break we did find was expected
1150 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1151 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1152 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1154 // The break was expected.
1155 // Check that the {nnn} tag value is correct.
1156 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1157 if (expectedTagVal
== -1) {
1160 int line
= t
->srcLine
->elementAti(bp
);
1161 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1162 if (rs
!= expectedTagVal
) {
1163 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1164 " Actual, Expected status = %4d, %4d",
1165 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1172 // Verify that there were no missed breaks prior to the last one found
1173 for (i
=prevBP
-1; i
>=0; i
--) {
1174 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1175 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1176 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1182 void RBBITest::TestExtended() {
1183 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1184 UErrorCode status
= U_ZERO_ERROR
;
1187 UnicodeString rules
;
1190 tp
.expectedBreaks
= new UVector32(status
);
1191 tp
.srcLine
= new UVector32(status
);
1192 tp
.srcCol
= new UVector32(status
);
1194 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status
);
1195 if (U_FAILURE(status
)) {
1196 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1201 // Open and read the test data file.
1203 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1204 char testFileName
[1000];
1205 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1206 errln("Can't open test data. Path too long.");
1209 strcpy(testFileName
, testDataDirectory
);
1210 strcat(testFileName
, "rbbitst.txt");
1213 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1214 if (U_FAILURE(status
)) {
1215 return; /* something went wrong, error already output */
1222 // Put the test data into a UnicodeString
1224 UnicodeString
testString(FALSE
, testFile
, len
);
1232 parseState
= PARSE_TAG
;
1234 EParseState savedState
= PARSE_TAG
;
1236 static const UChar CH_LF
= 0x0a;
1237 static const UChar CH_CR
= 0x0d;
1238 static const UChar CH_HASH
= 0x23;
1239 /*static const UChar CH_PERIOD = 0x2e;*/
1240 static const UChar CH_LT
= 0x3c;
1241 static const UChar CH_GT
= 0x3e;
1242 static const UChar CH_BACKSLASH
= 0x5c;
1243 static const UChar CH_BULLET
= 0x2022;
1245 int32_t lineNum
= 1;
1246 int32_t colStart
= 0;
1248 int32_t charIdx
= 0;
1250 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1252 for (charIdx
= 0; charIdx
< len
; ) {
1253 status
= U_ZERO_ERROR
;
1254 UChar c
= testString
.charAt(charIdx
);
1256 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1257 // treat CRLF as a unit
1261 if (c
== CH_LF
|| c
== CH_CR
) {
1265 column
= charIdx
- colStart
+ 1;
1267 switch (parseState
) {
1269 if (c
== 0x0a || c
== 0x0d) {
1270 parseState
= savedState
;
1277 parseState
= PARSE_COMMENT
;
1278 savedState
= PARSE_TAG
;
1281 if (u_isUWhiteSpace(c
)) {
1284 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1286 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1290 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1292 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1296 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1298 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1302 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1305 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1309 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1311 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1316 // <locale loc_name>
1317 localeMatcher
.reset(testString
);
1318 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1319 UnicodeString localeName
= localeMatcher
.group(1, status
);
1320 char localeName8
[100];
1321 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1322 locale
= Locale::createFromName(localeName8
);
1323 charIdx
+= localeMatcher
.group(0, status
).length();
1324 TEST_ASSERT_SUCCESS(status
);
1327 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1328 parseState
= PARSE_DATA
;
1330 tp
.dataToBreak
= "";
1331 tp
.expectedBreaks
->removeAllElements();
1332 tp
.srcCol
->removeAllElements();
1333 tp
.srcLine
->removeAllElements();
1337 errln("line %d: Tag expected in test file.", lineNum
);
1338 parseState
= PARSE_COMMENT
;
1339 savedState
= PARSE_DATA
;
1340 goto end_test
; // Stop the test.
1345 if (c
== CH_BULLET
) {
1346 int32_t breakIdx
= tp
.dataToBreak
.length();
1347 tp
.expectedBreaks
->setSize(breakIdx
+1);
1348 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1349 tp
.srcLine
->setSize(breakIdx
+1);
1350 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1351 tp
.srcCol
->setSize(breakIdx
+1);
1352 tp
.srcCol
->setElementAt(column
, breakIdx
);
1356 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1357 // Add final entry to mappings from break location to source file position.
1358 // Need one extra because last break position returned is after the
1359 // last char in the data, not at the last char.
1360 tp
.srcLine
->addElement(lineNum
, status
);
1361 tp
.srcCol
->addElement(column
, status
);
1363 parseState
= PARSE_TAG
;
1371 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1372 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1373 // Get the code point from the name and insert it into the test data.
1374 // (Damn, no API takes names in Unicode !!!
1375 // we've got to take it back to char *)
1376 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1377 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1378 char charNameBuf
[200];
1379 UChar32 theChar
= -1;
1380 if (nameEndIdx
!= -1) {
1381 UErrorCode status
= U_ZERO_ERROR
;
1382 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1383 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1384 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1385 if (U_FAILURE(status
)) {
1389 if (theChar
== -1) {
1390 errln("Error in named character in test file at line %d, col %d",
1393 // Named code point was recognized. Insert it
1394 // into the test data.
1395 tp
.dataToBreak
.append(theChar
);
1396 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1397 tp
.srcLine
->addElement(lineNum
, status
);
1398 tp
.srcCol
->addElement(column
, status
);
1401 if (nameEndIdx
> charIdx
) {
1402 charIdx
= nameEndIdx
+1;
1411 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1413 int32_t breakIdx
= tp
.dataToBreak
.length();
1414 tp
.expectedBreaks
->setSize(breakIdx
+1);
1415 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1416 tp
.srcLine
->setSize(breakIdx
+1);
1417 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1418 tp
.srcCol
->setSize(breakIdx
+1);
1419 tp
.srcCol
->setElementAt(column
, breakIdx
);
1425 parseState
= PARSE_NUM
;
1429 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1430 parseState
= PARSE_COMMENT
;
1431 savedState
= PARSE_DATA
;
1435 if (c
== CH_BACKSLASH
) {
1436 // Check for \ at end of line, a line continuation.
1437 // Advance over (discard) the newline
1438 UChar32 cp
= testString
.char32At(charIdx
);
1439 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1441 // Need an extra increment of the input ptr to move over both of them
1444 if (cp
== CH_LF
|| cp
== CH_CR
) {
1451 // Let unescape handle the back slash.
1452 cp
= testString
.unescapeAt(charIdx
);
1454 // Escape sequence was recognized. Insert the char
1455 // into the test data.
1456 tp
.dataToBreak
.append(cp
);
1457 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1458 tp
.srcLine
->addElement(lineNum
, status
);
1459 tp
.srcCol
->addElement(column
, status
);
1465 // Not a recognized backslash escape sequence.
1466 // Take the next char as a literal.
1467 // TODO: Should this be an error?
1468 c
= testString
.charAt(charIdx
);
1469 charIdx
= testString
.moveIndex32(charIdx
, 1);
1472 // Normal, non-escaped data char.
1473 tp
.dataToBreak
.append(c
);
1475 // Save the mapping from offset in the data to line/column numbers in
1476 // the original input file. Will be used for better error messages only.
1477 // If there's an expected break before this char, the slot in the mapping
1478 // vector will already be set for this char; don't overwrite it.
1479 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1480 tp
.srcLine
->addElement(lineNum
, status
);
1481 tp
.srcCol
->addElement(column
, status
);
1487 // We are parsing an expected numeric tag value, like <1234>,
1488 // within a chunk of data.
1489 if (u_isUWhiteSpace(c
)) {
1494 // Finished the number. Add the info to the expected break data,
1495 // and switch parse state back to doing plain data.
1496 parseState
= PARSE_DATA
;
1497 if (tagValue
== 0) {
1500 int32_t breakIdx
= tp
.dataToBreak
.length();
1501 tp
.expectedBreaks
->setSize(breakIdx
+1);
1502 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1503 tp
.srcLine
->setSize(breakIdx
+1);
1504 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1505 tp
.srcCol
->setSize(breakIdx
+1);
1506 tp
.srcCol
->setElementAt(column
, breakIdx
);
1511 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1515 errln("Syntax Error in test file at line %d, col %d",
1517 parseState
= PARSE_COMMENT
;
1518 goto end_test
; // Stop the test
1523 if (U_FAILURE(status
)) {
1524 dataerrln("ICU Error %s while parsing test file at line %d.",
1525 u_errorName(status
), lineNum
);
1526 status
= U_ZERO_ERROR
;
1527 goto end_test
; // Stop the test
1534 delete tp
.expectedBreaks
;
1542 //-------------------------------------------------------------------------------
1544 // TestDictRules create a break iterator from source rules that includes a
1545 // dictionary range. Regression for bug #7130. Source rules
1546 // do not declare a break iterator type (word, line, sentence, etc.
1547 // but the dictionary code, without a type, would loop.
1549 //-------------------------------------------------------------------------------
1550 void RBBITest::TestDictRules() {
1551 const char *rules
= "$dictionary = [a-z]; \n"
1553 "$dictionary $dictionary; \n"
1555 "$dictionary $dictionary; \n";
1556 const char *text
= "aa";
1557 UErrorCode status
= U_ZERO_ERROR
;
1558 UParseError parseError
;
1560 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1561 if (U_SUCCESS(status
)) {
1562 UnicodeString utext
= text
;
1566 for (loops
= 0; loops
<10; loops
++) {
1567 position
= bi
.next();
1568 if (position
== RuleBasedBreakIterator::DONE
) {
1572 TEST_ASSERT(loops
== 1);
1574 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1580 //-------------------------------------------------------------------------------
1582 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1583 // return the datain one big UChar * buffer, which the caller must delete.
1586 // fileName: the name of the file, with no directory part. The test data directory
1588 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1589 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1590 // specified here. The BOM, if it exists, will be stripped from the returned data.
1591 // Pass NULL for the system default encoding.
1594 // The file data, converted to UChar.
1595 // The caller must delete this when done with
1596 // delete [] theBuffer;
1598 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1599 // Move this function to some common place.
1601 //--------------------------------------------------------------------------------
1602 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1603 UChar
*retPtr
= NULL
;
1604 char *fileBuf
= NULL
;
1605 UConverter
* conv
= NULL
;
1609 if (U_FAILURE(status
)) {
1616 f
= fopen(fileName
, "rb");
1618 dataerrln("Error opening test data file %s\n", fileName
);
1619 status
= U_FILE_ACCESS_ERROR
;
1628 fseek( f
, 0, SEEK_END
);
1629 fileSize
= ftell(f
);
1630 fileBuf
= new char[fileSize
];
1631 fseek(f
, 0, SEEK_SET
);
1632 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1633 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1634 errln("Error reading test data file.");
1635 goto cleanUpAndReturn
;
1639 // Look for a Unicode Signature (BOM) on the data just read
1641 int32_t signatureLength
;
1642 const char * fileBufC
;
1643 const char* bomEncoding
;
1646 bomEncoding
= ucnv_detectUnicodeSignature(
1647 fileBuf
, fileSize
, &signatureLength
, &status
);
1648 if(bomEncoding
!=NULL
){
1649 fileBufC
+= signatureLength
;
1650 fileSize
-= signatureLength
;
1651 encoding
= bomEncoding
;
1655 // Open a converter to take the rule file to UTF-16
1657 conv
= ucnv_open(encoding
, &status
);
1658 if (U_FAILURE(status
)) {
1659 goto cleanUpAndReturn
;
1663 // Convert the rules to UChar.
1664 // Preflight first to determine required buffer size.
1666 ulen
= ucnv_toUChars(conv
,
1672 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1673 // Buffer Overflow is expected from the preflight operation.
1674 status
= U_ZERO_ERROR
;
1676 retPtr
= new UChar
[ulen
+1];
1689 if (U_FAILURE(status
)) {
1690 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1700 //--------------------------------------------------------------------------------------------
1702 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1704 //-------------------------------------------------------------------------------------------
1705 void RBBITest::TestUnicodeFiles() {
1706 RuleBasedBreakIterator
*bi
;
1707 UErrorCode status
= U_ZERO_ERROR
;
1709 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1710 TEST_ASSERT_SUCCESS(status
);
1711 if (U_SUCCESS(status
)) {
1712 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1716 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1717 TEST_ASSERT_SUCCESS(status
);
1718 if (U_SUCCESS(status
)) {
1719 runUnicodeTestData("WordBreakTest.txt", bi
);
1723 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1724 TEST_ASSERT_SUCCESS(status
);
1725 if (U_SUCCESS(status
)) {
1726 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1730 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1731 TEST_ASSERT_SUCCESS(status
);
1732 if (U_SUCCESS(status
)) {
1733 runUnicodeTestData("LineBreakTest.txt", bi
);
1739 //--------------------------------------------------------------------------------------------
1741 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1743 //-------------------------------------------------------------------------------------------
1744 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1745 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1746 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1747 UBool isTicket7270Fixed
= isICUVersionAtLeast(50, 0);
1748 UBool isLineBreak
= 0 == strcmp(fileName
, "LineBreakTest.txt");
1749 UErrorCode status
= U_ZERO_ERROR
;
1752 // Open and read the test data file, put it into a UnicodeString.
1754 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1755 char testFileName
[1000];
1756 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1757 dataerrln("Can't open test data. Path too long.");
1760 strcpy(testFileName
, testDataDirectory
);
1761 strcat(testFileName
, fileName
);
1763 logln("Opening data file %s\n", fileName
);
1766 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1767 if (status
!= U_FILE_ACCESS_ERROR
) {
1768 TEST_ASSERT_SUCCESS(status
);
1769 TEST_ASSERT(testFile
!= NULL
);
1771 if (U_FAILURE(status
) || testFile
== NULL
) {
1772 return; /* something went wrong, error already output */
1774 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1777 // Parse the test data file using a regular expression.
1778 // Each kind of token is recognized in its own capture group; what type of item was scanned
1779 // is identified by which group had a match.
1781 // Caputure Group # 1 2 3 4 5
1782 // Parses this item: divide x hex digits comment \n unrecognized \n
1784 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1785 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1786 UnicodeString testString
;
1787 UVector32
breakPositions(status
);
1789 TEST_ASSERT_SUCCESS(status
);
1790 if (U_FAILURE(status
)) {
1795 // Scan through each test case, building up the string to be broken in testString,
1796 // and the positions that should be boundaries in the breakPositions vector.
1799 while (tokenMatcher
.find()) {
1800 if(tokenMatcher
.hitEnd()) {
1801 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1802 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1803 and caused an infinite loop here on EBCDIC systems!
1805 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1808 if (tokenMatcher
.start(1, status
) >= 0) {
1809 // Scanned a divide sign, indicating a break position in the test data.
1810 if (testString
.length()>0) {
1811 breakPositions
.addElement(testString
.length(), status
);
1814 else if (tokenMatcher
.start(2, status
) >= 0) {
1815 // Scanned an 'x', meaning no break at this position in the test data
1816 // Nothing to be done here.
1818 else if (tokenMatcher
.start(3, status
) >= 0) {
1819 // Scanned Hex digits. Convert them to binary, append to the character data string.
1820 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1821 int length
= hexNumber
.length();
1824 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1825 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1827 testString
.append(c
);
1829 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1830 fileName
, lineNumber
);
1833 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1834 fileName
, lineNumber
);
1837 else if (tokenMatcher
.start(4, status
) >= 0) {
1838 // Scanned to end of a line, possibly skipping over a comment in the process.
1839 // If the line from the file contained test data, run the test now.
1841 if (testString
.length() > 0) {
1842 // TODO(andy): Remove this time bomb code. Note: Line range updated for Unicode 6.1 LineBreakTest.txt.
1843 if (!isLineBreak
|| isTicket7270Fixed
|| !(5066 <= lineNumber
&& lineNumber
<= 5170)) {
1844 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1848 // Clear out this test case.
1849 // The string and breakPositions vector will be refilled as the next
1850 // test case is parsed.
1851 testString
.remove();
1852 breakPositions
.removeAllElements();
1855 // Scanner catchall. Something unrecognized appeared on the line.
1857 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1858 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1859 token
[sizeof(token
)-1] = 0;
1860 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1862 // Clean up, in preparation for continuing with the next line.
1863 testString
.remove();
1864 breakPositions
.removeAllElements();
1867 TEST_ASSERT_SUCCESS(status
);
1868 if (U_FAILURE(status
)) {
1874 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1877 //--------------------------------------------------------------------------------------------
1879 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1880 // test data files. Do only a simple, forward-only check -
1881 // this test is mostly to check that ICU and the Unicode
1882 // data agree with each other.
1884 //--------------------------------------------------------------------------------------------
1885 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1886 const UnicodeString
&testString
, // Text data to be broken
1887 UVector32
*breakPositions
, // Positions where breaks should be found.
1888 RuleBasedBreakIterator
*bi
) {
1889 int32_t pos
; // Break Position in the test string
1890 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1891 int32_t expectedPos
; // Expected break position (index into test string)
1893 bi
->setText(testString
);
1897 while (pos
!= BreakIterator::DONE
) {
1898 if (expectedI
>= breakPositions
->size()) {
1899 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1900 testFileName
, lineNumber
, pos
);
1903 expectedPos
= breakPositions
->elementAti(expectedI
);
1904 if (pos
< expectedPos
) {
1905 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1906 testFileName
, lineNumber
, pos
);
1909 if (pos
> expectedPos
) {
1910 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1911 testFileName
, lineNumber
, expectedPos
);
1918 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1919 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1920 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1926 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1927 //---------------------------------------------------------------------------------------
1929 // classs RBBIMonkeyKind
1931 // Monkey Test for Break Iteration
1932 // Abstract interface class. Concrete derived classes independently
1933 // implement the break rules for different iterator types.
1935 // The Monkey Test itself uses doesn't know which type of break iterator it is
1936 // testing, but works purely in terms of the interface defined here.
1938 //---------------------------------------------------------------------------------------
1939 class RBBIMonkeyKind
{
1941 // Return a UVector of UnicodeSets, representing the character classes used
1942 // for this type of iterator.
1943 virtual UVector
*charClasses() = 0;
1945 // Set the test text on which subsequent calls to next() will operate
1946 virtual void setText(const UnicodeString
&s
) = 0;
1948 // Find the next break postion, starting from the prev break position, or from zero.
1949 // Return -1 after reaching end of string.
1950 virtual int32_t next(int32_t i
) = 0;
1952 virtual ~RBBIMonkeyKind();
1953 UErrorCode deferredStatus
;
1962 RBBIMonkeyKind::RBBIMonkeyKind() {
1963 deferredStatus
= U_ZERO_ERROR
;
1966 RBBIMonkeyKind::~RBBIMonkeyKind() {
1970 //----------------------------------------------------------------------------------------
1972 // Random Numbers. Similar to standard lib rand() and srand()
1973 // Not using library to
1974 // 1. Get same results on all platforms.
1975 // 2. Get access to current seed, to more easily reproduce failures.
1977 //---------------------------------------------------------------------------------------
1978 static uint32_t m_seed
= 1;
1980 static uint32_t m_rand()
1982 m_seed
= m_seed
* 1103515245 + 12345;
1983 return (uint32_t)(m_seed
/65536) % 32768;
1987 //------------------------------------------------------------------------------------------
1989 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1990 // of RBBIMonkeyKind.
1992 //------------------------------------------------------------------------------------------
1993 class RBBICharMonkey
: public RBBIMonkeyKind
{
1996 virtual ~RBBICharMonkey();
1997 virtual UVector
*charClasses();
1998 virtual void setText(const UnicodeString
&s
);
1999 virtual int32_t next(int32_t i
);
2003 UnicodeSet
*fCRLFSet
;
2004 UnicodeSet
*fControlSet
;
2005 UnicodeSet
*fExtendSet
;
2006 UnicodeSet
*fPrependSet
;
2007 UnicodeSet
*fSpacingSet
;
2012 UnicodeSet
*fLVTSet
;
2013 UnicodeSet
*fHangulSet
;
2014 UnicodeSet
*fAnySet
;
2016 const UnicodeString
*fText
;
2020 RBBICharMonkey::RBBICharMonkey() {
2021 UErrorCode status
= U_ZERO_ERROR
;
2025 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
2026 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status
);
2027 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status
);
2028 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
2029 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
2030 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
2031 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
2032 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
2033 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
2034 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
2035 fHangulSet
= new UnicodeSet();
2036 fHangulSet
->addAll(*fLSet
);
2037 fHangulSet
->addAll(*fVSet
);
2038 fHangulSet
->addAll(*fTSet
);
2039 fHangulSet
->addAll(*fLVSet
);
2040 fHangulSet
->addAll(*fLVTSet
);
2041 fAnySet
= new UnicodeSet(0, 0x10ffff);
2043 fSets
= new UVector(status
);
2044 fSets
->addElement(fCRLFSet
, status
);
2045 fSets
->addElement(fControlSet
, status
);
2046 fSets
->addElement(fExtendSet
, status
);
2047 if (!fPrependSet
->isEmpty()) {
2048 fSets
->addElement(fPrependSet
, status
);
2050 fSets
->addElement(fSpacingSet
, status
);
2051 fSets
->addElement(fHangulSet
, status
);
2052 fSets
->addElement(fAnySet
, status
);
2053 if (U_FAILURE(status
)) {
2054 deferredStatus
= status
;
2059 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2065 int32_t RBBICharMonkey::next(int32_t prevPos
) {
2066 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2067 // break position being tested. The candidate break
2068 // location is before p2.
2072 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2074 if (U_FAILURE(deferredStatus
)) {
2078 // Previous break at end of string. return DONE.
2079 if (prevPos
>= fText
->length()) {
2082 p0
= p1
= p2
= p3
= prevPos
;
2083 c3
= fText
->char32At(prevPos
);
2086 // Loop runs once per "significant" character position in the input text.
2088 // Move all of the positions forward in the input string.
2093 // Advancd p3 by one codepoint
2094 p3
= fText
->moveIndex32(p3
, 1);
2095 c3
= fText
->char32At(p3
);
2098 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2101 if (p2
== fText
->length()) {
2102 // Reached end of string. Always a break position.
2107 // No Extend or Format characters may appear between the CR and LF,
2108 // which requires the additional check for p2 immediately following p1.
2110 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2114 // Rule (GB4). ( Control | CR | LF ) <break>
2115 if (fControlSet
->contains(c1
) ||
2121 // Rule (GB5) <break> ( Control | CR | LF )
2123 if (fControlSet
->contains(c2
) ||
2130 // Rule (GB6) L x ( L | V | LV | LVT )
2131 if (fLSet
->contains(c1
) &&
2132 (fLSet
->contains(c2
) ||
2133 fVSet
->contains(c2
) ||
2134 fLVSet
->contains(c2
) ||
2135 fLVTSet
->contains(c2
))) {
2139 // Rule (GB7) ( LV | V ) x ( V | T )
2140 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
2141 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
2145 // Rule (GB8) ( LVT | T) x T
2146 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
2147 fTSet
->contains(c2
)) {
2151 // Rule (GB9) Numeric x ALetter
2152 if (fExtendSet
->contains(c2
)) {
2156 // Rule (GB9a) x SpacingMark
2157 if (fSpacingSet
->contains(c2
)) {
2161 // Rule (GB9b) Prepend x
2162 if (fPrependSet
->contains(c1
)) {
2166 // Rule (GB10) Any <break> Any
2176 UVector
*RBBICharMonkey::charClasses() {
2181 RBBICharMonkey::~RBBICharMonkey() {
2197 //------------------------------------------------------------------------------------------
2199 // class RBBIWordMonkey Word Break specific implementation
2200 // of RBBIMonkeyKind.
2202 //------------------------------------------------------------------------------------------
2203 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2206 virtual ~RBBIWordMonkey();
2207 virtual UVector
*charClasses();
2208 virtual void setText(const UnicodeString
&s
);
2209 virtual int32_t next(int32_t i
);
2215 UnicodeSet
*fNewlineSet
;
2216 UnicodeSet
*fKatakanaSet
;
2217 UnicodeSet
*fALetterSet
;
2218 UnicodeSet
*fMidNumLetSet
;
2219 UnicodeSet
*fMidLetterSet
;
2220 UnicodeSet
*fMidNumSet
;
2221 UnicodeSet
*fNumericSet
;
2222 UnicodeSet
*fFormatSet
;
2223 UnicodeSet
*fOtherSet
;
2224 UnicodeSet
*fExtendSet
;
2225 UnicodeSet
*fExtendNumLetSet
;
2227 RegexMatcher
*fMatcher
;
2229 const UnicodeString
*fText
;
2233 RBBIWordMonkey::RBBIWordMonkey()
2235 UErrorCode status
= U_ZERO_ERROR
;
2237 fSets
= new UVector(status
);
2239 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2240 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2241 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2242 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2243 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2244 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2245 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status
);
2246 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2247 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2248 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2249 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2250 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2252 fOtherSet
= new UnicodeSet();
2253 if(U_FAILURE(status
)) {
2254 deferredStatus
= status
;
2258 fOtherSet
->complement();
2259 fOtherSet
->removeAll(*fCRSet
);
2260 fOtherSet
->removeAll(*fLFSet
);
2261 fOtherSet
->removeAll(*fNewlineSet
);
2262 fOtherSet
->removeAll(*fKatakanaSet
);
2263 fOtherSet
->removeAll(*fALetterSet
);
2264 fOtherSet
->removeAll(*fMidLetterSet
);
2265 fOtherSet
->removeAll(*fMidNumSet
);
2266 fOtherSet
->removeAll(*fNumericSet
);
2267 fOtherSet
->removeAll(*fExtendNumLetSet
);
2268 fOtherSet
->removeAll(*fFormatSet
);
2269 fOtherSet
->removeAll(*fExtendSet
);
2270 // Inhibit dictionary characters from being tested at all.
2271 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2273 fSets
->addElement(fCRSet
, status
);
2274 fSets
->addElement(fLFSet
, status
);
2275 fSets
->addElement(fNewlineSet
, status
);
2276 fSets
->addElement(fALetterSet
, status
);
2277 fSets
->addElement(fKatakanaSet
, status
);
2278 fSets
->addElement(fMidLetterSet
, status
);
2279 fSets
->addElement(fMidNumLetSet
, status
);
2280 fSets
->addElement(fMidNumSet
, status
);
2281 fSets
->addElement(fNumericSet
, status
);
2282 fSets
->addElement(fFormatSet
, status
);
2283 fSets
->addElement(fExtendSet
, status
);
2284 fSets
->addElement(fOtherSet
, status
);
2285 fSets
->addElement(fExtendNumLetSet
, status
);
2287 if (U_FAILURE(status
)) {
2288 deferredStatus
= status
;
2292 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2297 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2298 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2299 // break position being tested. The candidate break
2300 // location is before p2.
2304 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2306 if (U_FAILURE(deferredStatus
)) {
2310 // Prev break at end of string. return DONE.
2311 if (prevPos
>= fText
->length()) {
2314 p0
= p1
= p2
= p3
= prevPos
;
2315 c3
= fText
->char32At(prevPos
);
2318 // Loop runs once per "significant" character position in the input text.
2320 // Move all of the positions forward in the input string.
2325 // Advancd p3 by X(Extend | Format)* Rule 4
2326 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2328 p3
= fText
->moveIndex32(p3
, 1);
2329 c3
= fText
->char32At(p3
);
2330 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2334 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
));
2338 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2341 if (p2
== fText
->length()) {
2342 // Reached end of string. Always a break position.
2347 // No Extend or Format characters may appear between the CR and LF,
2348 // which requires the additional check for p2 immediately following p1.
2350 if (c1
==0x0D && c2
==0x0A) {
2354 // Rule (3a) Break before and after newlines (including CR and LF)
2356 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2359 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2363 // Rule (5). ALetter x ALetter
2364 if (fALetterSet
->contains(c1
) &&
2365 fALetterSet
->contains(c2
)) {
2369 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2371 if ( fALetterSet
->contains(c1
) &&
2372 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2373 fALetterSet
->contains(c3
)) {
2378 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2379 if (fALetterSet
->contains(c0
) &&
2380 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2381 fALetterSet
->contains(c2
)) {
2385 // Rule (8) Numeric x Numeric
2386 if (fNumericSet
->contains(c1
) &&
2387 fNumericSet
->contains(c2
)) {
2391 // Rule (9) ALetter x Numeric
2392 if (fALetterSet
->contains(c1
) &&
2393 fNumericSet
->contains(c2
)) {
2397 // Rule (10) Numeric x ALetter
2398 if (fNumericSet
->contains(c1
) &&
2399 fALetterSet
->contains(c2
)) {
2403 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2404 if (fNumericSet
->contains(c0
) &&
2405 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2406 fNumericSet
->contains(c2
)) {
2410 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2411 if (fNumericSet
->contains(c1
) &&
2412 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2413 fNumericSet
->contains(c3
)) {
2417 // Rule (13) Katakana x Katakana
2418 if (fKatakanaSet
->contains(c1
) &&
2419 fKatakanaSet
->contains(c2
)) {
2424 if ((fALetterSet
->contains(c1
) || fNumericSet
->contains(c1
) ||
2425 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2426 fExtendNumLetSet
->contains(c2
)) {
2431 if (fExtendNumLetSet
->contains(c1
) &&
2432 (fALetterSet
->contains(c2
) || fNumericSet
->contains(c2
) ||
2433 fKatakanaSet
->contains(c2
))) {
2437 // Rule 14. Break found here.
2446 UVector
*RBBIWordMonkey::charClasses() {
2451 RBBIWordMonkey::~RBBIWordMonkey() {
2456 delete fKatakanaSet
;
2458 delete fMidNumLetSet
;
2459 delete fMidLetterSet
;
2464 delete fExtendNumLetSet
;
2471 //------------------------------------------------------------------------------------------
2473 // class RBBISentMonkey Sentence Break specific implementation
2474 // of RBBIMonkeyKind.
2476 //------------------------------------------------------------------------------------------
2477 class RBBISentMonkey
: public RBBIMonkeyKind
{
2480 virtual ~RBBISentMonkey();
2481 virtual UVector
*charClasses();
2482 virtual void setText(const UnicodeString
&s
);
2483 virtual int32_t next(int32_t i
);
2485 int moveBack(int posFrom
);
2486 int moveForward(int posFrom
);
2487 UChar32
cAt(int pos
);
2491 UnicodeSet
*fSepSet
;
2492 UnicodeSet
*fFormatSet
;
2494 UnicodeSet
*fLowerSet
;
2495 UnicodeSet
*fUpperSet
;
2496 UnicodeSet
*fOLetterSet
;
2497 UnicodeSet
*fNumericSet
;
2498 UnicodeSet
*fATermSet
;
2499 UnicodeSet
*fSContinueSet
;
2500 UnicodeSet
*fSTermSet
;
2501 UnicodeSet
*fCloseSet
;
2502 UnicodeSet
*fOtherSet
;
2503 UnicodeSet
*fExtendSet
;
2505 const UnicodeString
*fText
;
2509 RBBISentMonkey::RBBISentMonkey()
2511 UErrorCode status
= U_ZERO_ERROR
;
2513 fSets
= new UVector(status
);
2515 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2516 // set and made into character classes of their own. For the monkey impl,
2517 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2518 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2519 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2520 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2521 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2522 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2523 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2524 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2525 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2526 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2527 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2528 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2529 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2530 fOtherSet
= new UnicodeSet();
2532 if(U_FAILURE(status
)) {
2533 deferredStatus
= status
;
2537 fOtherSet
->complement();
2538 fOtherSet
->removeAll(*fSepSet
);
2539 fOtherSet
->removeAll(*fFormatSet
);
2540 fOtherSet
->removeAll(*fSpSet
);
2541 fOtherSet
->removeAll(*fLowerSet
);
2542 fOtherSet
->removeAll(*fUpperSet
);
2543 fOtherSet
->removeAll(*fOLetterSet
);
2544 fOtherSet
->removeAll(*fNumericSet
);
2545 fOtherSet
->removeAll(*fATermSet
);
2546 fOtherSet
->removeAll(*fSContinueSet
);
2547 fOtherSet
->removeAll(*fSTermSet
);
2548 fOtherSet
->removeAll(*fCloseSet
);
2549 fOtherSet
->removeAll(*fExtendSet
);
2551 fSets
->addElement(fSepSet
, status
);
2552 fSets
->addElement(fFormatSet
, status
);
2553 fSets
->addElement(fSpSet
, status
);
2554 fSets
->addElement(fLowerSet
, status
);
2555 fSets
->addElement(fUpperSet
, status
);
2556 fSets
->addElement(fOLetterSet
, status
);
2557 fSets
->addElement(fNumericSet
, status
);
2558 fSets
->addElement(fATermSet
, status
);
2559 fSets
->addElement(fSContinueSet
, status
);
2560 fSets
->addElement(fSTermSet
, status
);
2561 fSets
->addElement(fCloseSet
, status
);
2562 fSets
->addElement(fOtherSet
, status
);
2563 fSets
->addElement(fExtendSet
, status
);
2565 if (U_FAILURE(status
)) {
2566 deferredStatus
= status
;
2572 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2576 UVector
*RBBISentMonkey::charClasses() {
2581 // moveBack() Find the "significant" code point preceding the index i.
2582 // Skips over ($Extend | $Format)* .
2584 int RBBISentMonkey::moveBack(int i
) {
2591 j
= fText
->moveIndex32(j
, -1);
2592 c
= fText
->char32At(j
);
2594 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2600 int RBBISentMonkey::moveForward(int i
) {
2601 if (i
>=fText
->length()) {
2602 return fText
->length();
2607 j
= fText
->moveIndex32(j
, 1);
2610 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2614 UChar32
RBBISentMonkey::cAt(int pos
) {
2615 if (pos
<0 || pos
>=fText
->length()) {
2618 return fText
->char32At(pos
);
2622 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2623 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2624 // break position being tested. The candidate break
2625 // location is before p2.
2629 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2632 if (U_FAILURE(deferredStatus
)) {
2636 // Prev break at end of string. return DONE.
2637 if (prevPos
>= fText
->length()) {
2640 p0
= p1
= p2
= p3
= prevPos
;
2641 c3
= fText
->char32At(prevPos
);
2644 // Loop runs once per "significant" character position in the input text.
2646 // Move all of the positions forward in the input string.
2651 // Advancd p3 by X(Extend | Format)* Rule 4
2652 p3
= moveForward(p3
);
2656 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2660 // Rule (4). Sep <break>
2661 if (fSepSet
->contains(c1
)) {
2662 p2
= p1
+1; // Separators don't combine with Extend or Format.
2666 if (p2
>= fText
->length()) {
2667 // Reached end of string. Always a break position.
2671 if (p2
== prevPos
) {
2672 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2676 // Rule (6). ATerm x Numeric
2677 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2681 // Rule (7). Upper ATerm x Uppper
2682 if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2686 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2687 // Note: STerm | ATerm are added to the negated part of the expression by a
2688 // note to the Unicode 5.0 documents.
2690 while (fSpSet
->contains(cAt(p8
))) {
2693 while (fCloseSet
->contains(cAt(p8
))) {
2696 if (fATermSet
->contains(cAt(p8
))) {
2700 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2701 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2702 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2705 p8
= moveForward(p8
);
2707 if (fLowerSet
->contains(cAt(p8
))) {
2712 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2713 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2715 while (fSpSet
->contains(cAt(p8
))) {
2718 while (fCloseSet
->contains(cAt(p8
))) {
2722 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2727 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2729 while (fCloseSet
->contains(cAt(p9
))) {
2733 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2734 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2739 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2741 while (fSpSet
->contains(cAt(p10
))) {
2742 p10
= moveBack(p10
);
2744 while (fCloseSet
->contains(cAt(p10
))) {
2745 p10
= moveBack(p10
);
2747 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2748 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2753 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2755 if (fSepSet
->contains(cAt(p11
))) {
2756 p11
= moveBack(p11
);
2758 while (fSpSet
->contains(cAt(p11
))) {
2759 p11
= moveBack(p11
);
2761 while (fCloseSet
->contains(cAt(p11
))) {
2762 p11
= moveBack(p11
);
2764 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2768 // Rule (12) Any x Any
2775 RBBISentMonkey::~RBBISentMonkey() {
2785 delete fSContinueSet
;
2794 //-------------------------------------------------------------------------------------------
2798 //-------------------------------------------------------------------------------------------
2800 class RBBILineMonkey
: public RBBIMonkeyKind
{
2803 virtual ~RBBILineMonkey();
2804 virtual UVector
*charClasses();
2805 virtual void setText(const UnicodeString
&s
);
2806 virtual int32_t next(int32_t i
);
2807 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2851 BreakIterator
*fCharBI
;
2853 const UnicodeString
*fText
;
2854 int32_t *fOrigPositions
;
2856 RegexMatcher
*fNumberMatcher
;
2857 RegexMatcher
*fLB11Matcher
;
2861 RBBILineMonkey::RBBILineMonkey()
2863 UErrorCode status
= U_ZERO_ERROR
;
2865 fSets
= new UVector(status
);
2867 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
2868 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
2869 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
2870 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
2871 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
2872 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
2873 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
2874 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
2875 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
2876 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
2877 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
2878 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
2879 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
2880 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
2881 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
2882 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
2883 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
2884 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
2885 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
2886 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
2887 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
2888 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
2889 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
2890 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
2891 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
2892 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
2893 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
2894 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
2895 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
2896 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
2897 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
2898 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
2899 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
2900 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
2901 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
2902 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
2903 fSA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status
);
2904 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
2905 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
2907 if (U_FAILURE(status
)) {
2908 deferredStatus
= status
;
2910 fNumberMatcher
= NULL
;
2914 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2915 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2916 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
2917 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2919 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
2921 fSets
->addElement(fBK
, status
);
2922 fSets
->addElement(fCR
, status
);
2923 fSets
->addElement(fLF
, status
);
2924 fSets
->addElement(fCM
, status
);
2925 fSets
->addElement(fNL
, status
);
2926 fSets
->addElement(fWJ
, status
);
2927 fSets
->addElement(fZW
, status
);
2928 fSets
->addElement(fGL
, status
);
2929 fSets
->addElement(fCB
, status
);
2930 fSets
->addElement(fSP
, status
);
2931 fSets
->addElement(fB2
, status
);
2932 fSets
->addElement(fBA
, status
);
2933 fSets
->addElement(fBB
, status
);
2934 fSets
->addElement(fHY
, status
);
2935 fSets
->addElement(fH2
, status
);
2936 fSets
->addElement(fH3
, status
);
2937 fSets
->addElement(fCL
, status
);
2938 fSets
->addElement(fCP
, status
);
2939 fSets
->addElement(fEX
, status
);
2940 fSets
->addElement(fIN
, status
);
2941 fSets
->addElement(fJL
, status
);
2942 fSets
->addElement(fJT
, status
);
2943 fSets
->addElement(fJV
, status
);
2944 fSets
->addElement(fNS
, status
);
2945 fSets
->addElement(fOP
, status
);
2946 fSets
->addElement(fQU
, status
);
2947 fSets
->addElement(fIS
, status
);
2948 fSets
->addElement(fNU
, status
);
2949 fSets
->addElement(fPO
, status
);
2950 fSets
->addElement(fPR
, status
);
2951 fSets
->addElement(fSY
, status
);
2952 fSets
->addElement(fAI
, status
);
2953 fSets
->addElement(fAL
, status
);
2954 fSets
->addElement(fHL
, status
);
2955 fSets
->addElement(fID
, status
);
2956 fSets
->addElement(fWJ
, status
);
2957 fSets
->addElement(fSA
, status
);
2958 fSets
->addElement(fSG
, status
);
2961 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2962 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2963 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2964 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2965 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2966 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2968 fNumberMatcher
= new RegexMatcher(
2969 UnicodeString(rules
, -1, US_INV
), 0, status
);
2971 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2973 if (U_FAILURE(status
)) {
2974 deferredStatus
= status
;
2979 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2981 fCharBI
->setText(s
);
2982 fNumberMatcher
->reset(s
);
2987 // Line Break TR rules 9 and 10 implementation.
2988 // This deals with combining marks and other sequences that
2989 // that must be treated as if they were something other than what they actually are.
2991 // This is factored out into a separate function because it must be applied twice for
2992 // each potential break, once to the chars before the position being checked, then
2993 // again to the text following the possible break.
2995 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2997 // Invalid initial position. Happens during the warmup iteration of the
2998 // main loop in next().
3002 int32_t nPos
= *nextPos
;
3004 // LB 9 Keep combining sequences together.
3005 // advance over any CM class chars. Note that Line Break CM is different
3006 // from the normal Grapheme Extend property.
3007 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
3008 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
3010 *nextChar
= fText
->char32At(nPos
);
3011 if (!fCM
->contains(*nextChar
)) {
3014 nPos
= fText
->moveIndex32(nPos
, 1);
3019 // LB 9 Treat X CM* as if it were x.
3020 // No explicit action required.
3022 // LB 10 Treat any remaining combining mark as AL
3023 if (fCM
->contains(*posChar
)) {
3024 *posChar
= 0x41; // thisChar = 'A';
3027 // Push the updated nextPos and nextChar back to our caller.
3028 // This only makes a difference if posChar got bigger by consuming a
3029 // combining sequence.
3031 *nextChar
= fText
->char32At(nPos
);
3036 int32_t RBBILineMonkey::next(int32_t startPos
) {
3037 UErrorCode status
= U_ZERO_ERROR
;
3038 int32_t pos
; // Index of the char following a potential break position
3039 UChar32 thisChar
; // Character at above position "pos"
3041 int32_t prevPos
; // Index of the char preceding a potential break position
3042 UChar32 prevChar
; // Character at above position. Note that prevChar
3043 // and thisChar may not be adjacent because combining
3044 // characters between them will be ignored.
3046 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
3049 int32_t nextPos
; // Index of the next character following pos.
3050 // Usually skips over combining marks.
3051 int32_t nextCPPos
; // Index of the code point following "pos."
3052 // May point to a combining mark.
3053 int32_t tPos
; // temp value.
3056 if (U_FAILURE(deferredStatus
)) {
3060 if (startPos
>= fText
->length()) {
3065 // Initial values for loop. Loop will run the first time without finding breaks,
3066 // while the invalid values shift out and the "this" and
3067 // "prev" positions are filled in with good values.
3068 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
3069 thisChar
= prevChar
= prevCharX2
= 0;
3070 nextPos
= nextCPPos
= startPos
;
3073 // Loop runs once per position in the test text, until a break position
3076 prevPosX2
= prevPos
;
3077 prevCharX2
= prevChar
;
3080 prevChar
= thisChar
;
3083 thisChar
= fText
->char32At(pos
);
3085 nextCPPos
= fText
->moveIndex32(pos
, 1);
3086 nextPos
= nextCPPos
;
3088 // Rule LB2 - Break at end of text.
3089 if (pos
>= fText
->length()) {
3093 // Rule LB 9 - adjust for combining sequences.
3094 // We do this one out-of-order because the adjustment does not change anything
3095 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3097 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3098 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3099 c
= fText
->char32At(nextPos
);
3100 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3102 // If the loop is still warming up - if we haven't shifted the initial
3103 // -1 positions out of prevPos yet - loop back to advance the
3104 // position in the input without any further looking for breaks.
3105 if (prevPos
== -1) {
3109 // LB 4 Always break after hard line breaks,
3110 if (fBK
->contains(prevChar
)) {
3114 // LB 5 Break after CR, LF, NL, but not inside CR LF
3115 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3118 if (prevChar
== 0x0d ||
3124 // LB 6 Don't break before hard line breaks
3125 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3126 fBK
->contains(thisChar
)) {
3131 // LB 7 Don't break before spaces or zero-width space.
3132 if (fSP
->contains(thisChar
)) {
3136 if (fZW
->contains(thisChar
)) {
3140 // LB 8 Break after zero width space
3141 if (fZW
->contains(prevChar
)) {
3145 // LB 9, 10 Already done, at top of loop.
3149 // LB 11 Do not break before or after WORD JOINER and related characters.
3153 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3159 if (fGL
->contains(prevChar
)) {
3165 if (!(fSP
->contains(prevChar
) ||
3166 fBA
->contains(prevChar
) ||
3167 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3173 // LB 13 Don't break before closings.
3174 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3175 // fall into LB 17 and the more general number regular expression.
3177 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3178 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3179 fEX
->contains(thisChar
) ||
3180 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3181 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3185 // LB 14 Don't break after OP SP*
3186 // Scan backwards, checking for this sequence.
3187 // The OP char could include combining marks, so we actually check for
3189 // Another Twist: The Rule 67 fixes may have changed a SP CM
3190 // sequence into a ID char, so before scanning back through spaces,
3191 // verify that prevChar is indeed a space. The prevChar variable
3192 // may differ from fText[prevPos]
3194 if (fSP
->contains(prevChar
)) {
3195 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3196 tPos
=fText
->moveIndex32(tPos
, -1);
3199 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3200 tPos
=fText
->moveIndex32(tPos
, -1);
3202 if (fOP
->contains(fText
->char32At(tPos
))) {
3207 // LB 15 QU SP* x OP
3208 if (fOP
->contains(thisChar
)) {
3209 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3211 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3212 tPos
= fText
->moveIndex32(tPos
, -1);
3214 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3215 tPos
= fText
->moveIndex32(tPos
, -1);
3217 if (fQU
->contains(fText
->char32At(tPos
))) {
3224 // LB 16 (CL | CP) SP* x NS
3225 // Scan backwards for SP* CM* (CL | CP)
3226 if (fNS
->contains(thisChar
)) {
3228 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3229 tPos
= fText
->moveIndex32(tPos
, -1);
3231 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3232 tPos
= fText
->moveIndex32(tPos
, -1);
3234 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3240 // LB 17 B2 SP* x B2
3241 if (fB2
->contains(thisChar
)) {
3242 // Scan backwards, checking for the B2 CM* SP* sequence.
3244 if (fSP
->contains(prevChar
)) {
3245 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3246 tPos
=fText
->moveIndex32(tPos
, -1);
3249 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3250 tPos
=fText
->moveIndex32(tPos
, -1);
3252 if (fB2
->contains(fText
->char32At(tPos
))) {
3258 // LB 18 break after space
3259 if (fSP
->contains(prevChar
)) {
3266 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3270 // LB 20 Break around a CB
3271 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3276 if (fBA
->contains(thisChar
) ||
3277 fHY
->contains(thisChar
) ||
3278 fNS
->contains(thisChar
) ||
3279 fBB
->contains(prevChar
) ) {
3285 if (fHL
->contains(prevCharX2
) &&
3286 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3291 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3292 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3293 (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3294 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3295 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3304 if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) ||
3305 (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3306 (fHL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3307 (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) ||
3308 (fNU
->contains(prevChar
) && fHL
->contains(thisChar
)) ) {
3312 // LB 24 Do not break between prefix and letters or ideographs.
3316 if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) ||
3317 (fPR
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3318 (fPO
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
)))) {
3325 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3326 if (U_FAILURE(status
)) {
3329 // Matched a number. But could have been just a single digit, which would
3330 // not represent a "no break here" between prevChar and thisChar
3331 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3332 if (numEndIdx
> pos
) {
3333 // Number match includes at least our two chars being checked
3334 if (numEndIdx
> nextPos
) {
3335 // Number match includes additional chars. Update pos and nextPos
3336 // so that next loop iteration will continue at the end of the number,
3337 // checking for breaks between last char in number & whatever follows.
3338 pos
= nextPos
= numEndIdx
;
3340 pos
= fText
->moveIndex32(pos
, -1);
3341 thisChar
= fText
->char32At(pos
);
3342 } while (fCM
->contains(thisChar
));
3349 // LB 26 Do not break a Korean syllable.
3350 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3351 fJV
->contains(thisChar
) ||
3352 fH2
->contains(thisChar
) ||
3353 fH3
->contains(thisChar
))) {
3357 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3358 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3362 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3363 fJT
->contains(thisChar
)) {
3367 // LB 27 Treat a Korean Syllable Block the same as ID.
3368 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3369 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3370 fIN
->contains(thisChar
)) {
3373 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3374 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3375 fPO
->contains(thisChar
)) {
3378 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3379 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3385 // LB 28 Do not break between alphabetics ("at").
3386 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3390 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3391 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3395 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3398 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3401 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3405 // LB 31 Break everywhere else
3414 UVector
*RBBILineMonkey::charClasses() {
3419 RBBILineMonkey::~RBBILineMonkey() {
3463 delete fNumberMatcher
;
3467 //-------------------------------------------------------------------------------------------
3472 // seed=nnnnn Random number starting seed.
3473 // Setting the seed allows errors to be reproduced.
3474 // loop=nnn Looping count. Controls running time.
3476 // 0 or greater: run length.
3478 // type = char | word | line | sent | title
3480 //-------------------------------------------------------------------------------------------
3482 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3483 int32_t val
= defaultVal
;
3484 name
.append(" *= *(-?\\d+)");
3485 UErrorCode status
= U_ZERO_ERROR
;
3486 RegexMatcher
m(name
, params
, 0, status
);
3488 // The param exists. Convert the string to an int.
3489 char valString
[100];
3490 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3491 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3492 paramLength
= (int32_t)(sizeof(valString
)-2);
3494 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3495 val
= strtol(valString
, NULL
, 10);
3497 // Delete this parameter from the params string.
3499 params
= m
.replaceFirst("", status
);
3501 U_ASSERT(U_SUCCESS(status
));
3506 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3515 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3517 if (count
< expectedcount
&& expected
[count
] != i
) {
3518 test
->errln("break forward test failed: expected %d but got %d",
3519 expected
[count
], i
);
3524 if (count
!= expectedcount
) {
3525 printStringBreaks(ustr
, expected
, expectedcount
);
3526 test
->errln("break forward test failed: missed %d match",
3527 expectedcount
- count
);
3530 // testing boundaries
3531 for (i
= 1; i
< expectedcount
; i
++) {
3532 int j
= expected
[i
- 1];
3533 if (!bi
->isBoundary(j
)) {
3534 printStringBreaks(ustr
, expected
, expectedcount
);
3535 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3538 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3539 if (bi
->isBoundary(j
)) {
3540 printStringBreaks(ustr
, expected
, expectedcount
);
3541 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3547 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3549 if (forward
[count
] != i
) {
3550 test
->errln("happy break test previous() failed: expected %d but got %d",
3556 printStringBreaks(ustr
, expected
, expectedcount
);
3557 test
->errln("break test previous() failed: missed a match");
3561 // testing preceding
3562 for (i
= 0; i
< expectedcount
- 1; i
++) {
3563 // int j = expected[i] + 1;
3564 int j
= ustr
.moveIndex32(expected
[i
], 1);
3565 for (; j
<= expected
[i
+ 1]; j
++) {
3566 if (bi
->preceding(j
) != expected
[i
]) {
3567 printStringBreaks(ustr
, expected
, expectedcount
);
3568 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3575 void RBBITest::TestWordBreaks(void)
3577 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3579 Locale
locale("en");
3580 UErrorCode status
= U_ZERO_ERROR
;
3581 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3582 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3583 static const char *strlist
[] =
3585 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3586 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3587 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3588 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3589 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3590 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3591 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3592 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3593 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3594 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3595 "\\u2027\\U000e0067\\u0a47\\u00b7",
3596 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3597 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3598 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3599 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3600 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3601 "\\u0027\\u11af\\U000e0057\\u0602",
3602 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3603 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3604 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3605 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3606 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3607 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3608 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3609 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3610 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3611 "\\u58f4\\U000e0049\\u20e7\\u2027",
3612 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3613 "\\ua183\\u102d\\u0bec\\u003a",
3614 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3615 "\\u003a\\u0e57\\u0fad\\u002e",
3616 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3617 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3618 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3619 "\\u003a\\u0664\\u00b7\\u1fba",
3620 "\\u003b\\u0027\\u00b7\\u47a3",
3621 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3622 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3623 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3626 if (U_FAILURE(status
)) {
3627 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3630 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3631 // printf("looping %d\n", loop);
3632 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3633 // RBBICharMonkey monkey;
3634 RBBIWordMonkey monkey
;
3637 int expectedcount
= 0;
3639 monkey
.setText(ustr
);
3641 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3642 expected
[expectedcount
++] = i
;
3645 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3651 void RBBITest::TestWordBoundary(void)
3653 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3654 Locale
locale("en");
3655 UErrorCode status
= U_ZERO_ERROR
;
3656 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3657 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3659 static const char *strlist
[] =
3661 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3662 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3663 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3664 "\\u2027\\U000e0067\\u0a47\\u00b7",
3665 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3666 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3667 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3668 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3669 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3670 "\\u0027\\u11af\\U000e0057\\u0602",
3671 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3672 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3673 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3674 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3675 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3676 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3677 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3678 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3679 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3680 "\\u58f4\\U000e0049\\u20e7\\u2027",
3681 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3682 "\\ua183\\u102d\\u0bec\\u003a",
3683 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3684 "\\u003a\\u0e57\\u0fad\\u002e",
3685 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3686 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3687 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3688 "\\u003a\\u0664\\u00b7\\u1fba",
3689 "\\u003b\\u0027\\u00b7\\u47a3",
3692 if (U_FAILURE(status
)) {
3693 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3696 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3697 // printf("looping %d\n", loop);
3698 u_unescape(strlist
[loop
], str
, 20);
3699 UnicodeString
ustr(str
);
3706 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3707 forward
[count
++] = i
;
3710 for (j
= prev
+ 1; j
< i
; j
++) {
3711 if (bi
->isBoundary(j
)) {
3712 printStringBreaks(ustr
, forward
, count
);
3713 errln("happy boundary test failed: expected %d not a boundary",
3719 if (!bi
->isBoundary(i
)) {
3720 printStringBreaks(ustr
, forward
, count
);
3721 errln("happy boundary test failed: expected %d a boundary",
3731 void RBBITest::TestLineBreaks(void)
3733 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3734 Locale
locale("en");
3735 UErrorCode status
= U_ZERO_ERROR
;
3736 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3737 const int32_t STRSIZE
= 50;
3739 static const char *strlist
[] =
3741 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3742 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3743 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3744 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3745 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3746 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3747 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3748 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3749 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3750 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3751 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3752 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3753 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3754 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3755 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3756 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3757 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3758 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3759 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3760 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3761 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3762 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3763 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3764 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3765 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3766 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3767 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3768 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3769 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3770 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3771 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3772 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3773 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3774 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3775 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3776 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3777 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3778 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3779 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3780 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3781 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3782 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3783 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3784 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3785 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3786 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3787 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3790 TEST_ASSERT_SUCCESS(status
);
3791 if (U_FAILURE(status
)) {
3794 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3795 // printf("looping %d\n", loop);
3796 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3803 UnicodeString
ustr(str
);
3804 RBBILineMonkey monkey
;
3805 if (U_FAILURE(monkey
.deferredStatus
)) {
3809 const int EXPECTEDSIZE
= 50;
3810 int expected
[EXPECTEDSIZE
];
3811 int expectedcount
= 0;
3813 monkey
.setText(ustr
);
3815 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3816 if (expectedcount
>= EXPECTEDSIZE
) {
3817 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3820 expected
[expectedcount
++] = i
;
3823 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3829 void RBBITest::TestSentBreaks(void)
3831 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3832 Locale
locale("en");
3833 UErrorCode status
= U_ZERO_ERROR
;
3834 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3836 static const char *strlist
[] =
3838 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3840 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3841 "\"Sentence ending with a quote.\" Bye.",
3842 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3843 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3844 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3845 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3846 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3847 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3848 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3849 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3850 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3851 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3852 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3853 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3854 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3855 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3856 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3857 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3860 if (U_FAILURE(status
)) {
3861 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3864 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3865 u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0])));
3866 UnicodeString
ustr(str
);
3868 RBBISentMonkey monkey
;
3869 if (U_FAILURE(monkey
.deferredStatus
)) {
3873 const int EXPECTEDSIZE
= 50;
3874 int expected
[EXPECTEDSIZE
];
3875 int expectedcount
= 0;
3877 monkey
.setText(ustr
);
3879 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3880 if (expectedcount
>= EXPECTEDSIZE
) {
3881 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3884 expected
[expectedcount
++] = i
;
3887 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3893 void RBBITest::TestMonkey(char *params
) {
3894 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3896 UErrorCode status
= U_ZERO_ERROR
;
3897 int32_t loopCount
= 500;
3899 UnicodeString breakType
= "all";
3900 Locale
locale("en");
3901 UBool useUText
= FALSE
;
3903 if (quick
== FALSE
) {
3908 UnicodeString
p(params
);
3909 loopCount
= getIntParam("loop", p
, loopCount
);
3910 seed
= getIntParam("seed", p
, seed
);
3912 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3914 breakType
= m
.group(1, status
);
3916 p
= m
.replaceFirst("", status
);
3919 RegexMatcher
u(" *utext", p
, 0, status
);
3923 p
= u
.replaceFirst("", status
);
3928 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
3929 // Each option is stripped out of the option string as it is processed.
3930 // All options have been checked. The option string should have been completely emptied..
3932 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3933 buf
[sizeof(buf
)-1] = 0;
3934 errln("Unrecognized or extra parameter: %s\n", buf
);
3940 if (breakType
== "char" || breakType
== "all") {
3942 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3943 if (U_SUCCESS(status
)) {
3944 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
3945 if (breakType
== "all" && useUText
==FALSE
) {
3946 // Also run a quick test with UText when "all" is specified
3947 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
3951 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
3956 if (breakType
== "word" || breakType
== "all") {
3957 logln("Word Break Monkey Test");
3959 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3960 if (U_SUCCESS(status
)) {
3961 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
3964 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
3969 if (breakType
== "line" || breakType
== "all") {
3970 logln("Line Break Monkey Test");
3972 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3973 if (loopCount
>= 10) {
3974 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
3976 if (U_SUCCESS(status
)) {
3977 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
3980 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3985 if (breakType
== "sent" || breakType
== "all" ) {
3986 logln("Sentence Break Monkey Test");
3988 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3989 if (loopCount
>= 10) {
3990 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
3992 if (U_SUCCESS(status
)) {
3993 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
3996 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4005 // Run a RBBI monkey test. Common routine, for all break iterator types.
4007 // bi - the break iterator to use
4008 // mk - MonkeyKind, abstraction for obtaining expected results
4009 // name - Name of test (char, word, etc.) for use in error messages
4010 // seed - Seed for starting random number generator (parameter from user)
4013 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4014 int32_t numIterations
, UBool useUText
) {
4016 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4018 const int32_t TESTSTRINGLEN
= 500;
4019 UnicodeString testText
;
4020 int32_t numCharClasses
;
4022 int expected
[TESTSTRINGLEN
*2 + 1];
4023 int expectedCount
= 0;
4024 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4025 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4026 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4027 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4028 char followingBreaks
[TESTSTRINGLEN
*2+1];
4029 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4035 numCharClasses
= mk
.charClasses()->size();
4036 chClasses
= mk
.charClasses();
4038 // Check for errors that occured during the construction of the MonkeyKind object.
4039 // Can't report them where they occured because errln() is a method coming from intlTest,
4040 // and is not visible outside of RBBITest :-(
4041 if (U_FAILURE(mk
.deferredStatus
)) {
4042 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4046 // Verify that the character classes all have at least one member.
4047 for (i
=0; i
<numCharClasses
; i
++) {
4048 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4049 if (s
== NULL
|| s
->size() == 0) {
4050 errln("Character Class #%d is null or of zero size.", i
);
4055 while (loopCount
< numIterations
|| numIterations
== -1) {
4056 if (numIterations
== -1 && loopCount
% 10 == 0) {
4057 // If test is running in an infinite loop, display a periodic tic so
4058 // we can tell that it is making progress.
4059 fprintf(stderr
, ".");
4061 // Save current random number seed, so that we can recreate the random numbers
4062 // for this loop iteration in event of an error.
4065 // Populate a test string with data.
4066 testText
.truncate(0);
4067 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4068 int32_t aClassNum
= m_rand() % numCharClasses
;
4069 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4070 int32_t charIdx
= m_rand() % classSet
->size();
4071 UChar32 c
= classSet
->charAt(charIdx
);
4072 if (c
< 0) { // TODO: deal with sets containing strings.
4079 // Calculate the expected results for this test string.
4080 mk
.setText(testText
);
4081 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4082 expectedBreaks
[0] = 1;
4083 int32_t breakPos
= 0;
4086 breakPos
= mk
.next(breakPos
);
4087 if (breakPos
== -1) {
4090 if (breakPos
> testText
.length()) {
4091 errln("breakPos > testText.length()");
4093 expectedBreaks
[breakPos
] = 1;
4094 U_ASSERT(expectedCount
<testText
.length());
4095 expected
[expectedCount
++] = breakPos
;
4098 // Find the break positions using forward iteration
4099 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4101 UErrorCode status
= U_ZERO_ERROR
;
4102 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4103 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4104 bi
->setText(testUText
, status
);
4105 TEST_ASSERT_SUCCESS(status
);
4106 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4107 // This UText can be closed immediately, so long as the
4108 // testText string continues to exist.
4110 bi
->setText(testText
);
4113 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4114 if (i
< 0 || i
> testText
.length()) {
4115 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4118 forwardBreaks
[i
] = 1;
4121 // Find the break positions using reverse iteration
4122 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4123 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4124 if (i
< 0 || i
> testText
.length()) {
4125 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4128 reverseBreaks
[i
] = 1;
4131 // Find the break positions using isBoundary() tests.
4132 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4133 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4134 for (i
=0; i
<=testText
.length(); i
++) {
4135 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4139 // Find the break positions using the following() function.
4141 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4142 int32_t lastBreakPos
= 0;
4143 followingBreaks
[0] = 1;
4144 for (i
=0; i
<testText
.length(); i
++) {
4145 breakPos
= bi
->following(i
);
4146 if (breakPos
<= i
||
4147 breakPos
< lastBreakPos
||
4148 breakPos
> testText
.length() ||
4149 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4150 errln("%s break monkey test: "
4151 "Out of range value returned by BreakIterator::following().\n"
4152 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4153 name
, seed
, i
, breakPos
, lastBreakPos
);
4156 followingBreaks
[breakPos
] = 1;
4157 lastBreakPos
= breakPos
;
4160 // Find the break positions using the preceding() function.
4161 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4162 lastBreakPos
= testText
.length();
4163 precedingBreaks
[testText
.length()] = 1;
4164 for (i
=testText
.length(); i
>0; i
--) {
4165 breakPos
= bi
->preceding(i
);
4166 if (breakPos
>= i
||
4167 breakPos
> lastBreakPos
||
4168 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4169 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4170 errln("%s break monkey test: "
4171 "Out of range value returned by BreakIterator::preceding().\n"
4172 "index=%d; prev returned %d; lastBreak=%d" ,
4173 name
, i
, breakPos
, lastBreakPos
);
4174 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4175 precedingBreaks
[i
] = 2; // Forces an error.
4178 if (breakPos
>= 0) {
4179 precedingBreaks
[breakPos
] = 1;
4181 lastBreakPos
= breakPos
;
4185 // Compare the expected and actual results.
4186 for (i
=0; i
<=testText
.length(); i
++) {
4187 const char *errorType
= NULL
;
4188 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4189 errorType
= "next()";
4190 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4191 errorType
= "previous()";
4192 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4193 errorType
= "isBoundary()";
4194 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4195 errorType
= "following()";
4196 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4197 errorType
= "preceding()";
4201 if (errorType
!= NULL
) {
4202 // Format a range of the test text that includes the failure as
4203 // a data item that can be included in the rbbi test data file.
4205 // Start of the range is the last point where expected and actual results
4206 // both agreed that there was a break position.
4207 int startContext
= i
;
4210 if (startContext
==0) { break; }
4212 if (expectedBreaks
[startContext
] != 0) {
4213 if (count
== 2) break;
4218 // End of range is two expected breaks past the start position.
4219 int endContext
= i
+ 1;
4221 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4223 if (endContext
>= testText
.length()) {break;}
4224 if (expectedBreaks
[endContext
-1] != 0) {
4225 if (count
== 0) break;
4232 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4233 UnicodeString errorText
= "<data>";
4234 /***if (strcmp(errorType, "next()") == 0) {
4236 endContext = testText.length();
4238 printStringBreaks(testText, expected, expectedCount);
4241 for (ci
=startContext
; ci
<endContext
;) {
4242 UnicodeString
hexChars("0123456789abcdef");
4245 c
= testText
.char32At(ci
);
4247 // This is the location of the error.
4248 errorText
.append("<?>");
4249 } else if (expectedBreaks
[ci
] != 0) {
4250 // This a non-error expected break position.
4251 errorText
.append("\\");
4254 errorText
.append("\\u");
4255 for (bn
=12; bn
>=0; bn
-=4) {
4256 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4259 errorText
.append("\\U");
4260 for (bn
=28; bn
>=0; bn
-=4) {
4261 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4264 ci
= testText
.moveIndex32(ci
, 1);
4266 errorText
.append("\\");
4267 errorText
.append("</data>\n");
4270 char charErrorTxt
[500];
4271 UErrorCode status
= U_ZERO_ERROR
;
4272 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4273 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4274 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4276 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4277 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4278 errorType
, seed
, i
, charErrorTxt
);
4289 // Bug 5532. UTF-8 based UText fails in dictionary code.
4290 // This test checks the initial patch,
4291 // which is to just keep it from crashing. Correct word boundaries
4292 // await a proper fix to the dictionary code.
4294 void RBBITest::TestBug5532(void) {
4295 // Text includes a mixture of Thai and Latin.
4296 const unsigned char utf8Data
[] = {
4297 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4298 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4299 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4300 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4301 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4302 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4303 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4304 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4305 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4306 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4307 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4309 UErrorCode status
= U_ZERO_ERROR
;
4310 UText utext
=UTEXT_INITIALIZER
;
4311 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4312 TEST_ASSERT_SUCCESS(status
);
4314 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4315 TEST_ASSERT_SUCCESS(status
);
4316 if (U_SUCCESS(status
)) {
4317 bi
->setText(&utext
, status
);
4318 TEST_ASSERT_SUCCESS(status
);
4320 int32_t breakCount
= 0;
4321 int32_t previousBreak
= -1;
4322 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4323 // For now, just make sure that the break iterator doesn't hang.
4324 TEST_ASSERT(previousBreak
< bi
->current());
4325 previousBreak
= bi
->current();
4327 TEST_ASSERT(breakCount
> 0);
4330 utext_close(&utext
);
4335 // TestDebug - A place-holder test for debugging purposes.
4336 // For putting in fragments of other tests that can be invoked
4337 // for tracing without a lot of unwanted extra stuff happening.
4339 void RBBITest::TestDebug(void) {
4341 UErrorCode status
= U_ZERO_ERROR
;
4345 RuleBasedBreakIterator
* bi
=
4346 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4347 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4348 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4349 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4350 // UnicodeString s("Aaa. Bcd");
4353 UBool r
= bi
->isBoundary(8);
4354 printf("%s", r
?"true":"false");
4358 // ruleStatus = bi->getRuleStatus();
4359 printf("%d\t%d\n", pos
, ruleStatus
);
4360 pos
= bi
->previous();
4361 } while (pos
!= BreakIterator::DONE
);
4365 void RBBITest::TestProperties() {
4366 UErrorCode errorCode
= U_ZERO_ERROR
;
4367 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4368 if (!prependSet
.isEmpty()) {
4370 "[:GCB=Prepend:] is not empty any more. "
4371 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4372 "change this test to the opposite condition.");
4376 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */