1 /********************************************************************
3 * Copyright (c) 1999-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "utypeinfo.h" // for 'typeid' to work
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
42 #define TEST_ASSERT(x) {if (!(x)) { \
43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
49 //---------------------------------------------
51 //---------------------------------------------
54 // Note: Before adding new tests to this file, check whether the desired test data can
55 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
56 // it's much less work than writing a new test, diagnostic output in the event of failures
57 // is good, and the test data file will is shared with ICU4J, so eventually the test
58 // will run there as well, without additional effort.
60 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
62 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
65 #if !UCONFIG_NO_FILE_IO
66 case 0: name
= "TestBug4153072";
67 if(exec
) TestBug4153072(); break;
69 case 0: name
= "skip";
73 case 1: name
= "skip";
75 case 2: name
= "TestStatusReturn";
76 if(exec
) TestStatusReturn(); break;
78 #if !UCONFIG_NO_FILE_IO
79 case 3: name
= "TestUnicodeFiles";
80 if(exec
) TestUnicodeFiles(); break;
81 case 4: name
= "TestEmptyString";
82 if(exec
) TestEmptyString(); break;
84 case 3: case 4: name
= "skip";
88 case 5: name
= "TestGetAvailableLocales";
89 if(exec
) TestGetAvailableLocales(); break;
91 case 6: name
= "TestGetDisplayName";
92 if(exec
) TestGetDisplayName(); break;
94 #if !UCONFIG_NO_FILE_IO
95 case 7: name
= "TestEndBehaviour";
96 if(exec
) TestEndBehaviour(); break;
97 case 8: case 9: case 10: name
= "skip";
99 case 11: name
= "TestWordBreaks";
100 if(exec
) TestWordBreaks(); break;
101 case 12: name
= "TestWordBoundary";
102 if(exec
) TestWordBoundary(); break;
103 case 13: name
= "TestLineBreaks";
104 if(exec
) TestLineBreaks(); break;
105 case 14: name
= "TestSentBreaks";
106 if(exec
) TestSentBreaks(); break;
107 case 15: name
= "TestExtended";
108 if(exec
) TestExtended(); break;
110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name
= "skip";
114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
116 name
= "TestMonkey"; if(exec
) TestMonkey(params
); break;
119 name
= "skip"; break;
122 #if !UCONFIG_NO_FILE_IO
123 case 17: name
= "TestBug3818";
124 if(exec
) TestBug3818(); break;
126 case 17: name
= "skip";
130 case 18: name
= "skip";
132 case 19: name
= "TestDebug";
133 if(exec
) TestDebug(); break;
134 case 20: name
= "skip";
137 #if !UCONFIG_NO_FILE_IO
138 case 21: name
= "TestBug5775";
139 if (exec
) TestBug5775(); break;
141 case 21: name
= "skip";
145 case 22: name
= "TestBug9983";
146 if (exec
) TestBug9983(); break;
147 case 23: name
= "TestDictRules";
148 if (exec
) TestDictRules(); break;
149 case 24: name
= "TestBug5532";
150 if (exec
) TestBug5532(); break;
151 default: name
= ""; break; //needed to end loop
156 //---------------------------------------------------------------------------
158 // class BITestData Holds a set of Break iterator test data and results
160 // - the string data to be broken
161 // - a vector of the expected break positions.
162 // - a vector of source line numbers for the data,
163 // (to help see where errors occured.)
164 // - The expected break tag values.
165 // - Vectors of actual break positions and tag values.
166 // - Functions for comparing actual with expected and
169 //----------------------------------------------------------------------------
172 UnicodeString fDataToBreak
;
173 UVector fExpectedBreakPositions
;
174 UVector fExpectedTags
;
176 UVector fActualBreakPositions
; // Test Results.
179 BITestData(UErrorCode
&status
);
180 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
181 void checkResults(const char *heading
, RBBITest
*test
);
182 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
189 BITestData::BITestData(UErrorCode
&status
)
190 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
196 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
197 // The macro form collects the line number, which is helpful
198 // when tracking down failures.
200 // A null data item is inserted at the start of each test's data
201 // to put the starting zero into the data list. The position saved for
202 // each non-null item is its ending position.
204 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
205 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
206 if (U_FAILURE(status
)) {return;}
208 fDataToBreak
.append(CharsToUnicodeString(data
));
210 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
211 fExpectedTags
.addElement(tag
, status
);
212 fLineNum
.addElement(lineNum
, status
);
217 // checkResults. Compare the actual and expected break positions, report any differences.
219 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
220 int32_t expectedIndex
= 0;
221 int32_t actualIndex
= 0;
224 // If we've run through both the expected and actual results vectors, we're done.
225 // break out of the loop.
226 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
227 actualIndex
>= fActualBreakPositions
.size()) {
232 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
233 err(heading
, test
, expectedIndex
-1, actualIndex
);
238 if (actualIndex
>= fActualBreakPositions
.size()) {
239 err(heading
, test
, expectedIndex
, actualIndex
-1);
244 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
245 err(heading
, test
, expectedIndex
, actualIndex
);
246 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
255 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
256 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
257 heading
, fLineNum
.elementAt(expectedIndex
),
258 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
267 // err - An error was found. Report it, along with information about where the
268 // incorrectly broken test data appeared in the source file.
270 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
272 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
273 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
275 int32_t line
= fLineNum
.elementAti(expectedIdx
);
276 if (expectedIdx
> 0) {
277 // The line numbers are off by one because a premature break occurs somewhere
278 // within the previous item, rather than at the start of the current (expected) item.
279 // We want to report the offset of the unexpected break from the start of
280 // this previous item.
281 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
283 if (actual
< expected
) {
284 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
286 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
291 void BITestData::clearResults() {
292 fActualBreakPositions
.removeAllElements();
293 fActualTags
.removeAllElements();
297 //--------------------------------------------------------------------------------------
299 // RBBITest constructor and destructor
301 //--------------------------------------------------------------------------------------
303 RBBITest::RBBITest() {
307 RBBITest::~RBBITest() {
310 //-----------------------------------------------------------------------------------
312 // Test for status {tag} return value from break rules.
313 // TODO: a more thorough test.
315 //-----------------------------------------------------------------------------------
316 void RBBITest::TestStatusReturn() {
317 UnicodeString
rulesString1("$Letters = [:L:];\n"
318 "$Numbers = [:N:];\n"
321 "Help\\ {4}/me\\!;\n"
322 "[^$Letters $Numbers];\n"
323 "!.*;\n", -1, US_INV
);
324 UnicodeString testString1
= "abc123..abc Help me Help me!";
325 // 01234567890123456789012345678
326 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
329 UErrorCode status
=U_ZERO_ERROR
;
330 UParseError parseError
;
332 RuleBasedBreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
333 if(U_FAILURE(status
)) {
334 dataerrln("FAIL : in construction - %s", u_errorName(status
));
338 bi
->setText(testString1
);
339 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
340 if (pos
!= bounds1
[i
]) {
341 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
345 int tag
= bi
->getRuleStatus();
346 if (tag
!= brkStatus
[i
]) {
347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
357 static void printStringBreaks(UnicodeString ustr
, int expected
[],
360 UErrorCode status
= U_ZERO_ERROR
;
362 printf("code alpha extend alphanum type word sent line name\n");
364 for (j
= 0; j
< ustr
.length(); j
++) {
365 if (expectedcount
> 0) {
367 for (k
= 0; k
< expectedcount
; k
++) {
368 if (j
== expected
[k
]) {
369 printf("------------------------------------------------ %d\n",
374 UChar32 c
= ustr
.char32At(j
);
378 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
381 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
385 U_SHORT_PROPERTY_NAME
),
386 u_getPropertyValueName(UCHAR_WORD_BREAK
,
387 u_getIntPropertyValue(c
,
389 U_SHORT_PROPERTY_NAME
),
390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
391 u_getIntPropertyValue(c
,
392 UCHAR_SENTENCE_BREAK
),
393 U_SHORT_PROPERTY_NAME
),
394 u_getPropertyValueName(UCHAR_LINE_BREAK
,
395 u_getIntPropertyValue(c
,
397 U_SHORT_PROPERTY_NAME
),
403 void RBBITest::TestBug3818() {
404 UErrorCode status
= U_ZERO_ERROR
;
406 // Four Thai words...
407 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409 UnicodeString
thaiStr(thaiWordData
);
411 RuleBasedBreakIterator
* bi
=
412 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale("th"), status
);
413 if (U_FAILURE(status
) || bi
== NULL
) {
414 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
417 bi
->setText(thaiStr
);
419 int32_t startOfSecondWord
= bi
->following(1);
420 if (startOfSecondWord
!= 4) {
421 errln("Fail at file %s, line %d expected start of word at 4, got %d",
422 __FILE__
, __LINE__
, startOfSecondWord
);
424 startOfSecondWord
= bi
->following(0);
425 if (startOfSecondWord
!= 4) {
426 errln("Fail at file %s, line %d expected start of word at 4, got %d",
427 __FILE__
, __LINE__
, startOfSecondWord
);
432 //----------------------------------------------------------------------------
434 // generalIteratorTest Given a break iterator and a set of test data,
435 // Run the tests and report the results.
437 //----------------------------------------------------------------------------
438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
441 bi
.setText(td
.fDataToBreak
);
443 testFirstAndNext(bi
, td
);
445 testLastAndPrevious(bi
, td
);
447 testFollowing(bi
, td
);
448 testPreceding(bi
, td
);
449 testIsBoundary(bi
, td
);
450 doMultipleSelectionTest(bi
, td
);
455 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
460 UErrorCode status
= U_ZERO_ERROR
;
465 logln("Test first and next");
466 bi
.setText(td
.fDataToBreak
);
469 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
470 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
471 tag
= bi
.getRuleStatus();
472 td
.fActualTags
.addElement(tag
, status
);
474 // If the iterator is not making forward progress, stop.
475 // No need to raise an error here, it'll be detected in the normal check of results.
480 td
.checkResults("testFirstAndNext", this);
485 // TestLastAndPrevious. Run the iterator backwards, starting with last().
487 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
489 UErrorCode status
= U_ZERO_ERROR
;
491 int32_t lastP
= 0x7ffffffe;
494 logln("Test last and previous");
495 bi
.setText(td
.fDataToBreak
);
498 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
499 // Save break position. Insert it at start of vector of results, shoving
500 // already-saved results further towards the end.
501 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
502 // bi.previous(); // TODO: Why does this fix things up????
504 tag
= bi
.getRuleStatus();
505 td
.fActualTags
.insertElementAt(tag
, 0, status
);
507 // If the iterator is not making progress, stop.
508 // No need to raise an error here, it'll be detected in the normal check of results.
513 td
.checkResults("testLastAndPrevious", this);
517 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
519 UErrorCode status
= U_ZERO_ERROR
;
522 int32_t lastP
= -2; // A value that will never be returned as a break position.
523 // cannot be -1; that is returned for DONE.
526 logln("testFollowing():");
527 bi
.setText(td
.fDataToBreak
);
530 // Save the starting point, since we won't get that out of following.
532 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
533 tag
= bi
.getRuleStatus();
534 td
.fActualTags
.addElement(tag
, status
);
536 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
539 if (p
== RuleBasedBreakIterator::DONE
) {
542 // We've reached a new break position. Save it.
543 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
544 tag
= bi
.getRuleStatus();
545 td
.fActualTags
.addElement(tag
, status
);
549 // The loop normally exits by means of the break in the middle.
550 // Make sure that the index was at the correct position for the break iterator to have
552 if (i
!= td
.fDataToBreak
.length()) {
553 errln("testFollowing(): iterator returned DONE prematurely.");
556 // Full check of all results.
557 td
.checkResults("testFollowing", this);
562 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
563 UErrorCode status
= U_ZERO_ERROR
;
566 int32_t lastP
= 0x7ffffffe;
569 logln("testPreceding():");
570 bi
.setText(td
.fDataToBreak
);
574 td
.fActualBreakPositions
.addElement(p
, status
);
575 tag
= bi
.getRuleStatus();
576 td
.fActualTags
.addElement(tag
, status
);
578 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
581 if (p
== RuleBasedBreakIterator::DONE
) {
584 // We've reached a new break position. Save it.
585 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
587 tag
= bi
.getRuleStatus();
588 td
.fActualTags
.insertElementAt(tag
, 0, status
);
591 // The loop normally exits by means of the break in the middle.
592 // Make sure that the index was at the correct position for the break iterator to have
595 errln("testPreceding(): iterator returned DONE prematurely.");
598 // Full check of all results.
599 td
.checkResults("testPreceding", this);
604 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
605 UErrorCode status
= U_ZERO_ERROR
;
609 logln("testIsBoundary():");
610 bi
.setText(td
.fDataToBreak
);
613 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
614 if (bi
.isBoundary(i
)) {
615 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
616 tag
= bi
.getRuleStatus();
617 td
.fActualTags
.addElement(tag
, status
);
620 td
.checkResults("testIsBoundary: ", this);
625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
627 iterator
.setText(td
.fDataToBreak
);
629 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
630 int32_t offset
= iterator
.first();
634 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
636 if (*testIterator
!= iterator
)
637 errln("clone() or operator!= failed: two clones compared unequal");
640 testOffset
= testIterator
->first();
641 testOffset
= testIterator
->next(count
);
642 if (offset
!= testOffset
)
643 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
645 if (offset
!= RuleBasedBreakIterator::DONE
) {
647 offset
= iterator
.next();
649 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
650 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
651 if (count
> 10000 || offset
== -1) {
652 errln("operator== failed too many times. Stopping test.");
654 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
660 } while (offset
!= RuleBasedBreakIterator::DONE
);
662 // now do it backwards...
663 offset
= iterator
.last();
667 testOffset
= testIterator
->last();
668 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
669 if (offset
!= testOffset
)
670 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
672 if (offset
!= RuleBasedBreakIterator::DONE
) {
674 offset
= iterator
.previous();
676 } while (offset
!= RuleBasedBreakIterator::DONE
);
682 //---------------------------------------------
686 //---------------------------------------------
687 void RBBITest::TestEmptyString()
689 UnicodeString text
= "";
690 UErrorCode status
= U_ZERO_ERROR
;
692 BITestData
x(status
);
693 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
694 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
695 if (U_FAILURE(status
))
697 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
700 generalIteratorTest(*bi
, x
);
704 void RBBITest::TestGetAvailableLocales()
706 int32_t locCount
= 0;
707 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
710 dataerrln("getAvailableLocales() returned an empty list!");
711 // Just make sure that it's returning good memory.
713 for (i
= 0; i
< locCount
; ++i
) {
714 logln(locList
[i
].getName());
718 //Testing the BreakIterator::getDisplayName() function
719 void RBBITest::TestGetDisplayName()
721 UnicodeString result
;
723 BreakIterator::getDisplayName(Locale::getUS(), result
);
724 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
725 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
728 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
729 if (result
!= "French (France)")
730 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
737 void RBBITest::TestEndBehaviour()
739 UErrorCode status
= U_ZERO_ERROR
;
740 UnicodeString
testString("boo.");
741 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
742 if (U_FAILURE(status
))
744 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
747 wb
->setText(testString
);
749 if (wb
->first() != 0)
750 errln("Didn't get break at beginning of string.");
752 errln("Didn't get break before period in \"boo.\"");
753 if (wb
->current() != 4 && wb
->next() != 4)
754 errln("Didn't get break at end of string.");
760 void RBBITest::TestBug4153072() {
761 UErrorCode status
= U_ZERO_ERROR
;
762 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
763 if (U_FAILURE(status
))
765 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
768 UnicodeString
str("...Hello, World!...");
770 int32_t end
= str
.length() - 3;
773 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
774 iter
->adoptText(textIterator
);
776 // Note: with the switch to UText, there is no way to restrict the
777 // iteration range to begin at an index other than zero.
778 // String character iterators created with a non-zero bound are
779 // treated by RBBI as being empty.
780 for (index
= -1; index
< begin
+ 1; ++index
) {
781 onBoundary
= iter
->isBoundary(index
);
782 if (index
== 0? !onBoundary
: onBoundary
) {
783 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
784 " and begin index = " + begin
);
792 // Test for problem reported by Ashok Matoria on 9 July 2007
793 // One.<kSoftHyphen><kSpace>Two.
795 // Sentence break at start (0) and then on calling next() it breaks at
796 // 'T' of "Two". Now, at this point if I do next() and
797 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
799 void RBBITest::TestBug5775() {
800 UErrorCode status
= U_ZERO_ERROR
;
801 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
802 TEST_ASSERT_SUCCESS(status
);
803 if (U_FAILURE(status
)) {
806 // Check for status first for better handling of no data errors.
807 TEST_ASSERT(bi
!= NULL
);
812 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
816 int pos
= bi
->next();
817 TEST_ASSERT(pos
== 6);
819 TEST_ASSERT(pos
== 10);
820 pos
= bi
->previous();
821 TEST_ASSERT(pos
== 6);
827 //------------------------------------------------------------------------------
829 // RBBITest::Extended Run RBBI Tests from an external test data file
831 //------------------------------------------------------------------------------
835 UnicodeString dataToBreak
;
836 UVector32
*expectedBreaks
;
841 void RBBITest::executeTest(TestParams
*t
) {
850 t
->bi
->setText(t
->dataToBreak
);
852 // Run the iterator forward
855 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
857 // Fail for lack of forward progress.
858 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
859 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
863 // Check that there were we didn't miss an expected break between the last one
865 for (i
=prevBP
+1; i
<bp
; i
++) {
866 if (t
->expectedBreaks
->elementAti(i
) != 0) {
867 int expected
[] = {0, i
};
868 printStringBreaks(t
->dataToBreak
, expected
, 2);
869 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
870 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
874 // Check that the break we did find was expected
875 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
876 int expected
[] = {0, bp
};
877 printStringBreaks(t
->dataToBreak
, expected
, 2);
878 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
879 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
881 // The break was expected.
882 // Check that the {nnn} tag value is correct.
883 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
884 if (expectedTagVal
== -1) {
887 int32_t line
= t
->srcLine
->elementAti(bp
);
888 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
889 if (rs
!= expectedTagVal
) {
890 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
891 " Actual, Expected status = %4d, %4d",
892 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
900 // Verify that there were no missed expected breaks after the last one found
901 for (i
=prevBP
+1; i
<t
->expectedBreaks
->size(); i
++) {
902 if (t
->expectedBreaks
->elementAti(i
) != 0) {
903 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
904 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
909 // Run the iterator backwards, verify that the same breaks are found.
911 prevBP
= t
->dataToBreak
.length()+2; // start with a phony value for the last break pos seen.
912 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
914 // Fail for lack of progress.
915 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
916 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
920 // Check that there were we didn't miss an expected break between the last one
921 // and this one. (UVector returns zeros for index out of bounds.)
922 for (i
=prevBP
-1; i
>bp
; i
--) {
923 if (t
->expectedBreaks
->elementAti(i
) != 0) {
924 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
925 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
929 // Check that the break we did find was expected
930 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
931 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
932 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
934 // The break was expected.
935 // Check that the {nnn} tag value is correct.
936 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
937 if (expectedTagVal
== -1) {
940 int line
= t
->srcLine
->elementAti(bp
);
941 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
942 if (rs
!= expectedTagVal
) {
943 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
944 " Actual, Expected status = %4d, %4d",
945 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
952 // Verify that there were no missed breaks prior to the last one found
953 for (i
=prevBP
-1; i
>=0; i
--) {
954 if (t
->expectedBreaks
->elementAti(i
) != 0) {
955 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
956 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
960 // Check isBoundary()
961 for (i
=0; i
<t
->expectedBreaks
->size(); i
++) {
962 UBool boundaryExpected
= (t
->expectedBreaks
->elementAti(i
) != 0);
963 UBool boundaryFound
= t
->bi
->isBoundary(i
);
964 if (boundaryExpected
!= boundaryFound
) {
965 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
966 " Expected, Actual= %s, %s",
967 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
),
968 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
973 for (i
=0; i
<t
->expectedBreaks
->size(); i
++) {
974 int32_t actualBreak
= t
->bi
->following(i
);
975 int32_t expectedBreak
= BreakIterator::DONE
;
976 for (int32_t j
=i
+1; j
< t
->expectedBreaks
->size(); j
++) {
977 if (t
->expectedBreaks
->elementAti(j
) != 0) {
982 if (expectedBreak
!= actualBreak
) {
983 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
984 " Expected, Actual= %d, %d",
985 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
), expectedBreak
, actualBreak
);
990 for (i
=t
->expectedBreaks
->size(); i
>=0; i
--) {
991 int32_t actualBreak
= t
->bi
->preceding(i
);
992 int32_t expectedBreak
= BreakIterator::DONE
;
994 for (int32_t j
=i
-1; j
>= 0; j
--) {
995 if (t
->expectedBreaks
->elementAti(j
) != 0) {
1000 if (expectedBreak
!= actualBreak
) {
1001 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1002 " Expected, Actual= %d, %d",
1003 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
), expectedBreak
, actualBreak
);
1009 void RBBITest::TestExtended() {
1010 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1011 UErrorCode status
= U_ZERO_ERROR
;
1014 UnicodeString rules
;
1017 tp
.expectedBreaks
= new UVector32(status
);
1018 tp
.srcLine
= new UVector32(status
);
1019 tp
.srcCol
= new UVector32(status
);
1021 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status
);
1022 if (U_FAILURE(status
)) {
1023 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1028 // Open and read the test data file.
1030 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1031 char testFileName
[1000];
1032 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1033 errln("Can't open test data. Path too long.");
1036 strcpy(testFileName
, testDataDirectory
);
1037 strcat(testFileName
, "rbbitst.txt");
1040 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1041 if (U_FAILURE(status
)) {
1042 return; /* something went wrong, error already output */
1049 // Put the test data into a UnicodeString
1051 UnicodeString
testString(FALSE
, testFile
, len
);
1059 parseState
= PARSE_TAG
;
1061 EParseState savedState
= PARSE_TAG
;
1063 static const UChar CH_LF
= 0x0a;
1064 static const UChar CH_CR
= 0x0d;
1065 static const UChar CH_HASH
= 0x23;
1066 /*static const UChar CH_PERIOD = 0x2e;*/
1067 static const UChar CH_LT
= 0x3c;
1068 static const UChar CH_GT
= 0x3e;
1069 static const UChar CH_BACKSLASH
= 0x5c;
1070 static const UChar CH_BULLET
= 0x2022;
1072 int32_t lineNum
= 1;
1073 int32_t colStart
= 0;
1075 int32_t charIdx
= 0;
1077 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1079 for (charIdx
= 0; charIdx
< len
; ) {
1080 status
= U_ZERO_ERROR
;
1081 UChar c
= testString
.charAt(charIdx
);
1083 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1084 // treat CRLF as a unit
1088 if (c
== CH_LF
|| c
== CH_CR
) {
1092 column
= charIdx
- colStart
+ 1;
1094 switch (parseState
) {
1096 if (c
== 0x0a || c
== 0x0d) {
1097 parseState
= savedState
;
1104 parseState
= PARSE_COMMENT
;
1105 savedState
= PARSE_TAG
;
1108 if (u_isUWhiteSpace(c
)) {
1111 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1113 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1117 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1119 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1123 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1125 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1129 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1132 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1136 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1138 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1143 // <locale loc_name>
1144 localeMatcher
.reset(testString
);
1145 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1146 UnicodeString localeName
= localeMatcher
.group(1, status
);
1147 char localeName8
[100];
1148 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1149 locale
= Locale::createFromName(localeName8
);
1150 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
1151 TEST_ASSERT_SUCCESS(status
);
1154 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1155 parseState
= PARSE_DATA
;
1157 tp
.dataToBreak
= "";
1158 tp
.expectedBreaks
->removeAllElements();
1159 tp
.srcCol
->removeAllElements();
1160 tp
.srcLine
->removeAllElements();
1164 errln("line %d: Tag expected in test file.", lineNum
);
1165 parseState
= PARSE_COMMENT
;
1166 savedState
= PARSE_DATA
;
1167 goto end_test
; // Stop the test.
1172 if (c
== CH_BULLET
) {
1173 int32_t breakIdx
= tp
.dataToBreak
.length();
1174 tp
.expectedBreaks
->setSize(breakIdx
+1);
1175 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1176 tp
.srcLine
->setSize(breakIdx
+1);
1177 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1178 tp
.srcCol
->setSize(breakIdx
+1);
1179 tp
.srcCol
->setElementAt(column
, breakIdx
);
1183 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1184 // Add final entry to mappings from break location to source file position.
1185 // Need one extra because last break position returned is after the
1186 // last char in the data, not at the last char.
1187 tp
.srcLine
->addElement(lineNum
, status
);
1188 tp
.srcCol
->addElement(column
, status
);
1190 parseState
= PARSE_TAG
;
1198 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1199 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1200 // Get the code point from the name and insert it into the test data.
1201 // (Damn, no API takes names in Unicode !!!
1202 // we've got to take it back to char *)
1203 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1204 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1205 char charNameBuf
[200];
1206 UChar32 theChar
= -1;
1207 if (nameEndIdx
!= -1) {
1208 UErrorCode status
= U_ZERO_ERROR
;
1209 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1210 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1211 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1212 if (U_FAILURE(status
)) {
1216 if (theChar
== -1) {
1217 errln("Error in named character in test file at line %d, col %d",
1220 // Named code point was recognized. Insert it
1221 // into the test data.
1222 tp
.dataToBreak
.append(theChar
);
1223 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1224 tp
.srcLine
->addElement(lineNum
, status
);
1225 tp
.srcCol
->addElement(column
, status
);
1228 if (nameEndIdx
> charIdx
) {
1229 charIdx
= nameEndIdx
+1;
1238 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1240 int32_t breakIdx
= tp
.dataToBreak
.length();
1241 tp
.expectedBreaks
->setSize(breakIdx
+1);
1242 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1243 tp
.srcLine
->setSize(breakIdx
+1);
1244 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1245 tp
.srcCol
->setSize(breakIdx
+1);
1246 tp
.srcCol
->setElementAt(column
, breakIdx
);
1252 parseState
= PARSE_NUM
;
1256 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1257 parseState
= PARSE_COMMENT
;
1258 savedState
= PARSE_DATA
;
1262 if (c
== CH_BACKSLASH
) {
1263 // Check for \ at end of line, a line continuation.
1264 // Advance over (discard) the newline
1265 UChar32 cp
= testString
.char32At(charIdx
);
1266 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1268 // Need an extra increment of the input ptr to move over both of them
1271 if (cp
== CH_LF
|| cp
== CH_CR
) {
1278 // Let unescape handle the back slash.
1279 cp
= testString
.unescapeAt(charIdx
);
1281 // Escape sequence was recognized. Insert the char
1282 // into the test data.
1283 tp
.dataToBreak
.append(cp
);
1284 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1285 tp
.srcLine
->addElement(lineNum
, status
);
1286 tp
.srcCol
->addElement(column
, status
);
1292 // Not a recognized backslash escape sequence.
1293 // Take the next char as a literal.
1294 // TODO: Should this be an error?
1295 c
= testString
.charAt(charIdx
);
1296 charIdx
= testString
.moveIndex32(charIdx
, 1);
1299 // Normal, non-escaped data char.
1300 tp
.dataToBreak
.append(c
);
1302 // Save the mapping from offset in the data to line/column numbers in
1303 // the original input file. Will be used for better error messages only.
1304 // If there's an expected break before this char, the slot in the mapping
1305 // vector will already be set for this char; don't overwrite it.
1306 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1307 tp
.srcLine
->addElement(lineNum
, status
);
1308 tp
.srcCol
->addElement(column
, status
);
1314 // We are parsing an expected numeric tag value, like <1234>,
1315 // within a chunk of data.
1316 if (u_isUWhiteSpace(c
)) {
1321 // Finished the number. Add the info to the expected break data,
1322 // and switch parse state back to doing plain data.
1323 parseState
= PARSE_DATA
;
1324 if (tagValue
== 0) {
1327 int32_t breakIdx
= tp
.dataToBreak
.length();
1328 tp
.expectedBreaks
->setSize(breakIdx
+1);
1329 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1330 tp
.srcLine
->setSize(breakIdx
+1);
1331 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1332 tp
.srcCol
->setSize(breakIdx
+1);
1333 tp
.srcCol
->setElementAt(column
, breakIdx
);
1338 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1342 errln("Syntax Error in test file at line %d, col %d",
1344 parseState
= PARSE_COMMENT
;
1345 goto end_test
; // Stop the test
1350 if (U_FAILURE(status
)) {
1351 dataerrln("ICU Error %s while parsing test file at line %d.",
1352 u_errorName(status
), lineNum
);
1353 status
= U_ZERO_ERROR
;
1354 goto end_test
; // Stop the test
1361 delete tp
.expectedBreaks
;
1369 //-------------------------------------------------------------------------------
1371 // TestDictRules create a break iterator from source rules that includes a
1372 // dictionary range. Regression for bug #7130. Source rules
1373 // do not declare a break iterator type (word, line, sentence, etc.
1374 // but the dictionary code, without a type, would loop.
1376 //-------------------------------------------------------------------------------
1377 void RBBITest::TestDictRules() {
1378 const char *rules
= "$dictionary = [a-z]; \n"
1380 "$dictionary $dictionary; \n"
1382 "$dictionary $dictionary; \n";
1383 const char *text
= "aa";
1384 UErrorCode status
= U_ZERO_ERROR
;
1385 UParseError parseError
;
1387 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1388 if (U_SUCCESS(status
)) {
1389 UnicodeString utext
= text
;
1393 for (loops
= 0; loops
<10; loops
++) {
1394 position
= bi
.next();
1395 if (position
== RuleBasedBreakIterator::DONE
) {
1399 TEST_ASSERT(loops
== 1);
1401 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1407 //-------------------------------------------------------------------------------
1409 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1410 // return the datain one big UChar * buffer, which the caller must delete.
1413 // fileName: the name of the file, with no directory part. The test data directory
1415 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1416 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1417 // specified here. The BOM, if it exists, will be stripped from the returned data.
1418 // Pass NULL for the system default encoding.
1421 // The file data, converted to UChar.
1422 // The caller must delete this when done with
1423 // delete [] theBuffer;
1425 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1426 // Move this function to some common place.
1428 //--------------------------------------------------------------------------------
1429 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1430 UChar
*retPtr
= NULL
;
1431 char *fileBuf
= NULL
;
1432 UConverter
* conv
= NULL
;
1436 if (U_FAILURE(status
)) {
1443 f
= fopen(fileName
, "rb");
1445 dataerrln("Error opening test data file %s\n", fileName
);
1446 status
= U_FILE_ACCESS_ERROR
;
1455 fseek( f
, 0, SEEK_END
);
1456 fileSize
= ftell(f
);
1457 fileBuf
= new char[fileSize
];
1458 fseek(f
, 0, SEEK_SET
);
1459 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1460 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1461 errln("Error reading test data file.");
1462 goto cleanUpAndReturn
;
1466 // Look for a Unicode Signature (BOM) on the data just read
1468 int32_t signatureLength
;
1469 const char * fileBufC
;
1470 const char* bomEncoding
;
1473 bomEncoding
= ucnv_detectUnicodeSignature(
1474 fileBuf
, fileSize
, &signatureLength
, &status
);
1475 if(bomEncoding
!=NULL
){
1476 fileBufC
+= signatureLength
;
1477 fileSize
-= signatureLength
;
1478 encoding
= bomEncoding
;
1482 // Open a converter to take the rule file to UTF-16
1484 conv
= ucnv_open(encoding
, &status
);
1485 if (U_FAILURE(status
)) {
1486 goto cleanUpAndReturn
;
1490 // Convert the rules to UChar.
1491 // Preflight first to determine required buffer size.
1493 ulen
= ucnv_toUChars(conv
,
1499 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1500 // Buffer Overflow is expected from the preflight operation.
1501 status
= U_ZERO_ERROR
;
1503 retPtr
= new UChar
[ulen
+1];
1516 if (U_FAILURE(status
)) {
1517 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1527 //--------------------------------------------------------------------------------------------
1529 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1531 //-------------------------------------------------------------------------------------------
1532 void RBBITest::TestUnicodeFiles() {
1533 RuleBasedBreakIterator
*bi
;
1534 UErrorCode status
= U_ZERO_ERROR
;
1536 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1537 TEST_ASSERT_SUCCESS(status
);
1538 if (U_SUCCESS(status
)) {
1539 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1543 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1544 TEST_ASSERT_SUCCESS(status
);
1545 if (U_SUCCESS(status
)) {
1546 runUnicodeTestData("WordBreakTest.txt", bi
);
1550 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1551 TEST_ASSERT_SUCCESS(status
);
1552 if (U_SUCCESS(status
)) {
1553 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1557 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1558 TEST_ASSERT_SUCCESS(status
);
1559 if (U_SUCCESS(status
)) {
1560 runUnicodeTestData("LineBreakTest.txt", bi
);
1566 //--------------------------------------------------------------------------------------------
1568 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1570 //-------------------------------------------------------------------------------------------
1571 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1572 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1573 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1574 UBool isTicket7270Fixed
= isICUVersionAtLeast(52, 1);
1575 UBool isLineBreak
= 0 == strcmp(fileName
, "LineBreakTest.txt");
1576 UErrorCode status
= U_ZERO_ERROR
;
1579 // Open and read the test data file, put it into a UnicodeString.
1581 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1582 char testFileName
[1000];
1583 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1584 dataerrln("Can't open test data. Path too long.");
1587 strcpy(testFileName
, testDataDirectory
);
1588 strcat(testFileName
, fileName
);
1590 logln("Opening data file %s\n", fileName
);
1593 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1594 if (status
!= U_FILE_ACCESS_ERROR
) {
1595 TEST_ASSERT_SUCCESS(status
);
1596 TEST_ASSERT(testFile
!= NULL
);
1598 if (U_FAILURE(status
) || testFile
== NULL
) {
1599 return; /* something went wrong, error already output */
1601 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1604 // Parse the test data file using a regular expression.
1605 // Each kind of token is recognized in its own capture group; what type of item was scanned
1606 // is identified by which group had a match.
1608 // Caputure Group # 1 2 3 4 5
1609 // Parses this item: divide x hex digits comment \n unrecognized \n
1611 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1612 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1613 UnicodeString testString
;
1614 UVector32
breakPositions(status
);
1616 TEST_ASSERT_SUCCESS(status
);
1617 if (U_FAILURE(status
)) {
1622 // Scan through each test case, building up the string to be broken in testString,
1623 // and the positions that should be boundaries in the breakPositions vector.
1626 while (tokenMatcher
.find()) {
1627 if(tokenMatcher
.hitEnd()) {
1628 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1629 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1630 and caused an infinite loop here on EBCDIC systems!
1632 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1635 if (tokenMatcher
.start(1, status
) >= 0) {
1636 // Scanned a divide sign, indicating a break position in the test data.
1637 if (testString
.length()>0) {
1638 breakPositions
.addElement(testString
.length(), status
);
1641 else if (tokenMatcher
.start(2, status
) >= 0) {
1642 // Scanned an 'x', meaning no break at this position in the test data
1643 // Nothing to be done here.
1645 else if (tokenMatcher
.start(3, status
) >= 0) {
1646 // Scanned Hex digits. Convert them to binary, append to the character data string.
1647 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1648 int length
= hexNumber
.length();
1651 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1652 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1654 testString
.append(c
);
1656 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1657 fileName
, lineNumber
);
1660 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1661 fileName
, lineNumber
);
1664 else if (tokenMatcher
.start(4, status
) >= 0) {
1665 // Scanned to end of a line, possibly skipping over a comment in the process.
1666 // If the line from the file contained test data, run the test now.
1668 if (testString
.length() > 0) {
1669 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1672 // is not yet implemented.
1673 if (!(isLineBreak
&& !isTicket7270Fixed
&& (5198 == lineNumber
||
1674 5202 == lineNumber
||
1675 5214 == lineNumber
||
1676 5246 == lineNumber
||
1677 5298 == lineNumber
||
1678 5302 == lineNumber
))) {
1679 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1683 // Clear out this test case.
1684 // The string and breakPositions vector will be refilled as the next
1685 // test case is parsed.
1686 testString
.remove();
1687 breakPositions
.removeAllElements();
1690 // Scanner catchall. Something unrecognized appeared on the line.
1692 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1693 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1694 token
[sizeof(token
)-1] = 0;
1695 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1697 // Clean up, in preparation for continuing with the next line.
1698 testString
.remove();
1699 breakPositions
.removeAllElements();
1702 TEST_ASSERT_SUCCESS(status
);
1703 if (U_FAILURE(status
)) {
1709 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1712 //--------------------------------------------------------------------------------------------
1714 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1715 // test data files. Do only a simple, forward-only check -
1716 // this test is mostly to check that ICU and the Unicode
1717 // data agree with each other.
1719 //--------------------------------------------------------------------------------------------
1720 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1721 const UnicodeString
&testString
, // Text data to be broken
1722 UVector32
*breakPositions
, // Positions where breaks should be found.
1723 RuleBasedBreakIterator
*bi
) {
1724 int32_t pos
; // Break Position in the test string
1725 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1726 int32_t expectedPos
; // Expected break position (index into test string)
1728 bi
->setText(testString
);
1732 while (pos
!= BreakIterator::DONE
) {
1733 if (expectedI
>= breakPositions
->size()) {
1734 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1735 testFileName
, lineNumber
, pos
);
1738 expectedPos
= breakPositions
->elementAti(expectedI
);
1739 if (pos
< expectedPos
) {
1740 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1741 testFileName
, lineNumber
, pos
);
1744 if (pos
> expectedPos
) {
1745 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1746 testFileName
, lineNumber
, expectedPos
);
1753 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1754 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1755 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1761 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1762 //---------------------------------------------------------------------------------------
1764 // classs RBBIMonkeyKind
1766 // Monkey Test for Break Iteration
1767 // Abstract interface class. Concrete derived classes independently
1768 // implement the break rules for different iterator types.
1770 // The Monkey Test itself uses doesn't know which type of break iterator it is
1771 // testing, but works purely in terms of the interface defined here.
1773 //---------------------------------------------------------------------------------------
1774 class RBBIMonkeyKind
{
1776 // Return a UVector of UnicodeSets, representing the character classes used
1777 // for this type of iterator.
1778 virtual UVector
*charClasses() = 0;
1780 // Set the test text on which subsequent calls to next() will operate
1781 virtual void setText(const UnicodeString
&s
) = 0;
1783 // Find the next break postion, starting from the prev break position, or from zero.
1784 // Return -1 after reaching end of string.
1785 virtual int32_t next(int32_t i
) = 0;
1787 virtual ~RBBIMonkeyKind();
1788 UErrorCode deferredStatus
;
1797 RBBIMonkeyKind::RBBIMonkeyKind() {
1798 deferredStatus
= U_ZERO_ERROR
;
1801 RBBIMonkeyKind::~RBBIMonkeyKind() {
1805 //----------------------------------------------------------------------------------------
1807 // Random Numbers. Similar to standard lib rand() and srand()
1808 // Not using library to
1809 // 1. Get same results on all platforms.
1810 // 2. Get access to current seed, to more easily reproduce failures.
1812 //---------------------------------------------------------------------------------------
1813 static uint32_t m_seed
= 1;
1815 static uint32_t m_rand()
1817 m_seed
= m_seed
* 1103515245 + 12345;
1818 return (uint32_t)(m_seed
/65536) % 32768;
1822 //------------------------------------------------------------------------------------------
1824 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1825 // of RBBIMonkeyKind.
1827 //------------------------------------------------------------------------------------------
1828 class RBBICharMonkey
: public RBBIMonkeyKind
{
1831 virtual ~RBBICharMonkey();
1832 virtual UVector
*charClasses();
1833 virtual void setText(const UnicodeString
&s
);
1834 virtual int32_t next(int32_t i
);
1838 UnicodeSet
*fCRLFSet
;
1839 UnicodeSet
*fControlSet
;
1840 UnicodeSet
*fExtendSet
;
1841 UnicodeSet
*fRegionalIndicatorSet
;
1842 UnicodeSet
*fPrependSet
;
1843 UnicodeSet
*fSpacingSet
;
1848 UnicodeSet
*fLVTSet
;
1849 UnicodeSet
*fHangulSet
;
1850 UnicodeSet
*fAnySet
;
1852 const UnicodeString
*fText
;
1856 RBBICharMonkey::RBBICharMonkey() {
1857 UErrorCode status
= U_ZERO_ERROR
;
1861 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
1862 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status
);
1863 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status
);
1864 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
1865 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
1866 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
1867 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
1868 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
1869 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
1870 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
1871 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
1872 fHangulSet
= new UnicodeSet();
1873 fHangulSet
->addAll(*fLSet
);
1874 fHangulSet
->addAll(*fVSet
);
1875 fHangulSet
->addAll(*fTSet
);
1876 fHangulSet
->addAll(*fLVSet
);
1877 fHangulSet
->addAll(*fLVTSet
);
1878 fAnySet
= new UnicodeSet(0, 0x10ffff);
1880 fSets
= new UVector(status
);
1881 fSets
->addElement(fCRLFSet
, status
);
1882 fSets
->addElement(fControlSet
, status
);
1883 fSets
->addElement(fExtendSet
, status
);
1884 fSets
->addElement(fRegionalIndicatorSet
, status
);
1885 if (!fPrependSet
->isEmpty()) {
1886 fSets
->addElement(fPrependSet
, status
);
1888 fSets
->addElement(fSpacingSet
, status
);
1889 fSets
->addElement(fHangulSet
, status
);
1890 fSets
->addElement(fAnySet
, status
);
1891 if (U_FAILURE(status
)) {
1892 deferredStatus
= status
;
1897 void RBBICharMonkey::setText(const UnicodeString
&s
) {
1903 int32_t RBBICharMonkey::next(int32_t prevPos
) {
1904 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
1905 // break position being tested. The candidate break
1906 // location is before p2.
1910 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
1912 if (U_FAILURE(deferredStatus
)) {
1916 // Previous break at end of string. return DONE.
1917 if (prevPos
>= fText
->length()) {
1920 p0
= p1
= p2
= p3
= prevPos
;
1921 c3
= fText
->char32At(prevPos
);
1924 // Loop runs once per "significant" character position in the input text.
1926 // Move all of the positions forward in the input string.
1931 // Advancd p3 by one codepoint
1932 p3
= fText
->moveIndex32(p3
, 1);
1933 c3
= fText
->char32At(p3
);
1936 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1939 if (p2
== fText
->length()) {
1940 // Reached end of string. Always a break position.
1945 // No Extend or Format characters may appear between the CR and LF,
1946 // which requires the additional check for p2 immediately following p1.
1948 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
1952 // Rule (GB4). ( Control | CR | LF ) <break>
1953 if (fControlSet
->contains(c1
) ||
1959 // Rule (GB5) <break> ( Control | CR | LF )
1961 if (fControlSet
->contains(c2
) ||
1968 // Rule (GB6) L x ( L | V | LV | LVT )
1969 if (fLSet
->contains(c1
) &&
1970 (fLSet
->contains(c2
) ||
1971 fVSet
->contains(c2
) ||
1972 fLVSet
->contains(c2
) ||
1973 fLVTSet
->contains(c2
))) {
1977 // Rule (GB7) ( LV | V ) x ( V | T )
1978 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
1979 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
1983 // Rule (GB8) ( LVT | T) x T
1984 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
1985 fTSet
->contains(c2
)) {
1989 // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1991 // Rule (GB8a) Regional_Indicator x Regional_Indicator
1992 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
1996 // Rule (GB9) Numeric x ALetter
1997 if (fExtendSet
->contains(c2
)) {
2001 // Rule (GB9a) x SpacingMark
2002 if (fSpacingSet
->contains(c2
)) {
2006 // Rule (GB9b) Prepend x
2007 if (fPrependSet
->contains(c1
)) {
2011 // Rule (GB10) Any <break> Any
2021 UVector
*RBBICharMonkey::charClasses() {
2026 RBBICharMonkey::~RBBICharMonkey() {
2031 delete fRegionalIndicatorSet
;
2043 //------------------------------------------------------------------------------------------
2045 // class RBBIWordMonkey Word Break specific implementation
2046 // of RBBIMonkeyKind.
2048 //------------------------------------------------------------------------------------------
2049 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2052 virtual ~RBBIWordMonkey();
2053 virtual UVector
*charClasses();
2054 virtual void setText(const UnicodeString
&s
);
2055 virtual int32_t next(int32_t i
);
2061 UnicodeSet
*fNewlineSet
;
2062 UnicodeSet
*fKatakanaSet
;
2063 UnicodeSet
*fALetterSet
;
2064 // TODO(jungshik): Do we still need this change?
2065 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2066 UnicodeSet
*fMidNumLetSet
;
2067 UnicodeSet
*fMidLetterSet
;
2068 UnicodeSet
*fMidNumSet
;
2069 UnicodeSet
*fNumericSet
;
2070 UnicodeSet
*fFormatSet
;
2071 UnicodeSet
*fOtherSet
;
2072 UnicodeSet
*fExtendSet
;
2073 UnicodeSet
*fExtendNumLetSet
;
2074 UnicodeSet
*fRegionalIndicatorSet
;
2075 UnicodeSet
*fDictionaryCjkSet
;
2077 RegexMatcher
*fMatcher
;
2079 const UnicodeString
*fText
;
2083 RBBIWordMonkey::RBBIWordMonkey()
2085 UErrorCode status
= U_ZERO_ERROR
;
2087 fSets
= new UVector(status
);
2089 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2090 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2091 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2092 fDictionaryCjkSet
= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status
);
2093 // Exclude Hangul syllables from ALetterSet during testing.
2094 // Leave CJK dictionary characters out from the monkey tests!
2096 fALetterSet
= new UnicodeSet("[\\p{Word_Break = ALetter}"
2097 "[\\p{Line_Break = Complex_Context}"
2098 "-\\p{Grapheme_Cluster_Break = Extend}"
2099 "-\\p{Grapheme_Cluster_Break = Control}"
2103 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2104 fALetterSet
->removeAll(*fDictionaryCjkSet
);
2105 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2106 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2107 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status
);
2108 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2109 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2110 // we should figure out why
2111 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2112 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2113 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2114 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2115 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status
);
2117 fOtherSet
= new UnicodeSet();
2118 if(U_FAILURE(status
)) {
2119 deferredStatus
= status
;
2123 fOtherSet
->complement();
2124 fOtherSet
->removeAll(*fCRSet
);
2125 fOtherSet
->removeAll(*fLFSet
);
2126 fOtherSet
->removeAll(*fNewlineSet
);
2127 fOtherSet
->removeAll(*fKatakanaSet
);
2128 fOtherSet
->removeAll(*fALetterSet
);
2129 fOtherSet
->removeAll(*fMidLetterSet
);
2130 fOtherSet
->removeAll(*fMidNumSet
);
2131 fOtherSet
->removeAll(*fNumericSet
);
2132 fOtherSet
->removeAll(*fExtendNumLetSet
);
2133 fOtherSet
->removeAll(*fFormatSet
);
2134 fOtherSet
->removeAll(*fExtendSet
);
2135 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
2136 // Inhibit dictionary characters from being tested at all.
2137 fOtherSet
->removeAll(*fDictionaryCjkSet
);
2138 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2140 fSets
->addElement(fCRSet
, status
);
2141 fSets
->addElement(fLFSet
, status
);
2142 fSets
->addElement(fNewlineSet
, status
);
2143 fSets
->addElement(fALetterSet
, status
);
2144 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2145 fSets
->addElement(fMidLetterSet
, status
);
2146 fSets
->addElement(fMidNumLetSet
, status
);
2147 fSets
->addElement(fMidNumSet
, status
);
2148 fSets
->addElement(fNumericSet
, status
);
2149 fSets
->addElement(fFormatSet
, status
);
2150 fSets
->addElement(fExtendSet
, status
);
2151 fSets
->addElement(fOtherSet
, status
);
2152 fSets
->addElement(fExtendNumLetSet
, status
);
2153 fSets
->addElement(fRegionalIndicatorSet
, status
);
2155 if (U_FAILURE(status
)) {
2156 deferredStatus
= status
;
2160 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2165 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2166 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2167 // break position being tested. The candidate break
2168 // location is before p2.
2172 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2174 if (U_FAILURE(deferredStatus
)) {
2178 // Prev break at end of string. return DONE.
2179 if (prevPos
>= fText
->length()) {
2182 p0
= p1
= p2
= p3
= prevPos
;
2183 c3
= fText
->char32At(prevPos
);
2186 // Loop runs once per "significant" character position in the input text.
2188 // Move all of the positions forward in the input string.
2193 // Advancd p3 by X(Extend | Format)* Rule 4
2194 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2196 p3
= fText
->moveIndex32(p3
, 1);
2197 c3
= fText
->char32At(p3
);
2198 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2202 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
));
2206 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2209 if (p2
== fText
->length()) {
2210 // Reached end of string. Always a break position.
2215 // No Extend or Format characters may appear between the CR and LF,
2216 // which requires the additional check for p2 immediately following p1.
2218 if (c1
==0x0D && c2
==0x0A) {
2222 // Rule (3a) Break before and after newlines (including CR and LF)
2224 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2227 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2231 // Rule (5). ALetter x ALetter
2232 if (fALetterSet
->contains(c1
) &&
2233 fALetterSet
->contains(c2
)) {
2237 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2239 if ( fALetterSet
->contains(c1
) &&
2240 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2241 fALetterSet
->contains(c3
)) {
2246 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2247 if (fALetterSet
->contains(c0
) &&
2248 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2249 fALetterSet
->contains(c2
)) {
2253 // Rule (8) Numeric x Numeric
2254 if (fNumericSet
->contains(c1
) &&
2255 fNumericSet
->contains(c2
)) {
2259 // Rule (9) ALetter x Numeric
2260 if (fALetterSet
->contains(c1
) &&
2261 fNumericSet
->contains(c2
)) {
2265 // Rule (10) Numeric x ALetter
2266 if (fNumericSet
->contains(c1
) &&
2267 fALetterSet
->contains(c2
)) {
2271 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2272 if (fNumericSet
->contains(c0
) &&
2273 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2274 fNumericSet
->contains(c2
)) {
2278 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2279 if (fNumericSet
->contains(c1
) &&
2280 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2281 fNumericSet
->contains(c3
)) {
2285 // Rule (13) Katakana x Katakana
2286 if (fKatakanaSet
->contains(c1
) &&
2287 fKatakanaSet
->contains(c2
)) {
2292 if ((fALetterSet
->contains(c1
) || fNumericSet
->contains(c1
) ||
2293 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2294 fExtendNumLetSet
->contains(c2
)) {
2299 if (fExtendNumLetSet
->contains(c1
) &&
2300 (fALetterSet
->contains(c2
) || fNumericSet
->contains(c2
) ||
2301 fKatakanaSet
->contains(c2
))) {
2306 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2310 // Rule 14. Break found here.
2319 UVector
*RBBIWordMonkey::charClasses() {
2324 RBBIWordMonkey::~RBBIWordMonkey() {
2329 delete fKatakanaSet
;
2331 delete fMidNumLetSet
;
2332 delete fMidLetterSet
;
2337 delete fExtendNumLetSet
;
2338 delete fRegionalIndicatorSet
;
2339 delete fDictionaryCjkSet
;
2346 //------------------------------------------------------------------------------------------
2348 // class RBBISentMonkey Sentence Break specific implementation
2349 // of RBBIMonkeyKind.
2351 //------------------------------------------------------------------------------------------
2352 class RBBISentMonkey
: public RBBIMonkeyKind
{
2355 virtual ~RBBISentMonkey();
2356 virtual UVector
*charClasses();
2357 virtual void setText(const UnicodeString
&s
);
2358 virtual int32_t next(int32_t i
);
2360 int moveBack(int posFrom
);
2361 int moveForward(int posFrom
);
2362 UChar32
cAt(int pos
);
2366 UnicodeSet
*fSepSet
;
2367 UnicodeSet
*fFormatSet
;
2369 UnicodeSet
*fLowerSet
;
2370 UnicodeSet
*fUpperSet
;
2371 UnicodeSet
*fOLetterSet
;
2372 UnicodeSet
*fNumericSet
;
2373 UnicodeSet
*fATermSet
;
2374 UnicodeSet
*fSContinueSet
;
2375 UnicodeSet
*fSTermSet
;
2376 UnicodeSet
*fCloseSet
;
2377 UnicodeSet
*fOtherSet
;
2378 UnicodeSet
*fExtendSet
;
2380 const UnicodeString
*fText
;
2384 RBBISentMonkey::RBBISentMonkey()
2386 UErrorCode status
= U_ZERO_ERROR
;
2388 fSets
= new UVector(status
);
2390 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2391 // set and made into character classes of their own. For the monkey impl,
2392 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2393 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2394 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2395 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2396 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2397 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2398 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2399 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2400 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2401 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2402 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2403 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2404 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2405 fOtherSet
= new UnicodeSet();
2407 if(U_FAILURE(status
)) {
2408 deferredStatus
= status
;
2412 fOtherSet
->complement();
2413 fOtherSet
->removeAll(*fSepSet
);
2414 fOtherSet
->removeAll(*fFormatSet
);
2415 fOtherSet
->removeAll(*fSpSet
);
2416 fOtherSet
->removeAll(*fLowerSet
);
2417 fOtherSet
->removeAll(*fUpperSet
);
2418 fOtherSet
->removeAll(*fOLetterSet
);
2419 fOtherSet
->removeAll(*fNumericSet
);
2420 fOtherSet
->removeAll(*fATermSet
);
2421 fOtherSet
->removeAll(*fSContinueSet
);
2422 fOtherSet
->removeAll(*fSTermSet
);
2423 fOtherSet
->removeAll(*fCloseSet
);
2424 fOtherSet
->removeAll(*fExtendSet
);
2426 fSets
->addElement(fSepSet
, status
);
2427 fSets
->addElement(fFormatSet
, status
);
2428 fSets
->addElement(fSpSet
, status
);
2429 fSets
->addElement(fLowerSet
, status
);
2430 fSets
->addElement(fUpperSet
, status
);
2431 fSets
->addElement(fOLetterSet
, status
);
2432 fSets
->addElement(fNumericSet
, status
);
2433 fSets
->addElement(fATermSet
, status
);
2434 fSets
->addElement(fSContinueSet
, status
);
2435 fSets
->addElement(fSTermSet
, status
);
2436 fSets
->addElement(fCloseSet
, status
);
2437 fSets
->addElement(fOtherSet
, status
);
2438 fSets
->addElement(fExtendSet
, status
);
2440 if (U_FAILURE(status
)) {
2441 deferredStatus
= status
;
2447 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2451 UVector
*RBBISentMonkey::charClasses() {
2456 // moveBack() Find the "significant" code point preceding the index i.
2457 // Skips over ($Extend | $Format)* .
2459 int RBBISentMonkey::moveBack(int i
) {
2466 j
= fText
->moveIndex32(j
, -1);
2467 c
= fText
->char32At(j
);
2469 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2475 int RBBISentMonkey::moveForward(int i
) {
2476 if (i
>=fText
->length()) {
2477 return fText
->length();
2482 j
= fText
->moveIndex32(j
, 1);
2485 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2489 UChar32
RBBISentMonkey::cAt(int pos
) {
2490 if (pos
<0 || pos
>=fText
->length()) {
2493 return fText
->char32At(pos
);
2497 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2498 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2499 // break position being tested. The candidate break
2500 // location is before p2.
2504 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2507 if (U_FAILURE(deferredStatus
)) {
2511 // Prev break at end of string. return DONE.
2512 if (prevPos
>= fText
->length()) {
2515 p0
= p1
= p2
= p3
= prevPos
;
2516 c3
= fText
->char32At(prevPos
);
2519 // Loop runs once per "significant" character position in the input text.
2521 // Move all of the positions forward in the input string.
2526 // Advancd p3 by X(Extend | Format)* Rule 4
2527 p3
= moveForward(p3
);
2531 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2535 // Rule (4). Sep <break>
2536 if (fSepSet
->contains(c1
)) {
2537 p2
= p1
+1; // Separators don't combine with Extend or Format.
2541 if (p2
>= fText
->length()) {
2542 // Reached end of string. Always a break position.
2546 if (p2
== prevPos
) {
2547 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2551 // Rule (6). ATerm x Numeric
2552 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2556 // Rule (7). Upper ATerm x Uppper
2557 if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2561 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2562 // Note: STerm | ATerm are added to the negated part of the expression by a
2563 // note to the Unicode 5.0 documents.
2565 while (fSpSet
->contains(cAt(p8
))) {
2568 while (fCloseSet
->contains(cAt(p8
))) {
2571 if (fATermSet
->contains(cAt(p8
))) {
2575 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2576 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2577 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2580 p8
= moveForward(p8
);
2582 if (fLowerSet
->contains(cAt(p8
))) {
2587 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2588 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2590 while (fSpSet
->contains(cAt(p8
))) {
2593 while (fCloseSet
->contains(cAt(p8
))) {
2597 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2602 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2604 while (fCloseSet
->contains(cAt(p9
))) {
2608 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2609 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2614 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2616 while (fSpSet
->contains(cAt(p10
))) {
2617 p10
= moveBack(p10
);
2619 while (fCloseSet
->contains(cAt(p10
))) {
2620 p10
= moveBack(p10
);
2622 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2623 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2628 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2630 if (fSepSet
->contains(cAt(p11
))) {
2631 p11
= moveBack(p11
);
2633 while (fSpSet
->contains(cAt(p11
))) {
2634 p11
= moveBack(p11
);
2636 while (fCloseSet
->contains(cAt(p11
))) {
2637 p11
= moveBack(p11
);
2639 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2643 // Rule (12) Any x Any
2650 RBBISentMonkey::~RBBISentMonkey() {
2660 delete fSContinueSet
;
2669 //-------------------------------------------------------------------------------------------
2673 //-------------------------------------------------------------------------------------------
2675 class RBBILineMonkey
: public RBBIMonkeyKind
{
2678 virtual ~RBBILineMonkey();
2679 virtual UVector
*charClasses();
2680 virtual void setText(const UnicodeString
&s
);
2681 virtual int32_t next(int32_t i
);
2682 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2727 BreakIterator
*fCharBI
;
2729 const UnicodeString
*fText
;
2730 int32_t *fOrigPositions
;
2732 RegexMatcher
*fNumberMatcher
;
2733 RegexMatcher
*fLB11Matcher
;
2737 RBBILineMonkey::RBBILineMonkey()
2739 UErrorCode status
= U_ZERO_ERROR
;
2741 fSets
= new UVector(status
);
2743 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
2744 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
2745 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
2746 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
2747 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
2748 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
2749 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
2750 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
2751 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
2752 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
2753 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
2754 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
2755 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
2756 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
2757 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
2758 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
2759 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
2760 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
2761 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
2762 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
2763 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
2764 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
2765 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
2766 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
2767 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
2768 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
2769 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
2770 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
2771 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
2772 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
2773 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
2774 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
2775 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
2776 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
2777 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
2778 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
2779 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
2780 fSA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status
);
2781 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
2782 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
2784 if (U_FAILURE(status
)) {
2785 deferredStatus
= status
;
2787 fNumberMatcher
= NULL
;
2791 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2792 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2793 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
2794 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2796 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
2798 fSets
->addElement(fBK
, status
);
2799 fSets
->addElement(fCR
, status
);
2800 fSets
->addElement(fLF
, status
);
2801 fSets
->addElement(fCM
, status
);
2802 fSets
->addElement(fNL
, status
);
2803 fSets
->addElement(fWJ
, status
);
2804 fSets
->addElement(fZW
, status
);
2805 fSets
->addElement(fGL
, status
);
2806 fSets
->addElement(fCB
, status
);
2807 fSets
->addElement(fSP
, status
);
2808 fSets
->addElement(fB2
, status
);
2809 fSets
->addElement(fBA
, status
);
2810 fSets
->addElement(fBB
, status
);
2811 fSets
->addElement(fHY
, status
);
2812 fSets
->addElement(fH2
, status
);
2813 fSets
->addElement(fH3
, status
);
2814 fSets
->addElement(fCL
, status
);
2815 fSets
->addElement(fCP
, status
);
2816 fSets
->addElement(fEX
, status
);
2817 fSets
->addElement(fIN
, status
);
2818 fSets
->addElement(fJL
, status
);
2819 fSets
->addElement(fJT
, status
);
2820 fSets
->addElement(fJV
, status
);
2821 fSets
->addElement(fNS
, status
);
2822 fSets
->addElement(fOP
, status
);
2823 fSets
->addElement(fQU
, status
);
2824 fSets
->addElement(fIS
, status
);
2825 fSets
->addElement(fNU
, status
);
2826 fSets
->addElement(fPO
, status
);
2827 fSets
->addElement(fPR
, status
);
2828 fSets
->addElement(fSY
, status
);
2829 fSets
->addElement(fAI
, status
);
2830 fSets
->addElement(fAL
, status
);
2831 fSets
->addElement(fHL
, status
);
2832 fSets
->addElement(fID
, status
);
2833 fSets
->addElement(fWJ
, status
);
2834 fSets
->addElement(fRI
, status
);
2835 fSets
->addElement(fSA
, status
);
2836 fSets
->addElement(fSG
, status
);
2839 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2840 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2841 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2842 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2843 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2844 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2846 fNumberMatcher
= new RegexMatcher(
2847 UnicodeString(rules
, -1, US_INV
), 0, status
);
2849 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2851 if (U_FAILURE(status
)) {
2852 deferredStatus
= status
;
2857 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2859 fCharBI
->setText(s
);
2860 fNumberMatcher
->reset(s
);
2865 // Line Break TR rules 9 and 10 implementation.
2866 // This deals with combining marks and other sequences that
2867 // that must be treated as if they were something other than what they actually are.
2869 // This is factored out into a separate function because it must be applied twice for
2870 // each potential break, once to the chars before the position being checked, then
2871 // again to the text following the possible break.
2873 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2875 // Invalid initial position. Happens during the warmup iteration of the
2876 // main loop in next().
2880 int32_t nPos
= *nextPos
;
2882 // LB 9 Keep combining sequences together.
2883 // advance over any CM class chars. Note that Line Break CM is different
2884 // from the normal Grapheme Extend property.
2885 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
2886 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
2888 *nextChar
= fText
->char32At(nPos
);
2889 if (!fCM
->contains(*nextChar
)) {
2892 nPos
= fText
->moveIndex32(nPos
, 1);
2897 // LB 9 Treat X CM* as if it were x.
2898 // No explicit action required.
2900 // LB 10 Treat any remaining combining mark as AL
2901 if (fCM
->contains(*posChar
)) {
2902 *posChar
= 0x41; // thisChar = 'A';
2905 // Push the updated nextPos and nextChar back to our caller.
2906 // This only makes a difference if posChar got bigger by consuming a
2907 // combining sequence.
2909 *nextChar
= fText
->char32At(nPos
);
2914 int32_t RBBILineMonkey::next(int32_t startPos
) {
2915 UErrorCode status
= U_ZERO_ERROR
;
2916 int32_t pos
; // Index of the char following a potential break position
2917 UChar32 thisChar
; // Character at above position "pos"
2919 int32_t prevPos
; // Index of the char preceding a potential break position
2920 UChar32 prevChar
; // Character at above position. Note that prevChar
2921 // and thisChar may not be adjacent because combining
2922 // characters between them will be ignored.
2924 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
2927 int32_t nextPos
; // Index of the next character following pos.
2928 // Usually skips over combining marks.
2929 int32_t nextCPPos
; // Index of the code point following "pos."
2930 // May point to a combining mark.
2931 int32_t tPos
; // temp value.
2934 if (U_FAILURE(deferredStatus
)) {
2938 if (startPos
>= fText
->length()) {
2943 // Initial values for loop. Loop will run the first time without finding breaks,
2944 // while the invalid values shift out and the "this" and
2945 // "prev" positions are filled in with good values.
2946 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
2947 thisChar
= prevChar
= prevCharX2
= 0;
2948 nextPos
= nextCPPos
= startPos
;
2951 // Loop runs once per position in the test text, until a break position
2954 prevPosX2
= prevPos
;
2955 prevCharX2
= prevChar
;
2958 prevChar
= thisChar
;
2961 thisChar
= fText
->char32At(pos
);
2963 nextCPPos
= fText
->moveIndex32(pos
, 1);
2964 nextPos
= nextCPPos
;
2966 // Rule LB2 - Break at end of text.
2967 if (pos
>= fText
->length()) {
2971 // Rule LB 9 - adjust for combining sequences.
2972 // We do this one out-of-order because the adjustment does not change anything
2973 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2975 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
2976 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
2977 c
= fText
->char32At(nextPos
);
2978 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
2980 // If the loop is still warming up - if we haven't shifted the initial
2981 // -1 positions out of prevPos yet - loop back to advance the
2982 // position in the input without any further looking for breaks.
2983 if (prevPos
== -1) {
2987 // LB 4 Always break after hard line breaks,
2988 if (fBK
->contains(prevChar
)) {
2992 // LB 5 Break after CR, LF, NL, but not inside CR LF
2993 if (prevChar
== 0x0d && thisChar
== 0x0a) {
2996 if (prevChar
== 0x0d ||
3002 // LB 6 Don't break before hard line breaks
3003 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3004 fBK
->contains(thisChar
)) {
3009 // LB 7 Don't break before spaces or zero-width space.
3010 if (fSP
->contains(thisChar
)) {
3014 if (fZW
->contains(thisChar
)) {
3018 // LB 8 Break after zero width space
3019 if (fZW
->contains(prevChar
)) {
3023 // LB 9, 10 Already done, at top of loop.
3027 // LB 11 Do not break before or after WORD JOINER and related characters.
3031 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3037 if (fGL
->contains(prevChar
)) {
3043 if (!(fSP
->contains(prevChar
) ||
3044 fBA
->contains(prevChar
) ||
3045 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3051 // LB 13 Don't break before closings.
3052 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3053 // fall into LB 17 and the more general number regular expression.
3055 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3056 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3057 fEX
->contains(thisChar
) ||
3058 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3059 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3063 // LB 14 Don't break after OP SP*
3064 // Scan backwards, checking for this sequence.
3065 // The OP char could include combining marks, so we actually check for
3067 // Another Twist: The Rule 67 fixes may have changed a SP CM
3068 // sequence into a ID char, so before scanning back through spaces,
3069 // verify that prevChar is indeed a space. The prevChar variable
3070 // may differ from fText[prevPos]
3072 if (fSP
->contains(prevChar
)) {
3073 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3074 tPos
=fText
->moveIndex32(tPos
, -1);
3077 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3078 tPos
=fText
->moveIndex32(tPos
, -1);
3080 if (fOP
->contains(fText
->char32At(tPos
))) {
3085 // LB 15 QU SP* x OP
3086 if (fOP
->contains(thisChar
)) {
3087 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3089 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3090 tPos
= fText
->moveIndex32(tPos
, -1);
3092 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3093 tPos
= fText
->moveIndex32(tPos
, -1);
3095 if (fQU
->contains(fText
->char32At(tPos
))) {
3102 // LB 16 (CL | CP) SP* x NS
3103 // Scan backwards for SP* CM* (CL | CP)
3104 if (fNS
->contains(thisChar
)) {
3106 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3107 tPos
= fText
->moveIndex32(tPos
, -1);
3109 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3110 tPos
= fText
->moveIndex32(tPos
, -1);
3112 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3118 // LB 17 B2 SP* x B2
3119 if (fB2
->contains(thisChar
)) {
3120 // Scan backwards, checking for the B2 CM* SP* sequence.
3122 if (fSP
->contains(prevChar
)) {
3123 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3124 tPos
=fText
->moveIndex32(tPos
, -1);
3127 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3128 tPos
=fText
->moveIndex32(tPos
, -1);
3130 if (fB2
->contains(fText
->char32At(tPos
))) {
3136 // LB 18 break after space
3137 if (fSP
->contains(prevChar
)) {
3144 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3148 // LB 20 Break around a CB
3149 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3154 if (fBA
->contains(thisChar
) ||
3155 fHY
->contains(thisChar
) ||
3156 fNS
->contains(thisChar
) ||
3157 fBB
->contains(prevChar
) ) {
3163 if (fHL
->contains(prevCharX2
) &&
3164 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3168 // LB 21b - Added for Apple 13927604
3169 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3174 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3175 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3176 (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3177 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3178 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3187 if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) ||
3188 (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3189 (fHL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3190 (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) ||
3191 (fNU
->contains(prevChar
) && fHL
->contains(thisChar
)) ) {
3195 // LB 24 Do not break between prefix and letters or ideographs.
3199 if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) ||
3200 (fPR
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3201 (fPO
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
)))) {
3208 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3209 if (U_FAILURE(status
)) {
3212 // Matched a number. But could have been just a single digit, which would
3213 // not represent a "no break here" between prevChar and thisChar
3214 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3215 if (numEndIdx
> pos
) {
3216 // Number match includes at least our two chars being checked
3217 if (numEndIdx
> nextPos
) {
3218 // Number match includes additional chars. Update pos and nextPos
3219 // so that next loop iteration will continue at the end of the number,
3220 // checking for breaks between last char in number & whatever follows.
3221 pos
= nextPos
= numEndIdx
;
3223 pos
= fText
->moveIndex32(pos
, -1);
3224 thisChar
= fText
->char32At(pos
);
3225 } while (fCM
->contains(thisChar
));
3232 // LB 26 Do not break a Korean syllable.
3233 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3234 fJV
->contains(thisChar
) ||
3235 fH2
->contains(thisChar
) ||
3236 fH3
->contains(thisChar
))) {
3240 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3241 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3245 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3246 fJT
->contains(thisChar
)) {
3250 // LB 27 Treat a Korean Syllable Block the same as ID.
3251 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3252 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3253 fIN
->contains(thisChar
)) {
3256 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3257 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3258 fPO
->contains(thisChar
)) {
3261 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3262 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3268 // LB 28 Do not break between alphabetics ("at").
3269 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3273 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3274 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3278 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3281 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3284 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3288 // LB30a Do not break between regional indicators.
3290 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3294 // LB 31 Break everywhere else
3303 UVector
*RBBILineMonkey::charClasses() {
3308 RBBILineMonkey::~RBBILineMonkey() {
3353 delete fNumberMatcher
;
3357 //-------------------------------------------------------------------------------------------
3362 // seed=nnnnn Random number starting seed.
3363 // Setting the seed allows errors to be reproduced.
3364 // loop=nnn Looping count. Controls running time.
3366 // 0 or greater: run length.
3368 // type = char | word | line | sent | title
3370 //-------------------------------------------------------------------------------------------
3372 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3373 int32_t val
= defaultVal
;
3374 name
.append(" *= *(-?\\d+)");
3375 UErrorCode status
= U_ZERO_ERROR
;
3376 RegexMatcher
m(name
, params
, 0, status
);
3378 // The param exists. Convert the string to an int.
3379 char valString
[100];
3380 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3381 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3382 paramLength
= (int32_t)(sizeof(valString
)-2);
3384 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3385 val
= strtol(valString
, NULL
, 10);
3387 // Delete this parameter from the params string.
3389 params
= m
.replaceFirst("", status
);
3391 U_ASSERT(U_SUCCESS(status
));
3396 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3397 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3406 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3408 if (count
< expectedcount
&& expected
[count
] != i
) {
3409 test
->errln("break forward test failed: expected %d but got %d",
3410 expected
[count
], i
);
3415 if (count
!= expectedcount
) {
3416 printStringBreaks(ustr
, expected
, expectedcount
);
3417 test
->errln("break forward test failed: missed %d match",
3418 expectedcount
- count
);
3421 // testing boundaries
3422 for (i
= 1; i
< expectedcount
; i
++) {
3423 int j
= expected
[i
- 1];
3424 if (!bi
->isBoundary(j
)) {
3425 printStringBreaks(ustr
, expected
, expectedcount
);
3426 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3429 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3430 if (bi
->isBoundary(j
)) {
3431 printStringBreaks(ustr
, expected
, expectedcount
);
3432 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3438 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3440 if (forward
[count
] != i
) {
3441 printStringBreaks(ustr
, expected
, expectedcount
);
3442 test
->errln("happy break test previous() failed: expected %d but got %d",
3448 printStringBreaks(ustr
, expected
, expectedcount
);
3449 test
->errln("break test previous() failed: missed a match");
3453 // testing preceding
3454 for (i
= 0; i
< expectedcount
- 1; i
++) {
3455 // int j = expected[i] + 1;
3456 int j
= ustr
.moveIndex32(expected
[i
], 1);
3457 for (; j
<= expected
[i
+ 1]; j
++) {
3458 if (bi
->preceding(j
) != expected
[i
]) {
3459 printStringBreaks(ustr
, expected
, expectedcount
);
3460 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3468 void RBBITest::TestWordBreaks(void)
3470 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3472 Locale
locale("en");
3473 UErrorCode status
= U_ZERO_ERROR
;
3474 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3475 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3476 // Replaced any C+J characters in a row with a random sequence of characters
3477 // of the same length to make our C+J segmentation not get in the way.
3478 static const char *strlist
[] =
3480 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3481 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3482 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3483 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3484 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3485 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3486 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3487 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3488 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3489 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3490 "\\u2027\\U000e0067\\u0a47\\u00b7",
3491 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3492 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3493 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3494 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3495 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3496 "\\u0027\\u11af\\U000e0057\\u0602",
3497 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3498 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3499 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3500 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3501 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3502 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3503 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3504 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3505 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3506 "\\u18f4\\U000e0049\\u20e7\\u2027",
3507 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3508 "\\ua183\\u102d\\u0bec\\u003a",
3509 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3510 "\\u003a\\u0e57\\u0fad\\u002e",
3511 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3512 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3513 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3514 "\\u003a\\u0664\\u00b7\\u1fba",
3515 "\\u003b\\u0027\\u00b7\\u47a3",
3516 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3517 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3518 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3521 if (U_FAILURE(status
)) {
3522 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3525 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3526 // printf("looping %d\n", loop);
3527 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3528 // RBBICharMonkey monkey;
3529 RBBIWordMonkey monkey
;
3532 int expectedcount
= 0;
3534 monkey
.setText(ustr
);
3536 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3537 expected
[expectedcount
++] = i
;
3540 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3546 void RBBITest::TestWordBoundary(void)
3548 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3549 Locale
locale("en");
3550 UErrorCode status
= U_ZERO_ERROR
;
3551 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3552 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3554 static const char *strlist
[] =
3556 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3557 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3558 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3559 "\\u2027\\U000e0067\\u0a47\\u00b7",
3560 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3561 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3562 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3563 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3564 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3565 "\\u0027\\u11af\\U000e0057\\u0602",
3566 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3567 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3568 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3569 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3570 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3571 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3572 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3573 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3574 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3575 "\\u58f4\\U000e0049\\u20e7\\u2027",
3576 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3577 "\\ua183\\u102d\\u0bec\\u003a",
3578 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3579 "\\u003a\\u0e57\\u0fad\\u002e",
3580 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3581 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3582 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3583 "\\u003a\\u0664\\u00b7\\u1fba",
3584 "\\u003b\\u0027\\u00b7\\u47a3",
3587 if (U_FAILURE(status
)) {
3588 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3591 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3592 // printf("looping %d\n", loop);
3593 u_unescape(strlist
[loop
], str
, 20);
3594 UnicodeString
ustr(str
);
3601 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3602 forward
[count
++] = i
;
3605 for (j
= prev
+ 1; j
< i
; j
++) {
3606 if (bi
->isBoundary(j
)) {
3607 printStringBreaks(ustr
, forward
, count
);
3608 errln("happy boundary test failed: expected %d not a boundary",
3614 if (!bi
->isBoundary(i
)) {
3615 printStringBreaks(ustr
, forward
, count
);
3616 errln("happy boundary test failed: expected %d a boundary",
3626 void RBBITest::TestLineBreaks(void)
3628 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3629 Locale
locale("en");
3630 UErrorCode status
= U_ZERO_ERROR
;
3631 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3632 const int32_t STRSIZE
= 50;
3634 static const char *strlist
[] =
3636 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3637 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3638 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3639 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3640 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3641 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3642 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3643 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3644 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3645 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3646 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3647 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3648 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3649 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3650 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3651 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3652 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3653 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3654 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3655 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3656 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3657 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3658 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3659 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3660 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3661 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3662 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3663 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3664 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3665 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3666 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3667 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3668 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3669 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3670 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3671 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3672 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3673 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3674 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3675 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3676 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3677 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3678 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3679 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3680 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3681 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3682 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3685 TEST_ASSERT_SUCCESS(status
);
3686 if (U_FAILURE(status
)) {
3689 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3690 // printf("looping %d\n", loop);
3691 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3698 UnicodeString
ustr(str
);
3699 RBBILineMonkey monkey
;
3700 if (U_FAILURE(monkey
.deferredStatus
)) {
3704 const int EXPECTEDSIZE
= 50;
3705 int expected
[EXPECTEDSIZE
];
3706 int expectedcount
= 0;
3708 monkey
.setText(ustr
);
3710 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3711 if (expectedcount
>= EXPECTEDSIZE
) {
3712 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3715 expected
[expectedcount
++] = i
;
3718 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3724 void RBBITest::TestSentBreaks(void)
3726 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3727 Locale
locale("en");
3728 UErrorCode status
= U_ZERO_ERROR
;
3729 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3731 static const char *strlist
[] =
3733 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3735 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3736 "\"Sentence ending with a quote.\" Bye.",
3737 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3738 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3739 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3740 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3741 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3742 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3743 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3744 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3745 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3746 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3747 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3748 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3749 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3750 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3751 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3752 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3755 if (U_FAILURE(status
)) {
3756 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3759 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3760 u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0])));
3761 UnicodeString
ustr(str
);
3763 RBBISentMonkey monkey
;
3764 if (U_FAILURE(monkey
.deferredStatus
)) {
3768 const int EXPECTEDSIZE
= 50;
3769 int expected
[EXPECTEDSIZE
];
3770 int expectedcount
= 0;
3772 monkey
.setText(ustr
);
3774 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3775 if (expectedcount
>= EXPECTEDSIZE
) {
3776 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3779 expected
[expectedcount
++] = i
;
3782 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3788 void RBBITest::TestMonkey(char *params
) {
3789 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3791 UErrorCode status
= U_ZERO_ERROR
;
3792 int32_t loopCount
= 500;
3794 UnicodeString breakType
= "all";
3795 Locale
locale("en");
3796 UBool useUText
= FALSE
;
3798 if (quick
== FALSE
) {
3803 UnicodeString
p(params
);
3804 loopCount
= getIntParam("loop", p
, loopCount
);
3805 seed
= getIntParam("seed", p
, seed
);
3807 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3809 breakType
= m
.group(1, status
);
3811 p
= m
.replaceFirst("", status
);
3814 RegexMatcher
u(" *utext", p
, 0, status
);
3818 p
= u
.replaceFirst("", status
);
3823 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
3824 // Each option is stripped out of the option string as it is processed.
3825 // All options have been checked. The option string should have been completely emptied..
3827 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3828 buf
[sizeof(buf
)-1] = 0;
3829 errln("Unrecognized or extra parameter: %s\n", buf
);
3835 if (breakType
== "char" || breakType
== "all") {
3837 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3838 if (U_SUCCESS(status
)) {
3839 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
3840 if (breakType
== "all" && useUText
==FALSE
) {
3841 // Also run a quick test with UText when "all" is specified
3842 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
3846 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
3851 if (breakType
== "word" || breakType
== "all") {
3852 logln("Word Break Monkey Test");
3854 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3855 if (U_SUCCESS(status
)) {
3856 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
3859 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
3864 if (breakType
== "line" || breakType
== "all") {
3865 logln("Line Break Monkey Test");
3867 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3868 if (loopCount
>= 10) {
3869 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
3871 if (U_SUCCESS(status
)) {
3872 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
3875 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3880 if (breakType
== "sent" || breakType
== "all" ) {
3881 logln("Sentence Break Monkey Test");
3883 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3884 if (loopCount
>= 10) {
3885 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
3887 if (U_SUCCESS(status
)) {
3888 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
3891 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3900 // Run a RBBI monkey test. Common routine, for all break iterator types.
3902 // bi - the break iterator to use
3903 // mk - MonkeyKind, abstraction for obtaining expected results
3904 // name - Name of test (char, word, etc.) for use in error messages
3905 // seed - Seed for starting random number generator (parameter from user)
3908 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
3909 int32_t numIterations
, UBool useUText
) {
3911 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3913 const int32_t TESTSTRINGLEN
= 500;
3914 UnicodeString testText
;
3915 int32_t numCharClasses
;
3917 int expected
[TESTSTRINGLEN
*2 + 1];
3918 int expectedCount
= 0;
3919 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
3920 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
3921 char reverseBreaks
[TESTSTRINGLEN
*2+1];
3922 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
3923 char followingBreaks
[TESTSTRINGLEN
*2+1];
3924 char precedingBreaks
[TESTSTRINGLEN
*2+1];
3930 numCharClasses
= mk
.charClasses()->size();
3931 chClasses
= mk
.charClasses();
3933 // Check for errors that occured during the construction of the MonkeyKind object.
3934 // Can't report them where they occured because errln() is a method coming from intlTest,
3935 // and is not visible outside of RBBITest :-(
3936 if (U_FAILURE(mk
.deferredStatus
)) {
3937 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
3941 // Verify that the character classes all have at least one member.
3942 for (i
=0; i
<numCharClasses
; i
++) {
3943 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
3944 if (s
== NULL
|| s
->size() == 0) {
3945 errln("Character Class #%d is null or of zero size.", i
);
3950 while (loopCount
< numIterations
|| numIterations
== -1) {
3951 if (numIterations
== -1 && loopCount
% 10 == 0) {
3952 // If test is running in an infinite loop, display a periodic tic so
3953 // we can tell that it is making progress.
3954 fprintf(stderr
, ".");
3956 // Save current random number seed, so that we can recreate the random numbers
3957 // for this loop iteration in event of an error.
3960 // Populate a test string with data.
3961 testText
.truncate(0);
3962 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
3963 int32_t aClassNum
= m_rand() % numCharClasses
;
3964 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
3965 int32_t charIdx
= m_rand() % classSet
->size();
3966 UChar32 c
= classSet
->charAt(charIdx
);
3967 if (c
< 0) { // TODO: deal with sets containing strings.
3974 // Calculate the expected results for this test string.
3975 mk
.setText(testText
);
3976 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
3977 expectedBreaks
[0] = 1;
3978 int32_t breakPos
= 0;
3981 breakPos
= mk
.next(breakPos
);
3982 if (breakPos
== -1) {
3985 if (breakPos
> testText
.length()) {
3986 errln("breakPos > testText.length()");
3988 expectedBreaks
[breakPos
] = 1;
3989 U_ASSERT(expectedCount
<testText
.length());
3990 expected
[expectedCount
++] = breakPos
;
3993 // Find the break positions using forward iteration
3994 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
3996 UErrorCode status
= U_ZERO_ERROR
;
3997 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
3998 // testUText = utext_openUnicodeString(testUText, &testText, &status);
3999 bi
->setText(testUText
, status
);
4000 TEST_ASSERT_SUCCESS(status
);
4001 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4002 // This UText can be closed immediately, so long as the
4003 // testText string continues to exist.
4005 bi
->setText(testText
);
4008 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4009 if (i
< 0 || i
> testText
.length()) {
4010 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4013 forwardBreaks
[i
] = 1;
4016 // Find the break positions using reverse iteration
4017 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4018 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4019 if (i
< 0 || i
> testText
.length()) {
4020 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4023 reverseBreaks
[i
] = 1;
4026 // Find the break positions using isBoundary() tests.
4027 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4028 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4029 for (i
=0; i
<=testText
.length(); i
++) {
4030 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4034 // Find the break positions using the following() function.
4036 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4037 int32_t lastBreakPos
= 0;
4038 followingBreaks
[0] = 1;
4039 for (i
=0; i
<testText
.length(); i
++) {
4040 breakPos
= bi
->following(i
);
4041 if (breakPos
<= i
||
4042 breakPos
< lastBreakPos
||
4043 breakPos
> testText
.length() ||
4044 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4045 UChar32 brkChar
= testText
.char32At(lastBreakPos
);
4046 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4047 errln("%s break monkey test: "
4048 "Out of range value returned by BreakIterator::following().\n"
4049 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4050 name
, seed
, i
, breakPos
, lastBreakPos
);
4054 followingBreaks
[breakPos
] = 1;
4055 lastBreakPos
= breakPos
;
4058 // Find the break positions using the preceding() function.
4059 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4060 lastBreakPos
= testText
.length();
4061 precedingBreaks
[testText
.length()] = 1;
4062 for (i
=testText
.length(); i
>0; i
--) {
4063 breakPos
= bi
->preceding(i
);
4064 if (breakPos
>= i
||
4065 breakPos
> lastBreakPos
||
4066 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4067 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4068 UChar32 brkChar
= testText
.char32At(breakPos
);
4069 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4070 errln("%s break monkey test: "
4071 "Out of range value returned by BreakIterator::preceding().\n"
4072 "index=%d; prev returned %d; lastBreak=%d" ,
4073 name
, i
, breakPos
, lastBreakPos
);
4074 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4075 precedingBreaks
[i
] = 2; // Forces an error.
4079 if (breakPos
>= 0) {
4080 precedingBreaks
[breakPos
] = 1;
4082 lastBreakPos
= breakPos
;
4086 // Compare the expected and actual results.
4087 for (i
=0; i
<=testText
.length(); i
++) {
4088 const char *errorType
= NULL
;
4089 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4090 errorType
= "next()";
4091 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4092 errorType
= "previous()";
4093 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4094 errorType
= "isBoundary()";
4095 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4096 errorType
= "following()";
4097 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4098 errorType
= "preceding()";
4102 if (errorType
!= NULL
) {
4103 // Format a range of the test text that includes the failure as
4104 // a data item that can be included in the rbbi test data file.
4106 // Start of the range is the last point where expected and actual results
4107 // both agreed that there was a break position.
4108 int startContext
= i
;
4111 if (startContext
==0) { break; }
4113 if (expectedBreaks
[startContext
] != 0) {
4114 if (count
== 2) break;
4119 // End of range is two expected breaks past the start position.
4120 int endContext
= i
+ 1;
4122 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4124 if (endContext
>= testText
.length()) {break;}
4125 if (expectedBreaks
[endContext
-1] != 0) {
4126 if (count
== 0) break;
4133 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4134 UnicodeString errorText
= "<data>";
4135 /***if (strcmp(errorType, "next()") == 0) {
4137 endContext = testText.length();
4139 printStringBreaks(testText, expected, expectedCount);
4142 for (ci
=startContext
; ci
<endContext
;) {
4143 UnicodeString
hexChars("0123456789abcdef");
4146 c
= testText
.char32At(ci
);
4148 // This is the location of the error.
4149 errorText
.append("<?>");
4150 } else if (expectedBreaks
[ci
] != 0) {
4151 // This a non-error expected break position.
4152 errorText
.append("\\");
4155 errorText
.append("\\u");
4156 for (bn
=12; bn
>=0; bn
-=4) {
4157 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4160 errorText
.append("\\U");
4161 for (bn
=28; bn
>=0; bn
-=4) {
4162 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4165 ci
= testText
.moveIndex32(ci
, 1);
4167 errorText
.append("\\");
4168 errorText
.append("</data>\n");
4171 char charErrorTxt
[500];
4172 UErrorCode status
= U_ZERO_ERROR
;
4173 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4174 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4175 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4177 UChar32 brkChar
= testText
.char32At(i
);
4178 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4179 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4180 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4181 errorType
, seed
, i
, charErrorTxt
);
4193 // Bug 5532. UTF-8 based UText fails in dictionary code.
4194 // This test checks the initial patch,
4195 // which is to just keep it from crashing. Correct word boundaries
4196 // await a proper fix to the dictionary code.
4198 void RBBITest::TestBug5532(void) {
4199 // Text includes a mixture of Thai and Latin.
4200 const unsigned char utf8Data
[] = {
4201 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4202 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4203 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4204 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4205 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4206 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4207 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4208 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4209 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4210 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4211 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4213 UErrorCode status
= U_ZERO_ERROR
;
4214 UText utext
=UTEXT_INITIALIZER
;
4215 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4216 TEST_ASSERT_SUCCESS(status
);
4218 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4219 TEST_ASSERT_SUCCESS(status
);
4220 if (U_SUCCESS(status
)) {
4221 bi
->setText(&utext
, status
);
4222 TEST_ASSERT_SUCCESS(status
);
4224 int32_t breakCount
= 0;
4225 int32_t previousBreak
= -1;
4226 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4227 // For now, just make sure that the break iterator doesn't hang.
4228 TEST_ASSERT(previousBreak
< bi
->current());
4229 previousBreak
= bi
->current();
4231 TEST_ASSERT(breakCount
> 0);
4234 utext_close(&utext
);
4238 void RBBITest::TestBug9983(void) {
4239 UnicodeString text
= UnicodeString("\\u002A" // * Other
4241 "\\u309C" // Katakana
4245 "\\u0000").unescape();
4247 UErrorCode status
= U_ZERO_ERROR
;
4248 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4249 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4250 TEST_ASSERT_SUCCESS(status
);
4251 if (U_FAILURE(status
)) {
4254 brkiter
->setText(text
);
4255 int32_t offset
, rstatus
;
4257 int32_t iterationCount
= 0;
4258 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4260 rstatus
= brkiter
->getRuleStatus();
4261 // printf(" %d(%d)", offset, rstatus);
4262 if (iterationCount
>= 10) {
4266 TEST_ASSERT(iterationCount
== 6);
4271 // TestDebug - A place-holder test for debugging purposes.
4272 // For putting in fragments of other tests that can be invoked
4273 // for tracing without a lot of unwanted extra stuff happening.
4275 void RBBITest::TestDebug(void) {
4277 UErrorCode status
= U_ZERO_ERROR
;
4281 RuleBasedBreakIterator
* bi
=
4282 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4283 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4284 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4285 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4286 // UnicodeString s("Aaa. Bcd");
4289 UBool r
= bi
->isBoundary(8);
4290 printf("%s", r
?"true":"false");
4294 // ruleStatus = bi->getRuleStatus();
4295 printf("%d\t%d\n", pos
, ruleStatus
);
4296 pos
= bi
->previous();
4297 } while (pos
!= BreakIterator::DONE
);
4301 void RBBITest::TestProperties() {
4302 UErrorCode errorCode
= U_ZERO_ERROR
;
4303 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4304 if (!prependSet
.isEmpty()) {
4306 "[:GCB=Prepend:] is not empty any more. "
4307 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4308 "change this test to the opposite condition.");
4312 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */