1 /********************************************************************
3 * Copyright (c) 1999-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "utypeinfo.h" // for 'typeid' to work
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
42 #define TEST_ASSERT(x) {if (!(x)) { \
43 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
49 //---------------------------------------------
51 //---------------------------------------------
54 // Note: Before adding new tests to this file, check whether the desired test data can
55 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
56 // it's much less work than writing a new test, diagnostic output in the event of failures
57 // is good, and the test data file will is shared with ICU4J, so eventually the test
58 // will run there as well, without additional effort.
60 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
62 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
65 #if !UCONFIG_NO_FILE_IO
66 case 0: name
= "TestBug4153072";
67 if(exec
) TestBug4153072(); break;
69 case 0: name
= "skip";
73 case 1: name
= "skip";
75 case 2: name
= "TestStatusReturn";
76 if(exec
) TestStatusReturn(); break;
78 #if !UCONFIG_NO_FILE_IO
79 case 3: name
= "TestUnicodeFiles";
80 if(exec
) TestUnicodeFiles(); break;
81 case 4: name
= "TestEmptyString";
82 if(exec
) TestEmptyString(); break;
84 case 3: case 4: name
= "skip";
88 case 5: name
= "TestGetAvailableLocales";
89 if(exec
) TestGetAvailableLocales(); break;
91 case 6: name
= "TestGetDisplayName";
92 if(exec
) TestGetDisplayName(); break;
94 #if !UCONFIG_NO_FILE_IO
95 case 7: name
= "TestEndBehaviour";
96 if(exec
) TestEndBehaviour(); break;
97 case 8: case 9: case 10: name
= "skip";
99 case 11: name
= "TestWordBreaks";
100 if(exec
) TestWordBreaks(); break;
101 case 12: name
= "TestWordBoundary";
102 if(exec
) TestWordBoundary(); break;
103 case 13: name
= "TestLineBreaks";
104 if(exec
) TestLineBreaks(); break;
105 case 14: name
= "TestSentBreaks";
106 if(exec
) TestSentBreaks(); break;
107 case 15: name
= "TestExtended";
108 if(exec
) TestExtended(); break;
110 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name
= "skip";
114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
116 name
= "TestMonkey"; if(exec
) TestMonkey(params
); break;
119 name
= "skip"; break;
122 #if !UCONFIG_NO_FILE_IO
123 case 17: name
= "TestBug3818";
124 if(exec
) TestBug3818(); break;
126 case 17: name
= "skip";
130 case 18: name
= "skip";
132 case 19: name
= "TestDebug";
133 if(exec
) TestDebug(); break;
134 case 20: name
= "skip";
137 #if !UCONFIG_NO_FILE_IO
138 case 21: name
= "TestBug5775";
139 if (exec
) TestBug5775(); break;
141 case 21: name
= "skip";
145 case 22: name
= "TestBug9983";
146 if (exec
) TestBug9983(); break;
147 case 23: name
= "TestDictRules";
148 if (exec
) TestDictRules(); break;
149 case 24: name
= "TestBug5532";
150 if (exec
) TestBug5532(); break;
151 default: name
= ""; break; //needed to end loop
156 //---------------------------------------------------------------------------
158 // class BITestData Holds a set of Break iterator test data and results
160 // - the string data to be broken
161 // - a vector of the expected break positions.
162 // - a vector of source line numbers for the data,
163 // (to help see where errors occured.)
164 // - The expected break tag values.
165 // - Vectors of actual break positions and tag values.
166 // - Functions for comparing actual with expected and
169 //----------------------------------------------------------------------------
172 UnicodeString fDataToBreak
;
173 UVector fExpectedBreakPositions
;
174 UVector fExpectedTags
;
176 UVector fActualBreakPositions
; // Test Results.
179 BITestData(UErrorCode
&status
);
180 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
181 void checkResults(const char *heading
, RBBITest
*test
);
182 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
189 BITestData::BITestData(UErrorCode
&status
)
190 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
196 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
197 // The macro form collects the line number, which is helpful
198 // when tracking down failures.
200 // A null data item is inserted at the start of each test's data
201 // to put the starting zero into the data list. The position saved for
202 // each non-null item is its ending position.
204 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
205 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
206 if (U_FAILURE(status
)) {return;}
208 fDataToBreak
.append(CharsToUnicodeString(data
));
210 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
211 fExpectedTags
.addElement(tag
, status
);
212 fLineNum
.addElement(lineNum
, status
);
217 // checkResults. Compare the actual and expected break positions, report any differences.
219 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
220 int32_t expectedIndex
= 0;
221 int32_t actualIndex
= 0;
224 // If we've run through both the expected and actual results vectors, we're done.
225 // break out of the loop.
226 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
227 actualIndex
>= fActualBreakPositions
.size()) {
232 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
233 err(heading
, test
, expectedIndex
-1, actualIndex
);
238 if (actualIndex
>= fActualBreakPositions
.size()) {
239 err(heading
, test
, expectedIndex
, actualIndex
-1);
244 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
245 err(heading
, test
, expectedIndex
, actualIndex
);
246 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
255 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
256 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
257 heading
, fLineNum
.elementAt(expectedIndex
),
258 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
267 // err - An error was found. Report it, along with information about where the
268 // incorrectly broken test data appeared in the source file.
270 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
272 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
273 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
275 int32_t line
= fLineNum
.elementAti(expectedIdx
);
276 if (expectedIdx
> 0) {
277 // The line numbers are off by one because a premature break occurs somewhere
278 // within the previous item, rather than at the start of the current (expected) item.
279 // We want to report the offset of the unexpected break from the start of
280 // this previous item.
281 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
283 if (actual
< expected
) {
284 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
286 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
291 void BITestData::clearResults() {
292 fActualBreakPositions
.removeAllElements();
293 fActualTags
.removeAllElements();
297 //--------------------------------------------------------------------------------------
299 // RBBITest constructor and destructor
301 //--------------------------------------------------------------------------------------
303 RBBITest::RBBITest() {
307 RBBITest::~RBBITest() {
310 //-----------------------------------------------------------------------------------
312 // Test for status {tag} return value from break rules.
313 // TODO: a more thorough test.
315 //-----------------------------------------------------------------------------------
316 void RBBITest::TestStatusReturn() {
317 UnicodeString
rulesString1("$Letters = [:L:];\n"
318 "$Numbers = [:N:];\n"
321 "Help\\ {4}/me\\!;\n"
322 "[^$Letters $Numbers];\n"
323 "!.*;\n", -1, US_INV
);
324 UnicodeString testString1
= "abc123..abc Help me Help me!";
325 // 01234567890123456789012345678
326 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
329 UErrorCode status
=U_ZERO_ERROR
;
330 UParseError parseError
;
332 BreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
333 if(U_FAILURE(status
)) {
334 dataerrln("FAIL : in construction - %s", u_errorName(status
));
338 bi
->setText(testString1
);
339 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
340 if (pos
!= bounds1
[i
]) {
341 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
345 int tag
= bi
->getRuleStatus();
346 if (tag
!= brkStatus
[i
]) {
347 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
357 static void printStringBreaks(UnicodeString ustr
, int expected
[],
360 UErrorCode status
= U_ZERO_ERROR
;
362 printf("code alpha extend alphanum type word sent line name\n");
364 for (j
= 0; j
< ustr
.length(); j
++) {
365 if (expectedcount
> 0) {
367 for (k
= 0; k
< expectedcount
; k
++) {
368 if (j
== expected
[k
]) {
369 printf("------------------------------------------------ %d\n",
374 UChar32 c
= ustr
.char32At(j
);
378 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
379 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
381 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
383 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
385 U_SHORT_PROPERTY_NAME
),
386 u_getPropertyValueName(UCHAR_WORD_BREAK
,
387 u_getIntPropertyValue(c
,
389 U_SHORT_PROPERTY_NAME
),
390 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
391 u_getIntPropertyValue(c
,
392 UCHAR_SENTENCE_BREAK
),
393 U_SHORT_PROPERTY_NAME
),
394 u_getPropertyValueName(UCHAR_LINE_BREAK
,
395 u_getIntPropertyValue(c
,
397 U_SHORT_PROPERTY_NAME
),
403 void RBBITest::TestBug3818() {
404 UErrorCode status
= U_ZERO_ERROR
;
406 // Four Thai words...
407 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409 UnicodeString
thaiStr(thaiWordData
);
411 BreakIterator
* bi
= BreakIterator::createWordInstance(Locale("th"), status
);
412 if (U_FAILURE(status
) || bi
== NULL
) {
413 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
416 bi
->setText(thaiStr
);
418 int32_t startOfSecondWord
= bi
->following(1);
419 if (startOfSecondWord
!= 4) {
420 errln("Fail at file %s, line %d expected start of word at 4, got %d",
421 __FILE__
, __LINE__
, startOfSecondWord
);
423 startOfSecondWord
= bi
->following(0);
424 if (startOfSecondWord
!= 4) {
425 errln("Fail at file %s, line %d expected start of word at 4, got %d",
426 __FILE__
, __LINE__
, startOfSecondWord
);
431 //----------------------------------------------------------------------------
433 // generalIteratorTest Given a break iterator and a set of test data,
434 // Run the tests and report the results.
436 //----------------------------------------------------------------------------
437 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
440 bi
.setText(td
.fDataToBreak
);
442 testFirstAndNext(bi
, td
);
444 testLastAndPrevious(bi
, td
);
446 testFollowing(bi
, td
);
447 testPreceding(bi
, td
);
448 testIsBoundary(bi
, td
);
449 doMultipleSelectionTest(bi
, td
);
454 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
457 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
459 UErrorCode status
= U_ZERO_ERROR
;
464 logln("Test first and next");
465 bi
.setText(td
.fDataToBreak
);
468 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
469 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
470 tag
= bi
.getRuleStatus();
471 td
.fActualTags
.addElement(tag
, status
);
473 // If the iterator is not making forward progress, stop.
474 // No need to raise an error here, it'll be detected in the normal check of results.
479 td
.checkResults("testFirstAndNext", this);
484 // TestLastAndPrevious. Run the iterator backwards, starting with last().
486 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
488 UErrorCode status
= U_ZERO_ERROR
;
490 int32_t lastP
= 0x7ffffffe;
493 logln("Test last and previous");
494 bi
.setText(td
.fDataToBreak
);
497 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
498 // Save break position. Insert it at start of vector of results, shoving
499 // already-saved results further towards the end.
500 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
501 // bi.previous(); // TODO: Why does this fix things up????
503 tag
= bi
.getRuleStatus();
504 td
.fActualTags
.insertElementAt(tag
, 0, status
);
506 // If the iterator is not making progress, stop.
507 // No need to raise an error here, it'll be detected in the normal check of results.
512 td
.checkResults("testLastAndPrevious", this);
516 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
518 UErrorCode status
= U_ZERO_ERROR
;
521 int32_t lastP
= -2; // A value that will never be returned as a break position.
522 // cannot be -1; that is returned for DONE.
525 logln("testFollowing():");
526 bi
.setText(td
.fDataToBreak
);
529 // Save the starting point, since we won't get that out of following.
531 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
532 tag
= bi
.getRuleStatus();
533 td
.fActualTags
.addElement(tag
, status
);
535 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
538 if (p
== RuleBasedBreakIterator::DONE
) {
541 // We've reached a new break position. Save it.
542 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
543 tag
= bi
.getRuleStatus();
544 td
.fActualTags
.addElement(tag
, status
);
548 // The loop normally exits by means of the break in the middle.
549 // Make sure that the index was at the correct position for the break iterator to have
551 if (i
!= td
.fDataToBreak
.length()) {
552 errln("testFollowing(): iterator returned DONE prematurely.");
555 // Full check of all results.
556 td
.checkResults("testFollowing", this);
561 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
562 UErrorCode status
= U_ZERO_ERROR
;
565 int32_t lastP
= 0x7ffffffe;
568 logln("testPreceding():");
569 bi
.setText(td
.fDataToBreak
);
573 td
.fActualBreakPositions
.addElement(p
, status
);
574 tag
= bi
.getRuleStatus();
575 td
.fActualTags
.addElement(tag
, status
);
577 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
580 if (p
== RuleBasedBreakIterator::DONE
) {
583 // We've reached a new break position. Save it.
584 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
586 tag
= bi
.getRuleStatus();
587 td
.fActualTags
.insertElementAt(tag
, 0, status
);
590 // The loop normally exits by means of the break in the middle.
591 // Make sure that the index was at the correct position for the break iterator to have
594 errln("testPreceding(): iterator returned DONE prematurely.");
597 // Full check of all results.
598 td
.checkResults("testPreceding", this);
603 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
604 UErrorCode status
= U_ZERO_ERROR
;
608 logln("testIsBoundary():");
609 bi
.setText(td
.fDataToBreak
);
612 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
613 if (bi
.isBoundary(i
)) {
614 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
615 tag
= bi
.getRuleStatus();
616 td
.fActualTags
.addElement(tag
, status
);
619 td
.checkResults("testIsBoundary: ", this);
624 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
626 iterator
.setText(td
.fDataToBreak
);
628 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
629 int32_t offset
= iterator
.first();
633 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
635 if (*testIterator
!= iterator
)
636 errln("clone() or operator!= failed: two clones compared unequal");
639 testOffset
= testIterator
->first();
640 testOffset
= testIterator
->next(count
);
641 if (offset
!= testOffset
)
642 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
644 if (offset
!= RuleBasedBreakIterator::DONE
) {
646 offset
= iterator
.next();
648 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
649 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
650 if (count
> 10000 || offset
== -1) {
651 errln("operator== failed too many times. Stopping test.");
653 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
659 } while (offset
!= RuleBasedBreakIterator::DONE
);
661 // now do it backwards...
662 offset
= iterator
.last();
666 testOffset
= testIterator
->last();
667 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
668 if (offset
!= testOffset
)
669 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
671 if (offset
!= RuleBasedBreakIterator::DONE
) {
673 offset
= iterator
.previous();
675 } while (offset
!= RuleBasedBreakIterator::DONE
);
681 //---------------------------------------------
685 //---------------------------------------------
686 void RBBITest::TestEmptyString()
688 UnicodeString text
= "";
689 UErrorCode status
= U_ZERO_ERROR
;
691 BITestData
x(status
);
692 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
693 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
694 if (U_FAILURE(status
))
696 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
699 generalIteratorTest(*bi
, x
);
703 void RBBITest::TestGetAvailableLocales()
705 int32_t locCount
= 0;
706 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
709 dataerrln("getAvailableLocales() returned an empty list!");
710 // Just make sure that it's returning good memory.
712 for (i
= 0; i
< locCount
; ++i
) {
713 logln(locList
[i
].getName());
717 //Testing the BreakIterator::getDisplayName() function
718 void RBBITest::TestGetDisplayName()
720 UnicodeString result
;
722 BreakIterator::getDisplayName(Locale::getUS(), result
);
723 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
724 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
727 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
728 if (result
!= "French (France)")
729 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
736 void RBBITest::TestEndBehaviour()
738 UErrorCode status
= U_ZERO_ERROR
;
739 UnicodeString
testString("boo.");
740 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
741 if (U_FAILURE(status
))
743 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
746 wb
->setText(testString
);
748 if (wb
->first() != 0)
749 errln("Didn't get break at beginning of string.");
751 errln("Didn't get break before period in \"boo.\"");
752 if (wb
->current() != 4 && wb
->next() != 4)
753 errln("Didn't get break at end of string.");
759 void RBBITest::TestBug4153072() {
760 UErrorCode status
= U_ZERO_ERROR
;
761 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
762 if (U_FAILURE(status
))
764 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
767 UnicodeString
str("...Hello, World!...");
769 int32_t end
= str
.length() - 3;
772 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
773 iter
->adoptText(textIterator
);
775 // Note: with the switch to UText, there is no way to restrict the
776 // iteration range to begin at an index other than zero.
777 // String character iterators created with a non-zero bound are
778 // treated by RBBI as being empty.
779 for (index
= -1; index
< begin
+ 1; ++index
) {
780 onBoundary
= iter
->isBoundary(index
);
781 if (index
== 0? !onBoundary
: onBoundary
) {
782 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
783 " and begin index = " + begin
);
791 // Test for problem reported by Ashok Matoria on 9 July 2007
792 // One.<kSoftHyphen><kSpace>Two.
794 // Sentence break at start (0) and then on calling next() it breaks at
795 // 'T' of "Two". Now, at this point if I do next() and
796 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
798 void RBBITest::TestBug5775() {
799 UErrorCode status
= U_ZERO_ERROR
;
800 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
801 TEST_ASSERT_SUCCESS(status
);
802 if (U_FAILURE(status
)) {
805 // Check for status first for better handling of no data errors.
806 TEST_ASSERT(bi
!= NULL
);
811 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
815 int pos
= bi
->next();
816 TEST_ASSERT(pos
== 6);
818 TEST_ASSERT(pos
== 10);
819 pos
= bi
->previous();
820 TEST_ASSERT(pos
== 6);
826 //------------------------------------------------------------------------------
828 // RBBITest::Extended Run RBBI Tests from an external test data file
830 //------------------------------------------------------------------------------
834 UnicodeString dataToBreak
;
835 UVector32
*expectedBreaks
;
840 void RBBITest::executeTest(TestParams
*t
) {
849 t
->bi
->setText(t
->dataToBreak
);
851 // Run the iterator forward
854 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
856 // Fail for lack of forward progress.
857 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
858 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
862 // Check that there were we didn't miss an expected break between the last one
864 for (i
=prevBP
+1; i
<bp
; i
++) {
865 if (t
->expectedBreaks
->elementAti(i
) != 0) {
866 int expected
[] = {0, i
};
867 printStringBreaks(t
->dataToBreak
, expected
, 2);
868 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
869 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
873 // Check that the break we did find was expected
874 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
875 int expected
[] = {0, bp
};
876 printStringBreaks(t
->dataToBreak
, expected
, 2);
877 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
878 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
880 // The break was expected.
881 // Check that the {nnn} tag value is correct.
882 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
883 if (expectedTagVal
== -1) {
886 int32_t line
= t
->srcLine
->elementAti(bp
);
887 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
888 if (rs
!= expectedTagVal
) {
889 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
890 " Actual, Expected status = %4d, %4d",
891 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
899 // Verify that there were no missed expected breaks after the last one found
900 for (i
=prevBP
+1; i
<t
->expectedBreaks
->size(); i
++) {
901 if (t
->expectedBreaks
->elementAti(i
) != 0) {
902 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
903 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
908 // Run the iterator backwards, verify that the same breaks are found.
910 prevBP
= t
->dataToBreak
.length()+2; // start with a phony value for the last break pos seen.
911 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
913 // Fail for lack of progress.
914 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
915 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
919 // Check that there were we didn't miss an expected break between the last one
920 // and this one. (UVector returns zeros for index out of bounds.)
921 for (i
=prevBP
-1; i
>bp
; i
--) {
922 if (t
->expectedBreaks
->elementAti(i
) != 0) {
923 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
924 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
928 // Check that the break we did find was expected
929 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
930 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
931 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
933 // The break was expected.
934 // Check that the {nnn} tag value is correct.
935 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
936 if (expectedTagVal
== -1) {
939 int line
= t
->srcLine
->elementAti(bp
);
940 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
941 if (rs
!= expectedTagVal
) {
942 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
943 " Actual, Expected status = %4d, %4d",
944 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
951 // Verify that there were no missed breaks prior to the last one found
952 for (i
=prevBP
-1; i
>=0; i
--) {
953 if (t
->expectedBreaks
->elementAti(i
) != 0) {
954 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
955 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
959 // Check isBoundary()
960 for (i
=0; i
<t
->expectedBreaks
->size(); i
++) {
961 UBool boundaryExpected
= (t
->expectedBreaks
->elementAti(i
) != 0);
962 UBool boundaryFound
= t
->bi
->isBoundary(i
);
963 if (boundaryExpected
!= boundaryFound
) {
964 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
965 " Expected, Actual= %s, %s",
966 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
),
967 boundaryExpected
? "true":"false", boundaryFound
? "true" : "false");
972 for (i
=0; i
<t
->expectedBreaks
->size(); i
++) {
973 int32_t actualBreak
= t
->bi
->following(i
);
974 int32_t expectedBreak
= BreakIterator::DONE
;
975 for (int32_t j
=i
+1; j
< t
->expectedBreaks
->size(); j
++) {
976 if (t
->expectedBreaks
->elementAti(j
) != 0) {
981 if (expectedBreak
!= actualBreak
) {
982 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
983 " Expected, Actual= %d, %d",
984 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
), expectedBreak
, actualBreak
);
989 for (i
=t
->expectedBreaks
->size(); i
>=0; i
--) {
990 int32_t actualBreak
= t
->bi
->preceding(i
);
991 int32_t expectedBreak
= BreakIterator::DONE
;
993 for (int32_t j
=i
-1; j
>= 0; j
--) {
994 if (t
->expectedBreaks
->elementAti(j
) != 0) {
999 if (expectedBreak
!= actualBreak
) {
1000 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1001 " Expected, Actual= %d, %d",
1002 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
), expectedBreak
, actualBreak
);
1008 void RBBITest::TestExtended() {
1009 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1010 UErrorCode status
= U_ZERO_ERROR
;
1013 UnicodeString rules
;
1016 tp
.expectedBreaks
= new UVector32(status
);
1017 tp
.srcLine
= new UVector32(status
);
1018 tp
.srcCol
= new UVector32(status
);
1020 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status
);
1021 if (U_FAILURE(status
)) {
1022 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1027 // Open and read the test data file.
1029 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1030 char testFileName
[1000];
1031 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1032 errln("Can't open test data. Path too long.");
1035 strcpy(testFileName
, testDataDirectory
);
1036 strcat(testFileName
, "rbbitst.txt");
1039 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1040 if (U_FAILURE(status
)) {
1041 return; /* something went wrong, error already output */
1048 // Put the test data into a UnicodeString
1050 UnicodeString
testString(FALSE
, testFile
, len
);
1058 parseState
= PARSE_TAG
;
1060 EParseState savedState
= PARSE_TAG
;
1062 static const UChar CH_LF
= 0x0a;
1063 static const UChar CH_CR
= 0x0d;
1064 static const UChar CH_HASH
= 0x23;
1065 /*static const UChar CH_PERIOD = 0x2e;*/
1066 static const UChar CH_LT
= 0x3c;
1067 static const UChar CH_GT
= 0x3e;
1068 static const UChar CH_BACKSLASH
= 0x5c;
1069 static const UChar CH_BULLET
= 0x2022;
1071 int32_t lineNum
= 1;
1072 int32_t colStart
= 0;
1074 int32_t charIdx
= 0;
1076 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1078 for (charIdx
= 0; charIdx
< len
; ) {
1079 status
= U_ZERO_ERROR
;
1080 UChar c
= testString
.charAt(charIdx
);
1082 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1083 // treat CRLF as a unit
1087 if (c
== CH_LF
|| c
== CH_CR
) {
1091 column
= charIdx
- colStart
+ 1;
1093 switch (parseState
) {
1095 if (c
== 0x0a || c
== 0x0d) {
1096 parseState
= savedState
;
1103 parseState
= PARSE_COMMENT
;
1104 savedState
= PARSE_TAG
;
1107 if (u_isUWhiteSpace(c
)) {
1110 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1112 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1116 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1118 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1122 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1124 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1128 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1131 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1135 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1137 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1142 // <locale loc_name>
1143 localeMatcher
.reset(testString
);
1144 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1145 UnicodeString localeName
= localeMatcher
.group(1, status
);
1146 char localeName8
[100];
1147 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1148 locale
= Locale::createFromName(localeName8
);
1149 charIdx
+= localeMatcher
.group(0, status
).length() - 1;
1150 TEST_ASSERT_SUCCESS(status
);
1153 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1154 parseState
= PARSE_DATA
;
1156 tp
.dataToBreak
= "";
1157 tp
.expectedBreaks
->removeAllElements();
1158 tp
.srcCol
->removeAllElements();
1159 tp
.srcLine
->removeAllElements();
1163 errln("line %d: Tag expected in test file.", lineNum
);
1164 parseState
= PARSE_COMMENT
;
1165 savedState
= PARSE_DATA
;
1166 goto end_test
; // Stop the test.
1171 if (c
== CH_BULLET
) {
1172 int32_t breakIdx
= tp
.dataToBreak
.length();
1173 tp
.expectedBreaks
->setSize(breakIdx
+1);
1174 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1175 tp
.srcLine
->setSize(breakIdx
+1);
1176 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1177 tp
.srcCol
->setSize(breakIdx
+1);
1178 tp
.srcCol
->setElementAt(column
, breakIdx
);
1182 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1183 // Add final entry to mappings from break location to source file position.
1184 // Need one extra because last break position returned is after the
1185 // last char in the data, not at the last char.
1186 tp
.srcLine
->addElement(lineNum
, status
);
1187 tp
.srcCol
->addElement(column
, status
);
1189 parseState
= PARSE_TAG
;
1197 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1198 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1199 // Get the code point from the name and insert it into the test data.
1200 // (Damn, no API takes names in Unicode !!!
1201 // we've got to take it back to char *)
1202 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1203 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1204 char charNameBuf
[200];
1205 UChar32 theChar
= -1;
1206 if (nameEndIdx
!= -1) {
1207 UErrorCode status
= U_ZERO_ERROR
;
1208 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1209 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1210 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1211 if (U_FAILURE(status
)) {
1215 if (theChar
== -1) {
1216 errln("Error in named character in test file at line %d, col %d",
1219 // Named code point was recognized. Insert it
1220 // into the test data.
1221 tp
.dataToBreak
.append(theChar
);
1222 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1223 tp
.srcLine
->addElement(lineNum
, status
);
1224 tp
.srcCol
->addElement(column
, status
);
1227 if (nameEndIdx
> charIdx
) {
1228 charIdx
= nameEndIdx
+1;
1237 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1239 int32_t breakIdx
= tp
.dataToBreak
.length();
1240 tp
.expectedBreaks
->setSize(breakIdx
+1);
1241 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1242 tp
.srcLine
->setSize(breakIdx
+1);
1243 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1244 tp
.srcCol
->setSize(breakIdx
+1);
1245 tp
.srcCol
->setElementAt(column
, breakIdx
);
1251 parseState
= PARSE_NUM
;
1255 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1256 parseState
= PARSE_COMMENT
;
1257 savedState
= PARSE_DATA
;
1261 if (c
== CH_BACKSLASH
) {
1262 // Check for \ at end of line, a line continuation.
1263 // Advance over (discard) the newline
1264 UChar32 cp
= testString
.char32At(charIdx
);
1265 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1267 // Need an extra increment of the input ptr to move over both of them
1270 if (cp
== CH_LF
|| cp
== CH_CR
) {
1277 // Let unescape handle the back slash.
1278 cp
= testString
.unescapeAt(charIdx
);
1280 // Escape sequence was recognized. Insert the char
1281 // into the test data.
1282 tp
.dataToBreak
.append(cp
);
1283 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1284 tp
.srcLine
->addElement(lineNum
, status
);
1285 tp
.srcCol
->addElement(column
, status
);
1291 // Not a recognized backslash escape sequence.
1292 // Take the next char as a literal.
1293 // TODO: Should this be an error?
1294 c
= testString
.charAt(charIdx
);
1295 charIdx
= testString
.moveIndex32(charIdx
, 1);
1298 // Normal, non-escaped data char.
1299 tp
.dataToBreak
.append(c
);
1301 // Save the mapping from offset in the data to line/column numbers in
1302 // the original input file. Will be used for better error messages only.
1303 // If there's an expected break before this char, the slot in the mapping
1304 // vector will already be set for this char; don't overwrite it.
1305 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1306 tp
.srcLine
->addElement(lineNum
, status
);
1307 tp
.srcCol
->addElement(column
, status
);
1313 // We are parsing an expected numeric tag value, like <1234>,
1314 // within a chunk of data.
1315 if (u_isUWhiteSpace(c
)) {
1320 // Finished the number. Add the info to the expected break data,
1321 // and switch parse state back to doing plain data.
1322 parseState
= PARSE_DATA
;
1323 if (tagValue
== 0) {
1326 int32_t breakIdx
= tp
.dataToBreak
.length();
1327 tp
.expectedBreaks
->setSize(breakIdx
+1);
1328 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1329 tp
.srcLine
->setSize(breakIdx
+1);
1330 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1331 tp
.srcCol
->setSize(breakIdx
+1);
1332 tp
.srcCol
->setElementAt(column
, breakIdx
);
1337 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1341 errln("Syntax Error in test file at line %d, col %d",
1343 parseState
= PARSE_COMMENT
;
1344 goto end_test
; // Stop the test
1349 if (U_FAILURE(status
)) {
1350 dataerrln("ICU Error %s while parsing test file at line %d.",
1351 u_errorName(status
), lineNum
);
1352 status
= U_ZERO_ERROR
;
1353 goto end_test
; // Stop the test
1360 delete tp
.expectedBreaks
;
1368 //-------------------------------------------------------------------------------
1370 // TestDictRules create a break iterator from source rules that includes a
1371 // dictionary range. Regression for bug #7130. Source rules
1372 // do not declare a break iterator type (word, line, sentence, etc.
1373 // but the dictionary code, without a type, would loop.
1375 //-------------------------------------------------------------------------------
1376 void RBBITest::TestDictRules() {
1377 const char *rules
= "$dictionary = [a-z]; \n"
1379 "$dictionary $dictionary; \n"
1381 "$dictionary $dictionary; \n";
1382 const char *text
= "aa";
1383 UErrorCode status
= U_ZERO_ERROR
;
1384 UParseError parseError
;
1386 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
1387 if (U_SUCCESS(status
)) {
1388 UnicodeString utext
= text
;
1392 for (loops
= 0; loops
<10; loops
++) {
1393 position
= bi
.next();
1394 if (position
== RuleBasedBreakIterator::DONE
) {
1398 TEST_ASSERT(loops
== 1);
1400 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
1406 //-------------------------------------------------------------------------------
1408 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1409 // return the datain one big UChar * buffer, which the caller must delete.
1412 // fileName: the name of the file, with no directory part. The test data directory
1414 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1415 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1416 // specified here. The BOM, if it exists, will be stripped from the returned data.
1417 // Pass NULL for the system default encoding.
1420 // The file data, converted to UChar.
1421 // The caller must delete this when done with
1422 // delete [] theBuffer;
1424 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1425 // Move this function to some common place.
1427 //--------------------------------------------------------------------------------
1428 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1429 UChar
*retPtr
= NULL
;
1430 char *fileBuf
= NULL
;
1431 UConverter
* conv
= NULL
;
1435 if (U_FAILURE(status
)) {
1442 f
= fopen(fileName
, "rb");
1444 dataerrln("Error opening test data file %s\n", fileName
);
1445 status
= U_FILE_ACCESS_ERROR
;
1454 fseek( f
, 0, SEEK_END
);
1455 fileSize
= ftell(f
);
1456 fileBuf
= new char[fileSize
];
1457 fseek(f
, 0, SEEK_SET
);
1458 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1459 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1460 errln("Error reading test data file.");
1461 goto cleanUpAndReturn
;
1465 // Look for a Unicode Signature (BOM) on the data just read
1467 int32_t signatureLength
;
1468 const char * fileBufC
;
1469 const char* bomEncoding
;
1472 bomEncoding
= ucnv_detectUnicodeSignature(
1473 fileBuf
, fileSize
, &signatureLength
, &status
);
1474 if(bomEncoding
!=NULL
){
1475 fileBufC
+= signatureLength
;
1476 fileSize
-= signatureLength
;
1477 encoding
= bomEncoding
;
1481 // Open a converter to take the rule file to UTF-16
1483 conv
= ucnv_open(encoding
, &status
);
1484 if (U_FAILURE(status
)) {
1485 goto cleanUpAndReturn
;
1489 // Convert the rules to UChar.
1490 // Preflight first to determine required buffer size.
1492 ulen
= ucnv_toUChars(conv
,
1498 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1499 // Buffer Overflow is expected from the preflight operation.
1500 status
= U_ZERO_ERROR
;
1502 retPtr
= new UChar
[ulen
+1];
1515 if (U_FAILURE(status
)) {
1516 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1526 //--------------------------------------------------------------------------------------------
1528 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1530 //-------------------------------------------------------------------------------------------
1531 void RBBITest::TestUnicodeFiles() {
1532 RuleBasedBreakIterator
*bi
;
1533 UErrorCode status
= U_ZERO_ERROR
;
1535 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
1536 TEST_ASSERT_SUCCESS(status
);
1537 if (U_SUCCESS(status
)) {
1538 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1542 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
1543 TEST_ASSERT_SUCCESS(status
);
1544 if (U_SUCCESS(status
)) {
1545 runUnicodeTestData("WordBreakTest.txt", bi
);
1549 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1550 TEST_ASSERT_SUCCESS(status
);
1551 if (U_SUCCESS(status
)) {
1552 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1556 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
1557 TEST_ASSERT_SUCCESS(status
);
1558 if (U_SUCCESS(status
)) {
1559 runUnicodeTestData("LineBreakTest.txt", bi
);
1565 //--------------------------------------------------------------------------------------------
1567 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1569 //-------------------------------------------------------------------------------------------
1570 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1571 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1572 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1573 UBool isTicket7270Fixed
= !logKnownIssue("7270");
1574 UBool isLineBreak
= 0 == strcmp(fileName
, "LineBreakTest.txt");
1575 UErrorCode status
= U_ZERO_ERROR
;
1578 // Open and read the test data file, put it into a UnicodeString.
1580 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1581 char testFileName
[1000];
1582 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1583 dataerrln("Can't open test data. Path too long.");
1586 strcpy(testFileName
, testDataDirectory
);
1587 strcat(testFileName
, fileName
);
1589 logln("Opening data file %s\n", fileName
);
1592 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1593 if (status
!= U_FILE_ACCESS_ERROR
) {
1594 TEST_ASSERT_SUCCESS(status
);
1595 TEST_ASSERT(testFile
!= NULL
);
1597 if (U_FAILURE(status
) || testFile
== NULL
) {
1598 return; /* something went wrong, error already output */
1600 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
1603 // Parse the test data file using a regular expression.
1604 // Each kind of token is recognized in its own capture group; what type of item was scanned
1605 // is identified by which group had a match.
1607 // Caputure Group # 1 2 3 4 5
1608 // Parses this item: divide x hex digits comment \n unrecognized \n
1610 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
1611 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
1612 UnicodeString testString
;
1613 UVector32
breakPositions(status
);
1615 TEST_ASSERT_SUCCESS(status
);
1616 if (U_FAILURE(status
)) {
1621 // Scan through each test case, building up the string to be broken in testString,
1622 // and the positions that should be boundaries in the breakPositions vector.
1625 while (tokenMatcher
.find()) {
1626 if(tokenMatcher
.hitEnd()) {
1627 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1628 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1629 and caused an infinite loop here on EBCDIC systems!
1631 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
1634 if (tokenMatcher
.start(1, status
) >= 0) {
1635 // Scanned a divide sign, indicating a break position in the test data.
1636 if (testString
.length()>0) {
1637 breakPositions
.addElement(testString
.length(), status
);
1640 else if (tokenMatcher
.start(2, status
) >= 0) {
1641 // Scanned an 'x', meaning no break at this position in the test data
1642 // Nothing to be done here.
1644 else if (tokenMatcher
.start(3, status
) >= 0) {
1645 // Scanned Hex digits. Convert them to binary, append to the character data string.
1646 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
1647 int length
= hexNumber
.length();
1650 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
1651 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
1653 testString
.append(c
);
1655 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1656 fileName
, lineNumber
);
1659 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1660 fileName
, lineNumber
);
1663 else if (tokenMatcher
.start(4, status
) >= 0) {
1664 // Scanned to end of a line, possibly skipping over a comment in the process.
1665 // If the line from the file contained test data, run the test now.
1667 if (testString
.length() > 0) {
1668 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1671 // is not yet implemented.
1672 if (!(isLineBreak
&& !isTicket7270Fixed
&& (5198 == lineNumber
||
1673 5202 == lineNumber
||
1674 5214 == lineNumber
||
1675 5246 == lineNumber
||
1676 5298 == lineNumber
||
1677 5302 == lineNumber
))) {
1678 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
1682 // Clear out this test case.
1683 // The string and breakPositions vector will be refilled as the next
1684 // test case is parsed.
1685 testString
.remove();
1686 breakPositions
.removeAllElements();
1689 // Scanner catchall. Something unrecognized appeared on the line.
1691 UnicodeString uToken
= tokenMatcher
.group(0, status
);
1692 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
1693 token
[sizeof(token
)-1] = 0;
1694 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
1696 // Clean up, in preparation for continuing with the next line.
1697 testString
.remove();
1698 breakPositions
.removeAllElements();
1701 TEST_ASSERT_SUCCESS(status
);
1702 if (U_FAILURE(status
)) {
1708 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1711 //--------------------------------------------------------------------------------------------
1713 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1714 // test data files. Do only a simple, forward-only check -
1715 // this test is mostly to check that ICU and the Unicode
1716 // data agree with each other.
1718 //--------------------------------------------------------------------------------------------
1719 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
1720 const UnicodeString
&testString
, // Text data to be broken
1721 UVector32
*breakPositions
, // Positions where breaks should be found.
1722 RuleBasedBreakIterator
*bi
) {
1723 int32_t pos
; // Break Position in the test string
1724 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
1725 int32_t expectedPos
; // Expected break position (index into test string)
1727 bi
->setText(testString
);
1731 while (pos
!= BreakIterator::DONE
) {
1732 if (expectedI
>= breakPositions
->size()) {
1733 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1734 testFileName
, lineNumber
, pos
);
1737 expectedPos
= breakPositions
->elementAti(expectedI
);
1738 if (pos
< expectedPos
) {
1739 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1740 testFileName
, lineNumber
, pos
);
1743 if (pos
> expectedPos
) {
1744 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1745 testFileName
, lineNumber
, expectedPos
);
1752 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
1753 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1754 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761 //---------------------------------------------------------------------------------------
1763 // classs RBBIMonkeyKind
1765 // Monkey Test for Break Iteration
1766 // Abstract interface class. Concrete derived classes independently
1767 // implement the break rules for different iterator types.
1769 // The Monkey Test itself uses doesn't know which type of break iterator it is
1770 // testing, but works purely in terms of the interface defined here.
1772 //---------------------------------------------------------------------------------------
1773 class RBBIMonkeyKind
{
1775 // Return a UVector of UnicodeSets, representing the character classes used
1776 // for this type of iterator.
1777 virtual UVector
*charClasses() = 0;
1779 // Set the test text on which subsequent calls to next() will operate
1780 virtual void setText(const UnicodeString
&s
) = 0;
1782 // Find the next break postion, starting from the prev break position, or from zero.
1783 // Return -1 after reaching end of string.
1784 virtual int32_t next(int32_t i
) = 0;
1786 virtual ~RBBIMonkeyKind();
1787 UErrorCode deferredStatus
;
1796 RBBIMonkeyKind::RBBIMonkeyKind() {
1797 deferredStatus
= U_ZERO_ERROR
;
1800 RBBIMonkeyKind::~RBBIMonkeyKind() {
1804 //----------------------------------------------------------------------------------------
1806 // Random Numbers. Similar to standard lib rand() and srand()
1807 // Not using library to
1808 // 1. Get same results on all platforms.
1809 // 2. Get access to current seed, to more easily reproduce failures.
1811 //---------------------------------------------------------------------------------------
1812 static uint32_t m_seed
= 1;
1814 static uint32_t m_rand()
1816 m_seed
= m_seed
* 1103515245 + 12345;
1817 return (uint32_t)(m_seed
/65536) % 32768;
1821 //------------------------------------------------------------------------------------------
1823 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1824 // of RBBIMonkeyKind.
1826 //------------------------------------------------------------------------------------------
1827 class RBBICharMonkey
: public RBBIMonkeyKind
{
1830 virtual ~RBBICharMonkey();
1831 virtual UVector
*charClasses();
1832 virtual void setText(const UnicodeString
&s
);
1833 virtual int32_t next(int32_t i
);
1837 UnicodeSet
*fCRLFSet
;
1838 UnicodeSet
*fControlSet
;
1839 UnicodeSet
*fExtendSet
;
1840 UnicodeSet
*fRegionalIndicatorSet
;
1841 UnicodeSet
*fPrependSet
;
1842 UnicodeSet
*fSpacingSet
;
1847 UnicodeSet
*fLVTSet
;
1848 UnicodeSet
*fHangulSet
;
1849 UnicodeSet
*fAnySet
;
1851 const UnicodeString
*fText
;
1855 RBBICharMonkey::RBBICharMonkey() {
1856 UErrorCode status
= U_ZERO_ERROR
;
1860 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
1861 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status
);
1862 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status
);
1863 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status
);
1864 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
1865 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
1866 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
1867 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
1868 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
1869 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
1870 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
1871 fHangulSet
= new UnicodeSet();
1872 fHangulSet
->addAll(*fLSet
);
1873 fHangulSet
->addAll(*fVSet
);
1874 fHangulSet
->addAll(*fTSet
);
1875 fHangulSet
->addAll(*fLVSet
);
1876 fHangulSet
->addAll(*fLVTSet
);
1877 fAnySet
= new UnicodeSet(0, 0x10ffff);
1879 fSets
= new UVector(status
);
1880 fSets
->addElement(fCRLFSet
, status
);
1881 fSets
->addElement(fControlSet
, status
);
1882 fSets
->addElement(fExtendSet
, status
);
1883 fSets
->addElement(fRegionalIndicatorSet
, status
);
1884 if (!fPrependSet
->isEmpty()) {
1885 fSets
->addElement(fPrependSet
, status
);
1887 fSets
->addElement(fSpacingSet
, status
);
1888 fSets
->addElement(fHangulSet
, status
);
1889 fSets
->addElement(fAnySet
, status
);
1890 if (U_FAILURE(status
)) {
1891 deferredStatus
= status
;
1896 void RBBICharMonkey::setText(const UnicodeString
&s
) {
1902 int32_t RBBICharMonkey::next(int32_t prevPos
) {
1903 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
1904 // break position being tested. The candidate break
1905 // location is before p2.
1909 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
1911 if (U_FAILURE(deferredStatus
)) {
1915 // Previous break at end of string. return DONE.
1916 if (prevPos
>= fText
->length()) {
1919 p0
= p1
= p2
= p3
= prevPos
;
1920 c3
= fText
->char32At(prevPos
);
1922 (void)p0
; // suppress set but not used warning.
1925 // Loop runs once per "significant" character position in the input text.
1927 // Move all of the positions forward in the input string.
1932 // Advancd p3 by one codepoint
1933 p3
= fText
->moveIndex32(p3
, 1);
1934 c3
= fText
->char32At(p3
);
1937 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1940 if (p2
== fText
->length()) {
1941 // Reached end of string. Always a break position.
1946 // No Extend or Format characters may appear between the CR and LF,
1947 // which requires the additional check for p2 immediately following p1.
1949 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
1953 // Rule (GB4). ( Control | CR | LF ) <break>
1954 if (fControlSet
->contains(c1
) ||
1960 // Rule (GB5) <break> ( Control | CR | LF )
1962 if (fControlSet
->contains(c2
) ||
1969 // Rule (GB6) L x ( L | V | LV | LVT )
1970 if (fLSet
->contains(c1
) &&
1971 (fLSet
->contains(c2
) ||
1972 fVSet
->contains(c2
) ||
1973 fLVSet
->contains(c2
) ||
1974 fLVTSet
->contains(c2
))) {
1978 // Rule (GB7) ( LV | V ) x ( V | T )
1979 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
1980 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
1984 // Rule (GB8) ( LVT | T) x T
1985 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
1986 fTSet
->contains(c2
)) {
1990 // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1992 // Rule (GB8a) Regional_Indicator x Regional_Indicator
1993 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
1997 // Rule (GB9) Numeric x ALetter
1998 if (fExtendSet
->contains(c2
)) {
2002 // Rule (GB9a) x SpacingMark
2003 if (fSpacingSet
->contains(c2
)) {
2007 // Rule (GB9b) Prepend x
2008 if (fPrependSet
->contains(c1
)) {
2012 // Rule (GB10) Any <break> Any
2022 UVector
*RBBICharMonkey::charClasses() {
2027 RBBICharMonkey::~RBBICharMonkey() {
2032 delete fRegionalIndicatorSet
;
2044 //------------------------------------------------------------------------------------------
2046 // class RBBIWordMonkey Word Break specific implementation
2047 // of RBBIMonkeyKind.
2049 //------------------------------------------------------------------------------------------
2050 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2053 virtual ~RBBIWordMonkey();
2054 virtual UVector
*charClasses();
2055 virtual void setText(const UnicodeString
&s
);
2056 virtual int32_t next(int32_t i
);
2062 UnicodeSet
*fNewlineSet
;
2063 UnicodeSet
*fRegionalIndicatorSet
;
2064 UnicodeSet
*fKatakanaSet
;
2065 UnicodeSet
*fHebrew_LetterSet
;
2066 UnicodeSet
*fALetterSet
;
2067 // TODO(jungshik): Do we still need this change?
2068 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2069 UnicodeSet
*fSingle_QuoteSet
;
2070 UnicodeSet
*fDouble_QuoteSet
;
2071 UnicodeSet
*fMidNumLetSet
;
2072 UnicodeSet
*fMidLetterSet
;
2073 UnicodeSet
*fMidNumSet
;
2074 UnicodeSet
*fNumericSet
;
2075 UnicodeSet
*fFormatSet
;
2076 UnicodeSet
*fOtherSet
;
2077 UnicodeSet
*fExtendSet
;
2078 UnicodeSet
*fExtendNumLetSet
;
2079 UnicodeSet
*fDictionaryCjkSet
;
2081 const UnicodeString
*fText
;
2085 RBBIWordMonkey::RBBIWordMonkey()
2087 UErrorCode status
= U_ZERO_ERROR
;
2089 fSets
= new UVector(status
);
2091 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2092 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2093 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2094 fDictionaryCjkSet
= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status
);
2095 // Exclude Hangul syllables from ALetterSet during testing.
2096 // Leave CJK dictionary characters out from the monkey tests!
2098 fALetterSet
= new UnicodeSet("[\\p{Word_Break = ALetter}"
2099 "[\\p{Line_Break = Complex_Context}"
2100 "-\\p{Grapheme_Cluster_Break = Extend}"
2101 "-\\p{Grapheme_Cluster_Break = Control}"
2105 fRegionalIndicatorSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status
);
2106 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2107 fHebrew_LetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status
);
2108 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2109 fALetterSet
->removeAll(*fDictionaryCjkSet
);
2110 fSingle_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status
);
2111 fDouble_QuoteSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status
);
2112 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2113 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status
);
2114 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2115 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2116 // we should figure out why
2117 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2118 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2119 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2120 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2122 fOtherSet
= new UnicodeSet();
2123 if(U_FAILURE(status
)) {
2124 deferredStatus
= status
;
2128 fOtherSet
->complement();
2129 fOtherSet
->removeAll(*fCRSet
);
2130 fOtherSet
->removeAll(*fLFSet
);
2131 fOtherSet
->removeAll(*fNewlineSet
);
2132 fOtherSet
->removeAll(*fKatakanaSet
);
2133 fOtherSet
->removeAll(*fHebrew_LetterSet
);
2134 fOtherSet
->removeAll(*fALetterSet
);
2135 fOtherSet
->removeAll(*fSingle_QuoteSet
);
2136 fOtherSet
->removeAll(*fDouble_QuoteSet
);
2137 fOtherSet
->removeAll(*fMidLetterSet
);
2138 fOtherSet
->removeAll(*fMidNumSet
);
2139 fOtherSet
->removeAll(*fNumericSet
);
2140 fOtherSet
->removeAll(*fExtendNumLetSet
);
2141 fOtherSet
->removeAll(*fFormatSet
);
2142 fOtherSet
->removeAll(*fExtendSet
);
2143 fOtherSet
->removeAll(*fRegionalIndicatorSet
);
2144 // Inhibit dictionary characters from being tested at all.
2145 fOtherSet
->removeAll(*fDictionaryCjkSet
);
2146 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2148 fSets
->addElement(fCRSet
, status
);
2149 fSets
->addElement(fLFSet
, status
);
2150 fSets
->addElement(fNewlineSet
, status
);
2151 fSets
->addElement(fRegionalIndicatorSet
, status
);
2152 fSets
->addElement(fHebrew_LetterSet
, status
);
2153 fSets
->addElement(fALetterSet
, status
);
2154 fSets
->addElement(fSingle_QuoteSet
, status
);
2155 fSets
->addElement(fDouble_QuoteSet
, status
);
2156 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2157 fSets
->addElement(fMidLetterSet
, status
);
2158 fSets
->addElement(fMidNumLetSet
, status
);
2159 fSets
->addElement(fMidNumSet
, status
);
2160 fSets
->addElement(fNumericSet
, status
);
2161 fSets
->addElement(fFormatSet
, status
);
2162 fSets
->addElement(fExtendSet
, status
);
2163 fSets
->addElement(fOtherSet
, status
);
2164 fSets
->addElement(fExtendNumLetSet
, status
);
2166 if (U_FAILURE(status
)) {
2167 deferredStatus
= status
;
2171 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2176 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2177 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2178 // break position being tested. The candidate break
2179 // location is before p2.
2183 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2185 if (U_FAILURE(deferredStatus
)) {
2189 // Prev break at end of string. return DONE.
2190 if (prevPos
>= fText
->length()) {
2193 p0
= p1
= p2
= p3
= prevPos
;
2194 c3
= fText
->char32At(prevPos
);
2196 (void)p0
; // Suppress set but not used warning.
2198 // Loop runs once per "significant" character position in the input text.
2200 // Move all of the positions forward in the input string.
2205 // Advancd p3 by X(Extend | Format)* Rule 4
2206 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2208 p3
= fText
->moveIndex32(p3
, 1);
2209 c3
= fText
->char32At(p3
);
2210 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2214 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
));
2218 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2221 if (p2
== fText
->length()) {
2222 // Reached end of string. Always a break position.
2227 // No Extend or Format characters may appear between the CR and LF,
2228 // which requires the additional check for p2 immediately following p1.
2230 if (c1
==0x0D && c2
==0x0A) {
2234 // Rule (3a) Break before and after newlines (including CR and LF)
2236 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2239 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2243 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2244 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2245 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2249 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2251 if ( (fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2252 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2253 (fALetterSet
->contains(c3
) || fHebrew_LetterSet
->contains(c3
))) {
2257 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2258 if ((fALetterSet
->contains(c0
) || fHebrew_LetterSet
->contains(c0
)) &&
2259 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2260 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2264 // Rule (7a) Hebrew_Letter x Single_Quote
2265 if (fHebrew_LetterSet
->contains(c1
) && fSingle_QuoteSet
->contains(c2
)) {
2269 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2270 if (fHebrew_LetterSet
->contains(c1
) && fDouble_QuoteSet
->contains(c2
) && fHebrew_LetterSet
->contains(c3
)) {
2274 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2275 if (fHebrew_LetterSet
->contains(c0
) && fDouble_QuoteSet
->contains(c1
) && fHebrew_LetterSet
->contains(c2
)) {
2279 // Rule (8) Numeric x Numeric
2280 if (fNumericSet
->contains(c1
) &&
2281 fNumericSet
->contains(c2
)) {
2285 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2286 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
)) &&
2287 fNumericSet
->contains(c2
)) {
2291 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2292 if (fNumericSet
->contains(c1
) &&
2293 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
))) {
2297 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2298 if (fNumericSet
->contains(c0
) &&
2299 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
) || fSingle_QuoteSet
->contains(c1
)) &&
2300 fNumericSet
->contains(c2
)) {
2304 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2305 if (fNumericSet
->contains(c1
) &&
2306 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
) || fSingle_QuoteSet
->contains(c2
)) &&
2307 fNumericSet
->contains(c3
)) {
2311 // Rule (13) Katakana x Katakana
2312 if (fKatakanaSet
->contains(c1
) &&
2313 fKatakanaSet
->contains(c2
)) {
2317 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2318 if ((fALetterSet
->contains(c1
) || fHebrew_LetterSet
->contains(c1
) ||fNumericSet
->contains(c1
) ||
2319 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2320 fExtendNumLetSet
->contains(c2
)) {
2324 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2325 if (fExtendNumLetSet
->contains(c1
) &&
2326 (fALetterSet
->contains(c2
) || fHebrew_LetterSet
->contains(c2
) ||
2327 fNumericSet
->contains(c2
) || fKatakanaSet
->contains(c2
))) {
2332 if (fRegionalIndicatorSet
->contains(c1
) && fRegionalIndicatorSet
->contains(c2
)) {
2336 // Rule 14. Break found here.
2345 UVector
*RBBIWordMonkey::charClasses() {
2350 RBBIWordMonkey::~RBBIWordMonkey() {
2355 delete fKatakanaSet
;
2356 delete fHebrew_LetterSet
;
2358 delete fSingle_QuoteSet
;
2359 delete fDouble_QuoteSet
;
2360 delete fMidNumLetSet
;
2361 delete fMidLetterSet
;
2366 delete fExtendNumLetSet
;
2367 delete fRegionalIndicatorSet
;
2368 delete fDictionaryCjkSet
;
2375 //------------------------------------------------------------------------------------------
2377 // class RBBISentMonkey Sentence Break specific implementation
2378 // of RBBIMonkeyKind.
2380 //------------------------------------------------------------------------------------------
2381 class RBBISentMonkey
: public RBBIMonkeyKind
{
2384 virtual ~RBBISentMonkey();
2385 virtual UVector
*charClasses();
2386 virtual void setText(const UnicodeString
&s
);
2387 virtual int32_t next(int32_t i
);
2389 int moveBack(int posFrom
);
2390 int moveForward(int posFrom
);
2391 UChar32
cAt(int pos
);
2395 UnicodeSet
*fSepSet
;
2396 UnicodeSet
*fFormatSet
;
2398 UnicodeSet
*fLowerSet
;
2399 UnicodeSet
*fUpperSet
;
2400 UnicodeSet
*fOLetterSet
;
2401 UnicodeSet
*fNumericSet
;
2402 UnicodeSet
*fATermSet
;
2403 UnicodeSet
*fSContinueSet
;
2404 UnicodeSet
*fSTermSet
;
2405 UnicodeSet
*fCloseSet
;
2406 UnicodeSet
*fOtherSet
;
2407 UnicodeSet
*fExtendSet
;
2409 const UnicodeString
*fText
;
2413 RBBISentMonkey::RBBISentMonkey()
2415 UErrorCode status
= U_ZERO_ERROR
;
2417 fSets
= new UVector(status
);
2419 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2420 // set and made into character classes of their own. For the monkey impl,
2421 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2422 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2423 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2424 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2425 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2426 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2427 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2428 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2429 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2430 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2431 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2432 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2433 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2434 fOtherSet
= new UnicodeSet();
2436 if(U_FAILURE(status
)) {
2437 deferredStatus
= status
;
2441 fOtherSet
->complement();
2442 fOtherSet
->removeAll(*fSepSet
);
2443 fOtherSet
->removeAll(*fFormatSet
);
2444 fOtherSet
->removeAll(*fSpSet
);
2445 fOtherSet
->removeAll(*fLowerSet
);
2446 fOtherSet
->removeAll(*fUpperSet
);
2447 fOtherSet
->removeAll(*fOLetterSet
);
2448 fOtherSet
->removeAll(*fNumericSet
);
2449 fOtherSet
->removeAll(*fATermSet
);
2450 fOtherSet
->removeAll(*fSContinueSet
);
2451 fOtherSet
->removeAll(*fSTermSet
);
2452 fOtherSet
->removeAll(*fCloseSet
);
2453 fOtherSet
->removeAll(*fExtendSet
);
2455 fSets
->addElement(fSepSet
, status
);
2456 fSets
->addElement(fFormatSet
, status
);
2457 fSets
->addElement(fSpSet
, status
);
2458 fSets
->addElement(fLowerSet
, status
);
2459 fSets
->addElement(fUpperSet
, status
);
2460 fSets
->addElement(fOLetterSet
, status
);
2461 fSets
->addElement(fNumericSet
, status
);
2462 fSets
->addElement(fATermSet
, status
);
2463 fSets
->addElement(fSContinueSet
, status
);
2464 fSets
->addElement(fSTermSet
, status
);
2465 fSets
->addElement(fCloseSet
, status
);
2466 fSets
->addElement(fOtherSet
, status
);
2467 fSets
->addElement(fExtendSet
, status
);
2469 if (U_FAILURE(status
)) {
2470 deferredStatus
= status
;
2476 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2480 UVector
*RBBISentMonkey::charClasses() {
2485 // moveBack() Find the "significant" code point preceding the index i.
2486 // Skips over ($Extend | $Format)* .
2488 int RBBISentMonkey::moveBack(int i
) {
2495 j
= fText
->moveIndex32(j
, -1);
2496 c
= fText
->char32At(j
);
2498 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2504 int RBBISentMonkey::moveForward(int i
) {
2505 if (i
>=fText
->length()) {
2506 return fText
->length();
2511 j
= fText
->moveIndex32(j
, 1);
2514 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2518 UChar32
RBBISentMonkey::cAt(int pos
) {
2519 if (pos
<0 || pos
>=fText
->length()) {
2522 return fText
->char32At(pos
);
2526 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2527 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2528 // break position being tested. The candidate break
2529 // location is before p2.
2533 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2536 if (U_FAILURE(deferredStatus
)) {
2540 // Prev break at end of string. return DONE.
2541 if (prevPos
>= fText
->length()) {
2544 p0
= p1
= p2
= p3
= prevPos
;
2545 c3
= fText
->char32At(prevPos
);
2547 (void)p0
; // Suppress set but not used warning.
2549 // Loop runs once per "significant" character position in the input text.
2551 // Move all of the positions forward in the input string.
2556 // Advancd p3 by X(Extend | Format)* Rule 4
2557 p3
= moveForward(p3
);
2561 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2565 // Rule (4). Sep <break>
2566 if (fSepSet
->contains(c1
)) {
2567 p2
= p1
+1; // Separators don't combine with Extend or Format.
2571 if (p2
>= fText
->length()) {
2572 // Reached end of string. Always a break position.
2576 if (p2
== prevPos
) {
2577 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2581 // Rule (6). ATerm x Numeric
2582 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2586 // Rule (7). Upper ATerm x Uppper
2587 if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2591 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2592 // Note: STerm | ATerm are added to the negated part of the expression by a
2593 // note to the Unicode 5.0 documents.
2595 while (fSpSet
->contains(cAt(p8
))) {
2598 while (fCloseSet
->contains(cAt(p8
))) {
2601 if (fATermSet
->contains(cAt(p8
))) {
2605 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2606 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2607 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2610 p8
= moveForward(p8
);
2612 if (fLowerSet
->contains(cAt(p8
))) {
2617 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2618 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2620 while (fSpSet
->contains(cAt(p8
))) {
2623 while (fCloseSet
->contains(cAt(p8
))) {
2627 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2632 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2634 while (fCloseSet
->contains(cAt(p9
))) {
2638 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2639 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2644 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2646 while (fSpSet
->contains(cAt(p10
))) {
2647 p10
= moveBack(p10
);
2649 while (fCloseSet
->contains(cAt(p10
))) {
2650 p10
= moveBack(p10
);
2652 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2653 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2658 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2660 if (fSepSet
->contains(cAt(p11
))) {
2661 p11
= moveBack(p11
);
2663 while (fSpSet
->contains(cAt(p11
))) {
2664 p11
= moveBack(p11
);
2666 while (fCloseSet
->contains(cAt(p11
))) {
2667 p11
= moveBack(p11
);
2669 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2673 // Rule (12) Any x Any
2680 RBBISentMonkey::~RBBISentMonkey() {
2690 delete fSContinueSet
;
2699 //-------------------------------------------------------------------------------------------
2703 //-------------------------------------------------------------------------------------------
2705 class RBBILineMonkey
: public RBBIMonkeyKind
{
2708 virtual ~RBBILineMonkey();
2709 virtual UVector
*charClasses();
2710 virtual void setText(const UnicodeString
&s
);
2711 virtual int32_t next(int32_t i
);
2712 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2757 BreakIterator
*fCharBI
;
2758 const UnicodeString
*fText
;
2759 RegexMatcher
*fNumberMatcher
;
2763 RBBILineMonkey::RBBILineMonkey()
2765 UErrorCode status
= U_ZERO_ERROR
;
2767 fSets
= new UVector(status
);
2769 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
2770 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
2771 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
2772 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
2773 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
2774 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
2775 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
2776 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
2777 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
2778 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
2779 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
2780 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
2781 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
2782 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
2783 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
2784 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
2785 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
2786 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
2787 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
2788 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
2789 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
2790 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
2791 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
2792 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
2793 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
2794 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
2795 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
2796 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
2797 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
2798 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
2799 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
2800 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
2801 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
2802 fCJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status
);
2803 fHL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status
);
2804 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
2805 fRI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status
);
2806 fSA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status
);
2807 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
2808 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
2810 if (U_FAILURE(status
)) {
2811 deferredStatus
= status
;
2813 fNumberMatcher
= NULL
;
2817 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2818 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2819 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
2820 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2822 fNS
->addAll(*fCJ
); // Default behavior for CJ is identical to NS.
2824 fSets
->addElement(fBK
, status
);
2825 fSets
->addElement(fCR
, status
);
2826 fSets
->addElement(fLF
, status
);
2827 fSets
->addElement(fCM
, status
);
2828 fSets
->addElement(fNL
, status
);
2829 fSets
->addElement(fWJ
, status
);
2830 fSets
->addElement(fZW
, status
);
2831 fSets
->addElement(fGL
, status
);
2832 fSets
->addElement(fCB
, status
);
2833 fSets
->addElement(fSP
, status
);
2834 fSets
->addElement(fB2
, status
);
2835 fSets
->addElement(fBA
, status
);
2836 fSets
->addElement(fBB
, status
);
2837 fSets
->addElement(fHY
, status
);
2838 fSets
->addElement(fH2
, status
);
2839 fSets
->addElement(fH3
, status
);
2840 fSets
->addElement(fCL
, status
);
2841 fSets
->addElement(fCP
, status
);
2842 fSets
->addElement(fEX
, status
);
2843 fSets
->addElement(fIN
, status
);
2844 fSets
->addElement(fJL
, status
);
2845 fSets
->addElement(fJT
, status
);
2846 fSets
->addElement(fJV
, status
);
2847 fSets
->addElement(fNS
, status
);
2848 fSets
->addElement(fOP
, status
);
2849 fSets
->addElement(fQU
, status
);
2850 fSets
->addElement(fIS
, status
);
2851 fSets
->addElement(fNU
, status
);
2852 fSets
->addElement(fPO
, status
);
2853 fSets
->addElement(fPR
, status
);
2854 fSets
->addElement(fSY
, status
);
2855 fSets
->addElement(fAI
, status
);
2856 fSets
->addElement(fAL
, status
);
2857 fSets
->addElement(fHL
, status
);
2858 fSets
->addElement(fID
, status
);
2859 fSets
->addElement(fWJ
, status
);
2860 fSets
->addElement(fRI
, status
);
2861 fSets
->addElement(fSA
, status
);
2862 fSets
->addElement(fSG
, status
);
2865 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2866 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2867 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2868 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2869 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2870 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2872 fNumberMatcher
= new RegexMatcher(
2873 UnicodeString(rules
, -1, US_INV
), 0, status
);
2875 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2877 if (U_FAILURE(status
)) {
2878 deferredStatus
= status
;
2883 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2885 fCharBI
->setText(s
);
2886 fNumberMatcher
->reset(s
);
2891 // Line Break TR rules 9 and 10 implementation.
2892 // This deals with combining marks and other sequences that
2893 // that must be treated as if they were something other than what they actually are.
2895 // This is factored out into a separate function because it must be applied twice for
2896 // each potential break, once to the chars before the position being checked, then
2897 // again to the text following the possible break.
2899 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2901 // Invalid initial position. Happens during the warmup iteration of the
2902 // main loop in next().
2906 int32_t nPos
= *nextPos
;
2908 // LB 9 Keep combining sequences together.
2909 // advance over any CM class chars. Note that Line Break CM is different
2910 // from the normal Grapheme Extend property.
2911 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
2912 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
2914 *nextChar
= fText
->char32At(nPos
);
2915 if (!fCM
->contains(*nextChar
)) {
2918 nPos
= fText
->moveIndex32(nPos
, 1);
2923 // LB 9 Treat X CM* as if it were x.
2924 // No explicit action required.
2926 // LB 10 Treat any remaining combining mark as AL
2927 if (fCM
->contains(*posChar
)) {
2928 *posChar
= 0x41; // thisChar = 'A';
2931 // Push the updated nextPos and nextChar back to our caller.
2932 // This only makes a difference if posChar got bigger by consuming a
2933 // combining sequence.
2935 *nextChar
= fText
->char32At(nPos
);
2940 int32_t RBBILineMonkey::next(int32_t startPos
) {
2941 UErrorCode status
= U_ZERO_ERROR
;
2942 int32_t pos
; // Index of the char following a potential break position
2943 UChar32 thisChar
; // Character at above position "pos"
2945 int32_t prevPos
; // Index of the char preceding a potential break position
2946 UChar32 prevChar
; // Character at above position. Note that prevChar
2947 // and thisChar may not be adjacent because combining
2948 // characters between them will be ignored.
2950 int32_t prevPosX2
; // Second previous character. Wider context for LB21a.
2953 int32_t nextPos
; // Index of the next character following pos.
2954 // Usually skips over combining marks.
2955 int32_t nextCPPos
; // Index of the code point following "pos."
2956 // May point to a combining mark.
2957 int32_t tPos
; // temp value.
2960 if (U_FAILURE(deferredStatus
)) {
2964 if (startPos
>= fText
->length()) {
2969 // Initial values for loop. Loop will run the first time without finding breaks,
2970 // while the invalid values shift out and the "this" and
2971 // "prev" positions are filled in with good values.
2972 pos
= prevPos
= prevPosX2
= -1; // Invalid value, serves as flag for initial loop iteration.
2973 thisChar
= prevChar
= prevCharX2
= 0;
2974 nextPos
= nextCPPos
= startPos
;
2977 // Loop runs once per position in the test text, until a break position
2980 prevPosX2
= prevPos
;
2981 prevCharX2
= prevChar
;
2984 prevChar
= thisChar
;
2987 thisChar
= fText
->char32At(pos
);
2989 nextCPPos
= fText
->moveIndex32(pos
, 1);
2990 nextPos
= nextCPPos
;
2992 // Rule LB2 - Break at end of text.
2993 if (pos
>= fText
->length()) {
2997 // Rule LB 9 - adjust for combining sequences.
2998 // We do this one out-of-order because the adjustment does not change anything
2999 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3001 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3002 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3003 c
= fText
->char32At(nextPos
);
3004 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3006 // If the loop is still warming up - if we haven't shifted the initial
3007 // -1 positions out of prevPos yet - loop back to advance the
3008 // position in the input without any further looking for breaks.
3009 if (prevPos
== -1) {
3013 // LB 4 Always break after hard line breaks,
3014 if (fBK
->contains(prevChar
)) {
3018 // LB 5 Break after CR, LF, NL, but not inside CR LF
3019 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3022 if (prevChar
== 0x0d ||
3028 // LB 6 Don't break before hard line breaks
3029 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3030 fBK
->contains(thisChar
)) {
3035 // LB 7 Don't break before spaces or zero-width space.
3036 if (fSP
->contains(thisChar
)) {
3040 if (fZW
->contains(thisChar
)) {
3044 // LB 8 Break after zero width space
3045 if (fZW
->contains(prevChar
)) {
3049 // LB 9, 10 Already done, at top of loop.
3053 // LB 11 Do not break before or after WORD JOINER and related characters.
3057 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3063 if (fGL
->contains(prevChar
)) {
3069 if (!(fSP
->contains(prevChar
) ||
3070 fBA
->contains(prevChar
) ||
3071 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3077 // LB 13 Don't break before closings.
3078 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3079 // fall into LB 17 and the more general number regular expression.
3081 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3082 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3083 fEX
->contains(thisChar
) ||
3084 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3085 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3089 // LB 14 Don't break after OP SP*
3090 // Scan backwards, checking for this sequence.
3091 // The OP char could include combining marks, so we actually check for
3093 // Another Twist: The Rule 67 fixes may have changed a SP CM
3094 // sequence into a ID char, so before scanning back through spaces,
3095 // verify that prevChar is indeed a space. The prevChar variable
3096 // may differ from fText[prevPos]
3098 if (fSP
->contains(prevChar
)) {
3099 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3100 tPos
=fText
->moveIndex32(tPos
, -1);
3103 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3104 tPos
=fText
->moveIndex32(tPos
, -1);
3106 if (fOP
->contains(fText
->char32At(tPos
))) {
3111 // LB 15 QU SP* x OP
3112 if (fOP
->contains(thisChar
)) {
3113 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3115 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3116 tPos
= fText
->moveIndex32(tPos
, -1);
3118 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3119 tPos
= fText
->moveIndex32(tPos
, -1);
3121 if (fQU
->contains(fText
->char32At(tPos
))) {
3128 // LB 16 (CL | CP) SP* x NS
3129 // Scan backwards for SP* CM* (CL | CP)
3130 if (fNS
->contains(thisChar
)) {
3132 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3133 tPos
= fText
->moveIndex32(tPos
, -1);
3135 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3136 tPos
= fText
->moveIndex32(tPos
, -1);
3138 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3144 // LB 17 B2 SP* x B2
3145 if (fB2
->contains(thisChar
)) {
3146 // Scan backwards, checking for the B2 CM* SP* sequence.
3148 if (fSP
->contains(prevChar
)) {
3149 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3150 tPos
=fText
->moveIndex32(tPos
, -1);
3153 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3154 tPos
=fText
->moveIndex32(tPos
, -1);
3156 if (fB2
->contains(fText
->char32At(tPos
))) {
3162 // LB 18 break after space
3163 if (fSP
->contains(prevChar
)) {
3170 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3174 // LB 20 Break around a CB
3175 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3180 if (fBA
->contains(thisChar
) ||
3181 fHY
->contains(thisChar
) ||
3182 fNS
->contains(thisChar
) ||
3183 fBB
->contains(prevChar
) ) {
3189 if (fHL
->contains(prevCharX2
) &&
3190 (fHY
->contains(prevChar
) || fBA
->contains(prevChar
))) {
3196 if (fSY
->contains(prevChar
) && fHL
->contains(thisChar
)) {
3201 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3202 (fHL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3203 (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3204 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3205 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3214 if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) ||
3215 (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3216 (fHL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3217 (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) ||
3218 (fNU
->contains(prevChar
) && fHL
->contains(thisChar
)) ) {
3222 // LB 24 Do not break between prefix and letters or ideographs.
3226 if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) ||
3227 (fPR
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) ||
3228 (fPO
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
)))) {
3235 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3236 if (U_FAILURE(status
)) {
3239 // Matched a number. But could have been just a single digit, which would
3240 // not represent a "no break here" between prevChar and thisChar
3241 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3242 if (numEndIdx
> pos
) {
3243 // Number match includes at least our two chars being checked
3244 if (numEndIdx
> nextPos
) {
3245 // Number match includes additional chars. Update pos and nextPos
3246 // so that next loop iteration will continue at the end of the number,
3247 // checking for breaks between last char in number & whatever follows.
3248 pos
= nextPos
= numEndIdx
;
3250 pos
= fText
->moveIndex32(pos
, -1);
3251 thisChar
= fText
->char32At(pos
);
3252 } while (fCM
->contains(thisChar
));
3259 // LB 26 Do not break a Korean syllable.
3260 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3261 fJV
->contains(thisChar
) ||
3262 fH2
->contains(thisChar
) ||
3263 fH3
->contains(thisChar
))) {
3267 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3268 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3272 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3273 fJT
->contains(thisChar
)) {
3277 // LB 27 Treat a Korean Syllable Block the same as ID.
3278 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3279 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3280 fIN
->contains(thisChar
)) {
3283 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3284 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3285 fPO
->contains(thisChar
)) {
3288 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3289 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3295 // LB 28 Do not break between alphabetics ("at").
3296 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
)) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3300 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3301 if (fIS
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
))) {
3305 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3308 if ((fAL
->contains(prevChar
) || fHL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3311 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fHL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3315 // LB30a Do not break between regional indicators.
3317 if (fRI
->contains(prevChar
) && fRI
->contains(thisChar
)) {
3321 // LB 31 Break everywhere else
3330 UVector
*RBBILineMonkey::charClasses() {
3335 RBBILineMonkey::~RBBILineMonkey() {
3380 delete fNumberMatcher
;
3384 //-------------------------------------------------------------------------------------------
3389 // seed=nnnnn Random number starting seed.
3390 // Setting the seed allows errors to be reproduced.
3391 // loop=nnn Looping count. Controls running time.
3393 // 0 or greater: run length.
3395 // type = char | word | line | sent | title
3397 //-------------------------------------------------------------------------------------------
3399 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3400 int32_t val
= defaultVal
;
3401 name
.append(" *= *(-?\\d+)");
3402 UErrorCode status
= U_ZERO_ERROR
;
3403 RegexMatcher
m(name
, params
, 0, status
);
3405 // The param exists. Convert the string to an int.
3406 char valString
[100];
3407 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3408 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3409 paramLength
= (int32_t)(sizeof(valString
)-2);
3411 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3412 val
= strtol(valString
, NULL
, 10);
3414 // Delete this parameter from the params string.
3416 params
= m
.replaceFirst("", status
);
3418 U_ASSERT(U_SUCCESS(status
));
3423 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3424 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3433 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3435 if (count
< expectedcount
&& expected
[count
] != i
) {
3436 test
->errln("break forward test failed: expected %d but got %d",
3437 expected
[count
], i
);
3442 if (count
!= expectedcount
) {
3443 printStringBreaks(ustr
, expected
, expectedcount
);
3444 test
->errln("break forward test failed: missed %d match",
3445 expectedcount
- count
);
3448 // testing boundaries
3449 for (i
= 1; i
< expectedcount
; i
++) {
3450 int j
= expected
[i
- 1];
3451 if (!bi
->isBoundary(j
)) {
3452 printStringBreaks(ustr
, expected
, expectedcount
);
3453 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3456 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3457 if (bi
->isBoundary(j
)) {
3458 printStringBreaks(ustr
, expected
, expectedcount
);
3459 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3465 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3467 if (forward
[count
] != i
) {
3468 printStringBreaks(ustr
, expected
, expectedcount
);
3469 test
->errln("happy break test previous() failed: expected %d but got %d",
3475 printStringBreaks(ustr
, expected
, expectedcount
);
3476 test
->errln("break test previous() failed: missed a match");
3480 // testing preceding
3481 for (i
= 0; i
< expectedcount
- 1; i
++) {
3482 // int j = expected[i] + 1;
3483 int j
= ustr
.moveIndex32(expected
[i
], 1);
3484 for (; j
<= expected
[i
+ 1]; j
++) {
3485 if (bi
->preceding(j
) != expected
[i
]) {
3486 printStringBreaks(ustr
, expected
, expectedcount
);
3487 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3495 void RBBITest::TestWordBreaks(void)
3497 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3499 Locale
locale("en");
3500 UErrorCode status
= U_ZERO_ERROR
;
3501 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3502 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3503 // Replaced any C+J characters in a row with a random sequence of characters
3504 // of the same length to make our C+J segmentation not get in the way.
3505 static const char *strlist
[] =
3507 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3508 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3509 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3510 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3511 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3512 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3513 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3514 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3515 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3516 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3517 "\\u2027\\U000e0067\\u0a47\\u00b7",
3518 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3519 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3520 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3521 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3522 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3523 "\\u0027\\u11af\\U000e0057\\u0602",
3524 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3525 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3526 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3527 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3528 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3529 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3530 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3531 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3532 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3533 "\\u18f4\\U000e0049\\u20e7\\u2027",
3534 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3535 "\\ua183\\u102d\\u0bec\\u003a",
3536 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3537 "\\u003a\\u0e57\\u0fad\\u002e",
3538 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3539 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3540 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3541 "\\u003a\\u0664\\u00b7\\u1fba",
3542 "\\u003b\\u0027\\u00b7\\u47a3",
3543 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3544 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3545 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3548 if (U_FAILURE(status
)) {
3549 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3552 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3553 // printf("looping %d\n", loop);
3554 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3555 // RBBICharMonkey monkey;
3556 RBBIWordMonkey monkey
;
3559 int expectedcount
= 0;
3561 monkey
.setText(ustr
);
3563 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3564 expected
[expectedcount
++] = i
;
3567 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3573 void RBBITest::TestWordBoundary(void)
3575 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3576 Locale
locale("en");
3577 UErrorCode status
= U_ZERO_ERROR
;
3578 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3579 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3581 static const char *strlist
[] =
3583 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3584 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3585 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3586 "\\u2027\\U000e0067\\u0a47\\u00b7",
3587 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3588 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3589 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3590 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3591 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3592 "\\u0027\\u11af\\U000e0057\\u0602",
3593 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3594 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3595 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3596 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3597 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3598 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3599 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3600 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3601 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3602 "\\u58f4\\U000e0049\\u20e7\\u2027",
3603 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3604 "\\ua183\\u102d\\u0bec\\u003a",
3605 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3606 "\\u003a\\u0e57\\u0fad\\u002e",
3607 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3608 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3609 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3610 "\\u003a\\u0664\\u00b7\\u1fba",
3611 "\\u003b\\u0027\\u00b7\\u47a3",
3614 if (U_FAILURE(status
)) {
3615 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3618 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3619 // printf("looping %d\n", loop);
3620 u_unescape(strlist
[loop
], str
, 20);
3621 UnicodeString
ustr(str
);
3628 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3629 forward
[count
++] = i
;
3632 for (j
= prev
+ 1; j
< i
; j
++) {
3633 if (bi
->isBoundary(j
)) {
3634 printStringBreaks(ustr
, forward
, count
);
3635 errln("happy boundary test failed: expected %d not a boundary",
3641 if (!bi
->isBoundary(i
)) {
3642 printStringBreaks(ustr
, forward
, count
);
3643 errln("happy boundary test failed: expected %d a boundary",
3653 void RBBITest::TestLineBreaks(void)
3655 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3656 Locale
locale("en");
3657 UErrorCode status
= U_ZERO_ERROR
;
3658 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3659 const int32_t STRSIZE
= 50;
3661 static const char *strlist
[] =
3663 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3664 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3665 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3666 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3667 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3668 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3669 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3670 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3671 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3672 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3673 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3674 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3675 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3676 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3677 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3678 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3679 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3680 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3681 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3682 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3683 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3684 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3685 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3686 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3687 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3688 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3689 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3690 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3691 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3692 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3693 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3694 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3695 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3696 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3697 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3698 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3699 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3700 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3701 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3702 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3703 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3704 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3705 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3706 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3707 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3708 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3709 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3712 TEST_ASSERT_SUCCESS(status
);
3713 if (U_FAILURE(status
)) {
3716 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3717 // printf("looping %d\n", loop);
3718 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3725 UnicodeString
ustr(str
);
3726 RBBILineMonkey monkey
;
3727 if (U_FAILURE(monkey
.deferredStatus
)) {
3731 const int EXPECTEDSIZE
= 50;
3732 int expected
[EXPECTEDSIZE
];
3733 int expectedcount
= 0;
3735 monkey
.setText(ustr
);
3737 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3738 if (expectedcount
>= EXPECTEDSIZE
) {
3739 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3742 expected
[expectedcount
++] = i
;
3745 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3751 void RBBITest::TestSentBreaks(void)
3753 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3754 Locale
locale("en");
3755 UErrorCode status
= U_ZERO_ERROR
;
3756 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3758 static const char *strlist
[] =
3760 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3762 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3763 "\"Sentence ending with a quote.\" Bye.",
3764 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3765 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3766 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3767 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3768 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3769 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3770 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3771 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3772 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3773 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3774 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3775 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3776 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3777 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3778 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3779 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3782 if (U_FAILURE(status
)) {
3783 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
3786 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3787 u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0])));
3788 UnicodeString
ustr(str
);
3790 RBBISentMonkey monkey
;
3791 if (U_FAILURE(monkey
.deferredStatus
)) {
3795 const int EXPECTEDSIZE
= 50;
3796 int expected
[EXPECTEDSIZE
];
3797 int expectedcount
= 0;
3799 monkey
.setText(ustr
);
3801 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3802 if (expectedcount
>= EXPECTEDSIZE
) {
3803 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3806 expected
[expectedcount
++] = i
;
3809 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3815 void RBBITest::TestMonkey(char *params
) {
3816 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3818 UErrorCode status
= U_ZERO_ERROR
;
3819 int32_t loopCount
= 500;
3821 UnicodeString breakType
= "all";
3822 Locale
locale("en");
3823 UBool useUText
= FALSE
;
3825 if (quick
== FALSE
) {
3830 UnicodeString
p(params
);
3831 loopCount
= getIntParam("loop", p
, loopCount
);
3832 seed
= getIntParam("seed", p
, seed
);
3834 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3836 breakType
= m
.group(1, status
);
3838 p
= m
.replaceFirst("", status
);
3841 RegexMatcher
u(" *utext", p
, 0, status
);
3845 p
= u
.replaceFirst("", status
);
3850 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
3851 // Each option is stripped out of the option string as it is processed.
3852 // All options have been checked. The option string should have been completely emptied..
3854 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3855 buf
[sizeof(buf
)-1] = 0;
3856 errln("Unrecognized or extra parameter: %s\n", buf
);
3862 if (breakType
== "char" || breakType
== "all") {
3864 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3865 if (U_SUCCESS(status
)) {
3866 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
3867 if (breakType
== "all" && useUText
==FALSE
) {
3868 // Also run a quick test with UText when "all" is specified
3869 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
3873 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
3878 if (breakType
== "word" || breakType
== "all") {
3879 logln("Word Break Monkey Test");
3881 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3882 if (U_SUCCESS(status
)) {
3883 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
3886 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
3891 if (breakType
== "line" || breakType
== "all") {
3892 logln("Line Break Monkey Test");
3894 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3895 if (loopCount
>= 10) {
3896 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
3898 if (U_SUCCESS(status
)) {
3899 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
3902 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3907 if (breakType
== "sent" || breakType
== "all" ) {
3908 logln("Sentence Break Monkey Test");
3910 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3911 if (loopCount
>= 10) {
3912 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
3914 if (U_SUCCESS(status
)) {
3915 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
3918 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
3927 // Run a RBBI monkey test. Common routine, for all break iterator types.
3929 // bi - the break iterator to use
3930 // mk - MonkeyKind, abstraction for obtaining expected results
3931 // name - Name of test (char, word, etc.) for use in error messages
3932 // seed - Seed for starting random number generator (parameter from user)
3935 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
3936 int32_t numIterations
, UBool useUText
) {
3938 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3940 const int32_t TESTSTRINGLEN
= 500;
3941 UnicodeString testText
;
3942 int32_t numCharClasses
;
3944 int expected
[TESTSTRINGLEN
*2 + 1];
3945 int expectedCount
= 0;
3946 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
3947 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
3948 char reverseBreaks
[TESTSTRINGLEN
*2+1];
3949 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
3950 char followingBreaks
[TESTSTRINGLEN
*2+1];
3951 char precedingBreaks
[TESTSTRINGLEN
*2+1];
3957 numCharClasses
= mk
.charClasses()->size();
3958 chClasses
= mk
.charClasses();
3960 // Check for errors that occured during the construction of the MonkeyKind object.
3961 // Can't report them where they occured because errln() is a method coming from intlTest,
3962 // and is not visible outside of RBBITest :-(
3963 if (U_FAILURE(mk
.deferredStatus
)) {
3964 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
3968 // Verify that the character classes all have at least one member.
3969 for (i
=0; i
<numCharClasses
; i
++) {
3970 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
3971 if (s
== NULL
|| s
->size() == 0) {
3972 errln("Character Class #%d is null or of zero size.", i
);
3977 while (loopCount
< numIterations
|| numIterations
== -1) {
3978 if (numIterations
== -1 && loopCount
% 10 == 0) {
3979 // If test is running in an infinite loop, display a periodic tic so
3980 // we can tell that it is making progress.
3981 fprintf(stderr
, ".");
3983 // Save current random number seed, so that we can recreate the random numbers
3984 // for this loop iteration in event of an error.
3987 // Populate a test string with data.
3988 testText
.truncate(0);
3989 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
3990 int32_t aClassNum
= m_rand() % numCharClasses
;
3991 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
3992 int32_t charIdx
= m_rand() % classSet
->size();
3993 UChar32 c
= classSet
->charAt(charIdx
);
3994 if (c
< 0) { // TODO: deal with sets containing strings.
4001 // Calculate the expected results for this test string.
4002 mk
.setText(testText
);
4003 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4004 expectedBreaks
[0] = 1;
4005 int32_t breakPos
= 0;
4008 breakPos
= mk
.next(breakPos
);
4009 if (breakPos
== -1) {
4012 if (breakPos
> testText
.length()) {
4013 errln("breakPos > testText.length()");
4015 expectedBreaks
[breakPos
] = 1;
4016 U_ASSERT(expectedCount
<testText
.length());
4017 expected
[expectedCount
++] = breakPos
;
4018 (void)expected
; // Set but not used warning.
4019 // TODO (andy): check it out.
4022 // Find the break positions using forward iteration
4023 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4025 UErrorCode status
= U_ZERO_ERROR
;
4026 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4027 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4028 bi
->setText(testUText
, status
);
4029 TEST_ASSERT_SUCCESS(status
);
4030 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4031 // This UText can be closed immediately, so long as the
4032 // testText string continues to exist.
4034 bi
->setText(testText
);
4037 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4038 if (i
< 0 || i
> testText
.length()) {
4039 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4042 forwardBreaks
[i
] = 1;
4045 // Find the break positions using reverse iteration
4046 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4047 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4048 if (i
< 0 || i
> testText
.length()) {
4049 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4052 reverseBreaks
[i
] = 1;
4055 // Find the break positions using isBoundary() tests.
4056 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4057 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4058 for (i
=0; i
<=testText
.length(); i
++) {
4059 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4063 // Find the break positions using the following() function.
4065 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4066 int32_t lastBreakPos
= 0;
4067 followingBreaks
[0] = 1;
4068 for (i
=0; i
<testText
.length(); i
++) {
4069 breakPos
= bi
->following(i
);
4070 if (breakPos
<= i
||
4071 breakPos
< lastBreakPos
||
4072 breakPos
> testText
.length() ||
4073 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4074 UChar32 brkChar
= testText
.char32At(lastBreakPos
);
4075 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4076 errln("%s break monkey test: "
4077 "Out of range value returned by BreakIterator::following().\n"
4078 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4079 name
, seed
, i
, breakPos
, lastBreakPos
);
4083 followingBreaks
[breakPos
] = 1;
4084 lastBreakPos
= breakPos
;
4087 // Find the break positions using the preceding() function.
4088 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4089 lastBreakPos
= testText
.length();
4090 precedingBreaks
[testText
.length()] = 1;
4091 for (i
=testText
.length(); i
>0; i
--) {
4092 breakPos
= bi
->preceding(i
);
4093 if (breakPos
>= i
||
4094 breakPos
> lastBreakPos
||
4095 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4096 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4097 UChar32 brkChar
= testText
.char32At(breakPos
);
4098 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4099 errln("%s break monkey test: "
4100 "Out of range value returned by BreakIterator::preceding().\n"
4101 "index=%d; prev returned %d; lastBreak=%d" ,
4102 name
, i
, breakPos
, lastBreakPos
);
4103 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4104 precedingBreaks
[i
] = 2; // Forces an error.
4108 if (breakPos
>= 0) {
4109 precedingBreaks
[breakPos
] = 1;
4111 lastBreakPos
= breakPos
;
4115 // Compare the expected and actual results.
4116 for (i
=0; i
<=testText
.length(); i
++) {
4117 const char *errorType
= NULL
;
4118 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4119 errorType
= "next()";
4120 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4121 errorType
= "previous()";
4122 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4123 errorType
= "isBoundary()";
4124 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4125 errorType
= "following()";
4126 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4127 errorType
= "preceding()";
4131 if (errorType
!= NULL
) {
4132 // Format a range of the test text that includes the failure as
4133 // a data item that can be included in the rbbi test data file.
4135 // Start of the range is the last point where expected and actual results
4136 // both agreed that there was a break position.
4137 int startContext
= i
;
4140 if (startContext
==0) { break; }
4142 if (expectedBreaks
[startContext
] != 0) {
4143 if (count
== 2) break;
4148 // End of range is two expected breaks past the start position.
4149 int endContext
= i
+ 1;
4151 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4153 if (endContext
>= testText
.length()) {break;}
4154 if (expectedBreaks
[endContext
-1] != 0) {
4155 if (count
== 0) break;
4162 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4163 UnicodeString errorText
= "<data>";
4164 /***if (strcmp(errorType, "next()") == 0) {
4166 endContext = testText.length();
4168 printStringBreaks(testText, expected, expectedCount);
4171 for (ci
=startContext
; ci
<endContext
;) {
4172 UnicodeString
hexChars("0123456789abcdef");
4175 c
= testText
.char32At(ci
);
4177 // This is the location of the error.
4178 errorText
.append("<?>");
4179 } else if (expectedBreaks
[ci
] != 0) {
4180 // This a non-error expected break position.
4181 errorText
.append("\\");
4184 errorText
.append("\\u");
4185 for (bn
=12; bn
>=0; bn
-=4) {
4186 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4189 errorText
.append("\\U");
4190 for (bn
=28; bn
>=0; bn
-=4) {
4191 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4194 ci
= testText
.moveIndex32(ci
, 1);
4196 errorText
.append("\\");
4197 errorText
.append("</data>\n");
4200 char charErrorTxt
[500];
4201 UErrorCode status
= U_ZERO_ERROR
;
4202 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4203 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4204 const char *badLocale
= bi
->getLocaleID(ULOC_ACTUAL_LOCALE
, status
);
4206 UChar32 brkChar
= testText
.char32At(i
);
4207 if ((strcmp(name
, "char") != 0 && strcmp(name
, "word") != 0) || brkChar
< 0x1F1E6 || brkChar
> 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4208 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4209 name
, badLocale
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4210 errorType
, seed
, i
, charErrorTxt
);
4222 // Bug 5532. UTF-8 based UText fails in dictionary code.
4223 // This test checks the initial patch,
4224 // which is to just keep it from crashing. Correct word boundaries
4225 // await a proper fix to the dictionary code.
4227 void RBBITest::TestBug5532(void) {
4228 // Text includes a mixture of Thai and Latin.
4229 const unsigned char utf8Data
[] = {
4230 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4231 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4232 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4233 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4234 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4235 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4236 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4237 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4238 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4239 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4240 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4242 UErrorCode status
= U_ZERO_ERROR
;
4243 UText utext
=UTEXT_INITIALIZER
;
4244 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4245 TEST_ASSERT_SUCCESS(status
);
4247 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4248 TEST_ASSERT_SUCCESS(status
);
4249 if (U_SUCCESS(status
)) {
4250 bi
->setText(&utext
, status
);
4251 TEST_ASSERT_SUCCESS(status
);
4253 int32_t breakCount
= 0;
4254 int32_t previousBreak
= -1;
4255 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4256 // For now, just make sure that the break iterator doesn't hang.
4257 TEST_ASSERT(previousBreak
< bi
->current());
4258 previousBreak
= bi
->current();
4260 TEST_ASSERT(breakCount
> 0);
4263 utext_close(&utext
);
4267 void RBBITest::TestBug9983(void) {
4268 UnicodeString text
= UnicodeString("\\u002A" // * Other
4270 "\\u309C" // Katakana
4274 "\\u0000").unescape();
4276 UErrorCode status
= U_ZERO_ERROR
;
4277 LocalPointer
<RuleBasedBreakIterator
> brkiter(static_cast<RuleBasedBreakIterator
*>(
4278 BreakIterator::createWordInstance(Locale::getRoot(), status
)));
4279 TEST_ASSERT_SUCCESS(status
);
4280 LocalPointer
<RuleBasedBreakIterator
> brkiterPOSIX(static_cast<RuleBasedBreakIterator
*>(
4281 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status
)));
4282 TEST_ASSERT_SUCCESS(status
);
4283 if (U_FAILURE(status
)) {
4286 int32_t offset
, rstatus
, iterationCount
;
4288 brkiter
->setText(text
);
4291 while ( (offset
= brkiter
->previous()) != UBRK_DONE
) {
4293 rstatus
= brkiter
->getRuleStatus();
4294 (void)rstatus
; // Suppress set but not used warning.
4295 if (iterationCount
>= 10) {
4299 TEST_ASSERT(iterationCount
== 6);
4301 brkiterPOSIX
->setText(text
);
4302 brkiterPOSIX
->last();
4304 while ( (offset
= brkiterPOSIX
->previous()) != UBRK_DONE
) {
4306 rstatus
= brkiterPOSIX
->getRuleStatus();
4307 (void)rstatus
; // Suppress set but not used warning.
4308 if (iterationCount
>= 10) {
4312 TEST_ASSERT(iterationCount
== 6);
4317 // TestDebug - A place-holder test for debugging purposes.
4318 // For putting in fragments of other tests that can be invoked
4319 // for tracing without a lot of unwanted extra stuff happening.
4321 void RBBITest::TestDebug(void) {
4323 UErrorCode status
= U_ZERO_ERROR
;
4327 RuleBasedBreakIterator
* bi
=
4328 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4329 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4330 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4331 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4332 // UnicodeString s("Aaa. Bcd");
4335 UBool r
= bi
->isBoundary(8);
4336 printf("%s", r
?"true":"false");
4340 // ruleStatus = bi->getRuleStatus();
4341 printf("%d\t%d\n", pos
, ruleStatus
);
4342 pos
= bi
->previous();
4343 } while (pos
!= BreakIterator::DONE
);
4347 void RBBITest::TestProperties() {
4348 UErrorCode errorCode
= U_ZERO_ERROR
;
4349 UnicodeSet
prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode
);
4350 if (!prependSet
.isEmpty()) {
4352 "[:GCB=Prepend:] is not empty any more. "
4353 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4354 "change this test to the opposite condition.");
4358 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */