1 /********************************************************************
3 * Copyright (c) 1999-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include <typeinfo> // for 'typeid' to work
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
27 #include "unicode/ustring.h"
28 #include "unicode/utext.h"
39 #define TEST_ASSERT(x) {if (!(x)) { \
40 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
46 //---------------------------------------------
48 //---------------------------------------------
50 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
52 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
55 #if !UCONFIG_NO_FILE_IO
56 case 0: name
= "TestBug4153072";
57 if(exec
) TestBug4153072(); break;
59 case 0: name
= "skip";
63 case 1: name
= "TestJapaneseLineBreak";
64 if(exec
) TestJapaneseLineBreak(); break;
65 case 2: name
= "TestStatusReturn";
66 if(exec
) TestStatusReturn(); break;
68 #if !UCONFIG_NO_FILE_IO
69 case 3: name
= "TestUnicodeFiles";
70 if(exec
) TestUnicodeFiles(); break;
71 case 4: name
= "TestEmptyString";
72 if(exec
) TestEmptyString(); break;
74 case 3: case 4: name
= "skip";
78 case 5: name
= "TestGetAvailableLocales";
79 if(exec
) TestGetAvailableLocales(); break;
81 case 6: name
= "TestGetDisplayName";
82 if(exec
) TestGetDisplayName(); break;
84 #if !UCONFIG_NO_FILE_IO
85 case 7: name
= "TestEndBehaviour";
86 if(exec
) TestEndBehaviour(); break;
87 case 8: name
= "TestMixedThaiLineBreak";
88 if(exec
) TestMixedThaiLineBreak(); break;
89 case 9: name
= "TestThaiLineBreak";
90 if(exec
) TestThaiLineBreak(); break;
91 case 10: name
= "TestMaiyamok";
92 if(exec
) TestMaiyamok(); break;
93 case 11: name
= "TestWordBreaks";
94 if(exec
) TestWordBreaks(); break;
95 case 12: name
= "TestWordBoundary";
96 if(exec
) TestWordBoundary(); break;
97 case 13: name
= "TestLineBreaks";
98 if(exec
) TestLineBreaks(); break;
99 case 14: name
= "TestSentBreaks";
100 if(exec
) TestSentBreaks(); break;
101 case 15: name
= "TestExtended";
102 if(exec
) TestExtended(); break;
104 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name
= "skip";
110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
119 #if !UCONFIG_NO_FILE_IO
120 case 17: name
= "TestBug3818";
121 if(exec
) TestBug3818(); break;
122 case 18: name
= "TestJapaneseWordBreak";
123 if(exec
) TestJapaneseWordBreak(); break;
125 case 17: case 18: name
= "skip";
129 case 19: name
= "TestDebug";
130 if(exec
) TestDebug(); break;
131 case 20: name
= "TestTrieDict";
132 if(exec
) TestTrieDict(); break;
134 #if !UCONFIG_NO_FILE_IO
135 case 21: name
= "TestBug5775";
136 if (exec
) TestBug5775(); break;
137 case 22: name
= "TestThaiBreaks";
138 if (exec
) TestThaiBreaks(); break;
139 case 23: name
= "TestTailoredBreaks";
140 if (exec
) TestTailoredBreaks(); break;
142 case 21: case 22: case 23: name
= "skip";
145 case 24: name
= "TestDictRules";
146 if (exec
) TestDictRules(); break;
147 case 25: name
= "TestBug5532";
148 if (exec
) TestBug5532(); break;
149 default: name
= ""; break; //needed to end loop
154 //---------------------------------------------------------------------------
156 // class BITestData Holds a set of Break iterator test data and results
158 // - the string data to be broken
159 // - a vector of the expected break positions.
160 // - a vector of source line numbers for the data,
161 // (to help see where errors occured.)
162 // - The expected break tag values.
163 // - Vectors of actual break positions and tag values.
164 // - Functions for comparing actual with expected and
167 //----------------------------------------------------------------------------
170 UnicodeString fDataToBreak
;
171 UVector fExpectedBreakPositions
;
172 UVector fExpectedTags
;
174 UVector fActualBreakPositions
; // Test Results.
177 BITestData(UErrorCode
&status
);
178 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
179 void checkResults(const char *heading
, RBBITest
*test
);
180 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
187 BITestData::BITestData(UErrorCode
&status
)
188 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
194 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
195 // The macro form collects the line number, which is helpful
196 // when tracking down failures.
198 // A null data item is inserted at the start of each test's data
199 // to put the starting zero into the data list. The position saved for
200 // each non-null item is its ending position.
202 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
203 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
204 if (U_FAILURE(status
)) {return;}
206 fDataToBreak
.append(CharsToUnicodeString(data
));
208 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
209 fExpectedTags
.addElement(tag
, status
);
210 fLineNum
.addElement(lineNum
, status
);
215 // checkResults. Compare the actual and expected break positions, report any differences.
217 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
218 int32_t expectedIndex
= 0;
219 int32_t actualIndex
= 0;
222 // If we've run through both the expected and actual results vectors, we're done.
223 // break out of the loop.
224 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
225 actualIndex
>= fActualBreakPositions
.size()) {
230 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
231 err(heading
, test
, expectedIndex
-1, actualIndex
);
236 if (actualIndex
>= fActualBreakPositions
.size()) {
237 err(heading
, test
, expectedIndex
, actualIndex
-1);
242 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
243 err(heading
, test
, expectedIndex
, actualIndex
);
244 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
245 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
253 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
254 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
255 heading
, fLineNum
.elementAt(expectedIndex
),
256 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
265 // err - An error was found. Report it, along with information about where the
266 // incorrectly broken test data appeared in the source file.
268 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
270 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
271 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
273 int32_t line
= fLineNum
.elementAti(expectedIdx
);
274 if (expectedIdx
> 0) {
275 // The line numbers are off by one because a premature break occurs somewhere
276 // within the previous item, rather than at the start of the current (expected) item.
277 // We want to report the offset of the unexpected break from the start of
278 // this previous item.
279 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
281 if (actual
< expected
) {
282 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
284 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
289 void BITestData::clearResults() {
290 fActualBreakPositions
.removeAllElements();
291 fActualTags
.removeAllElements();
295 //-----------------------------------------------------------------------------------
297 // Cannned Test Characters
299 //-----------------------------------------------------------------------------------
301 static const UChar cannedTestArray
[] = {
302 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
303 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
304 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
305 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
306 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
307 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
308 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
309 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
312 static UnicodeString
* cannedTestChars
= 0;
314 #define halfNA "\\u0928\\u094d\\u200d"
315 #define halfSA "\\u0938\\u094d\\u200d"
316 #define halfCHA "\\u091a\\u094d\\u200d"
317 #define halfKA "\\u0915\\u094d\\u200d"
318 #define deadTA "\\u0924\\u094d"
320 //--------------------------------------------------------------------------------------
322 // RBBITest constructor and destructor
324 //--------------------------------------------------------------------------------------
326 RBBITest::RBBITest() {
327 UnicodeString
temp(cannedTestArray
);
328 cannedTestChars
= new UnicodeString();
329 *cannedTestChars
+= (UChar
)0x0000;
330 *cannedTestChars
+= temp
;
334 RBBITest::~RBBITest() {
335 delete cannedTestChars
;
339 static const int T_NUMBER
= 100;
340 static const int T_LETTER
= 200;
341 static const int T_H_OR_K
= 300;
342 static const int T_IDEO
= 400;
349 //--------------------------------------------------------------------
350 //Testing the BreakIterator for devanagari script
351 //--------------------------------------------------------------------
353 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
354 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
355 #define deadTTHA "\\u0920\\u094d"
356 #define deadPA "\\u092a\\u094d"
357 #define deadSA "\\u0938\\u094d"
358 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
365 //-----------------------------------------------------------------------------------
367 // Test for status {tag} return value from break rules.
368 // TODO: a more thorough test.
370 //-----------------------------------------------------------------------------------
371 void RBBITest::TestStatusReturn() {
372 UnicodeString
rulesString1("$Letters = [:L:];\n"
373 "$Numbers = [:N:];\n"
376 "Help\\ {4}/me\\!;\n"
377 "[^$Letters $Numbers];\n"
378 "!.*;\n", -1, US_INV
);
379 UnicodeString testString1
= "abc123..abc Help me Help me!";
380 // 01234567890123456789012345678
381 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
382 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
384 UErrorCode status
=U_ZERO_ERROR
;
385 UParseError parseError
;
387 RuleBasedBreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
388 if(U_FAILURE(status
)) {
389 dataerrln("FAIL : in construction - %s", u_errorName(status
));
393 bi
->setText(testString1
);
394 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
395 if (pos
!= bounds1
[i
]) {
396 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
400 int tag
= bi
->getRuleStatus();
401 if (tag
!= brkStatus
[i
]) {
402 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
412 static void printStringBreaks(UnicodeString ustr
, int expected
[],
415 UErrorCode status
= U_ZERO_ERROR
;
417 printf("code alpha extend alphanum type word sent line name\n");
419 for (j
= 0; j
< ustr
.length(); j
++) {
420 if (expectedcount
> 0) {
422 for (k
= 0; k
< expectedcount
; k
++) {
423 if (j
== expected
[k
]) {
424 printf("------------------------------------------------ %d\n",
429 UChar32 c
= ustr
.char32At(j
);
433 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
434 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
436 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
438 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
440 U_SHORT_PROPERTY_NAME
),
441 u_getPropertyValueName(UCHAR_WORD_BREAK
,
442 u_getIntPropertyValue(c
,
444 U_SHORT_PROPERTY_NAME
),
445 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
446 u_getIntPropertyValue(c
,
447 UCHAR_SENTENCE_BREAK
),
448 U_SHORT_PROPERTY_NAME
),
449 u_getPropertyValueName(UCHAR_LINE_BREAK
,
450 u_getIntPropertyValue(c
,
452 U_SHORT_PROPERTY_NAME
),
457 void RBBITest::TestThaiLineBreak() {
458 UErrorCode status
= U_ZERO_ERROR
;
459 BITestData
thaiLineSelection(status
);
461 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
462 // represents elided letters at the end of a long word. It should be bound to
463 // the end of the word and not treated as an independent punctuation mark.
466 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
467 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status
);
468 ADD_DATACHUNK(thaiLineSelection
, "\\u0e08\\u0e30", 0, status
);
469 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status
);
470 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status
);
471 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
472 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
473 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status
);
474 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
475 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e2d\\u0e01", 0, status
);
476 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32", 0, status
);
477 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status
);
478 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status
);
479 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status
);
480 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status
);
482 // the one time where the paiyannoi occurs somewhere other than at the end
483 // of a word is in the Thai abbrevation for "etc.", which both begins and
484 // ends with a paiyannoi
485 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2f\\u0e25\\u0e2f", 0, status
);
486 ADD_DATACHUNK(thaiLineSelection
, "\\u0e17\\u0e35\\u0e48", 0, status
);
487 ADD_DATACHUNK(thaiLineSelection
, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status
);
489 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
490 Locale("th"), status
);
491 if (U_FAILURE(status
))
493 errcheckln(status
, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status
));
497 generalIteratorTest(*e
, thaiLineSelection
);
503 void RBBITest::TestMixedThaiLineBreak()
505 UErrorCode status
= U_ZERO_ERROR
;
506 BITestData
thaiLineSelection(status
);
508 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
511 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
514 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1B\\u0E35", 0, status
);
515 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status
);
516 ADD_DATACHUNK(thaiLineSelection
, "2545 ", 0, status
);
517 ADD_DATACHUNK(thaiLineSelection
, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status
);
518 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1B\\u0E35", 0, status
);
519 ADD_DATACHUNK(thaiLineSelection
, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status
);
520 ADD_DATACHUNK(thaiLineSelection
, "\\u0E04\\u0E23\\u0E1A", 0, status
);
521 ADD_DATACHUNK(thaiLineSelection
, "\\u0E23\\u0E2D\\u0E1A ", 0, status
);
522 ADD_DATACHUNK(thaiLineSelection
, "\"\\u0E52\\u0E52\\u0E50 ", 0, status
);
523 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1b\\u0E35\" ", 0, status
);
524 ADD_DATACHUNK(thaiLineSelection
, "\\u0E02\\u0E2d\\u0E07", 0, status
);
525 ADD_DATACHUNK(thaiLineSelection
, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status
);
526 ADD_DATACHUNK(thaiLineSelection
, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status
);
527 ADD_DATACHUNK(thaiLineSelection
, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status
);
528 ADD_DATACHUNK(thaiLineSelection
, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status
);
529 ADD_DATACHUNK(thaiLineSelection
, "Bangkok)", 0, status
);
531 // @suwit - end of changes
534 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale("th"), status
);
535 if (U_FAILURE(status
))
537 errcheckln(status
, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status
));
542 generalIteratorTest(*e
, thaiLineSelection
);
547 void RBBITest::TestMaiyamok()
549 UErrorCode status
= U_ZERO_ERROR
;
550 BITestData
thaiLineSelection(status
);
551 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
552 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
553 // word". Instead of appearing as a word unto itself, however, it's kept together
554 // with the word before it
555 ADD_DATACHUNK(thaiLineSelection
, "\\u0e44\\u0e1b\\u0e46", 0, status
);
556 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32\\u0e46", 0, status
);
557 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status
);
558 ADD_DATACHUNK(thaiLineSelection
, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status
);
559 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e17\\u0e1e", 0, status
);
560 ADD_DATACHUNK(thaiLineSelection
, "\\u0e41\\u0e25\\u0e30", 0, status
);
561 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e03\\u0e35", 0, status
);
562 ADD_DATACHUNK(thaiLineSelection
, "\\u0e22\\u0e07", 0, status
);
563 ADD_DATACHUNK(thaiLineSelection
, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status
);
565 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
566 Locale("th"), status
);
568 if (U_FAILURE(status
))
570 errcheckln(status
, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status
));
573 generalIteratorTest(*e
, thaiLineSelection
);
579 void RBBITest::TestBug3818() {
580 UErrorCode status
= U_ZERO_ERROR
;
582 // Four Thai words...
583 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
584 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
585 UnicodeString
thaiStr(thaiWordData
);
587 RuleBasedBreakIterator
* bi
=
588 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale("th"), status
);
589 if (U_FAILURE(status
) || bi
== NULL
) {
590 errcheckln(status
, "Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
593 bi
->setText(thaiStr
);
595 int32_t startOfSecondWord
= bi
->following(1);
596 if (startOfSecondWord
!= 4) {
597 errln("Fail at file %s, line %d expected start of word at 4, got %d",
598 __FILE__
, __LINE__
, startOfSecondWord
);
600 startOfSecondWord
= bi
->following(0);
601 if (startOfSecondWord
!= 4) {
602 errln("Fail at file %s, line %d expected start of word at 4, got %d",
603 __FILE__
, __LINE__
, startOfSecondWord
);
609 void RBBITest::TestJapaneseWordBreak() {
610 UErrorCode status
= U_ZERO_ERROR
;
611 BITestData
japaneseWordSelection(status
);
613 ADD_DATACHUNK(japaneseWordSelection
, NULL
, 0, status
); // Break at start of data
614 ADD_DATACHUNK(japaneseWordSelection
, "\\u4ECA\\u65E5", 400, status
); //2
615 ADD_DATACHUNK(japaneseWordSelection
, "\\u306F\\u3044\\u3044", 300, status
); //5
616 ADD_DATACHUNK(japaneseWordSelection
, "\\u5929\\u6C17", 400, status
); //7
617 ADD_DATACHUNK(japaneseWordSelection
, "\\u3067\\u3059\\u306D", 300, status
); //10
618 ADD_DATACHUNK(japaneseWordSelection
, "\\u3002", 0, status
); //11
619 ADD_DATACHUNK(japaneseWordSelection
, "\\u000D\\u000A", 0, status
); //12
621 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(
622 Locale("ja"), status
);
623 if (U_FAILURE(status
))
625 errcheckln(status
, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
629 generalIteratorTest(*e
, japaneseWordSelection
);
633 void RBBITest::TestTrieDict() {
634 UErrorCode status
= U_ZERO_ERROR
;
637 // Open and read the test data file.
639 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
640 char testFileName
[1000];
641 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) + strlen("riwords.txt") + 10 >= sizeof(testFileName
)) {
642 errln("Can't open test data. Path too long.");
645 strcpy(testFileName
, testDataDirectory
);
646 strcat(testFileName
, "riwords.txt");
648 // Items needing deleting at the end
649 MutableTrieDictionary
*mutableDict
= NULL
;
650 CompactTrieDictionary
*compactDict
= NULL
;
651 UnicodeSet
*breaks
= NULL
;
652 UChar
*testFile
= NULL
;
653 StringEnumeration
*enumer1
= NULL
;
654 StringEnumeration
*enumer2
= NULL
;
655 MutableTrieDictionary
*mutable2
= NULL
;
656 StringEnumeration
*cloneEnum
= NULL
;
657 CompactTrieDictionary
*compact2
= NULL
;
660 const UnicodeString
*originalWord
= NULL
;
661 const UnicodeString
*cloneWord
= NULL
;
670 testFile
= ReadAndConvertFile(testFileName
, len
, NULL
, status
);
671 if (U_FAILURE(status
)) {
672 goto cleanup
; /* something went wrong, error already output */
675 mutableDict
= new MutableTrieDictionary(0x0E1C, status
);
676 if (U_FAILURE(status
)) {
677 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status
));
681 breaks
= new UnicodeSet
;
682 breaks
->add(0x000A); // Line Feed
683 breaks
->add(0x000D); // Carriage Return
684 breaks
->add(0x2028); // Line Separator
685 breaks
->add(0x2029); // Paragraph Separator
687 // Now add each non-comment line of the file as a word.
695 if (uc
== 0x0023) { // #comment line, skip
696 while (uc
&& !breaks
->contains(uc
)) {
700 else while (uc
&& !breaks
->contains(uc
)) {
705 mutableDict
->addWord(word
, wordLen
, status
);
706 if (U_FAILURE(status
)) {
707 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status
));
713 // Find beginning of next line
714 while (uc
&& breaks
->contains(uc
)) {
721 if (wordCount
< 50) {
722 errln("Word count (%d) unreasonably small\n", wordCount
);
726 enumer1
= mutableDict
->openWords(status
);
727 if (U_FAILURE(status
)) {
728 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status
));
733 if (wordCount
!= (testCount
= enumer1
->count(status
))) {
734 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
735 testCount
, wordCount
, u_errorName(status
));
740 compactDict
= new CompactTrieDictionary(*mutableDict
, status
);
741 if (U_FAILURE(status
)) {
742 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status
));
746 enumer2
= compactDict
->openWords(status
);
747 if (U_FAILURE(status
)) {
748 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status
));
752 if (wordCount
!= (testCount
= enumer2
->count(status
))) {
753 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
754 testCount
, wordCount
, u_errorName(status
));
758 if (typeid(*enumer1
) == typeid(*enumer2
)) {
759 errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
767 mutable2
= compactDict
->cloneMutable(status
);
768 if (U_FAILURE(status
)) {
769 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status
));
773 cloneEnum
= mutable2
->openWords(status
);
774 if (U_FAILURE(status
)) {
775 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status
));
779 if (wordCount
!= (testCount
= cloneEnum
->count(status
))) {
780 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
781 testCount
, wordCount
, u_errorName(status
));
785 // Compact original dictionary to clone. Note that we can only compare the same kind of
786 // dictionary as the order of the enumerators is not guaranteed to be the same between
788 enumer1
= mutableDict
->openWords(status
);
789 if (U_FAILURE(status
)) {
790 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status
));
794 originalWord
= enumer1
->snext(status
);
795 cloneWord
= cloneEnum
->snext(status
);
796 while (U_SUCCESS(status
) && originalWord
!= NULL
&& cloneWord
!= NULL
) {
797 if (*originalWord
!= *cloneWord
) {
798 errln("Original and cloned MutableTrieDictionary word mismatch\n");
801 originalWord
= enumer1
->snext(status
);
802 cloneWord
= cloneEnum
->snext(status
);
805 if (U_FAILURE(status
)) {
806 errln("Enumeration failed: %s\n", u_errorName(status
));
810 if (originalWord
!= cloneWord
) {
811 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
815 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
816 compact2
= new CompactTrieDictionary(compactDict
->data(), status
);
817 if (U_FAILURE(status
)) {
818 errln("CompactTrieDictionary(const void *,...) failed\n");
822 if (compact2
->dataSize() == 0) {
823 errln("CompactTrieDictionary->dataSize() == 0\n");
827 // Now count the words via the second dictionary
829 enumer1
= compact2
->openWords(status
);
830 if (U_FAILURE(status
)) {
831 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status
));
835 if (wordCount
!= (testCount
= enumer1
->count(status
))) {
836 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
837 testCount
, wordCount
, u_errorName(status
));
853 //----------------------------------------------------------------------------
855 // generalIteratorTest Given a break iterator and a set of test data,
856 // Run the tests and report the results.
858 //----------------------------------------------------------------------------
859 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
862 bi
.setText(td
.fDataToBreak
);
864 testFirstAndNext(bi
, td
);
866 testLastAndPrevious(bi
, td
);
868 testFollowing(bi
, td
);
869 testPreceding(bi
, td
);
870 testIsBoundary(bi
, td
);
871 doMultipleSelectionTest(bi
, td
);
876 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
879 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
881 UErrorCode status
= U_ZERO_ERROR
;
886 logln("Test first and next");
887 bi
.setText(td
.fDataToBreak
);
890 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
891 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
892 tag
= bi
.getRuleStatus();
893 td
.fActualTags
.addElement(tag
, status
);
895 // If the iterator is not making forward progress, stop.
896 // No need to raise an error here, it'll be detected in the normal check of results.
901 td
.checkResults("testFirstAndNext", this);
906 // TestLastAndPrevious. Run the iterator backwards, starting with last().
908 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
910 UErrorCode status
= U_ZERO_ERROR
;
912 int32_t lastP
= 0x7ffffffe;
915 logln("Test last and previous");
916 bi
.setText(td
.fDataToBreak
);
919 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
920 // Save break position. Insert it at start of vector of results, shoving
921 // already-saved results further towards the end.
922 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
923 // bi.previous(); // TODO: Why does this fix things up????
925 tag
= bi
.getRuleStatus();
926 td
.fActualTags
.insertElementAt(tag
, 0, status
);
928 // If the iterator is not making progress, stop.
929 // No need to raise an error here, it'll be detected in the normal check of results.
934 td
.checkResults("testLastAndPrevious", this);
938 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
940 UErrorCode status
= U_ZERO_ERROR
;
943 int32_t lastP
= -2; // A value that will never be returned as a break position.
944 // cannot be -1; that is returned for DONE.
947 logln("testFollowing():");
948 bi
.setText(td
.fDataToBreak
);
951 // Save the starting point, since we won't get that out of following.
953 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
954 tag
= bi
.getRuleStatus();
955 td
.fActualTags
.addElement(tag
, status
);
957 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
960 if (p
== RuleBasedBreakIterator::DONE
) {
963 // We've reached a new break position. Save it.
964 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
965 tag
= bi
.getRuleStatus();
966 td
.fActualTags
.addElement(tag
, status
);
970 // The loop normally exits by means of the break in the middle.
971 // Make sure that the index was at the correct position for the break iterator to have
973 if (i
!= td
.fDataToBreak
.length()) {
974 errln("testFollowing(): iterator returned DONE prematurely.");
977 // Full check of all results.
978 td
.checkResults("testFollowing", this);
983 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
984 UErrorCode status
= U_ZERO_ERROR
;
987 int32_t lastP
= 0x7ffffffe;
990 logln("testPreceding():");
991 bi
.setText(td
.fDataToBreak
);
995 td
.fActualBreakPositions
.addElement(p
, status
);
996 tag
= bi
.getRuleStatus();
997 td
.fActualTags
.addElement(tag
, status
);
999 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
1000 p
= bi
.preceding(i
);
1002 if (p
== RuleBasedBreakIterator::DONE
) {
1005 // We've reached a new break position. Save it.
1006 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
1008 tag
= bi
.getRuleStatus();
1009 td
.fActualTags
.insertElementAt(tag
, 0, status
);
1012 // The loop normally exits by means of the break in the middle.
1013 // Make sure that the index was at the correct position for the break iterator to have
1016 errln("testPreceding(): iterator returned DONE prematurely.");
1019 // Full check of all results.
1020 td
.checkResults("testPreceding", this);
1025 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
1026 UErrorCode status
= U_ZERO_ERROR
;
1030 logln("testIsBoundary():");
1031 bi
.setText(td
.fDataToBreak
);
1034 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
1035 if (bi
.isBoundary(i
)) {
1036 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
1037 tag
= bi
.getRuleStatus();
1038 td
.fActualTags
.addElement(tag
, status
);
1041 td
.checkResults("testIsBoundary: ", this);
1046 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
1048 iterator
.setText(td
.fDataToBreak
);
1050 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
1051 int32_t offset
= iterator
.first();
1055 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
1057 if (*testIterator
!= iterator
)
1058 errln("clone() or operator!= failed: two clones compared unequal");
1061 testOffset
= testIterator
->first();
1062 testOffset
= testIterator
->next(count
);
1063 if (offset
!= testOffset
)
1064 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
1066 if (offset
!= RuleBasedBreakIterator::DONE
) {
1068 offset
= iterator
.next();
1070 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
1071 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
1072 if (count
> 10000 || offset
== -1) {
1073 errln("operator== failed too many times. Stopping test.");
1075 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1081 } while (offset
!= RuleBasedBreakIterator::DONE
);
1083 // now do it backwards...
1084 offset
= iterator
.last();
1088 testOffset
= testIterator
->last();
1089 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
1090 if (offset
!= testOffset
)
1091 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
1093 if (offset
!= RuleBasedBreakIterator::DONE
) {
1095 offset
= iterator
.previous();
1097 } while (offset
!= RuleBasedBreakIterator::DONE
);
1099 delete testIterator
;
1103 //---------------------------------------------
1107 //---------------------------------------------
1108 void RBBITest::TestEmptyString()
1110 UnicodeString text
= "";
1111 UErrorCode status
= U_ZERO_ERROR
;
1113 BITestData
x(status
);
1114 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
1115 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
1116 if (U_FAILURE(status
))
1118 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status
));
1121 generalIteratorTest(*bi
, x
);
1125 void RBBITest::TestGetAvailableLocales()
1127 int32_t locCount
= 0;
1128 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
1131 dataerrln("getAvailableLocales() returned an empty list!");
1132 // Just make sure that it's returning good memory.
1134 for (i
= 0; i
< locCount
; ++i
) {
1135 logln(locList
[i
].getName());
1139 //Testing the BreakIterator::getDisplayName() function
1140 void RBBITest::TestGetDisplayName()
1142 UnicodeString result
;
1144 BreakIterator::getDisplayName(Locale::getUS(), result
);
1145 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
1146 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1149 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
1150 if (result
!= "French (France)")
1151 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1155 * Test End Behaviour
1158 void RBBITest::TestEndBehaviour()
1160 UErrorCode status
= U_ZERO_ERROR
;
1161 UnicodeString
testString("boo.");
1162 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1163 if (U_FAILURE(status
))
1165 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status
));
1168 wb
->setText(testString
);
1170 if (wb
->first() != 0)
1171 errln("Didn't get break at beginning of string.");
1172 if (wb
->next() != 3)
1173 errln("Didn't get break before period in \"boo.\"");
1174 if (wb
->current() != 4 && wb
->next() != 4)
1175 errln("Didn't get break at end of string.");
1181 void RBBITest::TestBug4153072() {
1182 UErrorCode status
= U_ZERO_ERROR
;
1183 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1184 if (U_FAILURE(status
))
1186 errcheckln(status
, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status
));
1189 UnicodeString
str("...Hello, World!...");
1191 int32_t end
= str
.length() - 3;
1194 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
1195 iter
->adoptText(textIterator
);
1197 // Note: with the switch to UText, there is no way to restrict the
1198 // iteration range to begin at an index other than zero.
1199 // String character iterators created with a non-zero bound are
1200 // treated by RBBI as being empty.
1201 for (index
= -1; index
< begin
+ 1; ++index
) {
1202 onBoundary
= iter
->isBoundary(index
);
1203 if (index
== 0? !onBoundary
: onBoundary
) {
1204 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
1205 " and begin index = " + begin
);
1213 // Test for problem reported by Ashok Matoria on 9 July 2007
1214 // One.<kSoftHyphen><kSpace>Two.
1216 // Sentence break at start (0) and then on calling next() it breaks at
1217 // 'T' of "Two". Now, at this point if I do next() and
1218 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1220 void RBBITest::TestBug5775() {
1221 UErrorCode status
= U_ZERO_ERROR
;
1222 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1223 TEST_ASSERT_SUCCESS(status
);
1224 if (U_FAILURE(status
)) {
1227 // Check for status first for better handling of no data errors.
1228 TEST_ASSERT(bi
!= NULL
);
1233 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
1237 int pos
= bi
->next();
1238 TEST_ASSERT(pos
== 6);
1240 TEST_ASSERT(pos
== 10);
1241 pos
= bi
->previous();
1242 TEST_ASSERT(pos
== 6);
1249 * Test Japanese Line Break
1252 void RBBITest::TestJapaneseLineBreak()
1255 // Test needs updating some more... Dump it for now.
1258 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1259 // as opening and closing punctuation for line breaking.
1260 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1261 // from these tests. 6-13-2002
1263 UErrorCode status
= U_ZERO_ERROR
;
1264 UnicodeString testString
= CharsToUnicodeString("\\u4e00x\\u4e8c");
1265 UnicodeString precedingChars
= CharsToUnicodeString(
1266 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1267 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1268 UnicodeString followingChars
= CharsToUnicodeString(
1269 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1270 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1271 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1272 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1273 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1274 BreakIterator
*iter
= BreakIterator::createLineInstance(Locale::getJapan(), status
);
1277 if (U_FAILURE(status
))
1279 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1283 for (i
= 0; i
< precedingChars
.length(); i
++) {
1284 testString
.setCharAt(1, precedingChars
[i
]);
1285 iter
->setText(testString
);
1286 int32_t j
= iter
->first();
1288 errln("ja line break failure: failed to start at 0");
1291 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars
[i
])
1292 + "' (" + ((int)(precedingChars
[i
])) + ")");
1295 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars
[i
])
1296 + "' (" + ((int)(precedingChars
[i
])) + ")");
1299 for (i
= 0; i
< followingChars
.length(); i
++) {
1300 testString
.setCharAt(1, followingChars
[i
]);
1301 iter
->setText(testString
);
1302 int j
= iter
->first();
1304 errln("ja line break failure: failed to start at 0");
1307 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars
[i
])
1308 + "' (" + ((int)(followingChars
[i
])) + ")");
1311 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars
[i
])
1312 + "' (" + ((int)(followingChars
[i
])) + ")");
1319 //------------------------------------------------------------------------------
1321 // RBBITest::Extended Run RBBI Tests from an external test data file
1323 //------------------------------------------------------------------------------
1327 UnicodeString dataToBreak
;
1328 UVector32
*expectedBreaks
;
1333 void RBBITest::executeTest(TestParams
*t
) {
1338 if (t
->bi
== NULL
) {
1342 t
->bi
->setText(t
->dataToBreak
);
1344 // Run the iterator forward
1347 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1349 // Fail for lack of forward progress.
1350 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1351 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1355 // Check that there were we didn't miss an expected break between the last one
1357 for (i
=prevBP
+1; i
<bp
; i
++) {
1358 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1359 int expected
[] = {0, i
};
1360 printStringBreaks(t
->dataToBreak
, expected
, 2);
1361 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1362 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1366 // Check that the break we did find was expected
1367 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1368 int expected
[] = {0, bp
};
1369 printStringBreaks(t
->dataToBreak
, expected
, 2);
1370 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1371 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1373 // The break was expected.
1374 // Check that the {nnn} tag value is correct.
1375 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1376 if (expectedTagVal
== -1) {
1379 int32_t line
= t
->srcLine
->elementAti(bp
);
1380 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1381 if (rs
!= expectedTagVal
) {
1382 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1383 " Actual, Expected status = %4d, %4d",
1384 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1392 // Verify that there were no missed expected breaks after the last one found
1393 for (i
=prevBP
+1; i
<t
->expectedBreaks
->size(); i
++) {
1394 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1395 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1396 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1401 // Run the iterator backwards, verify that the same breaks are found.
1403 prevBP
= t
->dataToBreak
.length()+2; // start with a phony value for the last break pos seen.
1404 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1406 // Fail for lack of progress.
1407 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1408 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1412 // Check that there were we didn't miss an expected break between the last one
1413 // and this one. (UVector returns zeros for index out of bounds.)
1414 for (i
=prevBP
-1; i
>bp
; i
--) {
1415 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1416 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1417 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1421 // Check that the break we did find was expected
1422 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1423 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1424 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1426 // The break was expected.
1427 // Check that the {nnn} tag value is correct.
1428 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1429 if (expectedTagVal
== -1) {
1432 int line
= t
->srcLine
->elementAti(bp
);
1433 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1434 if (rs
!= expectedTagVal
) {
1435 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1436 " Actual, Expected status = %4d, %4d",
1437 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1444 // Verify that there were no missed breaks prior to the last one found
1445 for (i
=prevBP
-1; i
>=0; i
--) {
1446 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1447 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1448 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1454 void RBBITest::TestExtended() {
1455 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1456 UErrorCode status
= U_ZERO_ERROR
;
1459 UnicodeString rules
;
1462 tp
.expectedBreaks
= new UVector32(status
);
1463 tp
.srcLine
= new UVector32(status
);
1464 tp
.srcCol
= new UVector32(status
);
1466 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status
);
1467 if (U_FAILURE(status
)) {
1468 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__
, u_errorName(status
));
1473 // Open and read the test data file.
1475 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1476 char testFileName
[1000];
1477 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1478 errln("Can't open test data. Path too long.");
1481 strcpy(testFileName
, testDataDirectory
);
1482 strcat(testFileName
, "rbbitst.txt");
1485 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1486 if (U_FAILURE(status
)) {
1487 return; /* something went wrong, error already output */
1494 // Put the test data into a UnicodeString
1496 UnicodeString
testString(FALSE
, testFile
, len
);
1504 parseState
= PARSE_TAG
;
1506 EParseState savedState
= PARSE_TAG
;
1508 static const UChar CH_LF
= 0x0a;
1509 static const UChar CH_CR
= 0x0d;
1510 static const UChar CH_HASH
= 0x23;
1511 /*static const UChar CH_PERIOD = 0x2e;*/
1512 static const UChar CH_LT
= 0x3c;
1513 static const UChar CH_GT
= 0x3e;
1514 static const UChar CH_BACKSLASH
= 0x5c;
1515 static const UChar CH_BULLET
= 0x2022;
1517 int32_t lineNum
= 1;
1518 int32_t colStart
= 0;
1520 int32_t charIdx
= 0;
1522 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1524 for (charIdx
= 0; charIdx
< len
; ) {
1525 status
= U_ZERO_ERROR
;
1526 UChar c
= testString
.charAt(charIdx
);
1528 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1529 // treat CRLF as a unit
1533 if (c
== CH_LF
|| c
== CH_CR
) {
1537 column
= charIdx
- colStart
+ 1;
1539 switch (parseState
) {
1541 if (c
== 0x0a || c
== 0x0d) {
1542 parseState
= savedState
;
1549 parseState
= PARSE_COMMENT
;
1550 savedState
= PARSE_TAG
;
1553 if (u_isUWhiteSpace(c
)) {
1556 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1558 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1562 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1564 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1568 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1570 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1574 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1577 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1581 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1583 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1588 // <locale loc_name>
1589 localeMatcher
.reset(testString
);
1590 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1591 UnicodeString localeName
= localeMatcher
.group(1, status
);
1592 char localeName8
[100];
1593 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1594 locale
= Locale::createFromName(localeName8
);
1595 charIdx
+= localeMatcher
.group(0, status
).length();
1596 TEST_ASSERT_SUCCESS(status
);
1599 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1600 parseState
= PARSE_DATA
;
1602 tp
.dataToBreak
= "";
1603 tp
.expectedBreaks
->removeAllElements();
1604 tp
.srcCol
->removeAllElements();
1605 tp
.srcLine
->removeAllElements();
1609 errln("line %d: Tag expected in test file.", lineNum
);
1610 parseState
= PARSE_COMMENT
;
1611 savedState
= PARSE_DATA
;
1612 goto end_test
; // Stop the test.
1617 if (c
== CH_BULLET
) {
1618 int32_t breakIdx
= tp
.dataToBreak
.length();
1619 tp
.expectedBreaks
->setSize(breakIdx
+1);
1620 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1621 tp
.srcLine
->setSize(breakIdx
+1);
1622 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1623 tp
.srcCol
->setSize(breakIdx
+1);
1624 tp
.srcCol
->setElementAt(column
, breakIdx
);
1628 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1629 // Add final entry to mappings from break location to source file position.
1630 // Need one extra because last break position returned is after the
1631 // last char in the data, not at the last char.
1632 tp
.srcLine
->addElement(lineNum
, status
);
1633 tp
.srcCol
->addElement(column
, status
);
1635 parseState
= PARSE_TAG
;
1643 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1644 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1645 // Get the code point from the name and insert it into the test data.
1646 // (Damn, no API takes names in Unicode !!!
1647 // we've got to take it back to char *)
1648 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1649 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1650 char charNameBuf
[200];
1651 UChar32 theChar
= -1;
1652 if (nameEndIdx
!= -1) {
1653 UErrorCode status
= U_ZERO_ERROR
;
1654 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1655 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1656 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1657 if (U_FAILURE(status
)) {
1661 if (theChar
== -1) {
1662 errln("Error in named character in test file at line %d, col %d",
1665 // Named code point was recognized. Insert it
1666 // into the test data.
1667 tp
.dataToBreak
.append(theChar
);
1668 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1669 tp
.srcLine
->addElement(lineNum
, status
);
1670 tp
.srcCol
->addElement(column
, status
);
1673 if (nameEndIdx
> charIdx
) {
1674 charIdx
= nameEndIdx
+1;
1683 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1685 int32_t breakIdx
= tp
.dataToBreak
.length();
1686 tp
.expectedBreaks
->setSize(breakIdx
+1);
1687 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1688 tp
.srcLine
->setSize(breakIdx
+1);
1689 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1690 tp
.srcCol
->setSize(breakIdx
+1);
1691 tp
.srcCol
->setElementAt(column
, breakIdx
);
1697 parseState
= PARSE_NUM
;
1701 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1702 parseState
= PARSE_COMMENT
;
1703 savedState
= PARSE_DATA
;
1707 if (c
== CH_BACKSLASH
) {
1708 // Check for \ at end of line, a line continuation.
1709 // Advance over (discard) the newline
1710 UChar32 cp
= testString
.char32At(charIdx
);
1711 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1713 // Need an extra increment of the input ptr to move over both of them
1716 if (cp
== CH_LF
|| cp
== CH_CR
) {
1723 // Let unescape handle the back slash.
1724 cp
= testString
.unescapeAt(charIdx
);
1726 // Escape sequence was recognized. Insert the char
1727 // into the test data.
1728 tp
.dataToBreak
.append(cp
);
1729 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1730 tp
.srcLine
->addElement(lineNum
, status
);
1731 tp
.srcCol
->addElement(column
, status
);
1737 // Not a recognized backslash escape sequence.
1738 // Take the next char as a literal.
1739 // TODO: Should this be an error?
1740 c
= testString
.charAt(charIdx
);
1741 charIdx
= testString
.moveIndex32(charIdx
, 1);
1744 // Normal, non-escaped data char.
1745 tp
.dataToBreak
.append(c
);
1747 // Save the mapping from offset in the data to line/column numbers in
1748 // the original input file. Will be used for better error messages only.
1749 // If there's an expected break before this char, the slot in the mapping
1750 // vector will already be set for this char; don't overwrite it.
1751 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1752 tp
.srcLine
->addElement(lineNum
, status
);
1753 tp
.srcCol
->addElement(column
, status
);
1759 // We are parsing an expected numeric tag value, like <1234>,
1760 // within a chunk of data.
1761 if (u_isUWhiteSpace(c
)) {
1766 // Finished the number. Add the info to the expected break data,
1767 // and switch parse state back to doing plain data.
1768 parseState
= PARSE_DATA
;
1769 if (tagValue
== 0) {
1772 int32_t breakIdx
= tp
.dataToBreak
.length();
1773 tp
.expectedBreaks
->setSize(breakIdx
+1);
1774 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1775 tp
.srcLine
->setSize(breakIdx
+1);
1776 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1777 tp
.srcCol
->setSize(breakIdx
+1);
1778 tp
.srcCol
->setElementAt(column
, breakIdx
);
1783 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1787 errln("Syntax Error in test file at line %d, col %d",
1789 parseState
= PARSE_COMMENT
;
1790 goto end_test
; // Stop the test
1795 if (U_FAILURE(status
)) {
1796 errln("ICU Error %s while parsing test file at line %d.",
1797 u_errorName(status
), lineNum
);
1798 status
= U_ZERO_ERROR
;
1799 goto end_test
; // Stop the test
1806 delete tp
.expectedBreaks
;
1813 void RBBITest::TestThaiBreaks() {
1814 UErrorCode status
=U_ZERO_ERROR
;
1816 Locale locale
= Locale("th");
1819 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
1820 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
1821 0x0E16, 0x0E49, 0x0E33, 0x0000
1823 int32_t expectedWordResult
[] = {
1824 2, 3, 6, 10, 11, 15, 17, 20, 22
1826 int32_t expectedLineResult
[] = {
1827 3, 6, 11, 15, 17, 20, 22
1830 int32_t size
= u_strlen(c
);
1831 UnicodeString text
=UnicodeString(c
);
1833 b
= BreakIterator::createWordInstance(locale
, status
);
1834 if (U_FAILURE(status
)) {
1835 errcheckln(status
, "Unable to create thai word break iterator. - %s", u_errorName(status
));
1840 while ((p
=b
->next())!=BreakIterator::DONE
&& p
< size
) {
1841 if (p
!= expectedWordResult
[index
++]) {
1842 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult
[index
-1], p
);
1847 b
= BreakIterator::createLineInstance(locale
, status
);
1848 if (U_FAILURE(status
)) {
1849 printf("Unable to create thai line break iterator.\n");
1854 while ((p
=b
->next())!=BreakIterator::DONE
&& p
< size
) {
1855 if (p
!= expectedLineResult
[index
++]) {
1856 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult
[index
-1], p
);
1863 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
1864 // Words don't include colon or period (cldrbug #1969).
1865 static const char posxWordText
[] = "Can't have breaks in xx:yy or struct.field for CS-types.";
1866 static const int32_t posxWordTOffsets
[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
1867 static const int32_t posxWordROffsets
[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
1869 // UBreakIteratorType UBRK_WORD, Locale "ja"
1870 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
1871 static const char jaWordText
[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
1872 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
1873 static const int32_t jaWordTOffsets
[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 };
1874 static const int32_t jaWordROffsets
[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
1876 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
1877 // Add break after Greek question mark (cldrbug #2069).
1878 static const char elSentText
[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
1879 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
1880 static const int32_t elSentTOffsets
[] = { 8, 14, 20, 27, 35, 36 };
1881 static const int32_t elSentROffsets
[] = { 20, 27, 35, 36 };
1883 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
1884 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
1885 static const char thCharText
[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
1886 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
1887 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
1888 static const int32_t thCharTOffsets
[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
1889 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
1890 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
1891 static const int32_t thCharROffsets
[] = { 1, 3, 5, 6, 7, 8, 9, 11,
1892 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28,
1893 29, 32, 33, 35, 37, 38, 40, 41 };
1896 UBreakIteratorType type
;
1897 const char * locale
;
1898 const char * escapedText
;
1899 const int32_t * tailoredOffsets
;
1900 int32_t tailoredOffsetsCount
;
1901 const int32_t * rootOffsets
;
1902 int32_t rootOffsetsCount
;
1903 } TailoredBreakItem
;
1905 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
1907 static const TailoredBreakItem tbItems
[] = {
1908 { UBRK_WORD
, "en_US_POSIX", posxWordText
, ARRAY_PTR_LEN(posxWordTOffsets
), ARRAY_PTR_LEN(posxWordROffsets
) },
1909 { UBRK_WORD
, "ja", jaWordText
, ARRAY_PTR_LEN(jaWordTOffsets
), ARRAY_PTR_LEN(jaWordROffsets
) },
1910 { UBRK_SENTENCE
, "el", elSentText
, ARRAY_PTR_LEN(elSentTOffsets
), ARRAY_PTR_LEN(elSentROffsets
) },
1911 { UBRK_CHARACTER
, "th", thCharText
, ARRAY_PTR_LEN(thCharTOffsets
), ARRAY_PTR_LEN(thCharROffsets
) },
1912 { UBRK_CHARACTER
, NULL
, NULL
, NULL
,0, NULL
,0 } // terminator
1915 static void formatOffsets(char* buffer
, int32_t buflen
, int32_t count
, const int32_t* offsets
) {
1916 while (count
-- > 0) {
1918 sprintf(buffer
, /* buflen, */ " %d%n", *offsets
++, &writeCount
); /* wants to be snprintf */
1919 buffer
+= writeCount
;
1920 buflen
-= writeCount
;
1924 enum { kMaxOffsetCount
= 128 };
1926 void RBBITest::TBTest(BreakIterator
* brkitr
, int type
, const char *locale
, const char* escapedText
, const int32_t *expectOffsets
, int32_t expectOffsetsCount
) {
1927 brkitr
->setText( CharsToUnicodeString(escapedText
) );
1928 int32_t foundOffsets
[kMaxOffsetCount
];
1929 int32_t offset
, foundOffsetsCount
= 0;
1930 // do forwards iteration test
1931 while ( foundOffsetsCount
< kMaxOffsetCount
&& (offset
= brkitr
->next()) != BreakIterator::DONE
) {
1932 foundOffsets
[foundOffsetsCount
++] = offset
;
1934 if ( foundOffsetsCount
!= expectOffsetsCount
|| memcmp(expectOffsets
, foundOffsets
, foundOffsetsCount
*sizeof(foundOffsets
[0])) != 0 ) {
1935 // log error for forwards test
1936 char formatExpect
[512], formatFound
[512];
1937 formatOffsets(formatExpect
, sizeof(formatExpect
), expectOffsetsCount
, expectOffsets
);
1938 formatOffsets(formatFound
, sizeof(formatFound
), foundOffsetsCount
, foundOffsets
);
1939 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
1940 type
, locale
, escapedText
, expectOffsetsCount
, formatExpect
, foundOffsetsCount
, formatFound
);
1942 // do backwards iteration test
1943 --foundOffsetsCount
; // back off one from the end offset
1944 while ( foundOffsetsCount
> 0 ) {
1945 offset
= brkitr
->previous();
1946 if ( offset
!= foundOffsets
[--foundOffsetsCount
] ) {
1947 // log error for backwards test
1948 char formatExpect
[512];
1949 formatOffsets(formatExpect
, sizeof(formatExpect
), expectOffsetsCount
, expectOffsets
);
1950 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
1951 type
, locale
, escapedText
, expectOffsetsCount
, formatExpect
, offset
, foundOffsets
[foundOffsetsCount
]);
1958 void RBBITest::TestTailoredBreaks() {
1959 const TailoredBreakItem
* tbItemPtr
;
1960 Locale rootLocale
= Locale("root");
1961 for (tbItemPtr
= tbItems
; tbItemPtr
->escapedText
!= NULL
; ++tbItemPtr
) {
1962 Locale testLocale
= Locale(tbItemPtr
->locale
);
1963 BreakIterator
* tailoredBrkiter
= NULL
;
1964 BreakIterator
* rootBrkiter
= NULL
;
1965 UErrorCode status
= U_ZERO_ERROR
;
1966 switch (tbItemPtr
->type
) {
1967 case UBRK_CHARACTER
:
1968 tailoredBrkiter
= BreakIterator::createCharacterInstance(testLocale
, status
);
1969 rootBrkiter
= BreakIterator::createCharacterInstance(rootLocale
, status
);
1972 tailoredBrkiter
= BreakIterator::createWordInstance(testLocale
, status
);
1973 rootBrkiter
= BreakIterator::createWordInstance(rootLocale
, status
);
1976 tailoredBrkiter
= BreakIterator::createLineInstance(testLocale
, status
);
1977 rootBrkiter
= BreakIterator::createLineInstance(rootLocale
, status
);
1980 tailoredBrkiter
= BreakIterator::createSentenceInstance(testLocale
, status
);
1981 rootBrkiter
= BreakIterator::createSentenceInstance(rootLocale
, status
);
1984 status
= U_UNSUPPORTED_ERROR
;
1987 if (U_FAILURE(status
)) {
1988 errcheckln(status
, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr
->type
), tbItemPtr
->locale
, u_errorName(status
));
1991 TBTest(tailoredBrkiter
, (int)(tbItemPtr
->type
), tbItemPtr
->locale
, tbItemPtr
->escapedText
, tbItemPtr
->tailoredOffsets
, tbItemPtr
->tailoredOffsetsCount
);
1992 TBTest(rootBrkiter
, (int)(tbItemPtr
->type
), "root", tbItemPtr
->escapedText
, tbItemPtr
->rootOffsets
, tbItemPtr
->rootOffsetsCount
);
1995 delete tailoredBrkiter
;
2000 //-------------------------------------------------------------------------------
2002 // TestDictRules create a break iterator from source rules that includes a
2003 // dictionary range. Regression for bug #7130. Source rules
2004 // do not declare a break iterator type (word, line, sentence, etc.
2005 // but the dictionary code, without a type, would loop.
2007 //-------------------------------------------------------------------------------
2008 void RBBITest::TestDictRules() {
2009 const char *rules
= "$dictionary = [a-z]; \n"
2011 "$dictionary $dictionary; \n"
2013 "$dictionary $dictionary; \n";
2014 const char *text
= "aa";
2015 UErrorCode status
= U_ZERO_ERROR
;
2016 UParseError parseError
;
2018 RuleBasedBreakIterator
bi(rules
, parseError
, status
);
2019 if (U_SUCCESS(status
)) {
2020 UnicodeString utext
= text
;
2024 for (loops
= 0; loops
<10; loops
++) {
2025 position
= bi
.next();
2026 if (position
== RuleBasedBreakIterator::DONE
) {
2030 TEST_ASSERT(loops
== 1);
2032 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status
));
2038 //-------------------------------------------------------------------------------
2040 // ReadAndConvertFile Read a text data file, convert it to UChars, and
2041 // return the datain one big UChar * buffer, which the caller must delete.
2044 // fileName: the name of the file, with no directory part. The test data directory
2046 // ulen an out parameter, receives the actual length (in UChars) of the file data.
2047 // encoding The file encoding. If the file contains a BOM, that will override the encoding
2048 // specified here. The BOM, if it exists, will be stripped from the returned data.
2049 // Pass NULL for the system default encoding.
2052 // The file data, converted to UChar.
2053 // The caller must delete this when done with
2054 // delete [] theBuffer;
2056 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
2057 // Move this function to some common place.
2059 //--------------------------------------------------------------------------------
2060 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
2061 UChar
*retPtr
= NULL
;
2062 char *fileBuf
= NULL
;
2063 UConverter
* conv
= NULL
;
2067 if (U_FAILURE(status
)) {
2074 f
= fopen(fileName
, "rb");
2076 dataerrln("Error opening test data file %s\n", fileName
);
2077 status
= U_FILE_ACCESS_ERROR
;
2086 fseek( f
, 0, SEEK_END
);
2087 fileSize
= ftell(f
);
2088 fileBuf
= new char[fileSize
];
2089 fseek(f
, 0, SEEK_SET
);
2090 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
2091 if (amt_read
!= fileSize
|| fileSize
<= 0) {
2092 errln("Error reading test data file.");
2093 goto cleanUpAndReturn
;
2097 // Look for a Unicode Signature (BOM) on the data just read
2099 int32_t signatureLength
;
2100 const char * fileBufC
;
2101 const char* bomEncoding
;
2104 bomEncoding
= ucnv_detectUnicodeSignature(
2105 fileBuf
, fileSize
, &signatureLength
, &status
);
2106 if(bomEncoding
!=NULL
){
2107 fileBufC
+= signatureLength
;
2108 fileSize
-= signatureLength
;
2109 encoding
= bomEncoding
;
2113 // Open a converter to take the rule file to UTF-16
2115 conv
= ucnv_open(encoding
, &status
);
2116 if (U_FAILURE(status
)) {
2117 goto cleanUpAndReturn
;
2121 // Convert the rules to UChar.
2122 // Preflight first to determine required buffer size.
2124 ulen
= ucnv_toUChars(conv
,
2130 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
2131 // Buffer Overflow is expected from the preflight operation.
2132 status
= U_ZERO_ERROR
;
2134 retPtr
= new UChar
[ulen
+1];
2147 if (U_FAILURE(status
)) {
2148 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
2158 //--------------------------------------------------------------------------------------------
2160 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
2162 //-------------------------------------------------------------------------------------------
2163 void RBBITest::TestUnicodeFiles() {
2164 RuleBasedBreakIterator
*bi
;
2165 UErrorCode status
= U_ZERO_ERROR
;
2167 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2168 TEST_ASSERT_SUCCESS(status
);
2169 if (U_SUCCESS(status
)) {
2170 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
2174 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getEnglish(), status
);
2175 TEST_ASSERT_SUCCESS(status
);
2176 if (U_SUCCESS(status
)) {
2177 runUnicodeTestData("WordBreakTest.txt", bi
);
2181 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
2182 TEST_ASSERT_SUCCESS(status
);
2183 if (U_SUCCESS(status
)) {
2184 runUnicodeTestData("SentenceBreakTest.txt", bi
);
2188 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getEnglish(), status
);
2189 TEST_ASSERT_SUCCESS(status
);
2190 if (U_SUCCESS(status
)) {
2191 runUnicodeTestData("LineBreakTest.txt", bi
);
2197 //--------------------------------------------------------------------------------------------
2199 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
2201 //-------------------------------------------------------------------------------------------
2202 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
2203 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2204 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
2205 UVersionInfo icu4700
= { 4, 7, 0, 0 };
2206 UBool isICUVersionPast46
= isICUVersionAtLeast(icu4700
);
2207 UBool isLineBreak
= 0 == strcmp(fileName
, "LineBreakTest.txt");
2208 UErrorCode status
= U_ZERO_ERROR
;
2211 // Open and read the test data file, put it into a UnicodeString.
2213 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
2214 char testFileName
[1000];
2215 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
2216 dataerrln("Can't open test data. Path too long.");
2219 strcpy(testFileName
, testDataDirectory
);
2220 strcat(testFileName
, fileName
);
2222 logln("Opening data file %s\n", fileName
);
2225 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
2226 if (status
!= U_FILE_ACCESS_ERROR
) {
2227 TEST_ASSERT_SUCCESS(status
);
2228 TEST_ASSERT(testFile
!= NULL
);
2230 if (U_FAILURE(status
) || testFile
== NULL
) {
2231 return; /* something went wrong, error already output */
2233 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
2236 // Parse the test data file using a regular expression.
2237 // Each kind of token is recognized in its own capture group; what type of item was scanned
2238 // is identified by which group had a match.
2240 // Caputure Group # 1 2 3 4 5
2241 // Parses this item: divide x hex digits comment \n unrecognized \n
2243 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
2244 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
2245 UnicodeString testString
;
2246 UVector32
breakPositions(status
);
2248 TEST_ASSERT_SUCCESS(status
);
2249 if (U_FAILURE(status
)) {
2254 // Scan through each test case, building up the string to be broken in testString,
2255 // and the positions that should be boundaries in the breakPositions vector.
2258 while (tokenMatcher
.find()) {
2259 if(tokenMatcher
.hitEnd()) {
2260 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
2261 This occurred when the text file was corrupt (wasn't marked as UTF-8)
2262 and caused an infinite loop here on EBCDIC systems!
2264 fprintf(stderr
,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName
, ++spin
);
2267 if (tokenMatcher
.start(1, status
) >= 0) {
2268 // Scanned a divide sign, indicating a break position in the test data.
2269 if (testString
.length()>0) {
2270 breakPositions
.addElement(testString
.length(), status
);
2273 else if (tokenMatcher
.start(2, status
) >= 0) {
2274 // Scanned an 'x', meaning no break at this position in the test data
2275 // Nothing to be done here.
2277 else if (tokenMatcher
.start(3, status
) >= 0) {
2278 // Scanned Hex digits. Convert them to binary, append to the character data string.
2279 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
2280 int length
= hexNumber
.length();
2283 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
2284 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
2286 testString
.append(c
);
2288 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2289 fileName
, lineNumber
);
2292 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2293 fileName
, lineNumber
);
2296 else if (tokenMatcher
.start(4, status
) >= 0) {
2297 // Scanned to end of a line, possibly skipping over a comment in the process.
2298 // If the line from the file contained test data, run the test now.
2300 if (testString
.length() > 0) {
2301 // TODO(andy): Remove this time bomb code.
2302 if (!isLineBreak
|| isICUVersionPast46
|| !(4658 <= lineNumber
&& lineNumber
<= 4758)) {
2303 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
2307 // Clear out this test case.
2308 // The string and breakPositions vector will be refilled as the next
2309 // test case is parsed.
2310 testString
.remove();
2311 breakPositions
.removeAllElements();
2314 // Scanner catchall. Something unrecognized appeared on the line.
2316 UnicodeString uToken
= tokenMatcher
.group(0, status
);
2317 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
2318 token
[sizeof(token
)-1] = 0;
2319 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
2321 // Clean up, in preparation for continuing with the next line.
2322 testString
.remove();
2323 breakPositions
.removeAllElements();
2326 TEST_ASSERT_SUCCESS(status
);
2327 if (U_FAILURE(status
)) {
2333 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2336 //--------------------------------------------------------------------------------------------
2338 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
2339 // test data files. Do only a simple, forward-only check -
2340 // this test is mostly to check that ICU and the Unicode
2341 // data agree with each other.
2343 //--------------------------------------------------------------------------------------------
2344 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
2345 const UnicodeString
&testString
, // Text data to be broken
2346 UVector32
*breakPositions
, // Positions where breaks should be found.
2347 RuleBasedBreakIterator
*bi
) {
2348 int32_t pos
; // Break Position in the test string
2349 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
2350 int32_t expectedPos
; // Expected break position (index into test string)
2352 bi
->setText(testString
);
2356 while (pos
!= BreakIterator::DONE
) {
2357 if (expectedI
>= breakPositions
->size()) {
2358 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2359 testFileName
, lineNumber
, pos
);
2362 expectedPos
= breakPositions
->elementAti(expectedI
);
2363 if (pos
< expectedPos
) {
2364 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2365 testFileName
, lineNumber
, pos
);
2368 if (pos
> expectedPos
) {
2369 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2370 testFileName
, lineNumber
, expectedPos
);
2377 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
2378 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2379 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
2385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2386 //---------------------------------------------------------------------------------------
2388 // classs RBBIMonkeyKind
2390 // Monkey Test for Break Iteration
2391 // Abstract interface class. Concrete derived classes independently
2392 // implement the break rules for different iterator types.
2394 // The Monkey Test itself uses doesn't know which type of break iterator it is
2395 // testing, but works purely in terms of the interface defined here.
2397 //---------------------------------------------------------------------------------------
2398 class RBBIMonkeyKind
{
2400 // Return a UVector of UnicodeSets, representing the character classes used
2401 // for this type of iterator.
2402 virtual UVector
*charClasses() = 0;
2404 // Set the test text on which subsequent calls to next() will operate
2405 virtual void setText(const UnicodeString
&s
) = 0;
2407 // Find the next break postion, starting from the prev break position, or from zero.
2408 // Return -1 after reaching end of string.
2409 virtual int32_t next(int32_t i
) = 0;
2411 virtual ~RBBIMonkeyKind();
2412 UErrorCode deferredStatus
;
2421 RBBIMonkeyKind::RBBIMonkeyKind() {
2422 deferredStatus
= U_ZERO_ERROR
;
2425 RBBIMonkeyKind::~RBBIMonkeyKind() {
2429 //----------------------------------------------------------------------------------------
2431 // Random Numbers. Similar to standard lib rand() and srand()
2432 // Not using library to
2433 // 1. Get same results on all platforms.
2434 // 2. Get access to current seed, to more easily reproduce failures.
2436 //---------------------------------------------------------------------------------------
2437 static uint32_t m_seed
= 1;
2439 static uint32_t m_rand()
2441 m_seed
= m_seed
* 1103515245 + 12345;
2442 return (uint32_t)(m_seed
/65536) % 32768;
2446 //------------------------------------------------------------------------------------------
2448 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2449 // of RBBIMonkeyKind.
2451 //------------------------------------------------------------------------------------------
2452 class RBBICharMonkey
: public RBBIMonkeyKind
{
2455 virtual ~RBBICharMonkey();
2456 virtual UVector
*charClasses();
2457 virtual void setText(const UnicodeString
&s
);
2458 virtual int32_t next(int32_t i
);
2462 UnicodeSet
*fCRLFSet
;
2463 UnicodeSet
*fControlSet
;
2464 UnicodeSet
*fExtendSet
;
2465 UnicodeSet
*fPrependSet
;
2466 UnicodeSet
*fSpacingSet
;
2471 UnicodeSet
*fLVTSet
;
2472 UnicodeSet
*fHangulSet
;
2473 UnicodeSet
*fAnySet
;
2475 const UnicodeString
*fText
;
2479 RBBICharMonkey::RBBICharMonkey() {
2480 UErrorCode status
= U_ZERO_ERROR
;
2484 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
2485 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status
);
2486 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status
);
2487 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
2488 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
2489 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
2490 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
2491 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
2492 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
2493 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
2494 fHangulSet
= new UnicodeSet();
2495 fHangulSet
->addAll(*fLSet
);
2496 fHangulSet
->addAll(*fVSet
);
2497 fHangulSet
->addAll(*fTSet
);
2498 fHangulSet
->addAll(*fLVSet
);
2499 fHangulSet
->addAll(*fLVTSet
);
2500 fAnySet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status
);
2502 fSets
= new UVector(status
);
2503 fSets
->addElement(fCRLFSet
, status
);
2504 fSets
->addElement(fControlSet
, status
);
2505 fSets
->addElement(fExtendSet
, status
);
2506 fSets
->addElement(fPrependSet
, status
);
2507 fSets
->addElement(fSpacingSet
, status
);
2508 fSets
->addElement(fHangulSet
, status
);
2509 fSets
->addElement(fAnySet
, status
);
2510 if (U_FAILURE(status
)) {
2511 deferredStatus
= status
;
2516 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2522 int32_t RBBICharMonkey::next(int32_t prevPos
) {
2523 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2524 // break position being tested. The candidate break
2525 // location is before p2.
2529 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2531 if (U_FAILURE(deferredStatus
)) {
2535 // Previous break at end of string. return DONE.
2536 if (prevPos
>= fText
->length()) {
2539 p0
= p1
= p2
= p3
= prevPos
;
2540 c3
= fText
->char32At(prevPos
);
2543 // Loop runs once per "significant" character position in the input text.
2545 // Move all of the positions forward in the input string.
2550 // Advancd p3 by one codepoint
2551 p3
= fText
->moveIndex32(p3
, 1);
2552 c3
= fText
->char32At(p3
);
2555 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2558 if (p2
== fText
->length()) {
2559 // Reached end of string. Always a break position.
2564 // No Extend or Format characters may appear between the CR and LF,
2565 // which requires the additional check for p2 immediately following p1.
2567 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2571 // Rule (GB4). ( Control | CR | LF ) <break>
2572 if (fControlSet
->contains(c1
) ||
2578 // Rule (GB5) <break> ( Control | CR | LF )
2580 if (fControlSet
->contains(c2
) ||
2587 // Rule (GB6) L x ( L | V | LV | LVT )
2588 if (fLSet
->contains(c1
) &&
2589 (fLSet
->contains(c2
) ||
2590 fVSet
->contains(c2
) ||
2591 fLVSet
->contains(c2
) ||
2592 fLVTSet
->contains(c2
))) {
2596 // Rule (GB7) ( LV | V ) x ( V | T )
2597 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
2598 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
2602 // Rule (GB8) ( LVT | T) x T
2603 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
2604 fTSet
->contains(c2
)) {
2608 // Rule (GB9) Numeric x ALetter
2609 if (fExtendSet
->contains(c2
)) {
2613 // Rule (GB9a) x SpacingMark
2614 if (fSpacingSet
->contains(c2
)) {
2618 // Rule (GB9b) Prepend x
2619 if (fPrependSet
->contains(c1
)) {
2623 // Rule (GB10) Any <break> Any
2633 UVector
*RBBICharMonkey::charClasses() {
2638 RBBICharMonkey::~RBBICharMonkey() {
2654 //------------------------------------------------------------------------------------------
2656 // class RBBIWordMonkey Word Break specific implementation
2657 // of RBBIMonkeyKind.
2659 //------------------------------------------------------------------------------------------
2660 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2663 virtual ~RBBIWordMonkey();
2664 virtual UVector
*charClasses();
2665 virtual void setText(const UnicodeString
&s
);
2666 virtual int32_t next(int32_t i
);
2672 UnicodeSet
*fNewlineSet
;
2673 UnicodeSet
*fKatakanaSet
;
2674 UnicodeSet
*fALetterSet
;
2675 UnicodeSet
*fMidNumLetSet
;
2676 UnicodeSet
*fMidLetterSet
;
2677 UnicodeSet
*fMidNumSet
;
2678 UnicodeSet
*fNumericSet
;
2679 UnicodeSet
*fFormatSet
;
2680 UnicodeSet
*fOtherSet
;
2681 UnicodeSet
*fExtendSet
;
2682 UnicodeSet
*fExtendNumLetSet
;
2684 RegexMatcher
*fMatcher
;
2686 const UnicodeString
*fText
;
2690 RBBIWordMonkey::RBBIWordMonkey()
2692 UErrorCode status
= U_ZERO_ERROR
;
2694 fSets
= new UVector(status
);
2696 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2697 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2698 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2699 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2700 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2701 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2702 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status
);
2703 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2704 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2705 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2706 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2707 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2709 fOtherSet
= new UnicodeSet();
2710 if(U_FAILURE(status
)) {
2711 deferredStatus
= status
;
2715 fOtherSet
->complement();
2716 fOtherSet
->removeAll(*fCRSet
);
2717 fOtherSet
->removeAll(*fLFSet
);
2718 fOtherSet
->removeAll(*fNewlineSet
);
2719 fOtherSet
->removeAll(*fKatakanaSet
);
2720 fOtherSet
->removeAll(*fALetterSet
);
2721 fOtherSet
->removeAll(*fMidLetterSet
);
2722 fOtherSet
->removeAll(*fMidNumSet
);
2723 fOtherSet
->removeAll(*fNumericSet
);
2724 fOtherSet
->removeAll(*fExtendNumLetSet
);
2725 fOtherSet
->removeAll(*fFormatSet
);
2726 fOtherSet
->removeAll(*fExtendSet
);
2727 // Inhibit dictionary characters from being tested at all.
2728 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2730 fSets
->addElement(fCRSet
, status
);
2731 fSets
->addElement(fLFSet
, status
);
2732 fSets
->addElement(fNewlineSet
, status
);
2733 fSets
->addElement(fALetterSet
, status
);
2734 fSets
->addElement(fKatakanaSet
, status
);
2735 fSets
->addElement(fMidLetterSet
, status
);
2736 fSets
->addElement(fMidNumLetSet
, status
);
2737 fSets
->addElement(fMidNumSet
, status
);
2738 fSets
->addElement(fNumericSet
, status
);
2739 fSets
->addElement(fFormatSet
, status
);
2740 fSets
->addElement(fExtendSet
, status
);
2741 fSets
->addElement(fOtherSet
, status
);
2742 fSets
->addElement(fExtendNumLetSet
, status
);
2744 if (U_FAILURE(status
)) {
2745 deferredStatus
= status
;
2749 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2754 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2755 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2756 // break position being tested. The candidate break
2757 // location is before p2.
2761 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2763 if (U_FAILURE(deferredStatus
)) {
2767 // Prev break at end of string. return DONE.
2768 if (prevPos
>= fText
->length()) {
2771 p0
= p1
= p2
= p3
= prevPos
;
2772 c3
= fText
->char32At(prevPos
);
2775 // Loop runs once per "significant" character position in the input text.
2777 // Move all of the positions forward in the input string.
2782 // Advancd p3 by X(Extend | Format)* Rule 4
2783 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2785 p3
= fText
->moveIndex32(p3
, 1);
2786 c3
= fText
->char32At(p3
);
2787 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2791 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
));
2795 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2798 if (p2
== fText
->length()) {
2799 // Reached end of string. Always a break position.
2804 // No Extend or Format characters may appear between the CR and LF,
2805 // which requires the additional check for p2 immediately following p1.
2807 if (c1
==0x0D && c2
==0x0A) {
2811 // Rule (3a) Break before and after newlines (including CR and LF)
2813 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2816 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2820 // Rule (5). ALetter x ALetter
2821 if (fALetterSet
->contains(c1
) &&
2822 fALetterSet
->contains(c2
)) {
2826 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2828 if ( fALetterSet
->contains(c1
) &&
2829 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2830 fALetterSet
->contains(c3
)) {
2835 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2836 if (fALetterSet
->contains(c0
) &&
2837 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2838 fALetterSet
->contains(c2
)) {
2842 // Rule (8) Numeric x Numeric
2843 if (fNumericSet
->contains(c1
) &&
2844 fNumericSet
->contains(c2
)) {
2848 // Rule (9) ALetter x Numeric
2849 if (fALetterSet
->contains(c1
) &&
2850 fNumericSet
->contains(c2
)) {
2854 // Rule (10) Numeric x ALetter
2855 if (fNumericSet
->contains(c1
) &&
2856 fALetterSet
->contains(c2
)) {
2860 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2861 if (fNumericSet
->contains(c0
) &&
2862 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2863 fNumericSet
->contains(c2
)) {
2867 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2868 if (fNumericSet
->contains(c1
) &&
2869 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2870 fNumericSet
->contains(c3
)) {
2874 // Rule (13) Katakana x Katakana
2875 if (fKatakanaSet
->contains(c1
) &&
2876 fKatakanaSet
->contains(c2
)) {
2881 if ((fALetterSet
->contains(c1
) || fNumericSet
->contains(c1
) ||
2882 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2883 fExtendNumLetSet
->contains(c2
)) {
2888 if (fExtendNumLetSet
->contains(c1
) &&
2889 (fALetterSet
->contains(c2
) || fNumericSet
->contains(c2
) ||
2890 fKatakanaSet
->contains(c2
))) {
2894 // Rule 14. Break found here.
2903 UVector
*RBBIWordMonkey::charClasses() {
2908 RBBIWordMonkey::~RBBIWordMonkey() {
2913 delete fKatakanaSet
;
2915 delete fMidNumLetSet
;
2916 delete fMidLetterSet
;
2921 delete fExtendNumLetSet
;
2928 //------------------------------------------------------------------------------------------
2930 // class RBBISentMonkey Sentence Break specific implementation
2931 // of RBBIMonkeyKind.
2933 //------------------------------------------------------------------------------------------
2934 class RBBISentMonkey
: public RBBIMonkeyKind
{
2937 virtual ~RBBISentMonkey();
2938 virtual UVector
*charClasses();
2939 virtual void setText(const UnicodeString
&s
);
2940 virtual int32_t next(int32_t i
);
2942 int moveBack(int posFrom
);
2943 int moveForward(int posFrom
);
2944 UChar32
cAt(int pos
);
2948 UnicodeSet
*fSepSet
;
2949 UnicodeSet
*fFormatSet
;
2951 UnicodeSet
*fLowerSet
;
2952 UnicodeSet
*fUpperSet
;
2953 UnicodeSet
*fOLetterSet
;
2954 UnicodeSet
*fNumericSet
;
2955 UnicodeSet
*fATermSet
;
2956 UnicodeSet
*fSContinueSet
;
2957 UnicodeSet
*fSTermSet
;
2958 UnicodeSet
*fCloseSet
;
2959 UnicodeSet
*fOtherSet
;
2960 UnicodeSet
*fExtendSet
;
2962 const UnicodeString
*fText
;
2966 RBBISentMonkey::RBBISentMonkey()
2968 UErrorCode status
= U_ZERO_ERROR
;
2970 fSets
= new UVector(status
);
2972 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2973 // set and made into character classes of their own. For the monkey impl,
2974 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2975 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2976 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2977 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2978 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2979 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2980 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2981 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2982 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2983 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2984 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2985 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2986 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2987 fOtherSet
= new UnicodeSet();
2989 if(U_FAILURE(status
)) {
2990 deferredStatus
= status
;
2994 fOtherSet
->complement();
2995 fOtherSet
->removeAll(*fSepSet
);
2996 fOtherSet
->removeAll(*fFormatSet
);
2997 fOtherSet
->removeAll(*fSpSet
);
2998 fOtherSet
->removeAll(*fLowerSet
);
2999 fOtherSet
->removeAll(*fUpperSet
);
3000 fOtherSet
->removeAll(*fOLetterSet
);
3001 fOtherSet
->removeAll(*fNumericSet
);
3002 fOtherSet
->removeAll(*fATermSet
);
3003 fOtherSet
->removeAll(*fSContinueSet
);
3004 fOtherSet
->removeAll(*fSTermSet
);
3005 fOtherSet
->removeAll(*fCloseSet
);
3006 fOtherSet
->removeAll(*fExtendSet
);
3008 fSets
->addElement(fSepSet
, status
);
3009 fSets
->addElement(fFormatSet
, status
);
3010 fSets
->addElement(fSpSet
, status
);
3011 fSets
->addElement(fLowerSet
, status
);
3012 fSets
->addElement(fUpperSet
, status
);
3013 fSets
->addElement(fOLetterSet
, status
);
3014 fSets
->addElement(fNumericSet
, status
);
3015 fSets
->addElement(fATermSet
, status
);
3016 fSets
->addElement(fSContinueSet
, status
);
3017 fSets
->addElement(fSTermSet
, status
);
3018 fSets
->addElement(fCloseSet
, status
);
3019 fSets
->addElement(fOtherSet
, status
);
3020 fSets
->addElement(fExtendSet
, status
);
3022 if (U_FAILURE(status
)) {
3023 deferredStatus
= status
;
3029 void RBBISentMonkey::setText(const UnicodeString
&s
) {
3033 UVector
*RBBISentMonkey::charClasses() {
3038 // moveBack() Find the "significant" code point preceding the index i.
3039 // Skips over ($Extend | $Format)* .
3041 int RBBISentMonkey::moveBack(int i
) {
3048 j
= fText
->moveIndex32(j
, -1);
3049 c
= fText
->char32At(j
);
3051 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
3057 int RBBISentMonkey::moveForward(int i
) {
3058 if (i
>=fText
->length()) {
3059 return fText
->length();
3064 j
= fText
->moveIndex32(j
, 1);
3067 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
3071 UChar32
RBBISentMonkey::cAt(int pos
) {
3072 if (pos
<0 || pos
>=fText
->length()) {
3075 return fText
->char32At(pos
);
3079 int32_t RBBISentMonkey::next(int32_t prevPos
) {
3080 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
3081 // break position being tested. The candidate break
3082 // location is before p2.
3086 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
3089 if (U_FAILURE(deferredStatus
)) {
3093 // Prev break at end of string. return DONE.
3094 if (prevPos
>= fText
->length()) {
3097 p0
= p1
= p2
= p3
= prevPos
;
3098 c3
= fText
->char32At(prevPos
);
3101 // Loop runs once per "significant" character position in the input text.
3103 // Move all of the positions forward in the input string.
3108 // Advancd p3 by X(Extend | Format)* Rule 4
3109 p3
= moveForward(p3
);
3113 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
3117 // Rule (4). Sep <break>
3118 if (fSepSet
->contains(c1
)) {
3119 p2
= p1
+1; // Separators don't combine with Extend or Format.
3123 if (p2
>= fText
->length()) {
3124 // Reached end of string. Always a break position.
3128 if (p2
== prevPos
) {
3129 // Still warming up the loop. (won't work with zero length strings, but we don't care)
3133 // Rule (6). ATerm x Numeric
3134 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
3138 // Rule (7). Upper ATerm x Uppper
3139 if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
3143 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
3144 // Note: STerm | ATerm are added to the negated part of the expression by a
3145 // note to the Unicode 5.0 documents.
3147 while (fSpSet
->contains(cAt(p8
))) {
3150 while (fCloseSet
->contains(cAt(p8
))) {
3153 if (fATermSet
->contains(cAt(p8
))) {
3157 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
3158 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
3159 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
3162 p8
= moveForward(p8
);
3164 if (fLowerSet
->contains(cAt(p8
))) {
3169 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
3170 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
3172 while (fSpSet
->contains(cAt(p8
))) {
3175 while (fCloseSet
->contains(cAt(p8
))) {
3179 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
3184 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
3186 while (fCloseSet
->contains(cAt(p9
))) {
3190 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
3191 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
3196 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
3198 while (fSpSet
->contains(cAt(p10
))) {
3199 p10
= moveBack(p10
);
3201 while (fCloseSet
->contains(cAt(p10
))) {
3202 p10
= moveBack(p10
);
3204 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
3205 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
3210 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
3212 if (fSepSet
->contains(cAt(p11
))) {
3213 p11
= moveBack(p11
);
3215 while (fSpSet
->contains(cAt(p11
))) {
3216 p11
= moveBack(p11
);
3218 while (fCloseSet
->contains(cAt(p11
))) {
3219 p11
= moveBack(p11
);
3221 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
3225 // Rule (12) Any x Any
3232 RBBISentMonkey::~RBBISentMonkey() {
3242 delete fSContinueSet
;
3251 //-------------------------------------------------------------------------------------------
3255 //-------------------------------------------------------------------------------------------
3257 class RBBILineMonkey
: public RBBIMonkeyKind
{
3260 virtual ~RBBILineMonkey();
3261 virtual UVector
*charClasses();
3262 virtual void setText(const UnicodeString
&s
);
3263 virtual int32_t next(int32_t i
);
3264 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
3306 BreakIterator
*fCharBI
;
3308 const UnicodeString
*fText
;
3309 int32_t *fOrigPositions
;
3311 RegexMatcher
*fNumberMatcher
;
3312 RegexMatcher
*fLB11Matcher
;
3316 RBBILineMonkey::RBBILineMonkey()
3318 UErrorCode status
= U_ZERO_ERROR
;
3320 fSets
= new UVector(status
);
3322 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
3323 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
3324 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
3325 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
3326 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
3327 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
3328 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
3329 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
3330 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
3331 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
3332 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
3333 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
3334 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
3335 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
3336 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
3337 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
3338 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
3339 fCP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status
);
3340 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
3341 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
3342 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
3343 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
3344 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
3345 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
3346 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
3347 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
3348 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
3349 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
3350 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
3351 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
3352 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
3353 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
3354 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
3355 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
3356 fSA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status
);
3357 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
3358 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
3360 if (U_FAILURE(status
)) {
3361 deferredStatus
= status
;
3363 fNumberMatcher
= NULL
;
3367 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
3368 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
3369 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
3370 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
3372 fSets
->addElement(fBK
, status
);
3373 fSets
->addElement(fCR
, status
);
3374 fSets
->addElement(fLF
, status
);
3375 fSets
->addElement(fCM
, status
);
3376 fSets
->addElement(fNL
, status
);
3377 fSets
->addElement(fWJ
, status
);
3378 fSets
->addElement(fZW
, status
);
3379 fSets
->addElement(fGL
, status
);
3380 fSets
->addElement(fCB
, status
);
3381 fSets
->addElement(fSP
, status
);
3382 fSets
->addElement(fB2
, status
);
3383 fSets
->addElement(fBA
, status
);
3384 fSets
->addElement(fBB
, status
);
3385 fSets
->addElement(fHY
, status
);
3386 fSets
->addElement(fH2
, status
);
3387 fSets
->addElement(fH3
, status
);
3388 fSets
->addElement(fCL
, status
);
3389 fSets
->addElement(fCP
, status
);
3390 fSets
->addElement(fEX
, status
);
3391 fSets
->addElement(fIN
, status
);
3392 fSets
->addElement(fJL
, status
);
3393 fSets
->addElement(fJT
, status
);
3394 fSets
->addElement(fJV
, status
);
3395 fSets
->addElement(fNS
, status
);
3396 fSets
->addElement(fOP
, status
);
3397 fSets
->addElement(fQU
, status
);
3398 fSets
->addElement(fIS
, status
);
3399 fSets
->addElement(fNU
, status
);
3400 fSets
->addElement(fPO
, status
);
3401 fSets
->addElement(fPR
, status
);
3402 fSets
->addElement(fSY
, status
);
3403 fSets
->addElement(fAI
, status
);
3404 fSets
->addElement(fAL
, status
);
3405 fSets
->addElement(fID
, status
);
3406 fSets
->addElement(fWJ
, status
);
3407 fSets
->addElement(fSA
, status
);
3408 fSets
->addElement(fSG
, status
);
3411 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3412 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3413 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3414 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3415 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3416 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3418 fNumberMatcher
= new RegexMatcher(
3419 UnicodeString(rules
, -1, US_INV
), 0, status
);
3421 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
3423 if (U_FAILURE(status
)) {
3424 deferredStatus
= status
;
3429 void RBBILineMonkey::setText(const UnicodeString
&s
) {
3431 fCharBI
->setText(s
);
3432 fNumberMatcher
->reset(s
);
3437 // Line Break TR rules 9 and 10 implementation.
3438 // This deals with combining marks and other sequences that
3439 // that must be treated as if they were something other than what they actually are.
3441 // This is factored out into a separate function because it must be applied twice for
3442 // each potential break, once to the chars before the position being checked, then
3443 // again to the text following the possible break.
3445 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
3447 // Invalid initial position. Happens during the warmup iteration of the
3448 // main loop in next().
3452 int32_t nPos
= *nextPos
;
3454 // LB 9 Keep combining sequences together.
3455 // advance over any CM class chars. Note that Line Break CM is different
3456 // from the normal Grapheme Extend property.
3457 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
3458 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
3460 *nextChar
= fText
->char32At(nPos
);
3461 if (!fCM
->contains(*nextChar
)) {
3464 nPos
= fText
->moveIndex32(nPos
, 1);
3469 // LB 9 Treat X CM* as if it were x.
3470 // No explicit action required.
3472 // LB 10 Treat any remaining combining mark as AL
3473 if (fCM
->contains(*posChar
)) {
3474 *posChar
= 0x41; // thisChar = 'A';
3477 // Push the updated nextPos and nextChar back to our caller.
3478 // This only makes a difference if posChar got bigger by consuming a
3479 // combining sequence.
3481 *nextChar
= fText
->char32At(nPos
);
3486 int32_t RBBILineMonkey::next(int32_t startPos
) {
3487 UErrorCode status
= U_ZERO_ERROR
;
3488 int32_t pos
; // Index of the char following a potential break position
3489 UChar32 thisChar
; // Character at above position "pos"
3491 int32_t prevPos
; // Index of the char preceding a potential break position
3492 UChar32 prevChar
; // Character at above position. Note that prevChar
3493 // and thisChar may not be adjacent because combining
3494 // characters between them will be ignored.
3496 int32_t nextPos
; // Index of the next character following pos.
3497 // Usually skips over combining marks.
3498 int32_t nextCPPos
; // Index of the code point following "pos."
3499 // May point to a combining mark.
3500 int32_t tPos
; // temp value.
3503 if (U_FAILURE(deferredStatus
)) {
3507 if (startPos
>= fText
->length()) {
3512 // Initial values for loop. Loop will run the first time without finding breaks,
3513 // while the invalid values shift out and the "this" and
3514 // "prev" positions are filled in with good values.
3515 pos
= prevPos
= -1; // Invalid value, serves as flag for initial loop iteration.
3516 thisChar
= prevChar
= 0;
3517 nextPos
= nextCPPos
= startPos
;
3520 // Loop runs once per position in the test text, until a break position
3524 prevChar
= thisChar
;
3527 thisChar
= fText
->char32At(pos
);
3529 nextCPPos
= fText
->moveIndex32(pos
, 1);
3530 nextPos
= nextCPPos
;
3532 // Rule LB2 - Break at end of text.
3533 if (pos
>= fText
->length()) {
3537 // Rule LB 9 - adjust for combining sequences.
3538 // We do this one out-of-order because the adjustment does not change anything
3539 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3541 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3542 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3543 c
= fText
->char32At(nextPos
);
3544 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3546 // If the loop is still warming up - if we haven't shifted the initial
3547 // -1 positions out of prevPos yet - loop back to advance the
3548 // position in the input without any further looking for breaks.
3549 if (prevPos
== -1) {
3553 // LB 4 Always break after hard line breaks,
3554 if (fBK
->contains(prevChar
)) {
3558 // LB 5 Break after CR, LF, NL, but not inside CR LF
3559 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3562 if (prevChar
== 0x0d ||
3568 // LB 6 Don't break before hard line breaks
3569 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3570 fBK
->contains(thisChar
)) {
3575 // LB 7 Don't break before spaces or zero-width space.
3576 if (fSP
->contains(thisChar
)) {
3580 if (fZW
->contains(thisChar
)) {
3584 // LB 8 Break after zero width space
3585 if (fZW
->contains(prevChar
)) {
3589 // LB 9, 10 Already done, at top of loop.
3593 // LB 11 Do not break before or after WORD JOINER and related characters.
3597 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3603 if (fGL
->contains(prevChar
)) {
3609 if (!(fSP
->contains(prevChar
) ||
3610 fBA
->contains(prevChar
) ||
3611 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3617 // LB 13 Don't break before closings.
3618 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3619 // fall into LB 17 and the more general number regular expression.
3621 if ((!fNU
->contains(prevChar
) && fCL
->contains(thisChar
)) ||
3622 (!fNU
->contains(prevChar
) && fCP
->contains(thisChar
)) ||
3623 fEX
->contains(thisChar
) ||
3624 (!fNU
->contains(prevChar
) && fIS
->contains(thisChar
)) ||
3625 (!fNU
->contains(prevChar
) && fSY
->contains(thisChar
))) {
3629 // LB 14 Don't break after OP SP*
3630 // Scan backwards, checking for this sequence.
3631 // The OP char could include combining marks, so we actually check for
3633 // Another Twist: The Rule 67 fixes may have changed a SP CM
3634 // sequence into a ID char, so before scanning back through spaces,
3635 // verify that prevChar is indeed a space. The prevChar variable
3636 // may differ from fText[prevPos]
3638 if (fSP
->contains(prevChar
)) {
3639 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3640 tPos
=fText
->moveIndex32(tPos
, -1);
3643 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3644 tPos
=fText
->moveIndex32(tPos
, -1);
3646 if (fOP
->contains(fText
->char32At(tPos
))) {
3651 // LB 15 QU SP* x OP
3652 if (fOP
->contains(thisChar
)) {
3653 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3655 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3656 tPos
= fText
->moveIndex32(tPos
, -1);
3658 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3659 tPos
= fText
->moveIndex32(tPos
, -1);
3661 if (fQU
->contains(fText
->char32At(tPos
))) {
3668 // LB 16 (CL | CP) SP* x NS
3669 // Scan backwards for SP* CM* (CL | CP)
3670 if (fNS
->contains(thisChar
)) {
3672 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3673 tPos
= fText
->moveIndex32(tPos
, -1);
3675 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3676 tPos
= fText
->moveIndex32(tPos
, -1);
3678 if (fCL
->contains(fText
->char32At(tPos
)) || fCP
->contains(fText
->char32At(tPos
))) {
3684 // LB 17 B2 SP* x B2
3685 if (fB2
->contains(thisChar
)) {
3686 // Scan backwards, checking for the B2 CM* SP* sequence.
3688 if (fSP
->contains(prevChar
)) {
3689 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3690 tPos
=fText
->moveIndex32(tPos
, -1);
3693 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3694 tPos
=fText
->moveIndex32(tPos
, -1);
3696 if (fB2
->contains(fText
->char32At(tPos
))) {
3702 // LB 18 break after space
3703 if (fSP
->contains(prevChar
)) {
3710 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3714 // LB 20 Break around a CB
3715 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3720 if (fBA
->contains(thisChar
) ||
3721 fHY
->contains(thisChar
) ||
3722 fNS
->contains(thisChar
) ||
3723 fBB
->contains(prevChar
) ) {
3728 if ((fAL
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3729 (fID
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3730 (fIN
->contains(prevChar
) && fIN
->contains(thisChar
)) ||
3731 (fNU
->contains(prevChar
) && fIN
->contains(thisChar
)) ) {
3739 if ((fID
->contains(prevChar
) && fPO
->contains(thisChar
)) ||
3740 (fAL
->contains(prevChar
) && fNU
->contains(thisChar
)) ||
3741 (fNU
->contains(prevChar
) && fAL
->contains(thisChar
)) ) {
3745 // LB 24 Do not break between prefix and letters or ideographs.
3749 if ((fPR
->contains(prevChar
) && fID
->contains(thisChar
)) ||
3750 (fPR
->contains(prevChar
) && fAL
->contains(thisChar
)) ||
3751 (fPO
->contains(prevChar
) && fAL
->contains(thisChar
)) ) {
3758 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3759 if (U_FAILURE(status
)) {
3762 // Matched a number. But could have been just a single digit, which would
3763 // not represent a "no break here" between prevChar and thisChar
3764 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3765 if (numEndIdx
> pos
) {
3766 // Number match includes at least our two chars being checked
3767 if (numEndIdx
> nextPos
) {
3768 // Number match includes additional chars. Update pos and nextPos
3769 // so that next loop iteration will continue at the end of the number,
3770 // checking for breaks between last char in number & whatever follows.
3771 pos
= nextPos
= numEndIdx
;
3773 pos
= fText
->moveIndex32(pos
, -1);
3774 thisChar
= fText
->char32At(pos
);
3775 } while (fCM
->contains(thisChar
));
3782 // LB 26 Do not break a Korean syllable.
3783 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3784 fJV
->contains(thisChar
) ||
3785 fH2
->contains(thisChar
) ||
3786 fH3
->contains(thisChar
))) {
3790 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3791 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3795 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3796 fJT
->contains(thisChar
)) {
3800 // LB 27 Treat a Korean Syllable Block the same as ID.
3801 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3802 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3803 fIN
->contains(thisChar
)) {
3806 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3807 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3808 fPO
->contains(thisChar
)) {
3811 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3812 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3818 // LB 28 Do not break between alphabetics ("at").
3819 if (fAL
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3823 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3824 if (fIS
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3828 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3831 if ((fAL
->contains(prevChar
) || fNU
->contains(prevChar
)) && fOP
->contains(thisChar
)) {
3834 if (fCP
->contains(prevChar
) && (fAL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3838 // LB 31 Break everywhere else
3847 UVector
*RBBILineMonkey::charClasses() {
3852 RBBILineMonkey::~RBBILineMonkey() {
3894 delete fNumberMatcher
;
3898 //-------------------------------------------------------------------------------------------
3903 // seed=nnnnn Random number starting seed.
3904 // Setting the seed allows errors to be reproduced.
3905 // loop=nnn Looping count. Controls running time.
3907 // 0 or greater: run length.
3909 // type = char | word | line | sent | title
3911 //-------------------------------------------------------------------------------------------
3913 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3914 int32_t val
= defaultVal
;
3915 name
.append(" *= *(-?\\d+)");
3916 UErrorCode status
= U_ZERO_ERROR
;
3917 RegexMatcher
m(name
, params
, 0, status
);
3919 // The param exists. Convert the string to an int.
3920 char valString
[100];
3921 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3922 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3923 paramLength
= (int32_t)(sizeof(valString
)-2);
3925 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3926 val
= strtol(valString
, NULL
, 10);
3928 // Delete this parameter from the params string.
3930 params
= m
.replaceFirst("", status
);
3932 U_ASSERT(U_SUCCESS(status
));
3937 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3946 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3948 if (count
< expectedcount
&& expected
[count
] != i
) {
3949 test
->errln("break forward test failed: expected %d but got %d",
3950 expected
[count
], i
);
3955 if (count
!= expectedcount
) {
3956 printStringBreaks(ustr
, expected
, expectedcount
);
3957 test
->errln("break forward test failed: missed %d match",
3958 expectedcount
- count
);
3961 // testing boundaries
3962 for (i
= 1; i
< expectedcount
; i
++) {
3963 int j
= expected
[i
- 1];
3964 if (!bi
->isBoundary(j
)) {
3965 printStringBreaks(ustr
, expected
, expectedcount
);
3966 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3969 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3970 if (bi
->isBoundary(j
)) {
3971 printStringBreaks(ustr
, expected
, expectedcount
);
3972 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3978 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3980 if (forward
[count
] != i
) {
3981 test
->errln("happy break test previous() failed: expected %d but got %d",
3987 printStringBreaks(ustr
, expected
, expectedcount
);
3988 test
->errln("break test previous() failed: missed a match");
3992 // testing preceding
3993 for (i
= 0; i
< expectedcount
- 1; i
++) {
3994 // int j = expected[i] + 1;
3995 int j
= ustr
.moveIndex32(expected
[i
], 1);
3996 for (; j
<= expected
[i
+ 1]; j
++) {
3997 if (bi
->preceding(j
) != expected
[i
]) {
3998 printStringBreaks(ustr
, expected
, expectedcount
);
3999 test
->errln("preceding(): Not expecting boundary at position %d", j
);
4006 void RBBITest::TestWordBreaks(void)
4008 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4010 Locale
locale("en");
4011 UErrorCode status
= U_ZERO_ERROR
;
4012 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4013 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4014 static const char *strlist
[] =
4016 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
4017 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
4018 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
4019 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
4020 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
4021 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4022 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
4023 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
4024 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4025 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4026 "\\u2027\\U000e0067\\u0a47\\u00b7",
4027 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4028 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4029 "\\u0589\\U000e006e\\u0a42\\U000104a5",
4030 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4031 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4032 "\\u0027\\u11af\\U000e0057\\u0602",
4033 "\\U0001d7f2\\U000e007\\u0004\\u0589",
4034 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4035 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4036 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4037 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4038 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4039 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4040 "\\u0233\\U000e0020\\u0a69\\u0d6a",
4041 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4042 "\\u58f4\\U000e0049\\u20e7\\u2027",
4043 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4044 "\\ua183\\u102d\\u0bec\\u003a",
4045 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4046 "\\u003a\\u0e57\\u0fad\\u002e",
4047 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4048 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4049 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
4050 "\\u003a\\u0664\\u00b7\\u1fba",
4051 "\\u003b\\u0027\\u00b7\\u47a3",
4052 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
4053 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
4054 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
4057 if (U_FAILURE(status
)) {
4058 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
4061 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
4062 // printf("looping %d\n", loop);
4063 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
4064 // RBBICharMonkey monkey;
4065 RBBIWordMonkey monkey
;
4068 int expectedcount
= 0;
4070 monkey
.setText(ustr
);
4072 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4073 expected
[expectedcount
++] = i
;
4076 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4082 void RBBITest::TestWordBoundary(void)
4084 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
4085 Locale
locale("en");
4086 UErrorCode status
= U_ZERO_ERROR
;
4087 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4088 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4090 static const char *strlist
[] =
4092 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4093 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4094 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4095 "\\u2027\\U000e0067\\u0a47\\u00b7",
4096 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4097 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4098 "\\u0589\\U000e006e\\u0a42\\U000104a5",
4099 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4100 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4101 "\\u0027\\u11af\\U000e0057\\u0602",
4102 "\\U0001d7f2\\U000e007\\u0004\\u0589",
4103 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4104 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4105 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4106 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4107 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4108 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4109 "\\u0233\\U000e0020\\u0a69\\u0d6a",
4110 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4111 "\\u58f4\\U000e0049\\u20e7\\u2027",
4112 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4113 "\\ua183\\u102d\\u0bec\\u003a",
4114 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4115 "\\u003a\\u0e57\\u0fad\\u002e",
4116 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4117 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4118 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
4119 "\\u003a\\u0664\\u00b7\\u1fba",
4120 "\\u003b\\u0027\\u00b7\\u47a3",
4123 if (U_FAILURE(status
)) {
4124 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
4127 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
4128 // printf("looping %d\n", loop);
4129 u_unescape(strlist
[loop
], str
, 20);
4130 UnicodeString
ustr(str
);
4137 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
4138 forward
[count
++] = i
;
4141 for (j
= prev
+ 1; j
< i
; j
++) {
4142 if (bi
->isBoundary(j
)) {
4143 printStringBreaks(ustr
, forward
, count
);
4144 errln("happy boundary test failed: expected %d not a boundary",
4150 if (!bi
->isBoundary(i
)) {
4151 printStringBreaks(ustr
, forward
, count
);
4152 errln("happy boundary test failed: expected %d a boundary",
4162 void RBBITest::TestLineBreaks(void)
4164 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4165 Locale
locale("en");
4166 UErrorCode status
= U_ZERO_ERROR
;
4167 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4168 const int32_t STRSIZE
= 50;
4170 static const char *strlist
[] =
4172 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4173 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4174 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4175 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4176 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4177 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4178 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4179 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4180 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4181 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4182 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
4183 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4184 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4185 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4186 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4187 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4188 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4189 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4190 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4191 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4192 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4193 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4194 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4195 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4196 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4197 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4198 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
4199 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4200 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4201 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4202 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4203 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4204 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
4205 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4206 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4207 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
4208 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4209 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4210 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4211 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4212 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4213 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4214 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
4215 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
4216 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
4217 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4218 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4221 TEST_ASSERT_SUCCESS(status
);
4222 if (U_FAILURE(status
)) {
4225 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
4226 // printf("looping %d\n", loop);
4227 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
4234 UnicodeString
ustr(str
);
4235 RBBILineMonkey monkey
;
4236 if (U_FAILURE(monkey
.deferredStatus
)) {
4240 const int EXPECTEDSIZE
= 50;
4241 int expected
[EXPECTEDSIZE
];
4242 int expectedcount
= 0;
4244 monkey
.setText(ustr
);
4246 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4247 if (expectedcount
>= EXPECTEDSIZE
) {
4248 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4251 expected
[expectedcount
++] = i
;
4254 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4260 void RBBITest::TestSentBreaks(void)
4262 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4263 Locale
locale("en");
4264 UErrorCode status
= U_ZERO_ERROR
;
4265 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4267 static const char *strlist
[] =
4269 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4271 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4272 "\"Sentence ending with a quote.\" Bye.",
4273 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4274 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4275 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4276 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4277 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4278 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4279 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4280 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4281 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4282 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4283 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4284 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4285 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4286 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4287 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4288 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4291 if (U_FAILURE(status
)) {
4292 errcheckln(status
, "Creation of break iterator failed %s", u_errorName(status
));
4295 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
4296 u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0])));
4297 UnicodeString
ustr(str
);
4299 RBBISentMonkey monkey
;
4300 if (U_FAILURE(monkey
.deferredStatus
)) {
4304 const int EXPECTEDSIZE
= 50;
4305 int expected
[EXPECTEDSIZE
];
4306 int expectedcount
= 0;
4308 monkey
.setText(ustr
);
4310 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4311 if (expectedcount
>= EXPECTEDSIZE
) {
4312 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4315 expected
[expectedcount
++] = i
;
4318 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4324 void RBBITest::TestMonkey(char *params
) {
4325 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4327 UErrorCode status
= U_ZERO_ERROR
;
4328 int32_t loopCount
= 500;
4330 UnicodeString breakType
= "all";
4331 Locale
locale("en");
4332 UBool useUText
= FALSE
;
4334 if (quick
== FALSE
) {
4339 UnicodeString
p(params
);
4340 loopCount
= getIntParam("loop", p
, loopCount
);
4341 seed
= getIntParam("seed", p
, seed
);
4343 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
4345 breakType
= m
.group(1, status
);
4347 p
= m
.replaceFirst("", status
);
4350 RegexMatcher
u(" *utext", p
, 0, status
);
4354 p
= u
.replaceFirst("", status
);
4359 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
4360 // Each option is stripped out of the option string as it is processed.
4361 // All options have been checked. The option string should have been completely emptied..
4363 p
.extract(buf
, sizeof(buf
), NULL
, status
);
4364 buf
[sizeof(buf
)-1] = 0;
4365 errln("Unrecognized or extra parameter: %s\n", buf
);
4371 if (breakType
== "char" || breakType
== "all") {
4373 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
4374 if (U_SUCCESS(status
)) {
4375 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
4376 if (breakType
== "all" && useUText
==FALSE
) {
4377 // Also run a quick test with UText when "all" is specified
4378 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
4382 errcheckln(status
, "Creation of character break iterator failed %s", u_errorName(status
));
4387 if (breakType
== "word" || breakType
== "all") {
4388 logln("Word Break Monkey Test");
4390 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4391 if (U_SUCCESS(status
)) {
4392 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
4395 errcheckln(status
, "Creation of word break iterator failed %s", u_errorName(status
));
4400 if (breakType
== "line" || breakType
== "all") {
4401 logln("Line Break Monkey Test");
4403 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4404 if (loopCount
>= 10) {
4405 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
4407 if (U_SUCCESS(status
)) {
4408 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
4411 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4416 if (breakType
== "sent" || breakType
== "all" ) {
4417 logln("Sentence Break Monkey Test");
4419 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4420 if (loopCount
>= 10) {
4421 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
4423 if (U_SUCCESS(status
)) {
4424 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
4427 errcheckln(status
, "Creation of line break iterator failed %s", u_errorName(status
));
4436 // Run a RBBI monkey test. Common routine, for all break iterator types.
4438 // bi - the break iterator to use
4439 // mk - MonkeyKind, abstraction for obtaining expected results
4440 // name - Name of test (char, word, etc.) for use in error messages
4441 // seed - Seed for starting random number generator (parameter from user)
4444 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4445 int32_t numIterations
, UBool useUText
) {
4447 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4449 const int32_t TESTSTRINGLEN
= 500;
4450 UnicodeString testText
;
4451 int32_t numCharClasses
;
4453 int expected
[TESTSTRINGLEN
*2 + 1];
4454 int expectedCount
= 0;
4455 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4456 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4457 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4458 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4459 char followingBreaks
[TESTSTRINGLEN
*2+1];
4460 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4466 numCharClasses
= mk
.charClasses()->size();
4467 chClasses
= mk
.charClasses();
4469 // Check for errors that occured during the construction of the MonkeyKind object.
4470 // Can't report them where they occured because errln() is a method coming from intlTest,
4471 // and is not visible outside of RBBITest :-(
4472 if (U_FAILURE(mk
.deferredStatus
)) {
4473 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4477 // Verify that the character classes all have at least one member.
4478 for (i
=0; i
<numCharClasses
; i
++) {
4479 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4480 if (s
== NULL
|| s
->size() == 0) {
4481 errln("Character Class #%d is null or of zero size.", i
);
4486 while (loopCount
< numIterations
|| numIterations
== -1) {
4487 if (numIterations
== -1 && loopCount
% 10 == 0) {
4488 // If test is running in an infinite loop, display a periodic tic so
4489 // we can tell that it is making progress.
4490 fprintf(stderr
, ".");
4492 // Save current random number seed, so that we can recreate the random numbers
4493 // for this loop iteration in event of an error.
4496 // Populate a test string with data.
4497 testText
.truncate(0);
4498 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4499 int32_t aClassNum
= m_rand() % numCharClasses
;
4500 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4501 int32_t charIdx
= m_rand() % classSet
->size();
4502 UChar32 c
= classSet
->charAt(charIdx
);
4503 if (c
< 0) { // TODO: deal with sets containing strings.
4510 // Calculate the expected results for this test string.
4511 mk
.setText(testText
);
4512 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4513 expectedBreaks
[0] = 1;
4514 int32_t breakPos
= 0;
4517 breakPos
= mk
.next(breakPos
);
4518 if (breakPos
== -1) {
4521 if (breakPos
> testText
.length()) {
4522 errln("breakPos > testText.length()");
4524 expectedBreaks
[breakPos
] = 1;
4525 U_ASSERT(expectedCount
<testText
.length());
4526 expected
[expectedCount
++] = breakPos
;
4529 // Find the break positions using forward iteration
4530 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4532 UErrorCode status
= U_ZERO_ERROR
;
4533 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4534 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4535 bi
->setText(testUText
, status
);
4536 TEST_ASSERT_SUCCESS(status
);
4537 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4538 // This UText can be closed immediately, so long as the
4539 // testText string continues to exist.
4541 bi
->setText(testText
);
4544 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4545 if (i
< 0 || i
> testText
.length()) {
4546 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4549 forwardBreaks
[i
] = 1;
4552 // Find the break positions using reverse iteration
4553 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4554 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4555 if (i
< 0 || i
> testText
.length()) {
4556 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4559 reverseBreaks
[i
] = 1;
4562 // Find the break positions using isBoundary() tests.
4563 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4564 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4565 for (i
=0; i
<=testText
.length(); i
++) {
4566 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4570 // Find the break positions using the following() function.
4572 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4573 int32_t lastBreakPos
= 0;
4574 followingBreaks
[0] = 1;
4575 for (i
=0; i
<testText
.length(); i
++) {
4576 breakPos
= bi
->following(i
);
4577 if (breakPos
<= i
||
4578 breakPos
< lastBreakPos
||
4579 breakPos
> testText
.length() ||
4580 (breakPos
> lastBreakPos
&& lastBreakPos
> i
)) {
4581 errln("%s break monkey test: "
4582 "Out of range value returned by BreakIterator::following().\n"
4583 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4584 name
, seed
, i
, breakPos
, lastBreakPos
);
4587 followingBreaks
[breakPos
] = 1;
4588 lastBreakPos
= breakPos
;
4591 // Find the break positions using the preceding() function.
4592 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4593 lastBreakPos
= testText
.length();
4594 precedingBreaks
[testText
.length()] = 1;
4595 for (i
=testText
.length(); i
>0; i
--) {
4596 breakPos
= bi
->preceding(i
);
4597 if (breakPos
>= i
||
4598 breakPos
> lastBreakPos
||
4599 (breakPos
< 0 && testText
.getChar32Start(i
)>0) ||
4600 (breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
)) ) {
4601 errln("%s break monkey test: "
4602 "Out of range value returned by BreakIterator::preceding().\n"
4603 "index=%d; prev returned %d; lastBreak=%d" ,
4604 name
, i
, breakPos
, lastBreakPos
);
4605 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4606 precedingBreaks
[i
] = 2; // Forces an error.
4609 if (breakPos
>= 0) {
4610 precedingBreaks
[breakPos
] = 1;
4612 lastBreakPos
= breakPos
;
4616 // Compare the expected and actual results.
4617 for (i
=0; i
<=testText
.length(); i
++) {
4618 const char *errorType
= NULL
;
4619 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4620 errorType
= "next()";
4621 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4622 errorType
= "previous()";
4623 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4624 errorType
= "isBoundary()";
4625 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4626 errorType
= "following()";
4627 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4628 errorType
= "preceding()";
4632 if (errorType
!= NULL
) {
4633 // Format a range of the test text that includes the failure as
4634 // a data item that can be included in the rbbi test data file.
4636 // Start of the range is the last point where expected and actual results
4637 // both agreed that there was a break position.
4638 int startContext
= i
;
4641 if (startContext
==0) { break; }
4643 if (expectedBreaks
[startContext
] != 0) {
4644 if (count
== 2) break;
4649 // End of range is two expected breaks past the start position.
4650 int endContext
= i
+ 1;
4652 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4654 if (endContext
>= testText
.length()) {break;}
4655 if (expectedBreaks
[endContext
-1] != 0) {
4656 if (count
== 0) break;
4663 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4664 UnicodeString errorText
= "<data>";
4665 /***if (strcmp(errorType, "next()") == 0) {
4667 endContext = testText.length();
4669 printStringBreaks(testText, expected, expectedCount);
4672 for (ci
=startContext
; ci
<endContext
;) {
4673 UnicodeString
hexChars("0123456789abcdef");
4676 c
= testText
.char32At(ci
);
4678 // This is the location of the error.
4679 errorText
.append("<?>");
4680 } else if (expectedBreaks
[ci
] != 0) {
4681 // This a non-error expected break position.
4682 errorText
.append("\\");
4685 errorText
.append("\\u");
4686 for (bn
=12; bn
>=0; bn
-=4) {
4687 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4690 errorText
.append("\\U");
4691 for (bn
=28; bn
>=0; bn
-=4) {
4692 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4695 ci
= testText
.moveIndex32(ci
, 1);
4697 errorText
.append("\\");
4698 errorText
.append("</data>\n");
4701 char charErrorTxt
[500];
4702 UErrorCode status
= U_ZERO_ERROR
;
4703 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4704 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4705 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4706 name
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4707 errorType
, seed
, i
, charErrorTxt
);
4718 // Bug 5532. UTF-8 based UText fails in dictionary code.
4719 // This test checks the initial patch,
4720 // which is to just keep it from crashing. Correct word boundaries
4721 // await a proper fix to the dictionary code.
4723 void RBBITest::TestBug5532(void) {
4724 // Text includes a mixture of Thai and Latin.
4725 const unsigned char utf8Data
[] = {
4726 0xE0u
, 0xB8u
, 0x82u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0xA2u
, 0xE0u
,
4727 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
,
4728 0xB7u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
, 0xB8u
, 0xADu
, 0xE0u
, 0xB8u
, 0x87u
,
4729 0xE0u
, 0xB9u
, 0x80u
, 0xE0u
, 0xB8u
, 0xA5u
, 0xE0u
, 0xB9u
, 0x88u
, 0xE0u
,
4730 0xB8u
, 0x99u
, 0xE0u
, 0xB8u
, 0x8Bu
, 0xE0u
, 0xB8u
, 0xB5u
, 0xE0u
, 0xB8u
,
4731 0x94u
, 0xE0u
, 0xB8u
, 0xB5u
, 0x20u
, 0x73u
, 0x69u
, 0x6Du
, 0x20u
, 0x61u
,
4732 0x75u
, 0x64u
, 0x69u
, 0x6Fu
, 0x2Fu
, 0x20u
, 0x4Du
, 0x4Fu
, 0x4Fu
, 0x4Eu
,
4733 0x20u
, 0x65u
, 0x63u
, 0x6Cu
, 0x69u
, 0x70u
, 0x73u
, 0x65u
, 0x20u
, 0xE0u
,
4734 0xB8u
, 0xA3u
, 0xE0u
, 0xB8u
, 0xB2u
, 0xE0u
, 0xB8u
, 0x84u
, 0xE0u
, 0xB8u
,
4735 0xB2u
, 0x20u
, 0x34u
, 0x37u
, 0x30u
, 0x30u
, 0x20u
, 0xE0u
, 0xB8u
, 0xA2u
,
4736 0xE0u
, 0xB8u
, 0xB9u
, 0xE0u
, 0xB9u
, 0x82u
, 0xE0u
, 0xB8u
, 0xA3u
, 0x00};
4738 UErrorCode status
= U_ZERO_ERROR
;
4739 UText utext
=UTEXT_INITIALIZER
;
4740 utext_openUTF8(&utext
, (const char *)utf8Data
, -1, &status
);
4741 TEST_ASSERT_SUCCESS(status
);
4743 BreakIterator
*bi
= BreakIterator::createWordInstance(Locale("th"), status
);
4744 TEST_ASSERT_SUCCESS(status
);
4745 if (U_SUCCESS(status
)) {
4746 bi
->setText(&utext
, status
);
4747 TEST_ASSERT_SUCCESS(status
);
4749 int32_t breakCount
= 0;
4750 int32_t previousBreak
= -1;
4751 for (bi
->first(); bi
->next() != BreakIterator::DONE
; breakCount
++) {
4752 // For now, just make sure that the break iterator doesn't hang.
4753 TEST_ASSERT(previousBreak
< bi
->current());
4754 previousBreak
= bi
->current();
4756 TEST_ASSERT(breakCount
> 0);
4759 utext_close(&utext
);
4764 // TestDebug - A place-holder test for debugging purposes.
4765 // For putting in fragments of other tests that can be invoked
4766 // for tracing without a lot of unwanted extra stuff happening.
4768 void RBBITest::TestDebug(void) {
4770 UErrorCode status
= U_ZERO_ERROR
;
4774 RuleBasedBreakIterator
* bi
=
4775 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4776 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4777 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4778 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4779 // UnicodeString s("Aaa. Bcd");
4782 UBool r
= bi
->isBoundary(8);
4783 printf("%s", r
?"true":"false");
4787 // ruleStatus = bi->getRuleStatus();
4788 printf("%d\t%d\n", pos
, ruleStatus
);
4789 pos
= bi
->previous();
4790 } while (pos
!= BreakIterator::DONE
);
4794 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */