1 /********************************************************************
3 * Copyright (c) 1999-2008, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/utypes.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/uchar.h"
20 #include "unicode/utf16.h"
21 #include "unicode/ucnv.h"
22 #include "unicode/schriter.h"
23 #include "unicode/uniset.h"
24 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/ustring.h"
26 #include "unicode/utext.h"
37 #define TEST_ASSERT(x) {if (!(x)) { \
38 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
40 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
41 errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
44 //---------------------------------------------
46 //---------------------------------------------
48 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
50 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
53 case 0: name
= "TestBug4153072";
54 if(exec
) TestBug4153072(); break;
55 case 1: name
= "TestJapaneseLineBreak";
56 if(exec
) TestJapaneseLineBreak(); break;
57 case 2: name
= "TestStatusReturn";
58 if(exec
) TestStatusReturn(); break;
59 case 3: name
= "TestUnicodeFiles";
60 if(exec
) TestUnicodeFiles(); break;
61 case 4: name
= "TestEmptyString";
62 if(exec
) TestEmptyString(); break;
64 case 5: name
= "TestGetAvailableLocales";
65 if(exec
) TestGetAvailableLocales(); break;
67 case 6: name
= "TestGetDisplayName";
68 if(exec
) TestGetDisplayName(); break;
70 case 7: name
= "TestEndBehaviour";
71 if(exec
) TestEndBehaviour(); break;
72 case 8: name
= "TestMixedThaiLineBreak";
73 if(exec
) TestMixedThaiLineBreak(); break;
74 case 9: name
= "TestThaiLineBreak";
75 if(exec
) TestThaiLineBreak(); break;
76 case 10: name
= "TestMaiyamok";
77 if(exec
) TestMaiyamok(); break;
78 case 11: name
= "TestWordBreaks";
79 if(exec
) TestWordBreaks(); break;
80 case 12: name
= "TestWordBoundary";
81 if(exec
) TestWordBoundary(); break;
82 case 13: name
= "TestLineBreaks";
83 if(exec
) TestLineBreaks(); break;
84 case 14: name
= "TestSentBreaks";
85 if(exec
) TestSentBreaks(); break;
86 case 15: name
= "TestExtended";
87 if(exec
) TestExtended(); break;
88 case 16: name
= "TestMonkey";
90 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
93 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
97 case 17: name
= "TestBug3818";
98 if(exec
) TestBug3818(); break;
99 case 18: name
= "TestJapaneseWordBreak";
100 if(exec
) TestJapaneseWordBreak(); break;
101 case 19: name
= "TestDebug";
102 if(exec
) TestDebug(); break;
103 case 20: name
= "TestTrieDict";
104 if(exec
) TestTrieDict(); break;
105 case 21: name
= "TestBug5775";
106 if (exec
) TestBug5775(); break;
107 case 22: name
= "TestThaiBreaks";
108 if (exec
) TestThaiBreaks(); break;
110 default: name
= ""; break; //needed to end loop
115 //---------------------------------------------------------------------------
117 // class BITestData Holds a set of Break iterator test data and results
119 // - the string data to be broken
120 // - a vector of the expected break positions.
121 // - a vector of source line numbers for the data,
122 // (to help see where errors occured.)
123 // - The expected break tag values.
124 // - Vectors of actual break positions and tag values.
125 // - Functions for comparing actual with expected and
128 //----------------------------------------------------------------------------
131 UnicodeString fDataToBreak
;
132 UVector fExpectedBreakPositions
;
133 UVector fExpectedTags
;
135 UVector fActualBreakPositions
; // Test Results.
138 BITestData(UErrorCode
&status
);
139 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
140 void checkResults(const char *heading
, RBBITest
*test
);
141 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
148 BITestData::BITestData(UErrorCode
&status
)
149 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
155 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
156 // The macro form collects the line number, which is helpful
157 // when tracking down failures.
159 // A null data item is inserted at the start of each test's data
160 // to put the starting zero into the data list. The position saved for
161 // each non-null item is its ending position.
163 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
164 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
165 if (U_FAILURE(status
)) {return;}
167 fDataToBreak
.append(CharsToUnicodeString(data
));
169 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
170 fExpectedTags
.addElement(tag
, status
);
171 fLineNum
.addElement(lineNum
, status
);
176 // checkResults. Compare the actual and expected break positions, report any differences.
178 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
179 int32_t expectedIndex
= 0;
180 int32_t actualIndex
= 0;
183 // If we've run through both the expected and actual results vectors, we're done.
184 // break out of the loop.
185 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
186 actualIndex
>= fActualBreakPositions
.size()) {
191 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
192 err(heading
, test
, expectedIndex
-1, actualIndex
);
197 if (actualIndex
>= fActualBreakPositions
.size()) {
198 err(heading
, test
, expectedIndex
, actualIndex
-1);
203 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
204 err(heading
, test
, expectedIndex
, actualIndex
);
205 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
206 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
214 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
215 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
216 heading
, fLineNum
.elementAt(expectedIndex
),
217 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
226 // err - An error was found. Report it, along with information about where the
227 // incorrectly broken test data appeared in the source file.
229 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
231 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
232 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
234 int32_t line
= fLineNum
.elementAti(expectedIdx
);
235 if (expectedIdx
> 0) {
236 // The line numbers are off by one because a premature break occurs somewhere
237 // within the previous item, rather than at the start of the current (expected) item.
238 // We want to report the offset of the unexpected break from the start of
239 // this previous item.
240 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
242 if (actual
< expected
) {
243 test
->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading
, o
, line
, actual
, expected
);
245 test
->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading
, line
, actual
, expected
);
250 void BITestData::clearResults() {
251 fActualBreakPositions
.removeAllElements();
252 fActualTags
.removeAllElements();
256 //-----------------------------------------------------------------------------------
258 // Cannned Test Characters
260 //-----------------------------------------------------------------------------------
262 static const UChar cannedTestArray
[] = {
263 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
264 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
265 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
266 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
267 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
268 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
269 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
270 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
273 static UnicodeString
* cannedTestChars
= 0;
275 #define halfNA "\\u0928\\u094d\\u200d"
276 #define halfSA "\\u0938\\u094d\\u200d"
277 #define halfCHA "\\u091a\\u094d\\u200d"
278 #define halfKA "\\u0915\\u094d\\u200d"
279 #define deadTA "\\u0924\\u094d"
281 //--------------------------------------------------------------------------------------
283 // RBBITest constructor and destructor
285 //--------------------------------------------------------------------------------------
287 RBBITest::RBBITest() {
288 UnicodeString
temp(cannedTestArray
);
289 cannedTestChars
= new UnicodeString();
290 *cannedTestChars
+= (UChar
)0x0000;
291 *cannedTestChars
+= temp
;
295 RBBITest::~RBBITest() {
296 delete cannedTestChars
;
300 static const int T_NUMBER
= 100;
301 static const int T_LETTER
= 200;
302 static const int T_H_OR_K
= 300;
303 static const int T_IDEO
= 400;
310 //--------------------------------------------------------------------
311 //Testing the BreakIterator for devanagari script
312 //--------------------------------------------------------------------
314 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
315 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
316 #define deadTTHA "\\u0920\\u094d"
317 #define deadPA "\\u092a\\u094d"
318 #define deadSA "\\u0938\\u094d"
319 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
326 //-----------------------------------------------------------------------------------
328 // Test for status {tag} return value from break rules.
329 // TODO: a more thorough test.
331 //-----------------------------------------------------------------------------------
332 void RBBITest::TestStatusReturn() {
333 UnicodeString
rulesString1("$Letters = [:L:];\n"
334 "$Numbers = [:N:];\n"
337 "Help\\ {4}/me\\!;\n"
338 "[^$Letters $Numbers];\n"
339 "!.*;\n", -1, US_INV
);
340 UnicodeString testString1
= "abc123..abc Help me Help me!";
341 // 01234567890123456789012345678
342 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
343 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
345 UErrorCode status
=U_ZERO_ERROR
;
346 UParseError parseError
;
348 RuleBasedBreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
349 if(U_FAILURE(status
)) {
350 errln("FAIL : in construction");
354 bi
->setText(testString1
);
355 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
356 if (pos
!= bounds1
[i
]) {
357 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
361 int tag
= bi
->getRuleStatus();
362 if (tag
!= brkStatus
[i
]) {
363 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
373 static void printStringBreaks(UnicodeString ustr
, int expected
[],
376 UErrorCode status
= U_ZERO_ERROR
;
378 printf("code alpha extend alphanum type word sent line name\n");
380 for (j
= 0; j
< ustr
.length(); j
++) {
381 if (expectedcount
> 0) {
383 for (k
= 0; k
< expectedcount
; k
++) {
384 if (j
== expected
[k
]) {
385 printf("------------------------------------------------ %d\n",
390 UChar32 c
= ustr
.char32At(j
);
394 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
395 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
397 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
399 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
401 U_SHORT_PROPERTY_NAME
),
402 u_getPropertyValueName(UCHAR_WORD_BREAK
,
403 u_getIntPropertyValue(c
,
405 U_SHORT_PROPERTY_NAME
),
406 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
407 u_getIntPropertyValue(c
,
408 UCHAR_SENTENCE_BREAK
),
409 U_SHORT_PROPERTY_NAME
),
410 u_getPropertyValueName(UCHAR_LINE_BREAK
,
411 u_getIntPropertyValue(c
,
413 U_SHORT_PROPERTY_NAME
),
418 void RBBITest::TestThaiLineBreak() {
419 UErrorCode status
= U_ZERO_ERROR
;
420 BITestData
thaiLineSelection(status
);
422 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
423 // represents elided letters at the end of a long word. It should be bound to
424 // the end of the word and not treated as an independent punctuation mark.
427 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
428 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status
);
429 ADD_DATACHUNK(thaiLineSelection
, "\\u0e08\\u0e30", 0, status
);
430 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status
);
431 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status
);
432 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
433 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
434 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status
);
435 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
436 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e2d\\u0e01", 0, status
);
437 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32", 0, status
);
438 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status
);
439 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status
);
440 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status
);
441 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status
);
443 // the one time where the paiyannoi occurs somewhere other than at the end
444 // of a word is in the Thai abbrevation for "etc.", which both begins and
445 // ends with a paiyannoi
446 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2f\\u0e25\\u0e2f", 0, status
);
447 ADD_DATACHUNK(thaiLineSelection
, "\\u0e17\\u0e35\\u0e48", 0, status
);
448 ADD_DATACHUNK(thaiLineSelection
, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status
);
450 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
451 Locale("th"), status
);
452 if (U_FAILURE(status
))
454 errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
458 generalIteratorTest(*e
, thaiLineSelection
);
464 void RBBITest::TestMixedThaiLineBreak()
466 UErrorCode status
= U_ZERO_ERROR
;
467 BITestData
thaiLineSelection(status
);
469 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
472 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
475 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1B\\u0E35", 0, status
);
476 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status
);
477 ADD_DATACHUNK(thaiLineSelection
, "2545 ", 0, status
);
478 ADD_DATACHUNK(thaiLineSelection
, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status
);
479 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1B\\u0E35", 0, status
);
480 ADD_DATACHUNK(thaiLineSelection
, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status
);
481 ADD_DATACHUNK(thaiLineSelection
, "\\u0E04\\u0E23\\u0E1A", 0, status
);
482 ADD_DATACHUNK(thaiLineSelection
, "\\u0E23\\u0E2D\\u0E1A ", 0, status
);
483 ADD_DATACHUNK(thaiLineSelection
, "\"\\u0E52\\u0E52\\u0E50 ", 0, status
);
484 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1b\\u0E35\" ", 0, status
);
485 ADD_DATACHUNK(thaiLineSelection
, "\\u0E02\\u0E2d\\u0E07", 0, status
);
486 ADD_DATACHUNK(thaiLineSelection
, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status
);
487 ADD_DATACHUNK(thaiLineSelection
, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status
);
488 ADD_DATACHUNK(thaiLineSelection
, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status
);
489 ADD_DATACHUNK(thaiLineSelection
, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status
);
490 ADD_DATACHUNK(thaiLineSelection
, "Bangkok)", 0, status
);
492 // @suwit - end of changes
495 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale("th"), status
);
496 if (U_FAILURE(status
))
498 errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
503 generalIteratorTest(*e
, thaiLineSelection
);
508 void RBBITest::TestMaiyamok()
510 UErrorCode status
= U_ZERO_ERROR
;
511 BITestData
thaiLineSelection(status
);
512 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
513 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
514 // word". Instead of appearing as a word unto itself, however, it's kept together
515 // with the word before it
516 ADD_DATACHUNK(thaiLineSelection
, "\\u0e44\\u0e1b\\u0e46", 0, status
);
517 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32\\u0e46", 0, status
);
518 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status
);
519 ADD_DATACHUNK(thaiLineSelection
, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status
);
520 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e17\\u0e1e", 0, status
);
521 ADD_DATACHUNK(thaiLineSelection
, "\\u0e41\\u0e25\\u0e30", 0, status
);
522 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e03\\u0e35", 0, status
);
523 ADD_DATACHUNK(thaiLineSelection
, "\\u0e22\\u0e07", 0, status
);
524 ADD_DATACHUNK(thaiLineSelection
, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status
);
526 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
527 Locale("th"), status
);
529 if (U_FAILURE(status
))
531 errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
534 generalIteratorTest(*e
, thaiLineSelection
);
540 void RBBITest::TestBug3818() {
541 UErrorCode status
= U_ZERO_ERROR
;
543 // Four Thai words...
544 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
545 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
546 UnicodeString
thaiStr(thaiWordData
);
548 RuleBasedBreakIterator
* bi
=
549 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale("th"), status
);
550 if (U_FAILURE(status
) || bi
== NULL
) {
551 errln("Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
554 bi
->setText(thaiStr
);
556 int32_t startOfSecondWord
= bi
->following(1);
557 if (startOfSecondWord
!= 4) {
558 errln("Fail at file %s, line %d expected start of word at 4, got %d",
559 __FILE__
, __LINE__
, startOfSecondWord
);
561 startOfSecondWord
= bi
->following(0);
562 if (startOfSecondWord
!= 4) {
563 errln("Fail at file %s, line %d expected start of word at 4, got %d",
564 __FILE__
, __LINE__
, startOfSecondWord
);
570 void RBBITest::TestJapaneseWordBreak() {
571 UErrorCode status
= U_ZERO_ERROR
;
572 BITestData
japaneseWordSelection(status
);
574 ADD_DATACHUNK(japaneseWordSelection
, NULL
, 0, status
); // Break at start of data
575 ADD_DATACHUNK(japaneseWordSelection
, "\\u4ECA\\u65E5", 400, status
); //2
576 ADD_DATACHUNK(japaneseWordSelection
, "\\u306F\\u3044\\u3044", 300, status
); //5
577 ADD_DATACHUNK(japaneseWordSelection
, "\\u5929\\u6C17", 400, status
); //7
578 ADD_DATACHUNK(japaneseWordSelection
, "\\u3067\\u3059\\u306D", 300, status
); //10
579 ADD_DATACHUNK(japaneseWordSelection
, "\\u3002", 0, status
); //11
580 ADD_DATACHUNK(japaneseWordSelection
, "\\u000D\\u000A", 0, status
); //12
582 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(
583 Locale("ja"), status
);
584 if (U_FAILURE(status
))
586 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
590 generalIteratorTest(*e
, japaneseWordSelection
);
594 void RBBITest::TestTrieDict() {
595 UErrorCode status
= U_ZERO_ERROR
;
598 // Open and read the test data file.
600 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
601 char testFileName
[1000];
602 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) + strlen("riwords.txt") + 10 >= sizeof(testFileName
)) {
603 errln("Can't open test data. Path too long.");
606 strcpy(testFileName
, testDataDirectory
);
607 strcat(testFileName
, "riwords.txt");
609 // Items needing deleting at the end
610 MutableTrieDictionary
*mutableDict
= NULL
;
611 CompactTrieDictionary
*compactDict
= NULL
;
612 UnicodeSet
*breaks
= NULL
;
613 UChar
*testFile
= NULL
;
614 StringEnumeration
*enumer1
= NULL
;
615 StringEnumeration
*enumer2
= NULL
;
616 MutableTrieDictionary
*mutable2
= NULL
;
617 StringEnumeration
*cloneEnum
= NULL
;
618 CompactTrieDictionary
*compact2
= NULL
;
621 const UnicodeString
*originalWord
= NULL
;
622 const UnicodeString
*cloneWord
= NULL
;
631 testFile
= ReadAndConvertFile(testFileName
, len
, NULL
, status
);
632 if (U_FAILURE(status
)) {
633 goto cleanup
; /* something went wrong, error already output */
636 mutableDict
= new MutableTrieDictionary(0x0E1C, status
);
637 if (U_FAILURE(status
)) {
638 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status
));
642 breaks
= new UnicodeSet
;
643 breaks
->add(0x000A); // Line Feed
644 breaks
->add(0x000D); // Carriage Return
645 breaks
->add(0x2028); // Line Separator
646 breaks
->add(0x2029); // Paragraph Separator
648 // Now add each non-comment line of the file as a word.
656 if (uc
== 0x0023) { // #comment line, skip
657 while (uc
&& !breaks
->contains(uc
)) {
661 else while (uc
&& !breaks
->contains(uc
)) {
666 mutableDict
->addWord(word
, wordLen
, status
);
667 if (U_FAILURE(status
)) {
668 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status
));
674 // Find beginning of next line
675 while (uc
&& breaks
->contains(uc
)) {
682 if (wordCount
< 50) {
683 errln("Word count (%d) unreasonably small\n", wordCount
);
687 enumer1
= mutableDict
->openWords(status
);
688 if (U_FAILURE(status
)) {
689 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status
));
694 if (wordCount
!= (testCount
= enumer1
->count(status
))) {
695 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
696 testCount
, wordCount
, u_errorName(status
));
701 compactDict
= new CompactTrieDictionary(*mutableDict
, status
);
702 if (U_FAILURE(status
)) {
703 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status
));
707 enumer2
= compactDict
->openWords(status
);
708 if (U_FAILURE(status
)) {
709 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status
));
713 if (wordCount
!= (testCount
= enumer2
->count(status
))) {
714 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
715 testCount
, wordCount
, u_errorName(status
));
719 if (enumer1
->getDynamicClassID() == enumer2
->getDynamicClassID()) {
720 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
728 mutable2
= compactDict
->cloneMutable(status
);
729 if (U_FAILURE(status
)) {
730 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status
));
734 cloneEnum
= mutable2
->openWords(status
);
735 if (U_FAILURE(status
)) {
736 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status
));
740 if (wordCount
!= (testCount
= cloneEnum
->count(status
))) {
741 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
742 testCount
, wordCount
, u_errorName(status
));
746 // Compact original dictionary to clone. Note that we can only compare the same kind of
747 // dictionary as the order of the enumerators is not guaranteed to be the same between
749 enumer1
= mutableDict
->openWords(status
);
750 if (U_FAILURE(status
)) {
751 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status
));
755 originalWord
= enumer1
->snext(status
);
756 cloneWord
= cloneEnum
->snext(status
);
757 while (U_SUCCESS(status
) && originalWord
!= NULL
&& cloneWord
!= NULL
) {
758 if (*originalWord
!= *cloneWord
) {
759 errln("Original and cloned MutableTrieDictionary word mismatch\n");
762 originalWord
= enumer1
->snext(status
);
763 cloneWord
= cloneEnum
->snext(status
);
766 if (U_FAILURE(status
)) {
767 errln("Enumeration failed: %s\n", u_errorName(status
));
771 if (originalWord
!= cloneWord
) {
772 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
776 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
777 compact2
= new CompactTrieDictionary(compactDict
->data(), status
);
778 if (U_FAILURE(status
)) {
779 errln("CompactTrieDictionary(const void *,...) failed\n");
783 if (compact2
->dataSize() == 0) {
784 errln("CompactTrieDictionary->dataSize() == 0\n");
788 // Now count the words via the second dictionary
790 enumer1
= compact2
->openWords(status
);
791 if (U_FAILURE(status
)) {
792 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status
));
796 if (wordCount
!= (testCount
= enumer1
->count(status
))) {
797 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
798 testCount
, wordCount
, u_errorName(status
));
814 //----------------------------------------------------------------------------
816 // generalIteratorTest Given a break iterator and a set of test data,
817 // Run the tests and report the results.
819 //----------------------------------------------------------------------------
820 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
823 bi
.setText(td
.fDataToBreak
);
825 testFirstAndNext(bi
, td
);
827 testLastAndPrevious(bi
, td
);
829 testFollowing(bi
, td
);
830 testPreceding(bi
, td
);
831 testIsBoundary(bi
, td
);
832 doMultipleSelectionTest(bi
, td
);
837 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
840 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
842 UErrorCode status
= U_ZERO_ERROR
;
847 logln("Test first and next");
848 bi
.setText(td
.fDataToBreak
);
851 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
852 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
853 tag
= bi
.getRuleStatus();
854 td
.fActualTags
.addElement(tag
, status
);
856 // If the iterator is not making forward progress, stop.
857 // No need to raise an error here, it'll be detected in the normal check of results.
862 td
.checkResults("testFirstAndNext", this);
867 // TestLastAndPrevious. Run the iterator backwards, starting with last().
869 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
871 UErrorCode status
= U_ZERO_ERROR
;
873 int32_t lastP
= 0x7ffffffe;
876 logln("Test last and previous");
877 bi
.setText(td
.fDataToBreak
);
880 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
881 // Save break position. Insert it at start of vector of results, shoving
882 // already-saved results further towards the end.
883 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
884 // bi.previous(); // TODO: Why does this fix things up????
886 tag
= bi
.getRuleStatus();
887 td
.fActualTags
.insertElementAt(tag
, 0, status
);
889 // If the iterator is not making progress, stop.
890 // No need to raise an error here, it'll be detected in the normal check of results.
895 td
.checkResults("testLastAndPrevious", this);
899 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
901 UErrorCode status
= U_ZERO_ERROR
;
904 int32_t lastP
= -2; // A value that will never be returned as a break position.
905 // cannot be -1; that is returned for DONE.
908 logln("testFollowing():");
909 bi
.setText(td
.fDataToBreak
);
912 // Save the starting point, since we won't get that out of following.
914 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
915 tag
= bi
.getRuleStatus();
916 td
.fActualTags
.addElement(tag
, status
);
918 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
921 if (p
== RuleBasedBreakIterator::DONE
) {
924 // We've reached a new break position. Save it.
925 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
926 tag
= bi
.getRuleStatus();
927 td
.fActualTags
.addElement(tag
, status
);
931 // The loop normally exits by means of the break in the middle.
932 // Make sure that the index was at the correct position for the break iterator to have
934 if (i
!= td
.fDataToBreak
.length()) {
935 errln("testFollowing(): iterator returned DONE prematurely.");
938 // Full check of all results.
939 td
.checkResults("testFollowing", this);
944 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
945 UErrorCode status
= U_ZERO_ERROR
;
948 int32_t lastP
= 0x7ffffffe;
951 logln("testPreceding():");
952 bi
.setText(td
.fDataToBreak
);
956 td
.fActualBreakPositions
.addElement(p
, status
);
957 tag
= bi
.getRuleStatus();
958 td
.fActualTags
.addElement(tag
, status
);
960 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
963 if (p
== RuleBasedBreakIterator::DONE
) {
966 // We've reached a new break position. Save it.
967 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
969 tag
= bi
.getRuleStatus();
970 td
.fActualTags
.insertElementAt(tag
, 0, status
);
973 // The loop normally exits by means of the break in the middle.
974 // Make sure that the index was at the correct position for the break iterator to have
977 errln("testPreceding(): iterator returned DONE prematurely.");
980 // Full check of all results.
981 td
.checkResults("testPreceding", this);
986 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
987 UErrorCode status
= U_ZERO_ERROR
;
991 logln("testIsBoundary():");
992 bi
.setText(td
.fDataToBreak
);
995 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
996 if (bi
.isBoundary(i
)) {
997 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
998 tag
= bi
.getRuleStatus();
999 td
.fActualTags
.addElement(tag
, status
);
1002 td
.checkResults("testIsBoundary: ", this);
1007 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
1009 iterator
.setText(td
.fDataToBreak
);
1011 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
1012 int32_t offset
= iterator
.first();
1016 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
1018 if (*testIterator
!= iterator
)
1019 errln("clone() or operator!= failed: two clones compared unequal");
1022 testOffset
= testIterator
->first();
1023 testOffset
= testIterator
->next(count
);
1024 if (offset
!= testOffset
)
1025 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
1027 if (offset
!= RuleBasedBreakIterator::DONE
) {
1029 offset
= iterator
.next();
1031 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
1032 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
1033 if (count
> 10000 || offset
== -1) {
1034 errln("operator== failed too many times. Stopping test.");
1036 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1042 } while (offset
!= RuleBasedBreakIterator::DONE
);
1044 // now do it backwards...
1045 offset
= iterator
.last();
1049 testOffset
= testIterator
->last();
1050 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
1051 if (offset
!= testOffset
)
1052 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
1054 if (offset
!= RuleBasedBreakIterator::DONE
) {
1056 offset
= iterator
.previous();
1058 } while (offset
!= RuleBasedBreakIterator::DONE
);
1060 delete testIterator
;
1064 //---------------------------------------------
1068 //---------------------------------------------
1069 void RBBITest::TestEmptyString()
1071 UnicodeString text
= "";
1072 UErrorCode status
= U_ZERO_ERROR
;
1074 BITestData
x(status
);
1075 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
1076 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
1077 if (U_FAILURE(status
))
1079 errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1082 generalIteratorTest(*bi
, x
);
1086 void RBBITest::TestGetAvailableLocales()
1088 int32_t locCount
= 0;
1089 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
1092 errln("getAvailableLocales() returned an empty list!");
1093 // Just make sure that it's returning good memory.
1095 for (i
= 0; i
< locCount
; ++i
) {
1096 logln(locList
[i
].getName());
1100 //Testing the BreakIterator::getDisplayName() function
1101 void RBBITest::TestGetDisplayName()
1103 UnicodeString result
;
1105 BreakIterator::getDisplayName(Locale::getUS(), result
);
1106 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
1107 errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1110 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
1111 if (result
!= "French (France)")
1112 errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1116 * Test End Behaviour
1119 void RBBITest::TestEndBehaviour()
1121 UErrorCode status
= U_ZERO_ERROR
;
1122 UnicodeString
testString("boo.");
1123 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1124 if (U_FAILURE(status
))
1126 errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1129 wb
->setText(testString
);
1131 if (wb
->first() != 0)
1132 errln("Didn't get break at beginning of string.");
1133 if (wb
->next() != 3)
1134 errln("Didn't get break before period in \"boo.\"");
1135 if (wb
->current() != 4 && wb
->next() != 4)
1136 errln("Didn't get break at end of string.");
1142 void RBBITest::TestBug4153072() {
1143 UErrorCode status
= U_ZERO_ERROR
;
1144 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1145 if (U_FAILURE(status
))
1147 errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1150 UnicodeString
str("...Hello, World!...");
1152 int32_t end
= str
.length() - 3;
1155 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
1156 iter
->adoptText(textIterator
);
1158 // Note: with the switch to UText, there is no way to restrict the
1159 // iteration range to begin at an index other than zero.
1160 // String character iterators created with a non-zero bound are
1161 // treated by RBBI as being empty.
1162 for (index
= -1; index
< begin
+ 1; ++index
) {
1163 onBoundary
= iter
->isBoundary(index
);
1164 if (index
== 0? !onBoundary
: onBoundary
) {
1165 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
1166 " and begin index = " + begin
);
1174 // Test for problem reported by Ashok Matoria on 9 July 2007
1175 // One.<kSoftHyphen><kSpace>Two.
1177 // Sentence break at start (0) and then on calling next() it breaks at
1178 // 'T' of "Two". Now, at this point if I do next() and
1179 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1181 void RBBITest::TestBug5775() {
1182 UErrorCode status
= U_ZERO_ERROR
;
1183 BreakIterator
*bi
= BreakIterator::createSentenceInstance(Locale::getEnglish(), status
);
1184 TEST_ASSERT_SUCCESS(status
);
1185 TEST_ASSERT(bi
!= NULL
);
1187 if (U_FAILURE(status
) || bi
== NULL
) {
1188 // TEST_ASSERT already printed error message.
1192 UnicodeString
s("One.\\u00ad Two.", -1, US_INV
);
1196 int pos
= bi
->next();
1197 TEST_ASSERT(pos
== 6);
1199 TEST_ASSERT(pos
== 10);
1200 pos
= bi
->previous();
1201 TEST_ASSERT(pos
== 6);
1208 * Test Japanese Line Break
1211 void RBBITest::TestJapaneseLineBreak()
1214 // Test needs updating some more... Dump it for now.
1217 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1218 // as opening and closing punctuation for line breaking.
1219 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1220 // from these tests. 6-13-2002
1222 UErrorCode status
= U_ZERO_ERROR
;
1223 UnicodeString testString
= CharsToUnicodeString("\\u4e00x\\u4e8c");
1224 UnicodeString precedingChars
= CharsToUnicodeString(
1225 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1226 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1227 UnicodeString followingChars
= CharsToUnicodeString(
1228 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1229 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1230 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1231 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1232 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1233 BreakIterator
*iter
= BreakIterator::createLineInstance(Locale::getJapan(), status
);
1236 if (U_FAILURE(status
))
1238 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1242 for (i
= 0; i
< precedingChars
.length(); i
++) {
1243 testString
.setCharAt(1, precedingChars
[i
]);
1244 iter
->setText(testString
);
1245 int32_t j
= iter
->first();
1247 errln("ja line break failure: failed to start at 0");
1250 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars
[i
])
1251 + "' (" + ((int)(precedingChars
[i
])) + ")");
1254 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars
[i
])
1255 + "' (" + ((int)(precedingChars
[i
])) + ")");
1258 for (i
= 0; i
< followingChars
.length(); i
++) {
1259 testString
.setCharAt(1, followingChars
[i
]);
1260 iter
->setText(testString
);
1261 int j
= iter
->first();
1263 errln("ja line break failure: failed to start at 0");
1266 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars
[i
])
1267 + "' (" + ((int)(followingChars
[i
])) + ")");
1270 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars
[i
])
1271 + "' (" + ((int)(followingChars
[i
])) + ")");
1278 //------------------------------------------------------------------------------
1280 // RBBITest::Extended Run RBBI Tests from an external test data file
1282 //------------------------------------------------------------------------------
1286 UnicodeString dataToBreak
;
1287 UVector32
*expectedBreaks
;
1292 void RBBITest::executeTest(TestParams
*t
) {
1297 if (t
->bi
== NULL
) {
1301 t
->bi
->setText(t
->dataToBreak
);
1303 // Run the iterator forward
1306 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1308 // Fail for lack of forward progress.
1309 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1310 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1314 // Check that there were we didn't miss an expected break between the last one
1316 for (i
=prevBP
+1; i
<bp
; i
++) {
1317 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1318 int expected
[] = {0, i
};
1319 printStringBreaks(t
->dataToBreak
, expected
, 2);
1320 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1321 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1325 // Check that the break we did find was expected
1326 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1327 int expected
[] = {0, bp
};
1328 printStringBreaks(t
->dataToBreak
, expected
, 2);
1329 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1330 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1332 // The break was expected.
1333 // Check that the {nnn} tag value is correct.
1334 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1335 if (expectedTagVal
== -1) {
1338 int32_t line
= t
->srcLine
->elementAti(bp
);
1339 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1340 if (rs
!= expectedTagVal
) {
1341 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1342 " Actual, Expected status = %4d, %4d",
1343 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1351 // Verify that there were no missed expected breaks after the last one found
1352 for (i
=prevBP
+1; i
<t
->expectedBreaks
->size(); i
++) {
1353 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1354 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1355 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1360 // Run the iterator backwards, verify that the same breaks are found.
1362 prevBP
= t
->dataToBreak
.length()+2; // start with a phony value for the last break pos seen.
1363 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1365 // Fail for lack of progress.
1366 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1367 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1371 // Check that there were we didn't miss an expected break between the last one
1372 // and this one. (UVector returns zeros for index out of bounds.)
1373 for (i
=prevBP
-1; i
>bp
; i
--) {
1374 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1375 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1376 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1380 // Check that the break we did find was expected
1381 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1382 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1383 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1385 // The break was expected.
1386 // Check that the {nnn} tag value is correct.
1387 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1388 if (expectedTagVal
== -1) {
1391 int line
= t
->srcLine
->elementAti(bp
);
1392 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1393 if (rs
!= expectedTagVal
) {
1394 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1395 " Actual, Expected status = %4d, %4d",
1396 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1403 // Verify that there were no missed breaks prior to the last one found
1404 for (i
=prevBP
-1; i
>=0; i
--) {
1405 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1406 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1407 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1413 void RBBITest::TestExtended() {
1414 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1415 UErrorCode status
= U_ZERO_ERROR
;
1418 UnicodeString rules
;
1421 tp
.expectedBreaks
= new UVector32(status
);
1422 tp
.srcLine
= new UVector32(status
);
1423 tp
.srcCol
= new UVector32(status
);
1425 RegexMatcher
localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status
);
1426 TEST_ASSERT_SUCCESS(status
);
1430 // Open and read the test data file.
1432 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1433 char testFileName
[1000];
1434 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1435 errln("Can't open test data. Path too long.");
1438 strcpy(testFileName
, testDataDirectory
);
1439 strcat(testFileName
, "rbbitst.txt");
1442 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
1443 if (U_FAILURE(status
)) {
1444 return; /* something went wrong, error already output */
1451 // Put the test data into a UnicodeString
1453 UnicodeString
testString(FALSE
, testFile
, len
);
1461 parseState
= PARSE_TAG
;
1463 EParseState savedState
= PARSE_TAG
;
1465 static const UChar CH_LF
= 0x0a;
1466 static const UChar CH_CR
= 0x0d;
1467 static const UChar CH_HASH
= 0x23;
1468 /*static const UChar CH_PERIOD = 0x2e;*/
1469 static const UChar CH_LT
= 0x3c;
1470 static const UChar CH_GT
= 0x3e;
1471 static const UChar CH_BACKSLASH
= 0x5c;
1472 static const UChar CH_BULLET
= 0x2022;
1474 int32_t lineNum
= 1;
1475 int32_t colStart
= 0;
1477 int32_t charIdx
= 0;
1479 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1481 for (charIdx
= 0; charIdx
< len
; ) {
1482 status
= U_ZERO_ERROR
;
1483 UChar c
= testString
.charAt(charIdx
);
1485 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1486 // treat CRLF as a unit
1490 if (c
== CH_LF
|| c
== CH_CR
) {
1494 column
= charIdx
- colStart
+ 1;
1496 switch (parseState
) {
1498 if (c
== 0x0a || c
== 0x0d) {
1499 parseState
= savedState
;
1506 parseState
= PARSE_COMMENT
;
1507 savedState
= PARSE_TAG
;
1510 if (u_isUWhiteSpace(c
)) {
1513 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1515 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1519 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1521 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1525 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1527 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1531 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1534 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1538 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1540 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1545 // <locale loc_name>
1546 localeMatcher
.reset(testString
);
1547 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1548 UnicodeString localeName
= localeMatcher
.group(1, status
);
1549 char localeName8
[100];
1550 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1551 locale
= Locale::createFromName(localeName8
);
1552 charIdx
+= localeMatcher
.group(0, status
).length();
1553 TEST_ASSERT_SUCCESS(status
);
1556 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1557 parseState
= PARSE_DATA
;
1559 tp
.dataToBreak
= "";
1560 tp
.expectedBreaks
->removeAllElements();
1561 tp
.srcCol
->removeAllElements();
1562 tp
.srcLine
->removeAllElements();
1566 errln("line %d: Tag expected in test file.", lineNum
);
1567 parseState
= PARSE_COMMENT
;
1568 savedState
= PARSE_DATA
;
1569 goto end_test
; // Stop the test.
1574 if (c
== CH_BULLET
) {
1575 int32_t breakIdx
= tp
.dataToBreak
.length();
1576 tp
.expectedBreaks
->setSize(breakIdx
+1);
1577 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1578 tp
.srcLine
->setSize(breakIdx
+1);
1579 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1580 tp
.srcCol
->setSize(breakIdx
+1);
1581 tp
.srcCol
->setElementAt(column
, breakIdx
);
1585 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1586 // Add final entry to mappings from break location to source file position.
1587 // Need one extra because last break position returned is after the
1588 // last char in the data, not at the last char.
1589 tp
.srcLine
->addElement(lineNum
, status
);
1590 tp
.srcCol
->addElement(column
, status
);
1592 parseState
= PARSE_TAG
;
1600 if (testString
.compare(charIdx
-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1601 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1602 // Get the code point from the name and insert it into the test data.
1603 // (Damn, no API takes names in Unicode !!!
1604 // we've got to take it back to char *)
1605 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1606 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1607 char charNameBuf
[200];
1608 UChar32 theChar
= -1;
1609 if (nameEndIdx
!= -1) {
1610 UErrorCode status
= U_ZERO_ERROR
;
1611 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1612 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1613 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1614 if (U_FAILURE(status
)) {
1618 if (theChar
== -1) {
1619 errln("Error in named character in test file at line %d, col %d",
1622 // Named code point was recognized. Insert it
1623 // into the test data.
1624 tp
.dataToBreak
.append(theChar
);
1625 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1626 tp
.srcLine
->addElement(lineNum
, status
);
1627 tp
.srcCol
->addElement(column
, status
);
1630 if (nameEndIdx
> charIdx
) {
1631 charIdx
= nameEndIdx
+1;
1640 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1642 int32_t breakIdx
= tp
.dataToBreak
.length();
1643 tp
.expectedBreaks
->setSize(breakIdx
+1);
1644 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1645 tp
.srcLine
->setSize(breakIdx
+1);
1646 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1647 tp
.srcCol
->setSize(breakIdx
+1);
1648 tp
.srcCol
->setElementAt(column
, breakIdx
);
1654 parseState
= PARSE_NUM
;
1658 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1659 parseState
= PARSE_COMMENT
;
1660 savedState
= PARSE_DATA
;
1664 if (c
== CH_BACKSLASH
) {
1665 // Check for \ at end of line, a line continuation.
1666 // Advance over (discard) the newline
1667 UChar32 cp
= testString
.char32At(charIdx
);
1668 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1670 // Need an extra increment of the input ptr to move over both of them
1673 if (cp
== CH_LF
|| cp
== CH_CR
) {
1680 // Let unescape handle the back slash.
1681 cp
= testString
.unescapeAt(charIdx
);
1683 // Escape sequence was recognized. Insert the char
1684 // into the test data.
1685 tp
.dataToBreak
.append(cp
);
1686 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1687 tp
.srcLine
->addElement(lineNum
, status
);
1688 tp
.srcCol
->addElement(column
, status
);
1694 // Not a recognized backslash escape sequence.
1695 // Take the next char as a literal.
1696 // TODO: Should this be an error?
1697 c
= testString
.charAt(charIdx
);
1698 charIdx
= testString
.moveIndex32(charIdx
, 1);
1701 // Normal, non-escaped data char.
1702 tp
.dataToBreak
.append(c
);
1704 // Save the mapping from offset in the data to line/column numbers in
1705 // the original input file. Will be used for better error messages only.
1706 // If there's an expected break before this char, the slot in the mapping
1707 // vector will already be set for this char; don't overwrite it.
1708 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1709 tp
.srcLine
->addElement(lineNum
, status
);
1710 tp
.srcCol
->addElement(column
, status
);
1716 // We are parsing an expected numeric tag value, like <1234>,
1717 // within a chunk of data.
1718 if (u_isUWhiteSpace(c
)) {
1723 // Finished the number. Add the info to the expected break data,
1724 // and switch parse state back to doing plain data.
1725 parseState
= PARSE_DATA
;
1726 if (tagValue
== 0) {
1729 int32_t breakIdx
= tp
.dataToBreak
.length();
1730 tp
.expectedBreaks
->setSize(breakIdx
+1);
1731 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1732 tp
.srcLine
->setSize(breakIdx
+1);
1733 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1734 tp
.srcCol
->setSize(breakIdx
+1);
1735 tp
.srcCol
->setElementAt(column
, breakIdx
);
1740 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1744 errln("Syntax Error in test file at line %d, col %d",
1746 parseState
= PARSE_COMMENT
;
1747 goto end_test
; // Stop the test
1752 if (U_FAILURE(status
)) {
1753 errln("ICU Error %s while parsing test file at line %d.",
1754 u_errorName(status
), lineNum
);
1755 status
= U_ZERO_ERROR
;
1756 goto end_test
; // Stop the test
1763 delete tp
.expectedBreaks
;
1770 void RBBITest::TestThaiBreaks() {
1771 UErrorCode status
=U_ZERO_ERROR
;
1773 Locale locale
= Locale("th");
1776 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
1777 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
1778 0x0E16, 0x0E49, 0x0E33
1780 int32_t expectedWordResult
[] = {
1781 2, 3, 6, 10, 11, 15, 17, 20, 22
1783 int32_t expectedLineResult
[] = {
1784 3, 6, 11, 15, 17, 20, 22
1786 int32_t size
= sizeof(c
)/sizeof(UChar
);
1787 UnicodeString text
=UnicodeString(c
);
1789 b
= BreakIterator::createWordInstance(locale
, status
);
1790 if (U_FAILURE(status
)) {
1791 errln("Unable to create thai word break iterator.\n");
1796 while ((p
=b
->next())!=BreakIterator::DONE
&& p
< size
) {
1797 if (p
!= expectedWordResult
[index
++]) {
1798 errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult
[index
-1], p
);
1803 b
= BreakIterator::createLineInstance(locale
, status
);
1804 if (U_FAILURE(status
)) {
1805 printf("Unable to create thai line break iterator.\n");
1810 while ((p
=b
->next())!=BreakIterator::DONE
&& p
< size
) {
1811 if (p
!= expectedLineResult
[index
++]) {
1812 errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult
[index
-1], p
);
1820 //-------------------------------------------------------------------------------
1822 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1823 // return the datain one big UChar * buffer, which the caller must delete.
1826 // fileName: the name of the file, with no directory part. The test data directory
1828 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1829 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1830 // specified here. The BOM, if it exists, will be stripped from the returned data.
1831 // Pass NULL for the system default encoding.
1834 // The file data, converted to UChar.
1835 // The caller must delete this when done with
1836 // delete [] theBuffer;
1838 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1839 // Move this function to some common place.
1841 //--------------------------------------------------------------------------------
1842 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, const char *encoding
, UErrorCode
&status
) {
1843 UChar
*retPtr
= NULL
;
1844 char *fileBuf
= NULL
;
1845 UConverter
* conv
= NULL
;
1849 if (U_FAILURE(status
)) {
1856 f
= fopen(fileName
, "rb");
1858 dataerrln("[DATA] Error opening test data file %s\n", fileName
);
1859 status
= U_FILE_ACCESS_ERROR
;
1868 fseek( f
, 0, SEEK_END
);
1869 fileSize
= ftell(f
);
1870 fileBuf
= new char[fileSize
];
1871 fseek(f
, 0, SEEK_SET
);
1872 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1873 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1874 errln("Error reading test data file.");
1875 goto cleanUpAndReturn
;
1879 // Look for a Unicode Signature (BOM) on the data just read
1881 int32_t signatureLength
;
1882 const char * fileBufC
;
1883 const char* bomEncoding
;
1886 bomEncoding
= ucnv_detectUnicodeSignature(
1887 fileBuf
, fileSize
, &signatureLength
, &status
);
1888 if(bomEncoding
!=NULL
){
1889 fileBufC
+= signatureLength
;
1890 fileSize
-= signatureLength
;
1891 encoding
= bomEncoding
;
1895 // Open a converter to take the rule file to UTF-16
1897 conv
= ucnv_open(encoding
, &status
);
1898 if (U_FAILURE(status
)) {
1899 goto cleanUpAndReturn
;
1903 // Convert the rules to UChar.
1904 // Preflight first to determine required buffer size.
1906 ulen
= ucnv_toUChars(conv
,
1912 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1913 // Buffer Overflow is expected from the preflight operation.
1914 status
= U_ZERO_ERROR
;
1916 retPtr
= new UChar
[ulen
+1];
1929 if (U_FAILURE(status
)) {
1930 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1940 //--------------------------------------------------------------------------------------------
1942 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1944 //-------------------------------------------------------------------------------------------
1945 void RBBITest::TestUnicodeFiles() {
1946 RuleBasedBreakIterator
*bi
;
1947 UErrorCode status
= U_ZERO_ERROR
;
1949 bi
= (RuleBasedBreakIterator
*)BreakIterator::createCharacterInstance(Locale::getDefault(), status
);
1950 TEST_ASSERT_SUCCESS(status
);
1951 if (U_SUCCESS(status
)) {
1952 runUnicodeTestData("GraphemeBreakTest.txt", bi
);
1956 bi
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale::getDefault(), status
);
1957 TEST_ASSERT_SUCCESS(status
);
1958 if (U_SUCCESS(status
)) {
1959 runUnicodeTestData("WordBreakTest.txt", bi
);
1963 bi
= (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
1964 TEST_ASSERT_SUCCESS(status
);
1965 if (U_SUCCESS(status
)) {
1966 runUnicodeTestData("SentenceBreakTest.txt", bi
);
1970 bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
1971 TEST_ASSERT_SUCCESS(status
);
1972 if (U_SUCCESS(status
)) {
1973 runUnicodeTestData("LineBreakTest.txt", bi
);
1979 //--------------------------------------------------------------------------------------------
1981 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1983 //-------------------------------------------------------------------------------------------
1984 void RBBITest::runUnicodeTestData(const char *fileName
, RuleBasedBreakIterator
*bi
) {
1985 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1986 UErrorCode status
= U_ZERO_ERROR
;
1989 // Open and read the test data file, put it into a UnicodeString.
1991 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1992 char testFileName
[1000];
1993 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1994 dataerrln("[DATA] Can't open test data. Path too long.");
1997 strcpy(testFileName
, testDataDirectory
);
1998 strcat(testFileName
, fileName
);
2000 logln("Opening data file %s\n", fileName
);
2003 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, "UTF-8", status
);
2004 if (status
!= U_FILE_ACCESS_ERROR
) {
2005 TEST_ASSERT_SUCCESS(status
);
2006 TEST_ASSERT(testFile
!= NULL
);
2008 if (U_FAILURE(status
) || testFile
== NULL
) {
2009 return; /* something went wrong, error already output */
2011 UnicodeString
testFileAsString(TRUE
, testFile
, len
);
2014 // Parse the test data file using a regular expression.
2015 // Each kind of token is recognized in its own capture group; what type of item was scanned
2016 // is identified by which group had a match.
2018 // Caputure Group # 1 2 3 4 5
2019 // Parses this item: divide x hex digits comment \n unrecognized \n
2021 UnicodeString
tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV
);
2022 RegexMatcher
tokenMatcher(tokenExpr
, testFileAsString
, UREGEX_MULTILINE
| UREGEX_DOTALL
, status
);
2023 UnicodeString testString
;
2024 UVector32
breakPositions(status
);
2026 TEST_ASSERT_SUCCESS(status
);
2027 if (U_FAILURE(status
)) {
2032 // Scan through each test case, building up the string to be broken in testString,
2033 // and the positions that should be boundaries in the breakPositions vector.
2035 while (tokenMatcher
.find()) {
2036 if (tokenMatcher
.start(1, status
) >= 0) {
2037 // Scanned a divide sign, indicating a break position in the test data.
2038 if (testString
.length()>0) {
2039 breakPositions
.addElement(testString
.length(), status
);
2042 else if (tokenMatcher
.start(2, status
) >= 0) {
2043 // Scanned an 'x', meaning no break at this position in the test data
2044 // Nothing to be done here.
2046 else if (tokenMatcher
.start(3, status
) >= 0) {
2047 // Scanned Hex digits. Convert them to binary, append to the character data string.
2048 const UnicodeString
&hexNumber
= tokenMatcher
.group(3, status
);
2049 int length
= hexNumber
.length();
2052 hexNumber
.extract (0, length
, buf
, sizeof(buf
), US_INV
);
2053 UChar32 c
= (UChar32
)strtol(buf
, NULL
, 16);
2055 testString
.append(c
);
2057 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2058 fileName
, lineNumber
);
2061 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2062 fileName
, lineNumber
);
2065 else if (tokenMatcher
.start(4, status
) >= 0) {
2066 // Scanned to end of a line, possibly skipping over a comment in the process.
2067 // If the line from the file contained test data, run the test now.
2069 if (testString
.length() > 0) {
2070 checkUnicodeTestCase(fileName
, lineNumber
, testString
, &breakPositions
, bi
);
2073 // Clear out this test case.
2074 // The string and breakPositions vector will be refilled as the next
2075 // test case is parsed.
2076 testString
.remove();
2077 breakPositions
.removeAllElements();
2080 // Scanner catchall. Something unrecognized appeared on the line.
2082 UnicodeString uToken
= tokenMatcher
.group(0, status
);
2083 uToken
.extract(0, uToken
.length(), token
, (uint32_t)sizeof(token
));
2084 token
[sizeof(token
)-1] = 0;
2085 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName
, lineNumber
, token
);
2087 // Clean up, in preparation for continuing with the next line.
2088 testString
.remove();
2089 breakPositions
.removeAllElements();
2092 TEST_ASSERT_SUCCESS(status
);
2093 if (U_FAILURE(status
)) {
2099 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2102 //--------------------------------------------------------------------------------------------
2104 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
2105 // test data files. Do only a simple, forward-only check -
2106 // this test is mostly to check that ICU and the Unicode
2107 // data agree with each other.
2109 //--------------------------------------------------------------------------------------------
2110 void RBBITest::checkUnicodeTestCase(const char *testFileName
, int lineNumber
,
2111 const UnicodeString
&testString
, // Text data to be broken
2112 UVector32
*breakPositions
, // Positions where breaks should be found.
2113 RuleBasedBreakIterator
*bi
) {
2114 int32_t pos
; // Break Position in the test string
2115 int32_t expectedI
= 0; // Index of expected break position in the vector of expected results.
2116 int32_t expectedPos
; // Expected break position (index into test string)
2118 bi
->setText(testString
);
2122 while (pos
!= BreakIterator::DONE
) {
2123 if (expectedI
>= breakPositions
->size()) {
2124 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2125 testFileName
, lineNumber
, pos
);
2128 expectedPos
= breakPositions
->elementAti(expectedI
);
2129 if (pos
< expectedPos
) {
2130 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2131 testFileName
, lineNumber
, pos
);
2134 if (pos
> expectedPos
) {
2135 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2136 testFileName
, lineNumber
, expectedPos
);
2143 if (pos
==BreakIterator::DONE
&& expectedI
<breakPositions
->size()) {
2144 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2145 testFileName
, lineNumber
, breakPositions
->elementAti(expectedI
));
2151 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2152 //---------------------------------------------------------------------------------------
2154 // classs RBBIMonkeyKind
2156 // Monkey Test for Break Iteration
2157 // Abstract interface class. Concrete derived classes independently
2158 // implement the break rules for different iterator types.
2160 // The Monkey Test itself uses doesn't know which type of break iterator it is
2161 // testing, but works purely in terms of the interface defined here.
2163 //---------------------------------------------------------------------------------------
2164 class RBBIMonkeyKind
{
2166 // Return a UVector of UnicodeSets, representing the character classes used
2167 // for this type of iterator.
2168 virtual UVector
*charClasses() = 0;
2170 // Set the test text on which subsequent calls to next() will operate
2171 virtual void setText(const UnicodeString
&s
) = 0;
2173 // Find the next break postion, starting from the prev break position, or from zero.
2174 // Return -1 after reaching end of string.
2175 virtual int32_t next(int32_t i
) = 0;
2177 virtual ~RBBIMonkeyKind();
2178 UErrorCode deferredStatus
;
2187 RBBIMonkeyKind::RBBIMonkeyKind() {
2188 deferredStatus
= U_ZERO_ERROR
;
2191 RBBIMonkeyKind::~RBBIMonkeyKind() {
2195 //----------------------------------------------------------------------------------------
2197 // Random Numbers. Similar to standard lib rand() and srand()
2198 // Not using library to
2199 // 1. Get same results on all platforms.
2200 // 2. Get access to current seed, to more easily reproduce failures.
2202 //---------------------------------------------------------------------------------------
2203 static uint32_t m_seed
= 1;
2205 static uint32_t m_rand()
2207 m_seed
= m_seed
* 1103515245 + 12345;
2208 return (uint32_t)(m_seed
/65536) % 32768;
2212 //------------------------------------------------------------------------------------------
2214 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2215 // of RBBIMonkeyKind.
2217 //------------------------------------------------------------------------------------------
2218 class RBBICharMonkey
: public RBBIMonkeyKind
{
2221 virtual ~RBBICharMonkey();
2222 virtual UVector
*charClasses();
2223 virtual void setText(const UnicodeString
&s
);
2224 virtual int32_t next(int32_t i
);
2228 UnicodeSet
*fCRLFSet
;
2229 UnicodeSet
*fControlSet
;
2230 UnicodeSet
*fExtendSet
;
2231 UnicodeSet
*fPrependSet
;
2232 UnicodeSet
*fSpacingSet
;
2237 UnicodeSet
*fLVTSet
;
2238 UnicodeSet
*fHangulSet
;
2239 UnicodeSet
*fAnySet
;
2241 const UnicodeString
*fText
;
2245 RBBICharMonkey::RBBICharMonkey() {
2246 UErrorCode status
= U_ZERO_ERROR
;
2250 fCRLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status
);
2251 fControlSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status
);
2252 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status
);
2253 fPrependSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status
);
2254 fSpacingSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status
);
2255 fLSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status
);
2256 fVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status
);
2257 fTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status
);
2258 fLVSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status
);
2259 fLVTSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status
);
2260 fHangulSet
= new UnicodeSet();
2261 fHangulSet
->addAll(*fLSet
);
2262 fHangulSet
->addAll(*fVSet
);
2263 fHangulSet
->addAll(*fTSet
);
2264 fHangulSet
->addAll(*fLVSet
);
2265 fHangulSet
->addAll(*fLVTSet
);
2266 fAnySet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status
);
2268 fSets
= new UVector(status
);
2269 fSets
->addElement(fCRLFSet
, status
);
2270 fSets
->addElement(fControlSet
, status
);
2271 fSets
->addElement(fExtendSet
, status
);
2272 fSets
->addElement(fPrependSet
, status
);
2273 fSets
->addElement(fSpacingSet
, status
);
2274 fSets
->addElement(fHangulSet
, status
);
2275 fSets
->addElement(fAnySet
, status
);
2276 if (U_FAILURE(status
)) {
2277 deferredStatus
= status
;
2282 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2288 int32_t RBBICharMonkey::next(int32_t prevPos
) {
2289 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2290 // break position being tested. The candidate break
2291 // location is before p2.
2295 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2297 if (U_FAILURE(deferredStatus
)) {
2301 // Previous break at end of string. return DONE.
2302 if (prevPos
>= fText
->length()) {
2305 p0
= p1
= p2
= p3
= prevPos
;
2306 c3
= fText
->char32At(prevPos
);
2309 // Loop runs once per "significant" character position in the input text.
2311 // Move all of the positions forward in the input string.
2316 // Advancd p3 by one codepoint
2317 p3
= fText
->moveIndex32(p3
, 1);
2318 c3
= fText
->char32At(p3
);
2321 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2324 if (p2
== fText
->length()) {
2325 // Reached end of string. Always a break position.
2330 // No Extend or Format characters may appear between the CR and LF,
2331 // which requires the additional check for p2 immediately following p1.
2333 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2337 // Rule (GB4). ( Control | CR | LF ) <break>
2338 if (fControlSet
->contains(c1
) ||
2344 // Rule (GB5) <break> ( Control | CR | LF )
2346 if (fControlSet
->contains(c2
) ||
2353 // Rule (GB6) L x ( L | V | LV | LVT )
2354 if (fLSet
->contains(c1
) &&
2355 (fLSet
->contains(c2
) ||
2356 fVSet
->contains(c2
) ||
2357 fLVSet
->contains(c2
) ||
2358 fLVTSet
->contains(c2
))) {
2362 // Rule (GB7) ( LV | V ) x ( V | T )
2363 if ((fLVSet
->contains(c1
) || fVSet
->contains(c1
)) &&
2364 (fVSet
->contains(c2
) || fTSet
->contains(c2
))) {
2368 // Rule (GB8) ( LVT | T) x T
2369 if ((fLVTSet
->contains(c1
) || fTSet
->contains(c1
)) &&
2370 fTSet
->contains(c2
)) {
2374 // Rule (GB9) Numeric x ALetter
2375 if (fExtendSet
->contains(c2
)) {
2379 // Rule (GB9a) x SpacingMark
2380 if (fSpacingSet
->contains(c2
)) {
2384 // Rule (GB9b) Prepend x
2385 if (fPrependSet
->contains(c1
)) {
2389 // Rule (GB10) Any <break> Any
2399 UVector
*RBBICharMonkey::charClasses() {
2404 RBBICharMonkey::~RBBICharMonkey() {
2420 //------------------------------------------------------------------------------------------
2422 // class RBBIWordMonkey Word Break specific implementation
2423 // of RBBIMonkeyKind.
2425 //------------------------------------------------------------------------------------------
2426 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2429 virtual ~RBBIWordMonkey();
2430 virtual UVector
*charClasses();
2431 virtual void setText(const UnicodeString
&s
);
2432 virtual int32_t next(int32_t i
);
2438 UnicodeSet
*fNewlineSet
;
2439 UnicodeSet
*fKatakanaSet
;
2440 UnicodeSet
*fALetterSet
;
2441 UnicodeSet
*fMidNumLetSet
;
2442 UnicodeSet
*fMidLetterSet
;
2443 UnicodeSet
*fMidNumSet
;
2444 UnicodeSet
*fNumericSet
;
2445 UnicodeSet
*fFormatSet
;
2446 UnicodeSet
*fOtherSet
;
2447 UnicodeSet
*fExtendSet
;
2448 UnicodeSet
*fExtendNumLetSet
;
2450 RegexMatcher
*fMatcher
;
2452 const UnicodeString
*fText
;
2456 RBBIWordMonkey::RBBIWordMonkey()
2458 UErrorCode status
= U_ZERO_ERROR
;
2460 fSets
= new UVector(status
);
2462 fCRSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status
);
2463 fLFSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status
);
2464 fNewlineSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status
);
2465 fALetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status
);
2466 fKatakanaSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status
);
2467 fMidNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status
);
2468 fMidLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status
);
2469 fMidNumSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status
);
2470 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status
);
2471 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status
);
2472 fExtendNumLetSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status
);
2473 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status
);
2475 fOtherSet
= new UnicodeSet();
2476 if(U_FAILURE(status
)) {
2477 deferredStatus
= status
;
2481 fOtherSet
->complement();
2482 fOtherSet
->removeAll(*fCRSet
);
2483 fOtherSet
->removeAll(*fLFSet
);
2484 fOtherSet
->removeAll(*fNewlineSet
);
2485 fOtherSet
->removeAll(*fKatakanaSet
);
2486 fOtherSet
->removeAll(*fALetterSet
);
2487 fOtherSet
->removeAll(*fMidLetterSet
);
2488 fOtherSet
->removeAll(*fMidNumSet
);
2489 fOtherSet
->removeAll(*fNumericSet
);
2490 fOtherSet
->removeAll(*fExtendNumLetSet
);
2491 fOtherSet
->removeAll(*fFormatSet
);
2492 fOtherSet
->removeAll(*fExtendSet
);
2493 // Inhibit dictionary characters from being tested at all.
2494 fOtherSet
->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status
));
2496 fSets
->addElement(fCRSet
, status
);
2497 fSets
->addElement(fLFSet
, status
);
2498 fSets
->addElement(fNewlineSet
, status
);
2499 fSets
->addElement(fALetterSet
, status
);
2500 fSets
->addElement(fKatakanaSet
, status
);
2501 fSets
->addElement(fMidLetterSet
, status
);
2502 fSets
->addElement(fMidNumLetSet
, status
);
2503 fSets
->addElement(fMidNumSet
, status
);
2504 fSets
->addElement(fNumericSet
, status
);
2505 fSets
->addElement(fFormatSet
, status
);
2506 fSets
->addElement(fExtendSet
, status
);
2507 fSets
->addElement(fOtherSet
, status
);
2508 fSets
->addElement(fExtendNumLetSet
, status
);
2510 if (U_FAILURE(status
)) {
2511 deferredStatus
= status
;
2515 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2520 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2521 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2522 // break position being tested. The candidate break
2523 // location is before p2.
2527 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2529 if (U_FAILURE(deferredStatus
)) {
2533 // Prev break at end of string. return DONE.
2534 if (prevPos
>= fText
->length()) {
2537 p0
= p1
= p2
= p3
= prevPos
;
2538 c3
= fText
->char32At(prevPos
);
2541 // Loop runs once per "significant" character position in the input text.
2543 // Move all of the positions forward in the input string.
2548 // Advancd p3 by X(Extend | Format)* Rule 4
2549 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2551 p3
= fText
->moveIndex32(p3
, 1);
2552 c3
= fText
->char32At(p3
);
2553 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2557 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
));
2561 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2564 if (p2
== fText
->length()) {
2565 // Reached end of string. Always a break position.
2570 // No Extend or Format characters may appear between the CR and LF,
2571 // which requires the additional check for p2 immediately following p1.
2573 if (c1
==0x0D && c2
==0x0A) {
2577 // Rule (3a) Break before and after newlines (including CR and LF)
2579 if (fCRSet
->contains(c1
) || fLFSet
->contains(c1
) || fNewlineSet
->contains(c1
)) {
2582 if (fCRSet
->contains(c2
) || fLFSet
->contains(c2
) || fNewlineSet
->contains(c2
)) {
2586 // Rule (5). ALetter x ALetter
2587 if (fALetterSet
->contains(c1
) &&
2588 fALetterSet
->contains(c2
)) {
2592 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2594 if ( fALetterSet
->contains(c1
) &&
2595 (fMidLetterSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2596 fALetterSet
->contains(c3
)) {
2601 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2602 if (fALetterSet
->contains(c0
) &&
2603 (fMidLetterSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2604 fALetterSet
->contains(c2
)) {
2608 // Rule (8) Numeric x Numeric
2609 if (fNumericSet
->contains(c1
) &&
2610 fNumericSet
->contains(c2
)) {
2614 // Rule (9) ALetter x Numeric
2615 if (fALetterSet
->contains(c1
) &&
2616 fNumericSet
->contains(c2
)) {
2620 // Rule (10) Numeric x ALetter
2621 if (fNumericSet
->contains(c1
) &&
2622 fALetterSet
->contains(c2
)) {
2626 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2627 if (fNumericSet
->contains(c0
) &&
2628 (fMidNumSet
->contains(c1
) || fMidNumLetSet
->contains(c1
)) &&
2629 fNumericSet
->contains(c2
)) {
2633 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2634 if (fNumericSet
->contains(c1
) &&
2635 (fMidNumSet
->contains(c2
) || fMidNumLetSet
->contains(c2
)) &&
2636 fNumericSet
->contains(c3
)) {
2640 // Rule (13) Katakana x Katakana
2641 if (fKatakanaSet
->contains(c1
) &&
2642 fKatakanaSet
->contains(c2
)) {
2647 if ((fALetterSet
->contains(c1
) || fNumericSet
->contains(c1
) ||
2648 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2649 fExtendNumLetSet
->contains(c2
)) {
2654 if (fExtendNumLetSet
->contains(c1
) &&
2655 (fALetterSet
->contains(c2
) || fNumericSet
->contains(c2
) ||
2656 fKatakanaSet
->contains(c2
))) {
2660 // Rule 14. Break found here.
2669 UVector
*RBBIWordMonkey::charClasses() {
2674 RBBIWordMonkey::~RBBIWordMonkey() {
2679 delete fKatakanaSet
;
2681 delete fMidNumLetSet
;
2682 delete fMidLetterSet
;
2687 delete fExtendNumLetSet
;
2694 //------------------------------------------------------------------------------------------
2696 // class RBBISentMonkey Sentence Break specific implementation
2697 // of RBBIMonkeyKind.
2699 //------------------------------------------------------------------------------------------
2700 class RBBISentMonkey
: public RBBIMonkeyKind
{
2703 virtual ~RBBISentMonkey();
2704 virtual UVector
*charClasses();
2705 virtual void setText(const UnicodeString
&s
);
2706 virtual int32_t next(int32_t i
);
2708 int moveBack(int posFrom
);
2709 int moveForward(int posFrom
);
2710 UChar32
cAt(int pos
);
2714 UnicodeSet
*fSepSet
;
2715 UnicodeSet
*fFormatSet
;
2717 UnicodeSet
*fLowerSet
;
2718 UnicodeSet
*fUpperSet
;
2719 UnicodeSet
*fOLetterSet
;
2720 UnicodeSet
*fNumericSet
;
2721 UnicodeSet
*fATermSet
;
2722 UnicodeSet
*fSContinueSet
;
2723 UnicodeSet
*fSTermSet
;
2724 UnicodeSet
*fCloseSet
;
2725 UnicodeSet
*fOtherSet
;
2726 UnicodeSet
*fExtendSet
;
2728 const UnicodeString
*fText
;
2732 RBBISentMonkey::RBBISentMonkey()
2734 UErrorCode status
= U_ZERO_ERROR
;
2736 fSets
= new UVector(status
);
2738 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2739 // set and made into character classes of their own. For the monkey impl,
2740 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2741 fSepSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status
);
2742 fFormatSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status
);
2743 fSpSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status
);
2744 fLowerSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status
);
2745 fUpperSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status
);
2746 fOLetterSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status
);
2747 fNumericSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status
);
2748 fATermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status
);
2749 fSContinueSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status
);
2750 fSTermSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status
);
2751 fCloseSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status
);
2752 fExtendSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status
);
2753 fOtherSet
= new UnicodeSet();
2755 if(U_FAILURE(status
)) {
2756 deferredStatus
= status
;
2760 fOtherSet
->complement();
2761 fOtherSet
->removeAll(*fSepSet
);
2762 fOtherSet
->removeAll(*fFormatSet
);
2763 fOtherSet
->removeAll(*fSpSet
);
2764 fOtherSet
->removeAll(*fLowerSet
);
2765 fOtherSet
->removeAll(*fUpperSet
);
2766 fOtherSet
->removeAll(*fOLetterSet
);
2767 fOtherSet
->removeAll(*fNumericSet
);
2768 fOtherSet
->removeAll(*fATermSet
);
2769 fOtherSet
->removeAll(*fSContinueSet
);
2770 fOtherSet
->removeAll(*fSTermSet
);
2771 fOtherSet
->removeAll(*fCloseSet
);
2772 fOtherSet
->removeAll(*fExtendSet
);
2774 fSets
->addElement(fSepSet
, status
);
2775 fSets
->addElement(fFormatSet
, status
);
2776 fSets
->addElement(fSpSet
, status
);
2777 fSets
->addElement(fLowerSet
, status
);
2778 fSets
->addElement(fUpperSet
, status
);
2779 fSets
->addElement(fOLetterSet
, status
);
2780 fSets
->addElement(fNumericSet
, status
);
2781 fSets
->addElement(fATermSet
, status
);
2782 fSets
->addElement(fSContinueSet
, status
);
2783 fSets
->addElement(fSTermSet
, status
);
2784 fSets
->addElement(fCloseSet
, status
);
2785 fSets
->addElement(fOtherSet
, status
);
2786 fSets
->addElement(fExtendSet
, status
);
2788 if (U_FAILURE(status
)) {
2789 deferredStatus
= status
;
2795 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2799 UVector
*RBBISentMonkey::charClasses() {
2804 // moveBack() Find the "significant" code point preceding the index i.
2805 // Skips over ($Extend | $Format)* .
2807 int RBBISentMonkey::moveBack(int i
) {
2814 j
= fText
->moveIndex32(j
, -1);
2815 c
= fText
->char32At(j
);
2817 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2823 int RBBISentMonkey::moveForward(int i
) {
2824 if (i
>=fText
->length()) {
2825 return fText
->length();
2830 j
= fText
->moveIndex32(j
, 1);
2833 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2837 UChar32
RBBISentMonkey::cAt(int pos
) {
2838 if (pos
<0 || pos
>=fText
->length()) {
2841 return fText
->char32At(pos
);
2845 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2846 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2847 // break position being tested. The candidate break
2848 // location is before p2.
2852 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2855 if (U_FAILURE(deferredStatus
)) {
2859 // Prev break at end of string. return DONE.
2860 if (prevPos
>= fText
->length()) {
2863 p0
= p1
= p2
= p3
= prevPos
;
2864 c3
= fText
->char32At(prevPos
);
2867 // Loop runs once per "significant" character position in the input text.
2869 // Move all of the positions forward in the input string.
2874 // Advancd p3 by X(Extend | Format)* Rule 4
2875 p3
= moveForward(p3
);
2879 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2883 // Rule (4). Sep <break>
2884 if (fSepSet
->contains(c1
)) {
2885 p2
= p1
+1; // Separators don't combine with Extend or Format.
2889 if (p2
>= fText
->length()) {
2890 // Reached end of string. Always a break position.
2894 if (p2
== prevPos
) {
2895 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2899 // Rule (6). ATerm x Numeric
2900 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2904 // Rule (7). Upper ATerm x Uppper
2905 if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2909 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2910 // Note: STerm | ATerm are added to the negated part of the expression by a
2911 // note to the Unicode 5.0 documents.
2913 while (fSpSet
->contains(cAt(p8
))) {
2916 while (fCloseSet
->contains(cAt(p8
))) {
2919 if (fATermSet
->contains(cAt(p8
))) {
2923 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2924 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2925 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2928 p8
= moveForward(p8
);
2930 if (fLowerSet
->contains(cAt(p8
))) {
2935 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2936 if (fSContinueSet
->contains(c2
) || fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2938 while (fSpSet
->contains(cAt(p8
))) {
2941 while (fCloseSet
->contains(cAt(p8
))) {
2945 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2950 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2952 while (fCloseSet
->contains(cAt(p9
))) {
2956 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2957 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2962 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2964 while (fSpSet
->contains(cAt(p10
))) {
2965 p10
= moveBack(p10
);
2967 while (fCloseSet
->contains(cAt(p10
))) {
2968 p10
= moveBack(p10
);
2970 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2971 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2976 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2978 if (fSepSet
->contains(cAt(p11
))) {
2979 p11
= moveBack(p11
);
2981 while (fSpSet
->contains(cAt(p11
))) {
2982 p11
= moveBack(p11
);
2984 while (fCloseSet
->contains(cAt(p11
))) {
2985 p11
= moveBack(p11
);
2987 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2991 // Rule (12) Any x Any
2998 RBBISentMonkey::~RBBISentMonkey() {
3008 delete fSContinueSet
;
3017 //-------------------------------------------------------------------------------------------
3021 //-------------------------------------------------------------------------------------------
3023 class RBBILineMonkey
: public RBBIMonkeyKind
{
3026 virtual ~RBBILineMonkey();
3027 virtual UVector
*charClasses();
3028 virtual void setText(const UnicodeString
&s
);
3029 virtual int32_t next(int32_t i
);
3030 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
3071 BreakIterator
*fCharBI
;
3073 const UnicodeString
*fText
;
3074 int32_t *fOrigPositions
;
3076 RegexMatcher
*fNumberMatcher
;
3077 RegexMatcher
*fLB11Matcher
;
3081 RBBILineMonkey::RBBILineMonkey()
3083 UErrorCode status
= U_ZERO_ERROR
;
3085 fSets
= new UVector(status
);
3087 fBK
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status
);
3088 fCR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status
);
3089 fLF
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status
);
3090 fCM
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status
);
3091 fNL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status
);
3092 fWJ
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status
);
3093 fZW
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status
);
3094 fGL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status
);
3095 fCB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status
);
3096 fSP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status
);
3097 fB2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status
);
3098 fBA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status
);
3099 fBB
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status
);
3100 fHY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status
);
3101 fH2
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status
);
3102 fH3
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status
);
3103 fCL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status
);
3104 fEX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status
);
3105 fIN
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status
);
3106 fJL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status
);
3107 fJV
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status
);
3108 fJT
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status
);
3109 fNS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status
);
3110 fOP
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status
);
3111 fQU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status
);
3112 fIS
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status
);
3113 fNU
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status
);
3114 fPO
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status
);
3115 fPR
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status
);
3116 fSY
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status
);
3117 fAI
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status
);
3118 fAL
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status
);
3119 fID
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status
);
3120 fSA
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status
);
3121 fSG
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status
);
3122 fXX
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status
);
3124 if (U_FAILURE(status
)) {
3125 deferredStatus
= status
;
3127 fNumberMatcher
= NULL
;
3131 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
3132 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
3133 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
3134 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
3136 fSets
->addElement(fBK
, status
);
3137 fSets
->addElement(fCR
, status
);
3138 fSets
->addElement(fLF
, status
);
3139 fSets
->addElement(fCM
, status
);
3140 fSets
->addElement(fNL
, status
);
3141 fSets
->addElement(fWJ
, status
);
3142 fSets
->addElement(fZW
, status
);
3143 fSets
->addElement(fGL
, status
);
3144 fSets
->addElement(fCB
, status
);
3145 fSets
->addElement(fSP
, status
);
3146 fSets
->addElement(fB2
, status
);
3147 fSets
->addElement(fBA
, status
);
3148 fSets
->addElement(fBB
, status
);
3149 fSets
->addElement(fHY
, status
);
3150 fSets
->addElement(fH2
, status
);
3151 fSets
->addElement(fH3
, status
);
3152 fSets
->addElement(fCL
, status
);
3153 fSets
->addElement(fEX
, status
);
3154 fSets
->addElement(fIN
, status
);
3155 fSets
->addElement(fJL
, status
);
3156 fSets
->addElement(fJT
, status
);
3157 fSets
->addElement(fJV
, status
);
3158 fSets
->addElement(fNS
, status
);
3159 fSets
->addElement(fOP
, status
);
3160 fSets
->addElement(fQU
, status
);
3161 fSets
->addElement(fIS
, status
);
3162 fSets
->addElement(fNU
, status
);
3163 fSets
->addElement(fPO
, status
);
3164 fSets
->addElement(fPR
, status
);
3165 fSets
->addElement(fSY
, status
);
3166 fSets
->addElement(fAI
, status
);
3167 fSets
->addElement(fAL
, status
);
3168 fSets
->addElement(fID
, status
);
3169 fSets
->addElement(fWJ
, status
);
3170 fSets
->addElement(fSA
, status
);
3171 fSets
->addElement(fSG
, status
);
3174 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3175 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3176 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3177 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3178 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
3179 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3181 fNumberMatcher
= new RegexMatcher(
3182 UnicodeString(rules
, -1, US_INV
), 0, status
);
3184 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
3186 if (U_FAILURE(status
)) {
3187 deferredStatus
= status
;
3192 void RBBILineMonkey::setText(const UnicodeString
&s
) {
3194 fCharBI
->setText(s
);
3195 fNumberMatcher
->reset(s
);
3200 // Line Break TR rules 9 and 10 implementation.
3201 // This deals with combining marks and other sequences that
3202 // that must be treated as if they were something other than what they actually are.
3204 // This is factored out into a separate function because it must be applied twice for
3205 // each potential break, once to the chars before the position being checked, then
3206 // again to the text following the possible break.
3208 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
3210 // Invalid initial position. Happens during the warmup iteration of the
3211 // main loop in next().
3215 int32_t nPos
= *nextPos
;
3217 // LB 9 Keep combining sequences together.
3218 // advance over any CM class chars. Note that Line Break CM is different
3219 // from the normal Grapheme Extend property.
3220 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
3221 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
3223 *nextChar
= fText
->char32At(nPos
);
3224 if (!fCM
->contains(*nextChar
)) {
3227 nPos
= fText
->moveIndex32(nPos
, 1);
3232 // LB 9 Treat X CM* as if it were x.
3233 // No explicit action required.
3235 // LB 10 Treat any remaining combining mark as AL
3236 if (fCM
->contains(*posChar
)) {
3237 *posChar
= 0x41; // thisChar = 'A';
3240 // Push the updated nextPos and nextChar back to our caller.
3241 // This only makes a difference if posChar got bigger by consuming a
3242 // combining sequence.
3244 *nextChar
= fText
->char32At(nPos
);
3249 int32_t RBBILineMonkey::next(int32_t startPos
) {
3250 UErrorCode status
= U_ZERO_ERROR
;
3251 int32_t pos
; // Index of the char following a potential break position
3252 UChar32 thisChar
; // Character at above position "pos"
3254 int32_t prevPos
; // Index of the char preceding a potential break position
3255 UChar32 prevChar
; // Character at above position. Note that prevChar
3256 // and thisChar may not be adjacent because combining
3257 // characters between them will be ignored.
3259 int32_t nextPos
; // Index of the next character following pos.
3260 // Usually skips over combining marks.
3261 int32_t nextCPPos
; // Index of the code point following "pos."
3262 // May point to a combining mark.
3263 int32_t tPos
; // temp value.
3266 if (U_FAILURE(deferredStatus
)) {
3270 if (startPos
>= fText
->length()) {
3275 // Initial values for loop. Loop will run the first time without finding breaks,
3276 // while the invalid values shift out and the "this" and
3277 // "prev" positions are filled in with good values.
3278 pos
= prevPos
= -1; // Invalid value, serves as flag for initial loop iteration.
3279 thisChar
= prevChar
= 0;
3280 nextPos
= nextCPPos
= startPos
;
3283 // Loop runs once per position in the test text, until a break position
3287 prevChar
= thisChar
;
3290 thisChar
= fText
->char32At(pos
);
3292 nextCPPos
= fText
->moveIndex32(pos
, 1);
3293 nextPos
= nextCPPos
;
3295 // Rule LB2 - Break at end of text.
3296 if (pos
>= fText
->length()) {
3300 // Rule LB 9 - adjust for combining sequences.
3301 // We do this one out-of-order because the adjustment does not change anything
3302 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3304 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3305 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3306 c
= fText
->char32At(nextPos
);
3307 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3309 // If the loop is still warming up - if we haven't shifted the initial
3310 // -1 positions out of prevPos yet - loop back to advance the
3311 // position in the input without any further looking for breaks.
3312 if (prevPos
== -1) {
3316 // LB 4 Always break after hard line breaks,
3317 if (fBK
->contains(prevChar
)) {
3321 // LB 5 Break after CR, LF, NL, but not inside CR LF
3322 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3325 if (prevChar
== 0x0d ||
3331 // LB 6 Don't break before hard line breaks
3332 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3333 fBK
->contains(thisChar
)) {
3338 // LB 7 Don't break before spaces or zero-width space.
3339 if (fSP
->contains(thisChar
)) {
3343 if (fZW
->contains(thisChar
)) {
3347 // LB 8 Break after zero width space
3348 if (fZW
->contains(prevChar
)) {
3352 // LB 9, 10 Already done, at top of loop.
3356 // LB 11 Do not break before or after WORD JOINER and related characters.
3360 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3366 if (fGL
->contains(prevChar
)) {
3372 if (!(fSP
->contains(prevChar
) ||
3373 fBA
->contains(prevChar
) ||
3374 fHY
->contains(prevChar
) ) && fGL
->contains(thisChar
)) {
3380 // LB 13 Don't break before closings.
3381 // NU x CL and NU x IS are not matched here so that they will
3382 // fall into LB 17 and the more general number regular expression.
3384 if (!fNU
->contains(prevChar
) && fCL
->contains(thisChar
) ||
3385 fEX
->contains(thisChar
) ||
3386 !fNU
->contains(prevChar
) && fIS
->contains(thisChar
) ||
3387 !fNU
->contains(prevChar
) && fSY
->contains(thisChar
)) {
3391 // LB 14 Don't break after OP SP*
3392 // Scan backwards, checking for this sequence.
3393 // The OP char could include combining marks, so we actually check for
3395 // Another Twist: The Rule 67 fixes may have changed a SP CM
3396 // sequence into a ID char, so before scanning back through spaces,
3397 // verify that prevChar is indeed a space. The prevChar variable
3398 // may differ from fText[prevPos]
3400 if (fSP
->contains(prevChar
)) {
3401 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3402 tPos
=fText
->moveIndex32(tPos
, -1);
3405 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3406 tPos
=fText
->moveIndex32(tPos
, -1);
3408 if (fOP
->contains(fText
->char32At(tPos
))) {
3413 // LB 15 QU SP* x OP
3414 if (fOP
->contains(thisChar
)) {
3415 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3417 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3418 tPos
= fText
->moveIndex32(tPos
, -1);
3420 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3421 tPos
= fText
->moveIndex32(tPos
, -1);
3423 if (fQU
->contains(fText
->char32At(tPos
))) {
3430 // LB 16 CL SP* x NS
3431 // Scan backwards for SP* CM* CL
3432 if (fNS
->contains(thisChar
)) {
3434 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3435 tPos
= fText
->moveIndex32(tPos
, -1);
3437 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3438 tPos
= fText
->moveIndex32(tPos
, -1);
3440 if (fCL
->contains(fText
->char32At(tPos
))) {
3446 // LB 17 B2 SP* x B2
3447 if (fB2
->contains(thisChar
)) {
3448 // Scan backwards, checking for the B2 CM* SP* sequence.
3450 if (fSP
->contains(prevChar
)) {
3451 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3452 tPos
=fText
->moveIndex32(tPos
, -1);
3455 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3456 tPos
=fText
->moveIndex32(tPos
, -1);
3458 if (fB2
->contains(fText
->char32At(tPos
))) {
3464 // LB 18 break after space
3465 if (fSP
->contains(prevChar
)) {
3472 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3476 // LB 20 Break around a CB
3477 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3482 if (fBA
->contains(thisChar
) ||
3483 fHY
->contains(thisChar
) ||
3484 fNS
->contains(thisChar
) ||
3485 fBB
->contains(prevChar
) ) {
3490 if (fAL
->contains(prevChar
) && fIN
->contains(thisChar
) ||
3491 fID
->contains(prevChar
) && fIN
->contains(thisChar
) ||
3492 fIN
->contains(prevChar
) && fIN
->contains(thisChar
) ||
3493 fNU
->contains(prevChar
) && fIN
->contains(thisChar
) ) {
3501 if (fID
->contains(prevChar
) && fPO
->contains(thisChar
) ||
3502 fAL
->contains(prevChar
) && fNU
->contains(thisChar
) ||
3503 fNU
->contains(prevChar
) && fAL
->contains(thisChar
) ) {
3507 // LB 24 Do not break between prefix and letters or ideographs.
3511 if (fPR
->contains(prevChar
) && fID
->contains(thisChar
) ||
3512 fPR
->contains(prevChar
) && fAL
->contains(thisChar
) ||
3513 fPO
->contains(prevChar
) && fAL
->contains(thisChar
) ) {
3520 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3521 if (U_FAILURE(status
)) {
3524 // Matched a number. But could have been just a single digit, which would
3525 // not represent a "no break here" between prevChar and thisChar
3526 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3527 if (numEndIdx
> pos
) {
3528 // Number match includes at least our two chars being checked
3529 if (numEndIdx
> nextPos
) {
3530 // Number match includes additional chars. Update pos and nextPos
3531 // so that next loop iteration will continue at the end of the number,
3532 // checking for breaks between last char in number & whatever follows.
3533 pos
= nextPos
= numEndIdx
;
3535 pos
= fText
->moveIndex32(pos
, -1);
3536 thisChar
= fText
->char32At(pos
);
3537 } while (fCM
->contains(thisChar
));
3544 // LB 26 Do not break a Korean syllable.
3545 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3546 fJV
->contains(thisChar
) ||
3547 fH2
->contains(thisChar
) ||
3548 fH3
->contains(thisChar
))) {
3552 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3553 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3557 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3558 fJT
->contains(thisChar
)) {
3562 // LB 27 Treat a Korean Syllable Block the same as ID.
3563 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3564 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3565 fIN
->contains(thisChar
)) {
3568 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3569 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3570 fPO
->contains(thisChar
)) {
3573 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3574 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3580 // LB 28 Do not break between alphabetics ("at").
3581 if (fAL
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3585 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3586 if (fIS
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3590 // LB 31 Break everywhere else
3599 UVector
*RBBILineMonkey::charClasses() {
3604 RBBILineMonkey::~RBBILineMonkey() {
3645 delete fNumberMatcher
;
3649 //-------------------------------------------------------------------------------------------
3654 // seed=nnnnn Random number starting seed.
3655 // Setting the seed allows errors to be reproduced.
3656 // loop=nnn Looping count. Controls running time.
3658 // 0 or greater: run length.
3660 // type = char | word | line | sent | title
3662 //-------------------------------------------------------------------------------------------
3664 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3665 int32_t val
= defaultVal
;
3666 name
.append(" *= *(-?\\d+)");
3667 UErrorCode status
= U_ZERO_ERROR
;
3668 RegexMatcher
m(name
, params
, 0, status
);
3670 // The param exists. Convert the string to an int.
3671 char valString
[100];
3672 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3673 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3674 paramLength
= (int32_t)(sizeof(valString
)-2);
3676 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3677 val
= strtol(valString
, NULL
, 10);
3679 // Delete this parameter from the params string.
3681 params
= m
.replaceFirst("", status
);
3683 U_ASSERT(U_SUCCESS(status
));
3688 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3697 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3699 if (count
< expectedcount
&& expected
[count
] != i
) {
3700 test
->errln("break forward test failed: expected %d but got %d",
3701 expected
[count
], i
);
3706 if (count
!= expectedcount
) {
3707 printStringBreaks(ustr
, expected
, expectedcount
);
3708 test
->errln("break forward test failed: missed %d match",
3709 expectedcount
- count
);
3712 // testing boundaries
3713 for (i
= 1; i
< expectedcount
; i
++) {
3714 int j
= expected
[i
- 1];
3715 if (!bi
->isBoundary(j
)) {
3716 printStringBreaks(ustr
, expected
, expectedcount
);
3717 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3720 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3721 if (bi
->isBoundary(j
)) {
3722 printStringBreaks(ustr
, expected
, expectedcount
);
3723 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3729 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3731 if (forward
[count
] != i
) {
3732 test
->errln("happy break test previous() failed: expected %d but got %d",
3738 printStringBreaks(ustr
, expected
, expectedcount
);
3739 test
->errln("break test previous() failed: missed a match");
3743 // testing preceding
3744 for (i
= 0; i
< expectedcount
- 1; i
++) {
3745 // int j = expected[i] + 1;
3746 int j
= ustr
.moveIndex32(expected
[i
], 1);
3747 for (; j
<= expected
[i
+ 1]; j
++) {
3748 if (bi
->preceding(j
) != expected
[i
]) {
3749 printStringBreaks(ustr
, expected
, expectedcount
);
3750 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3757 void RBBITest::TestWordBreaks(void)
3759 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3761 Locale
locale("en");
3762 UErrorCode status
= U_ZERO_ERROR
;
3763 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3764 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3765 static const char *strlist
[] =
3767 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3768 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3769 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3770 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3771 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3772 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3773 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3774 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3775 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3776 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3777 "\\u2027\\U000e0067\\u0a47\\u00b7",
3778 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3779 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3780 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3781 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3782 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3783 "\\u0027\\u11af\\U000e0057\\u0602",
3784 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3785 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3786 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3787 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3788 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3789 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3790 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3791 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3792 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3793 "\\u58f4\\U000e0049\\u20e7\\u2027",
3794 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3795 "\\ua183\\u102d\\u0bec\\u003a",
3796 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3797 "\\u003a\\u0e57\\u0fad\\u002e",
3798 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3799 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3800 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3801 "\\u003a\\u0664\\u00b7\\u1fba",
3802 "\\u003b\\u0027\\u00b7\\u47a3",
3803 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3804 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3805 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3808 if (U_FAILURE(status
)) {
3809 errln("Creation of break iterator failed %s", u_errorName(status
));
3812 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3813 // printf("looping %d\n", loop);
3814 UnicodeString ustr
= CharsToUnicodeString(strlist
[loop
]);
3815 // RBBICharMonkey monkey;
3816 RBBIWordMonkey monkey
;
3819 int expectedcount
= 0;
3821 monkey
.setText(ustr
);
3823 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3824 expected
[expectedcount
++] = i
;
3827 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3833 void RBBITest::TestWordBoundary(void)
3835 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3836 Locale
locale("en");
3837 UErrorCode status
= U_ZERO_ERROR
;
3838 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3839 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3841 static const char *strlist
[] =
3843 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3844 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3845 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3846 "\\u2027\\U000e0067\\u0a47\\u00b7",
3847 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3848 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3849 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3850 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3851 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3852 "\\u0027\\u11af\\U000e0057\\u0602",
3853 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3854 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3855 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3856 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3857 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3858 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3859 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3860 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3861 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3862 "\\u58f4\\U000e0049\\u20e7\\u2027",
3863 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3864 "\\ua183\\u102d\\u0bec\\u003a",
3865 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3866 "\\u003a\\u0e57\\u0fad\\u002e",
3867 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3868 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3869 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3870 "\\u003a\\u0664\\u00b7\\u1fba",
3871 "\\u003b\\u0027\\u00b7\\u47a3",
3874 if (U_FAILURE(status
)) {
3875 errln("Creation of break iterator failed %s", u_errorName(status
));
3878 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3879 // printf("looping %d\n", loop);
3880 u_unescape(strlist
[loop
], str
, 20);
3881 UnicodeString
ustr(str
);
3888 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3889 forward
[count
++] = i
;
3892 for (j
= prev
+ 1; j
< i
; j
++) {
3893 if (bi
->isBoundary(j
)) {
3894 printStringBreaks(ustr
, forward
, count
);
3895 errln("happy boundary test failed: expected %d not a boundary",
3901 if (!bi
->isBoundary(i
)) {
3902 printStringBreaks(ustr
, forward
, count
);
3903 errln("happy boundary test failed: expected %d a boundary",
3913 void RBBITest::TestLineBreaks(void)
3915 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3916 Locale
locale("en");
3917 UErrorCode status
= U_ZERO_ERROR
;
3918 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3919 const int32_t STRSIZE
= 50;
3921 static const char *strlist
[] =
3923 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3924 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3925 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3926 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3927 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3928 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3929 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3930 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3931 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3932 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3933 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3934 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3935 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3936 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3937 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3938 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3939 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3940 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3941 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3942 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3943 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3944 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3945 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3946 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3947 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3948 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3949 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3950 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3951 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3952 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3953 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3954 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3955 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3956 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3957 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3958 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3959 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3960 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3961 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3962 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3963 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3964 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3965 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3966 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3967 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3968 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3969 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3972 TEST_ASSERT_SUCCESS(status
);
3973 if (U_FAILURE(status
)) {
3976 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3977 // printf("looping %d\n", loop);
3978 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3985 UnicodeString
ustr(str
);
3986 RBBILineMonkey monkey
;
3987 if (U_FAILURE(monkey
.deferredStatus
)) {
3991 const int EXPECTEDSIZE
= 50;
3992 int expected
[EXPECTEDSIZE
];
3993 int expectedcount
= 0;
3995 monkey
.setText(ustr
);
3997 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3998 if (expectedcount
>= EXPECTEDSIZE
) {
3999 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4002 expected
[expectedcount
++] = i
;
4005 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4011 void RBBITest::TestSentBreaks(void)
4013 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4014 Locale
locale("en");
4015 UErrorCode status
= U_ZERO_ERROR
;
4016 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4018 static const char *strlist
[] =
4020 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4022 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4023 "\"Sentence ending with a quote.\" Bye.",
4024 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4025 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4026 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4027 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4028 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4029 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4030 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4031 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4032 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4033 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4034 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4035 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4036 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4037 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4038 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4039 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4042 if (U_FAILURE(status
)) {
4043 errln("Creation of break iterator failed %s", u_errorName(status
));
4046 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
4047 u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0])));
4048 UnicodeString
ustr(str
);
4050 RBBISentMonkey monkey
;
4051 if (U_FAILURE(monkey
.deferredStatus
)) {
4055 const int EXPECTEDSIZE
= 50;
4056 int expected
[EXPECTEDSIZE
];
4057 int expectedcount
= 0;
4059 monkey
.setText(ustr
);
4061 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
4062 if (expectedcount
>= EXPECTEDSIZE
) {
4063 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
4066 expected
[expectedcount
++] = i
;
4069 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
4075 void RBBITest::TestMonkey(char *params
) {
4076 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4078 UErrorCode status
= U_ZERO_ERROR
;
4079 int32_t loopCount
= 500;
4081 UnicodeString breakType
= "all";
4082 Locale
locale("en");
4083 UBool useUText
= FALSE
;
4085 if (quick
== FALSE
) {
4090 UnicodeString
p(params
);
4091 loopCount
= getIntParam("loop", p
, loopCount
);
4092 seed
= getIntParam("seed", p
, seed
);
4094 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
4096 breakType
= m
.group(1, status
);
4098 p
= m
.replaceFirst("", status
);
4101 RegexMatcher
u(" *utext", p
, 0, status
);
4105 p
= u
.replaceFirst("", status
);
4110 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p
, 0, status
).find()) {
4111 // Each option is stripped out of the option string as it is processed.
4112 // All options have been checked. The option string should have been completely emptied..
4114 p
.extract(buf
, sizeof(buf
), NULL
, status
);
4115 buf
[sizeof(buf
)-1] = 0;
4116 errln("Unrecognized or extra parameter: %s\n", buf
);
4122 if (breakType
== "char" || breakType
== "all") {
4124 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
4125 if (U_SUCCESS(status
)) {
4126 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
4127 if (breakType
== "all" && useUText
==FALSE
) {
4128 // Also run a quick test with UText when "all" is specified
4129 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
4133 errln("Creation of character break iterator failed %s", u_errorName(status
));
4138 if (breakType
== "word" || breakType
== "all") {
4139 logln("Word Break Monkey Test");
4141 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
4142 if (U_SUCCESS(status
)) {
4143 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
4146 errln("Creation of word break iterator failed %s", u_errorName(status
));
4151 if (breakType
== "line" || breakType
== "all") {
4152 logln("Line Break Monkey Test");
4154 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
4155 if (loopCount
>= 10) {
4156 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
4158 if (U_SUCCESS(status
)) {
4159 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
4162 errln("Creation of line break iterator failed %s", u_errorName(status
));
4167 if (breakType
== "sent" || breakType
== "all" ) {
4168 logln("Sentence Break Monkey Test");
4170 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
4171 if (loopCount
>= 10) {
4172 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
4174 if (U_SUCCESS(status
)) {
4175 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
4178 errln("Creation of line break iterator failed %s", u_errorName(status
));
4187 // Run a RBBI monkey test. Common routine, for all break iterator types.
4189 // bi - the break iterator to use
4190 // mk - MonkeyKind, abstraction for obtaining expected results
4191 // name - Name of test (char, word, etc.) for use in error messages
4192 // seed - Seed for starting random number generator (parameter from user)
4195 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
4196 int32_t numIterations
, UBool useUText
) {
4198 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4200 const int32_t TESTSTRINGLEN
= 500;
4201 UnicodeString testText
;
4202 int32_t numCharClasses
;
4204 int expected
[TESTSTRINGLEN
*2 + 1];
4205 int expectedCount
= 0;
4206 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
4207 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
4208 char reverseBreaks
[TESTSTRINGLEN
*2+1];
4209 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
4210 char followingBreaks
[TESTSTRINGLEN
*2+1];
4211 char precedingBreaks
[TESTSTRINGLEN
*2+1];
4217 numCharClasses
= mk
.charClasses()->size();
4218 chClasses
= mk
.charClasses();
4220 // Check for errors that occured during the construction of the MonkeyKind object.
4221 // Can't report them where they occured because errln() is a method coming from intlTest,
4222 // and is not visible outside of RBBITest :-(
4223 if (U_FAILURE(mk
.deferredStatus
)) {
4224 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
4228 // Verify that the character classes all have at least one member.
4229 for (i
=0; i
<numCharClasses
; i
++) {
4230 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
4231 if (s
== NULL
|| s
->size() == 0) {
4232 errln("Character Class #%d is null or of zero size.", i
);
4237 while (loopCount
< numIterations
|| numIterations
== -1) {
4238 if (numIterations
== -1 && loopCount
% 10 == 0) {
4239 // If test is running in an infinite loop, display a periodic tic so
4240 // we can tell that it is making progress.
4241 fprintf(stderr
, ".");
4243 // Save current random number seed, so that we can recreate the random numbers
4244 // for this loop iteration in event of an error.
4247 // Populate a test string with data.
4248 testText
.truncate(0);
4249 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
4250 int32_t aClassNum
= m_rand() % numCharClasses
;
4251 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
4252 int32_t charIdx
= m_rand() % classSet
->size();
4253 UChar32 c
= classSet
->charAt(charIdx
);
4254 if (c
< 0) { // TODO: deal with sets containing strings.
4261 // Calculate the expected results for this test string.
4262 mk
.setText(testText
);
4263 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4264 expectedBreaks
[0] = 1;
4265 int32_t breakPos
= 0;
4268 breakPos
= mk
.next(breakPos
);
4269 if (breakPos
== -1) {
4272 if (breakPos
> testText
.length()) {
4273 errln("breakPos > testText.length()");
4275 expectedBreaks
[breakPos
] = 1;
4276 U_ASSERT(expectedCount
<testText
.length());
4277 expected
[expectedCount
++] = breakPos
;
4280 // Find the break positions using forward iteration
4281 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4283 UErrorCode status
= U_ZERO_ERROR
;
4284 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4285 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4286 bi
->setText(testUText
, status
);
4287 TEST_ASSERT_SUCCESS(status
);
4288 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4289 // This UText can be closed immediately, so long as the
4290 // testText string continues to exist.
4292 bi
->setText(testText
);
4295 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4296 if (i
< 0 || i
> testText
.length()) {
4297 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4300 forwardBreaks
[i
] = 1;
4303 // Find the break positions using reverse iteration
4304 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4305 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4306 if (i
< 0 || i
> testText
.length()) {
4307 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4310 reverseBreaks
[i
] = 1;
4313 // Find the break positions using isBoundary() tests.
4314 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4315 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4316 for (i
=0; i
<=testText
.length(); i
++) {
4317 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4321 // Find the break positions using the following() function.
4323 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4324 int32_t lastBreakPos
= 0;
4325 followingBreaks
[0] = 1;
4326 for (i
=0; i
<testText
.length(); i
++) {
4327 breakPos
= bi
->following(i
);
4328 if (breakPos
<= i
||
4329 breakPos
< lastBreakPos
||
4330 breakPos
> testText
.length() ||
4331 breakPos
> lastBreakPos
&& lastBreakPos
> i
) {
4332 errln("%s break monkey test: "
4333 "Out of range value returned by BreakIterator::following().\n"
4334 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4335 name
, seed
, i
, breakPos
, lastBreakPos
);
4338 followingBreaks
[breakPos
] = 1;
4339 lastBreakPos
= breakPos
;
4342 // Find the break positions using the preceding() function.
4343 memset(precedingBreaks
, 0, sizeof(precedingBreaks
));
4344 lastBreakPos
= testText
.length();
4345 precedingBreaks
[testText
.length()] = 1;
4346 for (i
=testText
.length(); i
>0; i
--) {
4347 breakPos
= bi
->preceding(i
);
4348 if (breakPos
>= i
||
4349 breakPos
> lastBreakPos
||
4350 breakPos
< 0 && testText
.getChar32Start(i
)>0 ||
4351 breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
) ) {
4352 errln("%s break monkey test: "
4353 "Out of range value returned by BreakIterator::preceding().\n"
4354 "index=%d; prev returned %d; lastBreak=%d" ,
4355 name
, i
, breakPos
, lastBreakPos
);
4356 if (breakPos
>= 0 && breakPos
< (int32_t)sizeof(precedingBreaks
)) {
4357 precedingBreaks
[i
] = 2; // Forces an error.
4360 if (breakPos
>= 0) {
4361 precedingBreaks
[breakPos
] = 1;
4363 lastBreakPos
= breakPos
;
4367 // Compare the expected and actual results.
4368 for (i
=0; i
<=testText
.length(); i
++) {
4369 const char *errorType
= NULL
;
4370 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4371 errorType
= "next()";
4372 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4373 errorType
= "previous()";
4374 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4375 errorType
= "isBoundary()";
4376 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4377 errorType
= "following()";
4378 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4379 errorType
= "preceding()";
4383 if (errorType
!= NULL
) {
4384 // Format a range of the test text that includes the failure as
4385 // a data item that can be included in the rbbi test data file.
4387 // Start of the range is the last point where expected and actual results
4388 // both agreed that there was a break position.
4389 int startContext
= i
;
4392 if (startContext
==0) { break; }
4394 if (expectedBreaks
[startContext
] != 0) {
4395 if (count
== 2) break;
4400 // End of range is two expected breaks past the start position.
4401 int endContext
= i
+ 1;
4403 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4405 if (endContext
>= testText
.length()) {break;}
4406 if (expectedBreaks
[endContext
-1] != 0) {
4407 if (count
== 0) break;
4414 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4415 UnicodeString errorText
= "<data>";
4416 /***if (strcmp(errorType, "next()") == 0) {
4418 endContext = testText.length();
4420 printStringBreaks(testText, expected, expectedCount);
4423 for (ci
=startContext
; ci
<endContext
;) {
4424 UnicodeString
hexChars("0123456789abcdef");
4427 c
= testText
.char32At(ci
);
4429 // This is the location of the error.
4430 errorText
.append("<?>");
4431 } else if (expectedBreaks
[ci
] != 0) {
4432 // This a non-error expected break position.
4433 errorText
.append("\\");
4436 errorText
.append("\\u");
4437 for (bn
=12; bn
>=0; bn
-=4) {
4438 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4441 errorText
.append("\\U");
4442 for (bn
=28; bn
>=0; bn
-=4) {
4443 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4446 ci
= testText
.moveIndex32(ci
, 1);
4448 errorText
.append("\\");
4449 errorText
.append("</data>\n");
4452 char charErrorTxt
[500];
4453 UErrorCode status
= U_ZERO_ERROR
;
4454 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4455 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4456 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4457 name
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4458 errorType
, seed
, i
, charErrorTxt
);
4469 // TestDebug - A place-holder test for debugging purposes.
4470 // For putting in fragments of other tests that can be invoked
4471 // for tracing without a lot of unwanted extra stuff happening.
4473 void RBBITest::TestDebug(void) {
4475 UErrorCode status
= U_ZERO_ERROR
;
4479 RuleBasedBreakIterator
* bi
=
4480 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4481 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4482 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4483 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4484 // UnicodeString s("Aaa. Bcd");
4487 UBool r
= bi
->isBoundary(8);
4488 printf("%s", r
?"true":"false");
4492 // ruleStatus = bi->getRuleStatus();
4493 printf("%d\t%d\n", pos
, ruleStatus
);
4494 pos
= bi
->previous();
4495 } while (pos
!= BreakIterator::DONE
);
4499 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */