1 /********************************************************************
3 * Copyright (c) 1999-2006, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/utypes.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/uchar.h"
20 #include "unicode/utf16.h"
21 #include "unicode/ucnv.h"
22 #include "unicode/schriter.h"
23 #include "unicode/uniset.h"
24 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/ustring.h"
26 #include "unicode/utext.h"
37 #define TEST_ASSERT(x) {if (!(x)) { \
38 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
40 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
41 errln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
44 //---------------------------------------------------------------------------
46 // class BITestData Holds a set of Break iterator test data and results
48 // - the string data to be broken
49 // - a vector of the expected break positions.
50 // - a vector of source line numbers for the data,
51 // (to help see where errors occured.)
52 // - The expected break tag values.
53 // - Vectors of actual break positions and tag values.
54 // - Functions for comparing actual with expected and
57 //----------------------------------------------------------------------------
60 UnicodeString fDataToBreak
;
61 UVector fExpectedBreakPositions
;
62 UVector fExpectedTags
;
64 UVector fActualBreakPositions
; // Test Results.
67 BITestData(UErrorCode
&status
);
68 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
69 void checkResults(const char *heading
, RBBITest
*test
);
70 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
77 BITestData::BITestData(UErrorCode
&status
)
78 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
84 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
85 // The macro form collects the line number, which is helpful
86 // when tracking down failures.
88 // A null data item is inserted at the start of each test's data
89 // to put the starting zero into the data list. The position saved for
90 // each non-null item is its ending position.
92 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
93 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
94 if (U_FAILURE(status
)) {return;}
96 fDataToBreak
.append(CharsToUnicodeString(data
));
98 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
99 fExpectedTags
.addElement(tag
, status
);
100 fLineNum
.addElement(lineNum
, status
);
105 // checkResults. Compare the actual and expected break positions, report any differences.
107 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
108 int32_t expectedIndex
= 0;
109 int32_t actualIndex
= 0;
112 // If we've run through both the expected and actual results vectors, we're done.
113 // break out of the loop.
114 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
115 actualIndex
>= fActualBreakPositions
.size()) {
120 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
121 err(heading
, test
, expectedIndex
-1, actualIndex
);
126 if (actualIndex
>= fActualBreakPositions
.size()) {
127 err(heading
, test
, expectedIndex
, actualIndex
-1);
132 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
133 err(heading
, test
, expectedIndex
, actualIndex
);
134 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
135 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
143 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
144 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
145 heading
, fLineNum
.elementAt(expectedIndex
),
146 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
155 // err - An error was found. Report it, along with information about where the
156 // incorrectly broken test data appeared in the source file.
158 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
160 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
161 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
163 int32_t line
= fLineNum
.elementAti(expectedIdx
);
164 if (expectedIdx
> 0) {
165 // The line numbers are off by one because a premature break occurs somewhere
166 // within the previous item, rather than at the start of the current (expected) item.
167 // We want to report the offset of the unexpected break from the start of
168 // this previous item.
169 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
171 if (actual
< expected
) {
172 test
->errln("%s unexpected break at offset %d in test item from line %d", heading
, o
, line
);
174 test
->errln("%s Failed to find break at end of item from line %d", heading
, line
);
179 void BITestData::clearResults() {
180 fActualBreakPositions
.removeAllElements();
181 fActualTags
.removeAllElements();
185 //-----------------------------------------------------------------------------------
187 // Cannned Test Characters
189 //-----------------------------------------------------------------------------------
191 static const UChar cannedTestArray
[] = {
192 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
193 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
194 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
195 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
196 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
197 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
198 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
199 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
202 static UnicodeString
* cannedTestChars
= 0;
204 #define halfNA "\\u0928\\u094d\\u200d"
205 #define halfSA "\\u0938\\u094d\\u200d"
206 #define halfCHA "\\u091a\\u094d\\u200d"
207 #define halfKA "\\u0915\\u094d\\u200d"
208 #define deadTA "\\u0924\\u094d"
210 //--------------------------------------------------------------------------------------
212 // RBBITest constructor and destructor
214 //--------------------------------------------------------------------------------------
216 RBBITest::RBBITest() {
217 UnicodeString
temp(cannedTestArray
);
218 cannedTestChars
= new UnicodeString();
219 *cannedTestChars
+= (UChar
)0x0000;
220 *cannedTestChars
+= temp
;
224 RBBITest::~RBBITest() {
225 delete cannedTestChars
;
229 static const int T_NUMBER
= 100;
230 static const int T_LETTER
= 200;
231 static const int T_H_OR_K
= 300;
232 static const int T_IDEO
= 400;
239 //--------------------------------------------------------------------
240 //Testing the BreakIterator for devanagari script
241 //--------------------------------------------------------------------
243 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
244 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
245 #define deadTTHA "\\u0920\\u094d"
246 #define deadPA "\\u092a\\u094d"
247 #define deadSA "\\u0938\\u094d"
248 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
255 //-----------------------------------------------------------------------------------
257 // Test for status {tag} return value from break rules.
258 // TODO: a more thorough test.
260 //-----------------------------------------------------------------------------------
261 void RBBITest::TestStatusReturn() {
262 UnicodeString rulesString1
= "$Letters = [:L:];\n"
263 "$Numbers = [:N:];\n"
266 "Help\\ {4}/me\\!;\n"
267 "[^$Letters $Numbers];\n"
269 UnicodeString testString1
= "abc123..abc Help me Help me!";
270 // 01234567890123456789012345678
271 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
272 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
274 UErrorCode status
=U_ZERO_ERROR
;
275 UParseError parseError
;
277 RuleBasedBreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
278 if(U_FAILURE(status
)) {
279 errln("FAIL : in construction");
283 bi
->setText(testString1
);
284 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
285 if (pos
!= bounds1
[i
]) {
286 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
290 int tag
= bi
->getRuleStatus();
291 if (tag
!= brkStatus
[i
]) {
292 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
302 static void printStringBreaks(UnicodeString ustr
, int expected
[],
305 UErrorCode status
= U_ZERO_ERROR
;
307 printf("code alpha extend alphanum type word sent line name\n");
309 for (j
= 0; j
< ustr
.length(); j
++) {
310 if (expectedcount
> 0) {
312 for (k
= 0; k
< expectedcount
; k
++) {
313 if (j
== expected
[k
]) {
314 printf("------------------------------------------------ %d\n",
319 UChar32 c
= ustr
.char32At(j
);
323 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
324 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c
,
326 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
328 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
330 U_SHORT_PROPERTY_NAME
),
331 u_getPropertyValueName(UCHAR_WORD_BREAK
,
332 u_getIntPropertyValue(c
,
334 U_SHORT_PROPERTY_NAME
),
335 u_getPropertyValueName(UCHAR_SENTENCE_BREAK
,
336 u_getIntPropertyValue(c
,
337 UCHAR_SENTENCE_BREAK
),
338 U_SHORT_PROPERTY_NAME
),
339 u_getPropertyValueName(UCHAR_LINE_BREAK
,
340 u_getIntPropertyValue(c
,
342 U_SHORT_PROPERTY_NAME
),
347 void RBBITest::TestThaiLineBreak() {
348 UErrorCode status
= U_ZERO_ERROR
;
349 BITestData
thaiLineSelection(status
);
351 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
352 // represents elided letters at the end of a long word. It should be bound to
353 // the end of the word and not treated as an independent punctuation mark.
356 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
357 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status
);
358 ADD_DATACHUNK(thaiLineSelection
, "\\u0e08\\u0e30", 0, status
);
359 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status
);
360 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status
);
361 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
362 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
363 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status
);
364 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
365 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e2d\\u0e01", 0, status
);
366 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32", 0, status
);
367 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status
);
368 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status
);
369 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status
);
370 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status
);
372 // the one time where the paiyannoi occurs somewhere other than at the end
373 // of a word is in the Thai abbrevation for "etc.", which both begins and
374 // ends with a paiyannoi
375 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2f\\u0e25\\u0e2f", 0, status
);
376 ADD_DATACHUNK(thaiLineSelection
, "\\u0e17\\u0e35\\u0e48", 0, status
);
377 ADD_DATACHUNK(thaiLineSelection
, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status
);
379 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
380 Locale("th"), status
);
381 if (U_FAILURE(status
))
383 errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
387 generalIteratorTest(*e
, thaiLineSelection
);
393 void RBBITest::TestMixedThaiLineBreak()
395 UErrorCode status
= U_ZERO_ERROR
;
396 BITestData
thaiLineSelection(status
);
398 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
401 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
404 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1B\\u0E35", 0, status
);
405 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status
);
406 ADD_DATACHUNK(thaiLineSelection
, "2545 ", 0, status
);
407 ADD_DATACHUNK(thaiLineSelection
, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status
);
408 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1B\\u0E35", 0, status
);
409 ADD_DATACHUNK(thaiLineSelection
, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status
);
410 ADD_DATACHUNK(thaiLineSelection
, "\\u0E04\\u0E23\\u0E1A", 0, status
);
411 ADD_DATACHUNK(thaiLineSelection
, "\\u0E23\\u0E2D\\u0E1A ", 0, status
);
412 ADD_DATACHUNK(thaiLineSelection
, "\"\\u0E52\\u0E52\\u0E50 ", 0, status
);
413 ADD_DATACHUNK(thaiLineSelection
, "\\u0E1b\\u0E35\" ", 0, status
);
414 ADD_DATACHUNK(thaiLineSelection
, "\\u0E02\\u0E2d\\u0E07", 0, status
);
415 ADD_DATACHUNK(thaiLineSelection
, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status
);
416 ADD_DATACHUNK(thaiLineSelection
, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status
);
417 ADD_DATACHUNK(thaiLineSelection
, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status
);
418 ADD_DATACHUNK(thaiLineSelection
, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status
);
419 ADD_DATACHUNK(thaiLineSelection
, "Bangkok)", 0, status
);
421 // @suwit - end of changes
424 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale("th"), status
);
425 if (U_FAILURE(status
))
427 errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
432 generalIteratorTest(*e
, thaiLineSelection
);
437 void RBBITest::TestMaiyamok()
439 UErrorCode status
= U_ZERO_ERROR
;
440 BITestData
thaiLineSelection(status
);
441 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
442 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
443 // word". Instead of appearing as a word unto itself, however, it's kept together
444 // with the word before it
445 ADD_DATACHUNK(thaiLineSelection
, "\\u0e44\\u0e1b\\u0e46", 0, status
);
446 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32\\u0e46", 0, status
);
447 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status
);
448 ADD_DATACHUNK(thaiLineSelection
, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status
);
449 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e17\\u0e1e", 0, status
);
450 ADD_DATACHUNK(thaiLineSelection
, "\\u0e41\\u0e25\\u0e30", 0, status
);
451 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e03\\u0e35", 0, status
);
452 ADD_DATACHUNK(thaiLineSelection
, "\\u0e22\\u0e07", 0, status
);
453 ADD_DATACHUNK(thaiLineSelection
, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status
);
455 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
456 Locale("th"), status
);
458 if (U_FAILURE(status
))
460 errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
463 generalIteratorTest(*e
, thaiLineSelection
);
469 void RBBITest::TestBug3818() {
470 UErrorCode status
= U_ZERO_ERROR
;
472 // Four Thai words...
473 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
474 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
475 UnicodeString
thaiStr(thaiWordData
);
477 RuleBasedBreakIterator
* bi
=
478 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale("th"), status
);
479 if (U_FAILURE(status
) || bi
== NULL
) {
480 errln("Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
483 bi
->setText(thaiStr
);
485 int32_t startOfSecondWord
= bi
->following(1);
486 if (startOfSecondWord
!= 4) {
487 errln("Fail at file %s, line %d expected start of word at 4, got %d",
488 __FILE__
, __LINE__
, startOfSecondWord
);
490 startOfSecondWord
= bi
->following(0);
491 if (startOfSecondWord
!= 4) {
492 errln("Fail at file %s, line %d expected start of word at 4, got %d",
493 __FILE__
, __LINE__
, startOfSecondWord
);
499 void RBBITest::TestJapaneseWordBreak() {
500 UErrorCode status
= U_ZERO_ERROR
;
501 BITestData
japaneseWordSelection(status
);
503 ADD_DATACHUNK(japaneseWordSelection
, NULL
, 0, status
); // Break at start of data
504 ADD_DATACHUNK(japaneseWordSelection
, "\\u4ECA\\u65E5", 400, status
); //2
505 ADD_DATACHUNK(japaneseWordSelection
, "\\u306F\\u3044\\u3044", 300, status
); //5
506 ADD_DATACHUNK(japaneseWordSelection
, "\\u5929\\u6C17", 400, status
); //7
507 ADD_DATACHUNK(japaneseWordSelection
, "\\u3067\\u3059\\u306D", 300, status
); //10
508 ADD_DATACHUNK(japaneseWordSelection
, "\\u3002", 0, status
); //11
509 ADD_DATACHUNK(japaneseWordSelection
, "\\u000D\\u000A", 0, status
); //12
511 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(
512 Locale("ja"), status
);
513 if (U_FAILURE(status
))
515 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
519 generalIteratorTest(*e
, japaneseWordSelection
);
523 void RBBITest::TestTrieDict() {
524 UErrorCode status
= U_ZERO_ERROR
;
527 // Open and read the test data file.
529 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
530 char testFileName
[1000];
531 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) + strlen("riwords.txt") + 10 >= sizeof(testFileName
)) {
532 errln("Can't open test data. Path too long.");
535 strcpy(testFileName
, testDataDirectory
);
536 strcat(testFileName
, "riwords.txt");
538 // Items needing deleting at the end
539 MutableTrieDictionary
*mutableDict
= NULL
;
540 CompactTrieDictionary
*compactDict
= NULL
;
541 UnicodeSet
*breaks
= NULL
;
542 UChar
*testFile
= NULL
;
543 StringEnumeration
*enumer
= NULL
;
544 MutableTrieDictionary
*mutable2
= NULL
;
545 StringEnumeration
*cloneEnum
= NULL
;
546 CompactTrieDictionary
*compact2
= NULL
;
549 const UnicodeString
*originalWord
= NULL
;
550 const UnicodeString
*cloneWord
= NULL
;
559 testFile
= ReadAndConvertFile(testFileName
, len
, status
);
560 if (U_FAILURE(status
)) {
561 goto cleanup
; /* something went wrong, error already output */
564 mutableDict
= new MutableTrieDictionary(0x0E1C, status
);
565 if (U_FAILURE(status
)) {
566 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status
));
570 breaks
= new UnicodeSet
;
571 breaks
->add(0x000A); // Line Feed
572 breaks
->add(0x000D); // Carriage Return
573 breaks
->add(0x2028); // Line Separator
574 breaks
->add(0x2029); // Paragraph Separator
576 // Now add each non-comment line of the file as a word.
584 if (uc
== 0x0023) { // #comment line, skip
585 while (uc
&& !breaks
->contains(uc
)) {
589 else while (uc
&& !breaks
->contains(uc
)) {
594 mutableDict
->addWord(word
, wordLen
, status
);
595 if (U_FAILURE(status
)) {
596 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status
));
602 // Find beginning of next line
603 while (uc
&& breaks
->contains(uc
)) {
610 if (wordCount
< 50) {
611 errln("Word count (%d) unreasonably small\n", wordCount
);
615 enumer
= mutableDict
->openWords(status
);
616 if (U_FAILURE(status
)) {
617 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status
));
622 if (wordCount
!= (testCount
= enumer
->count(status
))) {
623 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
624 testCount
, wordCount
, u_errorName(status
));
632 compactDict
= new CompactTrieDictionary(*mutableDict
, status
);
633 if (U_FAILURE(status
)) {
634 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status
));
638 enumer
= compactDict
->openWords(status
);
639 if (U_FAILURE(status
)) {
640 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status
));
644 if (wordCount
!= (testCount
= enumer
->count(status
))) {
645 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
646 testCount
, wordCount
, u_errorName(status
));
654 mutable2
= compactDict
->cloneMutable(status
);
655 if (U_FAILURE(status
)) {
656 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status
));
660 cloneEnum
= mutable2
->openWords(status
);
661 if (U_FAILURE(status
)) {
662 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status
));
666 if (wordCount
!= (testCount
= cloneEnum
->count(status
))) {
667 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
668 testCount
, wordCount
, u_errorName(status
));
672 // Compact original dictionary to clone. Note that we can only compare the same kind of
673 // dictionary as the order of the enumerators is not guaranteed to be the same between
675 enumer
= mutableDict
->openWords(status
);
676 if (U_FAILURE(status
)) {
677 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status
));
681 originalWord
= enumer
->snext(status
);
682 cloneWord
= cloneEnum
->snext(status
);
683 while (U_SUCCESS(status
) && originalWord
!= NULL
&& cloneWord
!= NULL
) {
684 if (*originalWord
!= *cloneWord
) {
685 errln("Original and cloned MutableTrieDictionary word mismatch\n");
688 originalWord
= enumer
->snext(status
);
689 cloneWord
= cloneEnum
->snext(status
);
692 if (U_FAILURE(status
)) {
693 errln("Enumeration failed: %s\n", u_errorName(status
));
697 if (originalWord
!= cloneWord
) {
698 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
702 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
703 compact2
= new CompactTrieDictionary(compactDict
->data(), status
);
704 if (U_FAILURE(status
)) {
705 errln("CompactTrieDictionary(const void *,...) failed\n");
709 if (compact2
->dataSize() == 0) {
710 errln("CompactTrieDictionary->dataSize() == 0\n");
714 // Now count the words via the second dictionary
716 enumer
= compact2
->openWords(status
);
717 if (U_FAILURE(status
)) {
718 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status
));
722 if (wordCount
!= (testCount
= enumer
->count(status
))) {
723 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
724 testCount
, wordCount
, u_errorName(status
));
739 //---------------------------------------------
741 //---------------------------------------------
743 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
745 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
748 case 0: name
= "TestBug4153072";
749 if(exec
) TestBug4153072(); break;
750 case 1: name
= "TestJapaneseLineBreak";
751 if(exec
) TestJapaneseLineBreak(); break;
752 case 2: name
= "TestStatusReturn";
753 if(exec
) TestStatusReturn(); break;
755 case 3: name
= "TestLineBreakData";
756 if(exec
) TestLineBreakData(); break;
757 case 4: name
= "TestEmptyString";
758 if(exec
) TestEmptyString(); break;
760 case 5: name
= "TestGetAvailableLocales";
761 if(exec
) TestGetAvailableLocales(); break;
763 case 6: name
= "TestGetDisplayName";
764 if(exec
) TestGetDisplayName(); break;
766 case 7: name
= "TestEndBehaviour";
767 if(exec
) TestEndBehaviour(); break;
768 case 8: name
= "TestMixedThaiLineBreak";
769 if(exec
) TestMixedThaiLineBreak(); break;
770 case 9: name
= "TestThaiLineBreak";
771 if(exec
) TestThaiLineBreak(); break;
772 case 10: name
= "TestMaiyamok";
773 if(exec
) TestMaiyamok(); break;
774 case 11: name
= "TestWordBreaks";
775 if(exec
) TestWordBreaks(); break;
776 case 12: name
= "TestWordBoundary";
777 if(exec
) TestWordBoundary(); break;
778 case 13: name
= "TestLineBreaks";
779 if(exec
) TestLineBreaks(); break;
780 case 14: name
= "TestSentBreaks";
781 if(exec
) TestSentBreaks(); break;
782 case 15: name
= "TestExtended";
783 if(exec
) TestExtended(); break;
784 case 16: name
= "TestMonkey";
786 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
789 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
793 case 17: name
= "TestBug3818";
794 if(exec
) TestBug3818(); break;
795 case 18: name
= "TestJapaneseWordBreak";
796 if(exec
) TestJapaneseWordBreak(); break;
797 case 19: name
= "TestDebug";
798 if(exec
) TestDebug(); break;
799 case 20: name
= "TestTrieDict";
800 if(exec
) TestTrieDict(); break;
802 default: name
= ""; break; //needed to end loop
807 //----------------------------------------------------------------------------
809 // generalIteratorTest Given a break iterator and a set of test data,
810 // Run the tests and report the results.
812 //----------------------------------------------------------------------------
813 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
816 bi
.setText(td
.fDataToBreak
);
818 testFirstAndNext(bi
, td
);
820 testLastAndPrevious(bi
, td
);
822 testFollowing(bi
, td
);
823 testPreceding(bi
, td
);
824 testIsBoundary(bi
, td
);
825 doMultipleSelectionTest(bi
, td
);
830 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
833 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
835 UErrorCode status
= U_ZERO_ERROR
;
840 logln("Test first and next");
841 bi
.setText(td
.fDataToBreak
);
844 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
845 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
846 tag
= bi
.getRuleStatus();
847 td
.fActualTags
.addElement(tag
, status
);
849 // If the iterator is not making forward progress, stop.
850 // No need to raise an error here, it'll be detected in the normal check of results.
855 td
.checkResults("testFirstAndNext", this);
860 // TestLastAndPrevious. Run the iterator backwards, starting with last().
862 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
864 UErrorCode status
= U_ZERO_ERROR
;
866 int32_t lastP
= 0x7ffffffe;
869 logln("Test first and next");
870 bi
.setText(td
.fDataToBreak
);
873 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
874 // Save break position. Insert it at start of vector of results, shoving
875 // already-saved results further towards the end.
876 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
877 // bi.previous(); // TODO: Why does this fix things up????
879 tag
= bi
.getRuleStatus();
880 td
.fActualTags
.insertElementAt(tag
, 0, status
);
882 // If the iterator is not making progress, stop.
883 // No need to raise an error here, it'll be detected in the normal check of results.
888 td
.checkResults("testLastAndPrevious", this);
892 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
894 UErrorCode status
= U_ZERO_ERROR
;
897 int32_t lastP
= -2; // A value that will never be returned as a break position.
898 // cannot be -1; that is returned for DONE.
901 logln("testFollowing():");
902 bi
.setText(td
.fDataToBreak
);
905 // Save the starting point, since we won't get that out of following.
907 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
908 tag
= bi
.getRuleStatus();
909 td
.fActualTags
.addElement(tag
, status
);
911 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
914 if (p
== RuleBasedBreakIterator::DONE
) {
917 // We've reached a new break position. Save it.
918 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
919 tag
= bi
.getRuleStatus();
920 td
.fActualTags
.addElement(tag
, status
);
924 // The loop normally exits by means of the break in the middle.
925 // Make sure that the index was at the correct position for the break iterator to have
927 if (i
!= td
.fDataToBreak
.length()) {
928 errln("testFollowing(): iterator returned DONE prematurely.");
931 // Full check of all results.
932 td
.checkResults("testFollowing", this);
937 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
938 UErrorCode status
= U_ZERO_ERROR
;
941 int32_t lastP
= 0x7ffffffe;
944 logln("testPreceding():");
945 bi
.setText(td
.fDataToBreak
);
949 td
.fActualBreakPositions
.addElement(p
, status
);
950 tag
= bi
.getRuleStatus();
951 td
.fActualTags
.addElement(tag
, status
);
953 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
956 if (p
== RuleBasedBreakIterator::DONE
) {
959 // We've reached a new break position. Save it.
960 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
962 tag
= bi
.getRuleStatus();
963 td
.fActualTags
.insertElementAt(tag
, 0, status
);
966 // The loop normally exits by means of the break in the middle.
967 // Make sure that the index was at the correct position for the break iterator to have
970 errln("testPreceding(): iterator returned DONE prematurely.");
973 // Full check of all results.
974 td
.checkResults("testPreceding", this);
979 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
980 UErrorCode status
= U_ZERO_ERROR
;
984 logln("testIsBoundary():");
985 bi
.setText(td
.fDataToBreak
);
988 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
989 if (bi
.isBoundary(i
)) {
990 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
991 tag
= bi
.getRuleStatus();
992 td
.fActualTags
.addElement(tag
, status
);
995 td
.checkResults("testIsBoundary: ", this);
1000 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
1002 iterator
.setText(td
.fDataToBreak
);
1004 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
1005 int32_t offset
= iterator
.first();
1009 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
1011 if (*testIterator
!= iterator
)
1012 errln("clone() or operator!= failed: two clones compared unequal");
1015 testOffset
= testIterator
->first();
1016 testOffset
= testIterator
->next(count
);
1017 if (offset
!= testOffset
)
1018 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
1020 if (offset
!= RuleBasedBreakIterator::DONE
) {
1022 offset
= iterator
.next();
1024 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
1025 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
1026 if (count
> 10000 || offset
== -1) {
1027 errln("operator== failed too many times. Stopping test.");
1029 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1035 } while (offset
!= RuleBasedBreakIterator::DONE
);
1037 // now do it backwards...
1038 offset
= iterator
.last();
1042 testOffset
= testIterator
->last();
1043 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
1044 if (offset
!= testOffset
)
1045 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
1047 if (offset
!= RuleBasedBreakIterator::DONE
) {
1049 offset
= iterator
.previous();
1051 } while (offset
!= RuleBasedBreakIterator::DONE
);
1053 delete testIterator
;
1057 //---------------------------------------------
1061 //---------------------------------------------
1062 void RBBITest::TestEmptyString()
1064 UnicodeString text
= "";
1065 UErrorCode status
= U_ZERO_ERROR
;
1067 BITestData
x(status
);
1068 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
1069 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
1070 if (U_FAILURE(status
))
1072 errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1075 generalIteratorTest(*bi
, x
);
1079 void RBBITest::TestGetAvailableLocales()
1081 int32_t locCount
= 0;
1082 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
1085 errln("getAvailableLocales() returned an empty list!");
1086 // Just make sure that it's returning good memory.
1088 for (i
= 0; i
< locCount
; ++i
) {
1089 logln(locList
[i
].getName());
1093 //Testing the BreakIterator::getDisplayName() function
1094 void RBBITest::TestGetDisplayName()
1096 UnicodeString result
;
1098 BreakIterator::getDisplayName(Locale::getUS(), result
);
1099 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
1100 errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1103 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
1104 if (result
!= "French (France)")
1105 errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1109 * Test End Behaviour
1112 void RBBITest::TestEndBehaviour()
1114 UErrorCode status
= U_ZERO_ERROR
;
1115 UnicodeString
testString("boo.");
1116 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1117 if (U_FAILURE(status
))
1119 errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1122 wb
->setText(testString
);
1124 if (wb
->first() != 0)
1125 errln("Didn't get break at beginning of string.");
1126 if (wb
->next() != 3)
1127 errln("Didn't get break before period in \"boo.\"");
1128 if (wb
->current() != 4 && wb
->next() != 4)
1129 errln("Didn't get break at end of string.");
1135 void RBBITest::TestBug4153072() {
1136 UErrorCode status
= U_ZERO_ERROR
;
1137 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1138 if (U_FAILURE(status
))
1140 errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1143 UnicodeString
str("...Hello, World!...");
1145 int32_t end
= str
.length() - 3;
1148 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
1149 iter
->adoptText(textIterator
);
1151 // Note: with the switch to UText, there is no way to restrict the
1152 // iteration range to begin at an index other than zero.
1153 // String character iterators created with a non-zero bound are
1154 // treated by RBBI as being empty.
1155 for (index
= -1; index
< begin
+ 1; ++index
) {
1156 onBoundary
= iter
->isBoundary(index
);
1157 if (index
== 0? !onBoundary
: onBoundary
) {
1158 errln((UnicodeString
)"Didn't handle isBoundary correctly with offset = " + index
+
1159 " and begin index = " + begin
);
1167 * Test Japanese Line Break
1170 void RBBITest::TestJapaneseLineBreak()
1173 // Test needs updating some more... Dump it for now.
1176 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1177 // as opening and closing punctuation for line breaking.
1178 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1179 // from these tests. 6-13-2002
1181 UErrorCode status
= U_ZERO_ERROR
;
1182 UnicodeString testString
= CharsToUnicodeString("\\u4e00x\\u4e8c");
1183 UnicodeString precedingChars
= CharsToUnicodeString(
1184 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1185 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1186 UnicodeString followingChars
= CharsToUnicodeString(
1187 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1188 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1189 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1190 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1191 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1192 BreakIterator
*iter
= BreakIterator::createLineInstance(Locale::getJapan(), status
);
1195 if (U_FAILURE(status
))
1197 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1201 for (i
= 0; i
< precedingChars
.length(); i
++) {
1202 testString
.setCharAt(1, precedingChars
[i
]);
1203 iter
->setText(testString
);
1204 int32_t j
= iter
->first();
1206 errln("ja line break failure: failed to start at 0");
1209 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars
[i
])
1210 + "' (" + ((int)(precedingChars
[i
])) + ")");
1213 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars
[i
])
1214 + "' (" + ((int)(precedingChars
[i
])) + ")");
1217 for (i
= 0; i
< followingChars
.length(); i
++) {
1218 testString
.setCharAt(1, followingChars
[i
]);
1219 iter
->setText(testString
);
1220 int j
= iter
->first();
1222 errln("ja line break failure: failed to start at 0");
1225 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars
[i
])
1226 + "' (" + ((int)(followingChars
[i
])) + ")");
1229 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars
[i
])
1230 + "' (" + ((int)(followingChars
[i
])) + ")");
1237 //------------------------------------------------------------------------------
1239 // RBBITest::Extended Run RBBI Tests from an external test data file
1241 //------------------------------------------------------------------------------
1245 UnicodeString dataToBreak
;
1246 UVector32
*expectedBreaks
;
1251 void RBBITest::executeTest(TestParams
*t
) {
1256 if (t
->bi
== NULL
) {
1260 t
->bi
->setText(t
->dataToBreak
);
1262 // Run the iterator forward
1265 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1267 // Fail for lack of forward progress.
1268 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1269 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1273 // Check that there were we didn't miss an expected break between the last one
1275 for (i
=prevBP
+1; i
<bp
; i
++) {
1276 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1277 int expected
[] = {0, i
};
1278 printStringBreaks(t
->dataToBreak
, expected
, 2);
1279 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1280 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1284 // Check that the break we did find was expected
1285 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1286 int expected
[] = {0, bp
};
1287 printStringBreaks(t
->dataToBreak
, expected
, 2);
1288 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1289 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1291 // The break was expected.
1292 // Check that the {nnn} tag value is correct.
1293 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1294 if (expectedTagVal
== -1) {
1297 int32_t line
= t
->srcLine
->elementAti(bp
);
1298 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1299 if (rs
!= expectedTagVal
) {
1300 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1301 " Actual, Expected status = %4d, %4d",
1302 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1310 // Verify that there were no missed expected breaks after the last one found
1311 for (i
=prevBP
+1; i
<t
->expectedBreaks
->size(); i
++) {
1312 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1313 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1314 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1319 // Run the iterator backwards, verify that the same breaks are found.
1321 prevBP
= t
->dataToBreak
.length()+2; // start with a phony value for the last break pos seen.
1322 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1324 // Fail for lack of progress.
1325 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1326 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1330 // Check that there were we didn't miss an expected break between the last one
1331 // and this one. (UVector returns zeros for index out of bounds.)
1332 for (i
=prevBP
-1; i
>bp
; i
--) {
1333 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1334 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1335 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1339 // Check that the break we did find was expected
1340 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1341 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1342 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1344 // The break was expected.
1345 // Check that the {nnn} tag value is correct.
1346 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1347 if (expectedTagVal
== -1) {
1350 int line
= t
->srcLine
->elementAti(bp
);
1351 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1352 if (rs
!= expectedTagVal
) {
1353 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1354 " Actual, Expected status = %4d, %4d",
1355 bp
, line
, t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1362 // Verify that there were no missed breaks prior to the last one found
1363 for (i
=prevBP
-1; i
>=0; i
--) {
1364 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1365 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1366 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1372 void RBBITest::TestExtended() {
1373 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1374 UErrorCode status
= U_ZERO_ERROR
;
1377 UnicodeString rules
;
1380 tp
.expectedBreaks
= new UVector32(status
);
1381 tp
.srcLine
= new UVector32(status
);
1382 tp
.srcCol
= new UVector32(status
);
1384 RegexMatcher
localeMatcher("<locale *([\\p{L}\\p{Nd}_]*) *>", 0, status
);
1385 TEST_ASSERT_SUCCESS(status
);
1389 // Open and read the test data file.
1391 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1392 char testFileName
[1000];
1393 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1394 errln("Can't open test data. Path too long.");
1397 strcpy(testFileName
, testDataDirectory
);
1398 strcat(testFileName
, "rbbitst.txt");
1401 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, status
);
1402 if (U_FAILURE(status
)) {
1403 return; /* something went wrong, error already output */
1409 // Put the test data into a UnicodeString
1411 UnicodeString
testString(FALSE
, testFile
, len
);
1419 parseState
= PARSE_TAG
;
1421 EParseState savedState
= PARSE_TAG
;
1423 static const UChar CH_LF
= 0x0a;
1424 static const UChar CH_CR
= 0x0d;
1425 static const UChar CH_HASH
= 0x23;
1426 /*static const UChar CH_PERIOD = 0x2e;*/
1427 static const UChar CH_LT
= 0x3c;
1428 static const UChar CH_GT
= 0x3e;
1429 static const UChar CH_BACKSLASH
= 0x5c;
1430 static const UChar CH_BULLET
= 0x2022;
1432 int32_t lineNum
= 1;
1433 int32_t colStart
= 0;
1435 int32_t charIdx
= 0;
1437 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1439 for (charIdx
= 0; charIdx
< len
; ) {
1440 status
= U_ZERO_ERROR
;
1441 UChar c
= testString
.charAt(charIdx
);
1443 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1444 // treat CRLF as a unit
1448 if (c
== CH_LF
|| c
== CH_CR
) {
1452 column
= charIdx
- colStart
+ 1;
1454 switch (parseState
) {
1456 if (c
== 0x0a || c
== 0x0d) {
1457 parseState
= savedState
;
1464 parseState
= PARSE_COMMENT
;
1465 savedState
= PARSE_TAG
;
1468 if (u_isUWhiteSpace(c
)) {
1471 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1473 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1477 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1479 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1483 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1485 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1489 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1492 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1496 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1498 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1502 // <locale loc_name>
1503 localeMatcher
.reset(testString
);
1504 if (localeMatcher
.lookingAt(charIdx
-1, status
)) {
1505 UnicodeString localeName
= localeMatcher
.group(1, status
);
1506 char localeName8
[100];
1507 localeName
.extract(0, localeName
.length(), localeName8
, sizeof(localeName8
), 0);
1508 locale
= Locale::createFromName(localeName8
);
1509 charIdx
+= localeMatcher
.group(0, status
).length();
1510 TEST_ASSERT_SUCCESS(status
);
1513 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1514 parseState
= PARSE_DATA
;
1516 tp
.dataToBreak
= "";
1517 tp
.expectedBreaks
->removeAllElements();
1518 tp
.srcCol
->removeAllElements();
1519 tp
.srcLine
->removeAllElements();
1523 errln("line %d: Tag expected in test file.", lineNum
);
1525 parseState
= PARSE_COMMENT
;
1526 savedState
= PARSE_DATA
;
1531 if (c
== CH_BULLET
) {
1532 int32_t breakIdx
= tp
.dataToBreak
.length();
1533 tp
.expectedBreaks
->setSize(breakIdx
+1);
1534 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1535 tp
.srcLine
->setSize(breakIdx
+1);
1536 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1537 tp
.srcCol
->setSize(breakIdx
+1);
1538 tp
.srcCol
->setElementAt(column
, breakIdx
);
1542 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1543 // Add final entry to mappings from break location to source file position.
1544 // Need one extra because last break position returned is after the
1545 // last char in the data, not at the last char.
1546 tp
.srcLine
->addElement(lineNum
, status
);
1547 tp
.srcCol
->addElement(column
, status
);
1549 parseState
= PARSE_TAG
;
1557 if (testString
.compare(charIdx
-1, 3, "\\N{") == 0) {
1558 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1559 // Get the code point from the name and insert it into the test data.
1560 // (Damn, no API takes names in Unicode !!!
1561 // we've got to take it back to char *)
1562 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1563 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1564 char charNameBuf
[200];
1565 UChar32 theChar
= -1;
1566 if (nameEndIdx
!= -1) {
1567 UErrorCode status
= U_ZERO_ERROR
;
1568 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1569 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1570 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1571 if (U_FAILURE(status
)) {
1575 if (theChar
== -1) {
1576 errln("Error in named character in test file at line %d, col %d",
1579 // Named code point was recognized. Insert it
1580 // into the test data.
1581 tp
.dataToBreak
.append(theChar
);
1582 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1583 tp
.srcLine
->addElement(lineNum
, status
);
1584 tp
.srcCol
->addElement(column
, status
);
1587 if (nameEndIdx
> charIdx
) {
1588 charIdx
= nameEndIdx
+1;
1597 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1599 int32_t breakIdx
= tp
.dataToBreak
.length();
1600 tp
.expectedBreaks
->setSize(breakIdx
+1);
1601 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1602 tp
.srcLine
->setSize(breakIdx
+1);
1603 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1604 tp
.srcCol
->setSize(breakIdx
+1);
1605 tp
.srcCol
->setElementAt(column
, breakIdx
);
1611 parseState
= PARSE_NUM
;
1615 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1616 parseState
= PARSE_COMMENT
;
1617 savedState
= PARSE_DATA
;
1621 if (c
== CH_BACKSLASH
) {
1622 // Check for \ at end of line, a line continuation.
1623 // Advance over (discard) the newline
1624 UChar32 cp
= testString
.char32At(charIdx
);
1625 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1627 // Need an extra increment of the input ptr to move over both of them
1630 if (cp
== CH_LF
|| cp
== CH_CR
) {
1637 // Let unescape handle the back slash.
1638 cp
= testString
.unescapeAt(charIdx
);
1640 // Escape sequence was recognized. Insert the char
1641 // into the test data.
1642 tp
.dataToBreak
.append(cp
);
1643 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1644 tp
.srcLine
->addElement(lineNum
, status
);
1645 tp
.srcCol
->addElement(column
, status
);
1651 // Not a recognized backslash escape sequence.
1652 // Take the next char as a literal.
1653 // TODO: Should this be an error?
1654 c
= testString
.charAt(charIdx
);
1655 charIdx
= testString
.moveIndex32(charIdx
, 1);
1658 // Normal, non-escaped data char.
1659 tp
.dataToBreak
.append(c
);
1661 // Save the mapping from offset in the data to line/column numbers in
1662 // the original input file. Will be used for better error messages only.
1663 // If there's an expected break before this char, the slot in the mapping
1664 // vector will already be set for this char; don't overwrite it.
1665 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1666 tp
.srcLine
->addElement(lineNum
, status
);
1667 tp
.srcCol
->addElement(column
, status
);
1673 // We are parsing an expected numeric tag value, like <1234>,
1674 // within a chunk of data.
1675 if (u_isUWhiteSpace(c
)) {
1680 // Finished the number. Add the info to the expected break data,
1681 // and switch parse state back to doing plain data.
1682 parseState
= PARSE_DATA
;
1683 if (tagValue
== 0) {
1686 int32_t breakIdx
= tp
.dataToBreak
.length();
1687 tp
.expectedBreaks
->setSize(breakIdx
+1);
1688 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1689 tp
.srcLine
->setSize(breakIdx
+1);
1690 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1691 tp
.srcCol
->setSize(breakIdx
+1);
1692 tp
.srcCol
->setElementAt(column
, breakIdx
);
1697 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1701 errln("Syntax Error in test file at line %d, col %d",
1704 parseState
= PARSE_COMMENT
;
1709 if (U_FAILURE(status
)) {
1710 errln("ICU Error %s while parsing test file at line %d.",
1711 u_errorName(status
), lineNum
);
1713 status
= U_ZERO_ERROR
;
1720 delete tp
.expectedBreaks
;
1728 //-------------------------------------------------------------------------------
1730 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1731 // return the datain one big UChar * buffer, which the caller must delete.
1733 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1734 // Move this function to some common place.
1736 //--------------------------------------------------------------------------------
1737 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, UErrorCode
&status
) {
1738 UChar
*retPtr
= NULL
;
1739 char *fileBuf
= NULL
;
1740 UConverter
* conv
= NULL
;
1744 if (U_FAILURE(status
)) {
1751 f
= fopen(fileName
, "rb");
1753 errln("Error opening test data file %s\n", fileName
);
1754 status
= U_FILE_ACCESS_ERROR
;
1763 fseek( f
, 0, SEEK_END
);
1764 fileSize
= ftell(f
);
1765 fileBuf
= new char[fileSize
];
1766 fseek(f
, 0, SEEK_SET
);
1767 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1768 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1769 errln("Error reading test data file.");
1770 goto cleanUpAndReturn
;
1774 // Look for a Unicode Signature (BOM) on the data just read
1776 int32_t signatureLength
;
1777 const char * fileBufC
;
1778 const char* encoding
;
1781 encoding
= ucnv_detectUnicodeSignature(
1782 fileBuf
, fileSize
, &signatureLength
, &status
);
1783 if(encoding
!=NULL
){
1784 fileBufC
+= signatureLength
;
1785 fileSize
-= signatureLength
;
1789 // Open a converter to take the rule file to UTF-16
1791 conv
= ucnv_open(encoding
, &status
);
1792 if (U_FAILURE(status
)) {
1793 goto cleanUpAndReturn
;
1797 // Convert the rules to UChar.
1798 // Preflight first to determine required buffer size.
1800 ulen
= ucnv_toUChars(conv
,
1806 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1807 // Buffer Overflow is expected from the preflight operation.
1808 status
= U_ZERO_ERROR
;
1810 retPtr
= new UChar
[ulen
+1];
1823 if (U_FAILURE(status
)) {
1824 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1833 //--------------------------------------------------------------------------------------------
1835 // Exhaustive Tests, using Unicode Data Files.
1837 //--------------------------------------------------------------------------------------------
1840 // Token level scanner for the Unicode Line Break Test Data file.
1841 // Return the next token, as follows:
1842 // >= 0: a UChar32 character, scanned from hex in the file.
1843 // -1: a break position, a division sign in the file.
1844 // -2: end of rule. A new line in the file.
1845 // -3: end of file. No more rules.
1849 // strips comments, ('#' to end of line)
1850 // Recognizes CR, CR/LF and LF as new lines.
1851 // Skips over spaces and Xs (don't break here) in the data.
1858 ScanState() :fPeeked(FALSE
), fLineNum(0), fFile(NULL
) {};
1861 // Literal characters that are of interest. In hex to keep EBCDIC based machines happy.
1862 // The data itself is latin-1 on all platforms.
1863 static const int32_t chSpace
= 0x20;
1864 static const int32_t chTab
= 0x09;
1865 static const int32_t chCR
= 0x0D;
1866 static const int32_t chLF
= 0x0A;
1867 static const int32_t chHash
= 0x23;
1868 static const int32_t chMult
= 0xD7;
1869 static const int32_t chDivide
= 0xF7;
1871 static int32_t nextLBDToken(ScanState
*s
) {
1874 // Read characters from the input file until we get something interesting
1875 // to return. The file is in latin-1 encoding.
1877 // Get the next character to look at,
1885 // EOF. Return immediately.
1890 // Spaces. Treat the multiply sign as a space - it indicates a no-break position
1891 // in the data, and the test program doesn't want to see them.
1892 // Continue the next char loop, looking for something significant.
1893 if (c
== chSpace
|| c
== chTab
|| c
== chMult
) {
1897 // Divide sign. Indicates an expected break position.
1898 if (c
== chDivide
) {
1902 // New Line Handling. Keep track of line number in the file, which in turn
1903 // requires keeping track of CR/LF as a single new line.
1906 s
->fPeekChar
= getc(s
->fFile
);
1907 if (s
->fPeekChar
!= chLF
) {s
->fPeeked
= TRUE
;};
1915 // Comments. Consume everything up to the next new line.
1919 } while (!(c
== EOF
|| c
== chCR
|| c
== chLF
));
1922 return nextLBDToken(s
);
1925 // Scan a hex character (UChar32) value.
1926 if (u_digit(c
, 16) >= 0) {
1927 int32_t v
= u_digit(c
, 16);
1930 if (u_digit(c
, 16) < 0) {break;};
1932 v
+= u_digit(c
, 16);
1939 // Error. Character was something unexpected.
1946 void RBBITest::TestLineBreakData() {
1948 UErrorCode status
= U_ZERO_ERROR
;
1949 UnicodeString testString
;
1950 UVector
expectedBreaks(status
);
1954 BreakIterator
*bi
= BreakIterator::createLineInstance(Locale::getDefault(), status
);
1955 if (U_FAILURE(status
)) {
1956 errln("Failure creating break iterator");
1960 const char * lbdfName
= "LBTest.txt";
1962 // Open the test data file.
1963 // TODO: a proper way to handle this data.
1964 ss
.fFile
= fopen(lbdfName
, "rb");
1965 if (ss
.fFile
== NULL
) {
1966 logln("Unable to open Line Break Test Data file. Skipping test.");
1971 // Loop once per line from the test data file.
1973 // Zero out test data from previous line.
1974 testString
.truncate(0);
1975 expectedBreaks
.removeAllElements();
1977 // Read one test's (line's) worth of data from the file.
1978 // Loop once per token on the input file line.
1980 tok
= nextLBDToken(&ss
);
1982 // If we scanned a character number in the file.
1983 // save it in the test data array.
1985 testString
.append((UChar32
)tok
);
1989 // If we scanned a break position in the data, record it.
1991 expectedBreaks
.addElement(testString
.length(), status
);
1995 // If we scanned a new line, or EOF
1996 // drop out of scan loop and run the test case.
1997 if (tok
== -2 || tok
== -3) {break;};
1999 // None of above. Error.
2000 errln("Failure: Unrecognized data format, test file line %d", ss
.fLineNum
);
2004 // If this line from the test data file actually contained test data,
2006 if (testString
.length() > 0) {
2007 int32_t pos
; // Break Position in the test string
2008 int32_t expectedI
= 0; // Index of expected break position in vector of same.
2009 int32_t expectedPos
; // Expected break position (index into test string)
2011 bi
->setText(testString
);
2015 for (; pos
!= BreakIterator::DONE
; ) {
2016 expectedPos
= expectedBreaks
.elementAti(expectedI
);
2017 if (pos
< expectedPos
) {
2018 errln("Failure: Test file line %d, unexpected break found at position %d",
2022 if (pos
> expectedPos
) {
2023 errln("Failure: Test file line %d, failed to find break at position %d",
2024 ss
.fLineNum
, expectedPos
);
2032 // If we've hit EOF on the input file, we're done.
2044 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2046 //---------------------------------------------------------------------------------------
2048 // classs RBBIMonkeyKind
2050 // Monkey Test for Break Iteration
2051 // Abstract interface class. Concrete derived classes independently
2052 // implement the break rules for different iterator types.
2054 // The Monkey Test itself uses doesn't know which type of break iterator it is
2055 // testing, but works purely in terms of the interface defined here.
2057 //---------------------------------------------------------------------------------------
2058 class RBBIMonkeyKind
{
2060 // Return a UVector of UnicodeSets, representing the character classes used
2061 // for this type of iterator.
2062 virtual UVector
*charClasses() = 0;
2064 // Set the test text on which subsequent calls to next() will operate
2065 virtual void setText(const UnicodeString
&s
) = 0;
2067 // Find the next break postion, starting from the prev break position, or from zero.
2068 // Return -1 after reaching end of string.
2069 virtual int32_t next(int32_t i
) = 0;
2071 virtual ~RBBIMonkeyKind();
2072 UErrorCode deferredStatus
;
2081 RBBIMonkeyKind::RBBIMonkeyKind() {
2082 deferredStatus
= U_ZERO_ERROR
;
2085 RBBIMonkeyKind::~RBBIMonkeyKind() {
2089 //----------------------------------------------------------------------------------------
2091 // Random Numbers. Similar to standard lib rand() and srand()
2092 // Not using library to
2093 // 1. Get same results on all platforms.
2094 // 2. Get access to current seed, to more easily reproduce failures.
2096 //---------------------------------------------------------------------------------------
2097 static uint32_t m_seed
= 1;
2099 static uint32_t m_rand()
2101 m_seed
= m_seed
* 1103515245 + 12345;
2102 return (uint32_t)(m_seed
/65536) % 32768;
2106 //------------------------------------------------------------------------------------------
2108 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2109 // of RBBIMonkeyKind.
2111 //------------------------------------------------------------------------------------------
2112 class RBBICharMonkey
: public RBBIMonkeyKind
{
2115 virtual ~RBBICharMonkey();
2116 virtual UVector
*charClasses();
2117 virtual void setText(const UnicodeString
&s
);
2118 virtual int32_t next(int32_t i
);
2122 UnicodeSet
*fCRLFSet
;
2123 UnicodeSet
*fControlSet
;
2124 UnicodeSet
*fExtendSet
;
2125 UnicodeSet
*fHangulSet
;
2126 UnicodeSet
*fAnySet
;
2128 RegexMatcher
*fMatcher
;
2129 const UnicodeString
*fText
;
2133 RBBICharMonkey::RBBICharMonkey() {
2134 UErrorCode status
= U_ZERO_ERROR
;
2137 fMatcher
= new RegexMatcher("\\X", 0, status
); // Pattern to match a grampheme cluster
2139 fCRLFSet
= new UnicodeSet("[\\r\\n]", status
);
2140 fControlSet
= new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status
);
2141 fExtendSet
= new UnicodeSet("[\\p{Grapheme_Extend}]", status
);
2142 fHangulSet
= new UnicodeSet(
2143 "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
2144 "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status
);
2145 fAnySet
= new UnicodeSet("[\\u0000-\\U0010ffff]", status
);
2147 fSets
= new UVector(status
);
2148 fSets
->addElement(fCRLFSet
, status
);
2149 fSets
->addElement(fControlSet
, status
);
2150 fSets
->addElement(fExtendSet
, status
);
2151 fSets
->addElement(fHangulSet
, status
);
2152 fSets
->addElement(fAnySet
, status
);
2153 if (U_FAILURE(status
)) {
2154 deferredStatus
= status
;
2159 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2165 int32_t RBBICharMonkey::next(int32_t i
) {
2166 UErrorCode status
= U_ZERO_ERROR
;
2167 int32_t retVal
= -1;
2169 if (fMatcher
->find(i
, status
)) {
2170 retVal
= fMatcher
->end(status
);
2172 if (U_FAILURE(status
)){
2179 UVector
*RBBICharMonkey::charClasses() {
2184 RBBICharMonkey::~RBBICharMonkey() {
2195 //------------------------------------------------------------------------------------------
2197 // class RBBIWordMonkey Word Break specific implementation
2198 // of RBBIMonkeyKind.
2200 //------------------------------------------------------------------------------------------
2201 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2204 virtual ~RBBIWordMonkey();
2205 virtual UVector
*charClasses();
2206 virtual void setText(const UnicodeString
&s
);
2207 virtual int32_t next(int32_t i
);
2211 UnicodeSet
*fKatakanaSet
;
2212 UnicodeSet
*fALetterSet
;
2213 UnicodeSet
*fMidLetterSet
;
2214 UnicodeSet
*fMidNumSet
;
2215 UnicodeSet
*fNumericSet
;
2216 UnicodeSet
*fFormatSet
;
2217 UnicodeSet
*fOtherSet
;
2218 UnicodeSet
*fExtendSet
;
2219 UnicodeSet
*fExtendNumLetSet
;
2221 RegexMatcher
*fMatcher
;
2223 const UnicodeString
*fText
;
2227 RBBIWordMonkey::RBBIWordMonkey()
2229 UErrorCode status
= U_ZERO_ERROR
;
2232 fSets
= new UVector(status
);
2234 fALetterSet
= new UnicodeSet("[\\p{Word_Break = ALetter}"
2235 "[\\p{Line_Break = Complex_Context}"
2236 "-\\p{Grapheme_Cluster_Break = Extend}"
2237 "-\\p{Grapheme_Cluster_Break = Control}]]", status
);
2238 //fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]", status);
2239 fKatakanaSet
= new UnicodeSet("[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]", status
);
2240 fMidLetterSet
= new UnicodeSet("[\\p{Word_Break = MidLetter}]", status
);
2241 fMidNumSet
= new UnicodeSet("[\\p{Word_Break = MidNum}]", status
);
2242 fNumericSet
= new UnicodeSet("[\\p{Word_Break = Numeric}]", status
);
2243 fFormatSet
= new UnicodeSet("[\\p{Word_Break = Format}]", status
);
2244 fExtendNumLetSet
= new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]", status
);
2245 //fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]", status);
2246 fExtendSet
= new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]", status
);
2248 fOtherSet
= new UnicodeSet();
2249 if(U_FAILURE(status
)) {
2250 deferredStatus
= status
;
2254 fOtherSet
->complement();
2255 fOtherSet
->removeAll(*fKatakanaSet
);
2256 fOtherSet
->removeAll(*fALetterSet
);
2257 fOtherSet
->removeAll(*fMidLetterSet
);
2258 fOtherSet
->removeAll(*fMidNumSet
);
2259 fOtherSet
->removeAll(*fNumericSet
);
2260 fOtherSet
->removeAll(*fExtendNumLetSet
);
2261 fOtherSet
->removeAll(*fFormatSet
);
2262 fOtherSet
->removeAll(*fExtendSet
);
2264 fSets
->addElement(fALetterSet
, status
);
2265 fSets
->addElement(fKatakanaSet
, status
);
2266 fSets
->addElement(fMidLetterSet
, status
);
2267 fSets
->addElement(fMidNumSet
, status
);
2268 fSets
->addElement(fNumericSet
, status
);
2269 fSets
->addElement(fFormatSet
, status
);
2270 fSets
->addElement(fExtendSet
, status
);
2271 fSets
->addElement(fOtherSet
, status
);
2272 fSets
->addElement(fExtendNumLetSet
, status
);
2275 if (U_FAILURE(status
)) {
2276 deferredStatus
= status
;
2280 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2285 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2286 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2287 // break position being tested. The candidate break
2288 // location is before p2.
2292 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2294 // Prev break at end of string. return DONE.
2295 if (prevPos
>= fText
->length()) {
2298 p0
= p1
= p2
= p3
= prevPos
;
2299 c3
= fText
->char32At(prevPos
);
2302 // Loop runs once per "significant" character position in the input text.
2304 // Move all of the positions forward in the input string.
2309 // Advancd p3 by X(Extend | Format)* Rule 4
2311 p3
= fText
->moveIndex32(p3
, 1);
2312 c3
= fText
->char32At(p3
);
2314 while (fFormatSet
->contains(c3
) || fExtendSet
->contains(c3
));
2318 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2321 if (p2
== fText
->length()) {
2322 // Reached end of string. Always a break position.
2327 // No Extend or Format characters may appear between the CR and LF,
2328 // which requires the additional check for p2 immediately following p1.
2330 if (c1
==0x0D && c2
==0x0A && p1
==(p2
-1)) {
2334 // Rule (5). ALetter x ALetter
2335 if (fALetterSet
->contains(c1
) &&
2336 fALetterSet
->contains(c2
)) {
2340 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2342 // Also incorporates rule 7 by skipping pos ahead to position of the
2343 // terminating ALetter.
2344 if ( fALetterSet
->contains(c1
) &&
2345 fMidLetterSet
->contains(c2
) &&
2346 fALetterSet
->contains(c3
)) {
2351 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2352 if (fALetterSet
->contains(c0
) &&
2353 (fMidLetterSet
->contains(c1
) ) &&
2354 fALetterSet
->contains(c2
)) {
2358 // Rule (8) Numeric x Numeric
2359 if (fNumericSet
->contains(c1
) &&
2360 fNumericSet
->contains(c2
)) {
2364 // Rule (9) ALetter x Numeric
2365 if (fALetterSet
->contains(c1
) &&
2366 fNumericSet
->contains(c2
)) {
2370 // Rule (10) Numeric x ALetter
2371 if (fNumericSet
->contains(c1
) &&
2372 fALetterSet
->contains(c2
)) {
2376 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2377 if ( fNumericSet
->contains(c0
) &&
2378 fMidNumSet
->contains(c1
) &&
2379 fNumericSet
->contains(c2
)) {
2383 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2384 if (fNumericSet
->contains(c1
) &&
2385 fMidNumSet
->contains(c2
) &&
2386 fNumericSet
->contains(c3
)) {
2390 // Rule (13) Katakana x Katakana
2391 if (fKatakanaSet
->contains(c1
) &&
2392 fKatakanaSet
->contains(c2
)) {
2397 if ((fALetterSet
->contains(c1
) || fNumericSet
->contains(c1
) ||
2398 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2399 fExtendNumLetSet
->contains(c2
)) {
2404 if (fExtendNumLetSet
->contains(c1
) &&
2405 (fALetterSet
->contains(c2
) || fNumericSet
->contains(c2
) ||
2406 fKatakanaSet
->contains(c2
))) {
2410 // Rule 14. Break found here.
2419 UVector
*RBBIWordMonkey::charClasses() {
2424 RBBIWordMonkey::~RBBIWordMonkey() {
2426 delete fKatakanaSet
;
2428 delete fMidLetterSet
;
2433 delete fExtendNumLetSet
;
2440 //------------------------------------------------------------------------------------------
2442 // class RBBISentMonkey Sentence Break specific implementation
2443 // of RBBIMonkeyKind.
2445 //------------------------------------------------------------------------------------------
2446 class RBBISentMonkey
: public RBBIMonkeyKind
{
2449 virtual ~RBBISentMonkey();
2450 virtual UVector
*charClasses();
2451 virtual void setText(const UnicodeString
&s
);
2452 virtual int32_t next(int32_t i
);
2454 int moveBack(int posFrom
);
2455 int moveForward(int posFrom
);
2456 UChar32
cAt(int pos
);
2460 UnicodeSet
*fSepSet
;
2461 UnicodeSet
*fFormatSet
;
2463 UnicodeSet
*fLowerSet
;
2464 UnicodeSet
*fUpperSet
;
2465 UnicodeSet
*fOLetterSet
;
2466 UnicodeSet
*fNumericSet
;
2467 UnicodeSet
*fATermSet
;
2468 UnicodeSet
*fSTermSet
;
2469 UnicodeSet
*fCloseSet
;
2470 UnicodeSet
*fOtherSet
;
2471 UnicodeSet
*fExtendSet
;
2473 const UnicodeString
*fText
;
2477 RBBISentMonkey::RBBISentMonkey()
2479 UErrorCode status
= U_ZERO_ERROR
;
2481 fSets
= new UVector(status
);
2483 fSepSet
= new UnicodeSet("[\\p{Sentence_Break = Sep}]", status
);
2484 fFormatSet
= new UnicodeSet("[\\p{Sentence_Break = Format}]", status
);
2485 fSpSet
= new UnicodeSet("[\\p{Sentence_Break = Sp}]", status
);
2486 fLowerSet
= new UnicodeSet("[\\p{Sentence_Break = Lower}]", status
);
2487 fUpperSet
= new UnicodeSet("[\\p{Sentence_Break = Upper}]", status
);
2488 fOLetterSet
= new UnicodeSet("[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]", status
);
2489 fNumericSet
= new UnicodeSet("[\\p{Sentence_Break = Numeric}]", status
);
2490 fATermSet
= new UnicodeSet("[\\p{Sentence_Break = ATerm}]", status
);
2491 fSTermSet
= new UnicodeSet("[\\p{Sentence_Break = STerm}]", status
);
2492 fCloseSet
= new UnicodeSet("[\\p{Sentence_Break = Close}]", status
);
2493 fExtendSet
= new UnicodeSet("[\\p{Grapheme_Extend}\\uff9e\\uff9f]", status
);
2494 fOtherSet
= new UnicodeSet();
2496 if(U_FAILURE(status
)) {
2497 deferredStatus
= status
;
2501 fOtherSet
->complement();
2502 fOtherSet
->removeAll(*fSepSet
);
2503 fOtherSet
->removeAll(*fFormatSet
);
2504 fOtherSet
->removeAll(*fSpSet
);
2505 fOtherSet
->removeAll(*fLowerSet
);
2506 fOtherSet
->removeAll(*fUpperSet
);
2507 fOtherSet
->removeAll(*fOLetterSet
);
2508 fOtherSet
->removeAll(*fNumericSet
);
2509 fOtherSet
->removeAll(*fATermSet
);
2510 fOtherSet
->removeAll(*fSTermSet
);
2511 fOtherSet
->removeAll(*fCloseSet
);
2512 fOtherSet
->removeAll(*fExtendSet
);
2514 fSets
->addElement(fSepSet
, status
);
2515 fSets
->addElement(fFormatSet
, status
);
2517 fSets
->addElement(fSpSet
, status
);
2518 fSets
->addElement(fLowerSet
, status
);
2519 fSets
->addElement(fUpperSet
, status
);
2520 fSets
->addElement(fOLetterSet
, status
);
2521 fSets
->addElement(fNumericSet
, status
);
2522 fSets
->addElement(fATermSet
, status
);
2523 fSets
->addElement(fSTermSet
, status
);
2524 fSets
->addElement(fCloseSet
, status
);
2525 fSets
->addElement(fOtherSet
, status
);
2526 fSets
->addElement(fExtendSet
, status
);
2528 if (U_FAILURE(status
)) {
2529 deferredStatus
= status
;
2535 void RBBISentMonkey::setText(const UnicodeString
&s
) {
2539 UVector
*RBBISentMonkey::charClasses() {
2544 // moveBack() Find the "significant" code point preceding the index i.
2545 // Skips over ($Extend | $Format)* .
2547 int RBBISentMonkey::moveBack(int i
) {
2554 j
= fText
->moveIndex32(j
, -1);
2555 c
= fText
->char32At(j
);
2557 while (j
>0 &&(fFormatSet
->contains(c
) || fExtendSet
->contains(c
)));
2563 int RBBISentMonkey::moveForward(int i
) {
2564 if (i
>=fText
->length()) {
2565 return fText
->length();
2570 j
= fText
->moveIndex32(j
, 1);
2573 while (fFormatSet
->contains(c
) || fExtendSet
->contains(c
));
2577 UChar32
RBBISentMonkey::cAt(int pos
) {
2578 if (pos
<0 || pos
>=fText
->length()) {
2581 return fText
->char32At(pos
);
2585 int32_t RBBISentMonkey::next(int32_t prevPos
) {
2586 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2587 // break position being tested. The candidate break
2588 // location is before p2.
2592 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2595 // Prev break at end of string. return DONE.
2596 if (prevPos
>= fText
->length()) {
2599 p0
= p1
= p2
= p3
= prevPos
;
2600 c3
= fText
->char32At(prevPos
);
2603 // Loop runs once per "significant" character position in the input text.
2605 // Move all of the positions forward in the input string.
2610 // Advancd p3 by X(Extend | Format)* Rule 4
2611 p3
= moveForward(p3
);
2615 if (c1
==0x0d && c2
==0x0a && p2
==(p1
+1)) {
2619 // Rule (4). Sep <break>
2620 if (fSepSet
->contains(c1
)) {
2621 p2
= p1
+1; // Separators don't combine with Extend or Format.
2625 if (p2
>= fText
->length()) {
2626 // Reached end of string. Always a break position.
2630 if (p2
== prevPos
) {
2631 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2635 // Rule (6). ATerm x Numeric
2636 if (fATermSet
->contains(c1
) && fNumericSet
->contains(c2
)) {
2640 // Rule (7). Upper ATerm x Uppper
2641 if (fUpperSet
->contains(c0
) && fATermSet
->contains(c1
) && fUpperSet
->contains(c2
)) {
2645 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2646 // Note: STerm | ATerm are added to the negated part of the expression by a
2647 // note to the Unicode 5.0 documents.
2649 while (fSpSet
->contains(cAt(p8
))) {
2652 while (fCloseSet
->contains(cAt(p8
))) {
2655 if (fATermSet
->contains(cAt(p8
))) {
2659 if (c
==-1 || fOLetterSet
->contains(c
) || fUpperSet
->contains(c
) ||
2660 fLowerSet
->contains(c
) || fSepSet
->contains(c
) ||
2661 fATermSet
->contains(c
) || fSTermSet
->contains(c
)) {
2664 p8
= moveForward(p8
);
2666 if (fLowerSet
->contains(cAt(p8
))) {
2671 // Rule 8a (STerm | ATerm) Close* Sp* x (STerm | ATerm);
2672 if (fSTermSet
->contains(c2
) || fATermSet
->contains(c2
)) {
2674 while (fSpSet
->contains(cAt(p8
))) {
2677 while (fCloseSet
->contains(cAt(p8
))) {
2681 if (fSTermSet
->contains(c
) || fATermSet
->contains(c
)) {
2686 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep)
2688 while (fCloseSet
->contains(cAt(p9
))) {
2692 if ((fSTermSet
->contains(c
) || fATermSet
->contains(c
))) {
2693 if (fCloseSet
->contains(c2
) || fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2698 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep)
2700 while (fSpSet
->contains(cAt(p10
))) {
2701 p10
= moveBack(p10
);
2703 while (fCloseSet
->contains(cAt(p10
))) {
2704 p10
= moveBack(p10
);
2706 if (fSTermSet
->contains(cAt(p10
)) || fATermSet
->contains(cAt(p10
))) {
2707 if (fSpSet
->contains(c2
) || fSepSet
->contains(c2
)) {
2712 // Rule (11) (STerm | ATerm) Close* Sp* <break>
2714 while (fSpSet
->contains(cAt(p11
))) {
2715 p11
= moveBack(p11
);
2717 while (fCloseSet
->contains(cAt(p11
))) {
2718 p11
= moveBack(p11
);
2720 if (fSTermSet
->contains(cAt(p11
)) || fATermSet
->contains(cAt(p11
))) {
2724 // Rule (12) Any x Any
2731 RBBISentMonkey::~RBBISentMonkey() {
2749 //-------------------------------------------------------------------------------------------
2753 //-------------------------------------------------------------------------------------------
2755 class RBBILineMonkey
: public RBBIMonkeyKind
{
2758 virtual ~RBBILineMonkey();
2759 virtual UVector
*charClasses();
2760 virtual void setText(const UnicodeString
&s
);
2761 virtual int32_t next(int32_t i
);
2762 virtual void rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2803 BreakIterator
*fCharBI
;
2805 const UnicodeString
*fText
;
2806 int32_t *fOrigPositions
;
2808 RegexMatcher
*fNumberMatcher
;
2809 RegexMatcher
*fLB11Matcher
;
2813 RBBILineMonkey::RBBILineMonkey()
2815 UErrorCode status
= U_ZERO_ERROR
;
2817 fSets
= new UVector(status
);
2819 fBK
= new UnicodeSet("[\\p{Line_Break=BK}]", status
);
2820 fCR
= new UnicodeSet("[\\p{Line_break=CR}]", status
);
2821 fLF
= new UnicodeSet("[\\p{Line_break=LF}]", status
);
2822 fCM
= new UnicodeSet("[\\p{Line_break=CM}]", status
);
2823 fNL
= new UnicodeSet("[\\p{Line_break=NL}]", status
);
2824 fWJ
= new UnicodeSet("[\\p{Line_break=WJ}]", status
);
2825 fZW
= new UnicodeSet("[\\p{Line_break=ZW}]", status
);
2826 fGL
= new UnicodeSet("[\\p{Line_break=GL}]", status
);
2827 fCB
= new UnicodeSet("[\\p{Line_break=CB}]", status
);
2828 fSP
= new UnicodeSet("[\\p{Line_break=SP}]", status
);
2829 fB2
= new UnicodeSet("[\\p{Line_break=B2}]", status
);
2830 fBA
= new UnicodeSet("[\\p{Line_break=BA}]", status
);
2831 fBB
= new UnicodeSet("[\\p{Line_break=BB}]", status
);
2832 fHY
= new UnicodeSet("[\\p{Line_break=HY}]", status
);
2833 fH2
= new UnicodeSet("[\\p{Line_break=H2}]", status
);
2834 fH3
= new UnicodeSet("[\\p{Line_break=H3}]", status
);
2835 fCL
= new UnicodeSet("[\\p{Line_break=CL}]", status
);
2836 fEX
= new UnicodeSet("[\\p{Line_break=EX}]", status
);
2837 fIN
= new UnicodeSet("[\\p{Line_break=IN}]", status
);
2838 fJL
= new UnicodeSet("[\\p{Line_break=JL}]", status
);
2839 fJV
= new UnicodeSet("[\\p{Line_break=JV}]", status
);
2840 fJT
= new UnicodeSet("[\\p{Line_break=JT}]", status
);
2841 fNS
= new UnicodeSet("[\\p{Line_break=NS}]", status
);
2842 fOP
= new UnicodeSet("[\\p{Line_break=OP}]", status
);
2843 fQU
= new UnicodeSet("[\\p{Line_break=QU}]", status
);
2844 fIS
= new UnicodeSet("[\\p{Line_break=IS}]", status
);
2845 fNU
= new UnicodeSet("[\\p{Line_break=NU}]", status
);
2846 fPO
= new UnicodeSet("[\\p{Line_break=PO}]", status
);
2847 fPR
= new UnicodeSet("[\\p{Line_break=PR}]", status
);
2848 fSY
= new UnicodeSet("[\\p{Line_break=SY}]", status
);
2849 fAI
= new UnicodeSet("[\\p{Line_break=AI}]", status
);
2850 fAL
= new UnicodeSet("[\\p{Line_break=AL}]", status
);
2851 fID
= new UnicodeSet("[\\p{Line_break=ID}]", status
);
2852 fSA
= new UnicodeSet("[\\p{Line_break=SA}]", status
);
2853 fSG
= new UnicodeSet("[\\ud800-\\udfff]", status
);
2854 fXX
= new UnicodeSet("[\\p{Line_break=XX}]", status
);
2856 if (U_FAILURE(status
)) {
2857 deferredStatus
= status
;
2859 fNumberMatcher
= NULL
;
2863 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2864 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2865 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
2866 fAL
->addAll(*fSG
); // Default behavior for SG is identical to AL.
2868 fSets
->addElement(fBK
, status
);
2869 fSets
->addElement(fCR
, status
);
2870 fSets
->addElement(fLF
, status
);
2871 fSets
->addElement(fCM
, status
);
2872 fSets
->addElement(fNL
, status
);
2873 fSets
->addElement(fWJ
, status
);
2874 fSets
->addElement(fZW
, status
);
2875 fSets
->addElement(fGL
, status
);
2876 fSets
->addElement(fCB
, status
);
2877 fSets
->addElement(fSP
, status
);
2878 fSets
->addElement(fB2
, status
);
2879 fSets
->addElement(fBA
, status
);
2880 fSets
->addElement(fBB
, status
);
2881 fSets
->addElement(fHY
, status
);
2882 fSets
->addElement(fH2
, status
);
2883 fSets
->addElement(fH3
, status
);
2884 fSets
->addElement(fCL
, status
);
2885 fSets
->addElement(fEX
, status
);
2886 fSets
->addElement(fIN
, status
);
2887 fSets
->addElement(fJL
, status
);
2888 fSets
->addElement(fJT
, status
);
2889 fSets
->addElement(fJV
, status
);
2890 fSets
->addElement(fNS
, status
);
2891 fSets
->addElement(fOP
, status
);
2892 fSets
->addElement(fQU
, status
);
2893 fSets
->addElement(fIS
, status
);
2894 fSets
->addElement(fNU
, status
);
2895 fSets
->addElement(fPO
, status
);
2896 fSets
->addElement(fPR
, status
);
2897 fSets
->addElement(fSY
, status
);
2898 fSets
->addElement(fAI
, status
);
2899 fSets
->addElement(fAL
, status
);
2900 fSets
->addElement(fID
, status
);
2901 fSets
->addElement(fWJ
, status
);
2902 fSets
->addElement(fSA
, status
);
2903 fSets
->addElement(fSG
, status
);
2905 fNumberMatcher
= new RegexMatcher(
2906 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2907 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2908 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2909 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2910 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
2911 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?",
2914 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2916 if (U_FAILURE(status
)) {
2917 deferredStatus
= status
;
2922 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2924 fCharBI
->setText(s
);
2925 fNumberMatcher
->reset(s
);
2930 // Line Break TR rules 9 and 10 implementation.
2931 // This deals with combining marks and other sequences that
2932 // that must be treated as if they were something other than what they actually are.
2934 // This is factored out into a separate function because it must be applied twice for
2935 // each potential break, once to the chars before the position being checked, then
2936 // again to the text following the possible break.
2938 void RBBILineMonkey::rule9Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2940 // Invalid initial position. Happens during the warmup iteration of the
2941 // main loop in next().
2945 int32_t nPos
= *nextPos
;
2947 // LB 9 Keep combining sequences together.
2948 // advance over any CM class chars. Note that Line Break CM is different
2949 // from the normal Grapheme Extend property.
2950 if (!(fSP
->contains(*posChar
) || fBK
->contains(*posChar
) || *posChar
==0x0d ||
2951 *posChar
==0x0a ||fNL
->contains(*posChar
) || fZW
->contains(*posChar
))) {
2953 *nextChar
= fText
->char32At(nPos
);
2954 if (!fCM
->contains(*nextChar
)) {
2957 nPos
= fText
->moveIndex32(nPos
, 1);
2962 // LB 9 Treat X CM* as if it were x.
2963 // No explicit action required.
2965 // LB 10 Treat any remaining combining mark as AL
2966 if (fCM
->contains(*posChar
)) {
2967 *posChar
= 0x41; // thisChar = 'A';
2970 // Push the updated nextPos and nextChar back to our caller.
2971 // This only makes a difference if posChar got bigger by consuming a
2972 // combining sequence.
2974 *nextChar
= fText
->char32At(nPos
);
2979 int32_t RBBILineMonkey::next(int32_t startPos
) {
2980 UErrorCode status
= U_ZERO_ERROR
;
2981 int32_t pos
; // Index of the char following a potential break position
2982 UChar32 thisChar
; // Character at above position "pos"
2984 int32_t prevPos
; // Index of the char preceding a potential break position
2985 UChar32 prevChar
; // Character at above position. Note that prevChar
2986 // and thisChar may not be adjacent because combining
2987 // characters between them will be ignored.
2989 int32_t nextPos
; // Index of the next character following pos.
2990 // Usually skips over combining marks.
2991 int32_t nextCPPos
; // Index of the code point following "pos."
2992 // May point to a combining mark.
2993 int32_t tPos
; // temp value.
2996 if (startPos
>= fText
->length()) {
3001 // Initial values for loop. Loop will run the first time without finding breaks,
3002 // while the invalid values shift out and the "this" and
3003 // "prev" positions are filled in with good values.
3004 pos
= prevPos
= -1; // Invalid value, serves as flag for initial loop iteration.
3005 thisChar
= prevChar
= 0;
3006 nextPos
= nextCPPos
= startPos
;
3009 // Loop runs once per position in the test text, until a break position
3013 prevChar
= thisChar
;
3016 thisChar
= fText
->char32At(pos
);
3018 nextCPPos
= fText
->moveIndex32(pos
, 1);
3019 nextPos
= nextCPPos
;
3021 // Rule LB2 - Break at end of text.
3022 if (pos
>= fText
->length()) {
3026 // Rule LB 9 - adjust for combining sequences.
3027 // We do this one out-of-order because the adjustment does not change anything
3028 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3030 rule9Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
3031 nextCPPos
= nextPos
= fText
->moveIndex32(pos
, 1);
3032 c
= fText
->char32At(nextPos
);
3033 rule9Adjust(pos
, &thisChar
, &nextPos
, &c
);
3035 // If the loop is still warming up - if we haven't shifted the initial
3036 // -1 positions out of prevPos yet - loop back to advance the
3037 // position in the input without any further looking for breaks.
3038 if (prevPos
== -1) {
3042 // LB 4 Always break after hard line breaks,
3043 if (fBK
->contains(prevChar
)) {
3047 // LB 5 Break after CR, LF, NL, but not inside CR LF
3048 if (prevChar
== 0x0d && thisChar
== 0x0a) {
3051 if (prevChar
== 0x0d ||
3057 // LB 6 Don't break before hard line breaks
3058 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
3059 fBK
->contains(thisChar
)) {
3064 // LB 7 Don't break before spaces or zero-width space.
3065 if (fSP
->contains(thisChar
)) {
3069 if (fZW
->contains(thisChar
)) {
3073 // LB 8 Break after zero width space
3074 if (fZW
->contains(prevChar
)) {
3078 // LB 9, 10 Already done, at top of loop.
3082 // LB 11 Do not break before or after WORD JOINER and related characters.
3086 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
3093 if ((!fSP
->contains(prevChar
)) && fGL
->contains(thisChar
) ||
3094 fGL
->contains(prevChar
)) {
3100 // LB 13 Don't break before closings.
3101 // NU x CL and NU x IS are not matched here so that they will
3102 // fall into LB 17 and the more general number regular expression.
3104 if (!fNU
->contains(prevChar
) && fCL
->contains(thisChar
) ||
3105 fEX
->contains(thisChar
) ||
3106 !fNU
->contains(prevChar
) && fIS
->contains(thisChar
) ||
3107 !fNU
->contains(prevChar
) && fSY
->contains(thisChar
)) {
3111 // LB 14 Don't break after OP SP*
3112 // Scan backwards, checking for this sequence.
3113 // The OP char could include combining marks, so we actually check for
3115 // Another Twist: The Rule 67 fixes may have changed a SP CM
3116 // sequence into a ID char, so before scanning back through spaces,
3117 // verify that prevChar is indeed a space. The prevChar variable
3118 // may differ from fText[prevPos]
3120 if (fSP
->contains(prevChar
)) {
3121 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3122 tPos
=fText
->moveIndex32(tPos
, -1);
3125 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3126 tPos
=fText
->moveIndex32(tPos
, -1);
3128 if (fOP
->contains(fText
->char32At(tPos
))) {
3133 // LB 15 QU SP* x OP
3134 if (fOP
->contains(thisChar
)) {
3135 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3137 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3138 tPos
= fText
->moveIndex32(tPos
, -1);
3140 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3141 tPos
= fText
->moveIndex32(tPos
, -1);
3143 if (fQU
->contains(fText
->char32At(tPos
))) {
3150 // LB 16 CL SP* x NS
3151 // Scan backwards for SP* CM* CL
3152 if (fNS
->contains(thisChar
)) {
3154 while (tPos
>0 && fSP
->contains(fText
->char32At(tPos
))) {
3155 tPos
= fText
->moveIndex32(tPos
, -1);
3157 while (tPos
>0 && fCM
->contains(fText
->char32At(tPos
))) {
3158 tPos
= fText
->moveIndex32(tPos
, -1);
3160 if (fCL
->contains(fText
->char32At(tPos
))) {
3166 // LB 17 B2 SP* x B2
3167 if (fB2
->contains(thisChar
)) {
3168 // Scan backwards, checking for the B2 CM* SP* sequence.
3170 if (fSP
->contains(prevChar
)) {
3171 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
3172 tPos
=fText
->moveIndex32(tPos
, -1);
3175 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
3176 tPos
=fText
->moveIndex32(tPos
, -1);
3178 if (fB2
->contains(fText
->char32At(tPos
))) {
3184 // LB 18 break after space
3185 if (fSP
->contains(prevChar
)) {
3192 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
3196 // LB 20 Break around a CB
3197 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
3202 if (fBA
->contains(thisChar
) ||
3203 fHY
->contains(thisChar
) ||
3204 fNS
->contains(thisChar
) ||
3205 fBB
->contains(prevChar
) ) {
3210 if (fAL
->contains(prevChar
) && fIN
->contains(thisChar
) ||
3211 fID
->contains(prevChar
) && fIN
->contains(thisChar
) ||
3212 fIN
->contains(prevChar
) && fIN
->contains(thisChar
) ||
3213 fNU
->contains(prevChar
) && fIN
->contains(thisChar
) ) {
3221 if (fID
->contains(prevChar
) && fPO
->contains(thisChar
) ||
3222 fAL
->contains(prevChar
) && fNU
->contains(thisChar
) ||
3223 fNU
->contains(prevChar
) && fAL
->contains(thisChar
) ) {
3227 // LB 24 Do not break between prefix and letters or ideographs.
3231 if (fPR
->contains(prevChar
) && fID
->contains(thisChar
) ||
3232 fPR
->contains(prevChar
) && fAL
->contains(thisChar
) ||
3233 fPO
->contains(prevChar
) && fAL
->contains(thisChar
) ) {
3240 if (fNumberMatcher
->lookingAt(prevPos
, status
)) {
3241 if (U_FAILURE(status
)) {
3244 // Matched a number. But could have been just a single digit, which would
3245 // not represent a "no break here" between prevChar and thisChar
3246 int32_t numEndIdx
= fNumberMatcher
->end(status
); // idx of first char following num
3247 if (numEndIdx
> pos
) {
3248 // Number match includes at least our two chars being checked
3249 if (numEndIdx
> nextPos
) {
3250 // Number match includes additional chars. Update pos and nextPos
3251 // so that next loop iteration will continue at the end of the number,
3252 // checking for breaks between last char in number & whatever follows.
3253 pos
= nextPos
= numEndIdx
;
3255 pos
= fText
->moveIndex32(pos
, -1);
3256 thisChar
= fText
->char32At(pos
);
3257 } while (fCM
->contains(thisChar
));
3264 // LB 26 Do not break a Korean syllable.
3265 if (fJL
->contains(prevChar
) && (fJL
->contains(thisChar
) ||
3266 fJV
->contains(thisChar
) ||
3267 fH2
->contains(thisChar
) ||
3268 fH3
->contains(thisChar
))) {
3272 if ((fJV
->contains(prevChar
) || fH2
->contains(prevChar
)) &&
3273 (fJV
->contains(thisChar
) || fJT
->contains(thisChar
))) {
3277 if ((fJT
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3278 fJT
->contains(thisChar
)) {
3282 // LB 27 Treat a Korean Syllable Block the same as ID.
3283 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3284 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3285 fIN
->contains(thisChar
)) {
3288 if ((fJL
->contains(prevChar
) || fJV
->contains(prevChar
) ||
3289 fJT
->contains(prevChar
) || fH2
->contains(prevChar
) || fH3
->contains(prevChar
)) &&
3290 fPO
->contains(thisChar
)) {
3293 if (fPR
->contains(prevChar
) && (fJL
->contains(thisChar
) || fJV
->contains(thisChar
) ||
3294 fJT
->contains(thisChar
) || fH2
->contains(thisChar
) || fH3
->contains(thisChar
))) {
3300 // LB 28 Do not break between alphabetics (“at”).
3301 if (fAL
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3305 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3306 if (fIS
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3310 //LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation
3313 if ((fAL
->contains(prevChar
) || fNU
->contains(prevChar
)) &&
3314 fOP
->contains(thisChar
)) {
3317 if (fCL
->contains(prevChar
) &&
3318 (fAL
->contains(thisChar
) || fNU
->contains(thisChar
))) {
3323 // LB 31 Break everywhere else
3332 UVector
*RBBILineMonkey::charClasses() {
3337 RBBILineMonkey::~RBBILineMonkey() {
3378 delete fNumberMatcher
;
3382 //-------------------------------------------------------------------------------------------
3387 // seed=nnnnn Random number starting seed.
3388 // Setting the seed allows errors to be reproduced.
3389 // loop=nnn Looping count. Controls running time.
3391 // 0 or greater: run length.
3393 // type = char | word | line | sent | title
3395 //-------------------------------------------------------------------------------------------
3397 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3398 int32_t val
= defaultVal
;
3399 name
.append(" *= *(-?\\d+)");
3400 UErrorCode status
= U_ZERO_ERROR
;
3401 RegexMatcher
m(name
, params
, 0, status
);
3403 // The param exists. Convert the string to an int.
3404 char valString
[100];
3405 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3406 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3407 paramLength
= (int32_t)(sizeof(valString
)-2);
3409 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3410 val
= strtol(valString
, NULL
, 10);
3412 // Delete this parameter from the params string.
3414 params
= m
.replaceFirst("", status
);
3416 U_ASSERT(U_SUCCESS(status
));
3421 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3430 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3432 if (count
< expectedcount
&& expected
[count
] != i
) {
3433 test
->errln("break forward test failed: expected %d but got %d",
3434 expected
[count
], i
);
3439 if (count
!= expectedcount
) {
3440 printStringBreaks(ustr
, expected
, expectedcount
);
3441 test
->errln("break forward test failed: missed %d match",
3442 expectedcount
- count
);
3445 // testing boundaries
3446 for (i
= 1; i
< expectedcount
; i
++) {
3447 int j
= expected
[i
- 1];
3448 if (!bi
->isBoundary(j
)) {
3449 printStringBreaks(ustr
, expected
, expectedcount
);
3450 test
->errln("isBoundary() failed. Expected boundary at position %d", j
);
3453 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3454 if (bi
->isBoundary(j
)) {
3455 printStringBreaks(ustr
, expected
, expectedcount
);
3456 test
->errln("isBoundary() failed. Not expecting boundary at position %d", j
);
3462 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3464 if (forward
[count
] != i
) {
3465 test
->errln("happy break test previous() failed: expected %d but got %d",
3471 printStringBreaks(ustr
, expected
, expectedcount
);
3472 test
->errln("break test previous() failed: missed a match");
3476 // testing preceding
3477 for (i
= 0; i
< expectedcount
- 1; i
++) {
3478 // int j = expected[i] + 1;
3479 int j
= ustr
.moveIndex32(expected
[i
], 1);
3480 for (; j
<= expected
[i
+ 1]; j
++) {
3481 if (bi
->preceding(j
) != expected
[i
]) {
3482 printStringBreaks(ustr
, expected
, expectedcount
);
3483 test
->errln("preceding(): Not expecting boundary at position %d", j
);
3490 void RBBITest::TestWordBreaks(void)
3492 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3494 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3495 Locale
locale("en");
3496 UErrorCode status
= U_ZERO_ERROR
;
3497 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3498 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3500 static const char *strlist
[] =
3502 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3503 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3504 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
3505 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3506 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3507 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3508 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3509 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3510 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3511 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3512 "\\u2027\\U000e0067\\u0a47\\u00b7",
3513 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3514 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3515 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3516 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3517 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3518 "\\u0027\\u11af\\U000e0057\\u0602",
3519 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3520 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3521 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3522 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3523 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3524 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3525 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3526 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3527 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3528 "\\u58f4\\U000e0049\\u20e7\\u2027",
3529 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3530 "\\ua183\\u102d\\u0bec\\u003a",
3531 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3532 "\\u003a\\u0e57\\u0fad\\u002e",
3533 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3534 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3535 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3536 "\\u003a\\u0664\\u00b7\\u1fba",
3537 "\\u003b\\u0027\\u00b7\\u47a3",
3538 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3539 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3540 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3543 if (U_FAILURE(status
)) {
3544 errln("Creation of break iterator failed %s", u_errorName(status
));
3547 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3548 // printf("looping %d\n", loop);
3549 u_unescape(strlist
[loop
], str
, 25);
3550 UnicodeString
ustr(str
);
3551 // RBBICharMonkey monkey;
3552 RBBIWordMonkey monkey
;
3555 int expectedcount
= 0;
3557 monkey
.setText(ustr
);
3559 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3560 expected
[expectedcount
++] = i
;
3563 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3569 void RBBITest::TestWordBoundary(void)
3571 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3572 Locale
locale("en");
3573 UErrorCode status
= U_ZERO_ERROR
;
3574 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3575 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3577 static const char *strlist
[] =
3579 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3580 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3581 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3582 "\\u2027\\U000e0067\\u0a47\\u00b7",
3583 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3584 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3585 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3586 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3587 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3588 "\\u0027\\u11af\\U000e0057\\u0602",
3589 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3590 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3591 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3592 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3593 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3594 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3595 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3596 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3597 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3598 "\\u58f4\\U000e0049\\u20e7\\u2027",
3599 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3600 "\\ua183\\u102d\\u0bec\\u003a",
3601 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3602 "\\u003a\\u0e57\\u0fad\\u002e",
3603 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3604 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3605 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3606 "\\u003a\\u0664\\u00b7\\u1fba",
3607 "\\u003b\\u0027\\u00b7\\u47a3",
3610 if (U_FAILURE(status
)) {
3611 errln("Creation of break iterator failed %s", u_errorName(status
));
3614 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3615 // printf("looping %d\n", loop);
3616 u_unescape(strlist
[loop
], str
, 20);
3617 UnicodeString
ustr(str
);
3624 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3625 forward
[count
++] = i
;
3628 for (j
= prev
+ 1; j
< i
; j
++) {
3629 if (bi
->isBoundary(j
)) {
3630 printStringBreaks(ustr
, forward
, count
);
3631 errln("happy boundary test failed: expected %d not a boundary",
3637 if (!bi
->isBoundary(i
)) {
3638 printStringBreaks(ustr
, forward
, count
);
3639 errln("happy boundary test failed: expected %d a boundary",
3649 void RBBITest::TestLineBreaks(void)
3651 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3652 Locale
locale("en");
3653 UErrorCode status
= U_ZERO_ERROR
;
3654 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3655 const int32_t STRSIZE
= 50;
3657 static const char *strlist
[] =
3659 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3660 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3661 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3662 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3663 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3664 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3665 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3666 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3667 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3668 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3669 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3670 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3671 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3672 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3673 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3674 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3675 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3676 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3677 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3678 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3679 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3680 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3681 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3682 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3683 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3684 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3685 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3686 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3687 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3688 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3689 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3690 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3691 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3692 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3693 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3694 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3695 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3696 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3697 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3698 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3699 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3700 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3701 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3702 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3703 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3704 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3705 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3708 TEST_ASSERT_SUCCESS(status
);
3709 if (U_FAILURE(status
)) {
3712 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3713 // printf("looping %d\n", loop);
3714 int32_t t
= u_unescape(strlist
[loop
], str
, STRSIZE
);
3721 UnicodeString
ustr(str
);
3722 RBBILineMonkey monkey
;
3723 if (U_FAILURE(monkey
.deferredStatus
)) {
3727 const int EXPECTEDSIZE
= 50;
3728 int expected
[EXPECTEDSIZE
];
3729 int expectedcount
= 0;
3731 monkey
.setText(ustr
);
3733 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3734 if (expectedcount
>= EXPECTEDSIZE
) {
3735 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3738 expected
[expectedcount
++] = i
;
3741 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3747 void RBBITest::TestSentBreaks(void)
3749 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3750 Locale
locale("en");
3751 UErrorCode status
= U_ZERO_ERROR
;
3752 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3754 static const char *strlist
[] =
3756 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3758 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3759 "\"Sentence ending with a quote.\" Bye.",
3760 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3761 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3762 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3763 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3764 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3765 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3766 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3767 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3768 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3769 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3770 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3771 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3772 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3773 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3774 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3775 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3778 if (U_FAILURE(status
)) {
3779 errln("Creation of break iterator failed %s", u_errorName(status
));
3782 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3783 u_unescape(strlist
[loop
], str
, (int32_t)(sizeof(str
) / sizeof(str
[0])));
3784 UnicodeString
ustr(str
);
3786 RBBISentMonkey monkey
;
3787 if (U_FAILURE(monkey
.deferredStatus
)) {
3791 const int EXPECTEDSIZE
= 50;
3792 int expected
[EXPECTEDSIZE
];
3793 int expectedcount
= 0;
3795 monkey
.setText(ustr
);
3797 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3798 if (expectedcount
>= EXPECTEDSIZE
) {
3799 TEST_ASSERT(expectedcount
< EXPECTEDSIZE
);
3802 expected
[expectedcount
++] = i
;
3805 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3811 void RBBITest::TestMonkey(char *params
) {
3812 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3814 UErrorCode status
= U_ZERO_ERROR
;
3815 int32_t loopCount
= 500;
3817 UnicodeString breakType
= "all";
3818 Locale
locale("en");
3819 UBool useUText
= FALSE
;
3821 if (quick
== FALSE
) {
3826 UnicodeString
p(params
);
3827 loopCount
= getIntParam("loop", p
, loopCount
);
3828 seed
= getIntParam("seed", p
, seed
);
3830 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3832 breakType
= m
.group(1, status
);
3834 p
= m
.replaceFirst("", status
);
3837 RegexMatcher
u(" *utext", p
, 0, status
);
3841 p
= u
.replaceFirst("", status
);
3846 if (RegexMatcher("\\S", p
, 0, status
).find()) {
3847 // Each option is stripped out of the option string as it is processed.
3848 // All options have been checked. The option string should have been completely emptied..
3850 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3851 buf
[sizeof(buf
)-1] = 0;
3852 errln("Unrecognized or extra parameter: %s\n", buf
);
3858 if (breakType
== "char" || breakType
== "all") {
3860 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3861 if (U_SUCCESS(status
)) {
3862 RunMonkey(bi
, m
, "char", seed
, loopCount
, useUText
);
3863 if (breakType
== "all" && useUText
==FALSE
) {
3864 // Also run a quick test with UText when "all" is specified
3865 RunMonkey(bi
, m
, "char", seed
, loopCount
, TRUE
);
3869 errln("Creation of character break iterator failed %s", u_errorName(status
));
3874 if (breakType
== "word" || breakType
== "all") {
3875 logln("Word Break Monkey Test");
3877 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3878 if (U_SUCCESS(status
)) {
3879 RunMonkey(bi
, m
, "word", seed
, loopCount
, useUText
);
3882 errln("Creation of word break iterator failed %s", u_errorName(status
));
3887 if (breakType
== "line" || breakType
== "all") {
3888 logln("Line Break Monkey Test");
3890 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3891 if (loopCount
>= 10) {
3892 loopCount
= loopCount
/ 5; // Line break runs slower than the others.
3894 if (U_SUCCESS(status
)) {
3895 RunMonkey(bi
, m
, "line", seed
, loopCount
, useUText
);
3898 errln("Creation of line break iterator failed %s", u_errorName(status
));
3903 if (breakType
== "sent" || breakType
== "all" ) {
3904 logln("Sentence Break Monkey Test");
3906 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3907 if (loopCount
>= 10) {
3908 loopCount
= loopCount
/ 10; // Sentence runs slower than the other break types
3910 if (U_SUCCESS(status
)) {
3911 RunMonkey(bi
, m
, "sentence", seed
, loopCount
, useUText
);
3914 errln("Creation of line break iterator failed %s", u_errorName(status
));
3923 // Run a RBBI monkey test. Common routine, for all break iterator types.
3925 // bi - the break iterator to use
3926 // mk - MonkeyKind, abstraction for obtaining expected results
3927 // name - Name of test (char, word, etc.) for use in error messages
3928 // seed - Seed for starting random number generator (parameter from user)
3931 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
,
3932 int32_t numIterations
, UBool useUText
) {
3934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3936 const int32_t TESTSTRINGLEN
= 500;
3937 UnicodeString testText
;
3938 int32_t numCharClasses
;
3940 int expected
[TESTSTRINGLEN
*2 + 1];
3941 int expectedCount
= 0;
3942 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
3943 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
3944 char reverseBreaks
[TESTSTRINGLEN
*2+1];
3945 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
3946 char followingBreaks
[TESTSTRINGLEN
*2+1];
3947 char precedingBreaks
[TESTSTRINGLEN
*2+1];
3953 numCharClasses
= mk
.charClasses()->size();
3954 chClasses
= mk
.charClasses();
3956 // Check for errors that occured during the construction of the MonkeyKind object.
3957 // Can't report them where they occured because errln() is a method coming from intlTest,
3958 // and is not visible outside of RBBITest :-(
3959 if (U_FAILURE(mk
.deferredStatus
)) {
3960 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
3964 // Verify that the character classes all have at least one member.
3965 for (i
=0; i
<numCharClasses
; i
++) {
3966 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
3967 if (s
== NULL
|| s
->size() == 0) {
3968 errln("Character Class #%d is null or of zero size.", i
);
3973 while (loopCount
< numIterations
|| numIterations
== -1) {
3974 if (numIterations
== -1 && loopCount
% 10 == 0) {
3975 // If test is running in an infinite loop, display a periodic tic so
3976 // we can tell that it is making progress.
3977 fprintf(stderr
, ".");
3979 // Save current random number seed, so that we can recreate the random numbers
3980 // for this loop iteration in event of an error.
3983 // Populate a test string with data.
3984 testText
.truncate(0);
3985 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
3986 int32_t aClassNum
= m_rand() % numCharClasses
;
3987 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
3988 int32_t charIdx
= m_rand() % classSet
->size();
3989 UChar32 c
= classSet
->charAt(charIdx
);
3990 if (c
< 0) { // TODO: deal with sets containing strings.
3997 // Calculate the expected results for this test string.
3998 mk
.setText(testText
);
3999 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
4000 expectedBreaks
[0] = 1;
4001 int32_t breakPos
= 0;
4004 breakPos
= mk
.next(breakPos
);
4005 if (breakPos
== -1) {
4008 if (breakPos
> testText
.length()) {
4009 errln("breakPos > testText.length()");
4011 expectedBreaks
[breakPos
] = 1;
4012 U_ASSERT(expectedCount
<testText
.length());
4013 expected
[expectedCount
++] = breakPos
;
4016 // Find the break positions using forward iteration
4017 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
4019 UErrorCode status
= U_ZERO_ERROR
;
4020 UText
*testUText
= utext_openReplaceable(NULL
, &testText
, &status
);
4021 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4022 bi
->setText(testUText
, status
);
4023 TEST_ASSERT_SUCCESS(status
);
4024 utext_close(testUText
); // The break iterator does a shallow clone of the UText
4025 // This UText can be closed immediately, so long as the
4026 // testText string continues to exist.
4028 bi
->setText(testText
);
4031 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
4032 if (i
< 0 || i
> testText
.length()) {
4033 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4036 forwardBreaks
[i
] = 1;
4039 // Find the break positions using reverse iteration
4040 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
4041 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
4042 if (i
< 0 || i
> testText
.length()) {
4043 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
4046 reverseBreaks
[i
] = 1;
4049 // Find the break positions using isBoundary() tests.
4050 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
4051 U_ASSERT((int32_t)sizeof(isBoundaryBreaks
) > testText
.length());
4052 for (i
=0; i
<=testText
.length(); i
++) {
4053 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
4057 // Find the break positions using the following() function.
4059 memset(followingBreaks
, 0, sizeof(followingBreaks
));
4060 int32_t lastBreakPos
= 0;
4061 followingBreaks
[0] = 1;
4062 for (i
=0; i
<testText
.length(); i
++) {
4063 breakPos
= bi
->following(i
);
4064 if (breakPos
<= i
||
4065 breakPos
< lastBreakPos
||
4066 breakPos
> testText
.length() ||
4067 breakPos
> lastBreakPos
&& lastBreakPos
> i
) {
4068 errln("%s break monkey test: "
4069 "Out of range value returned by BreakIterator::following().\n"
4070 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4071 name
, seed
, i
, breakPos
, lastBreakPos
);
4074 followingBreaks
[breakPos
] = 1;
4075 lastBreakPos
= breakPos
;
4078 // Find the break positions using the preceding() function.
4079 memset(precedingBreaks
, 0, sizeof(followingBreaks
));
4080 lastBreakPos
= testText
.length();
4081 precedingBreaks
[testText
.length()] = 1;
4082 for (i
=testText
.length(); i
>0; i
--) {
4083 breakPos
= bi
->preceding(i
);
4084 if (breakPos
>= i
||
4085 breakPos
> lastBreakPos
||
4086 breakPos
< 0 && testText
.getChar32Start(i
)>0 ||
4087 breakPos
< lastBreakPos
&& lastBreakPos
< testText
.getChar32Start(i
) ) {
4088 errln("%s break monkey test: "
4089 "Out of range value returned by BreakIterator::preceding().\n"
4090 "index=%d; prev returned %d; lastBreak=%d" ,
4091 name
, i
, breakPos
, lastBreakPos
);
4092 precedingBreaks
[i
] = 2; // Forces an error.
4094 precedingBreaks
[breakPos
] = 1;
4095 lastBreakPos
= breakPos
;
4099 // Compare the expected and actual results.
4100 for (i
=0; i
<=testText
.length(); i
++) {
4101 const char *errorType
= NULL
;
4102 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
4103 errorType
= "next()";
4104 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
4105 errorType
= "previous()";
4106 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
4107 errorType
= "isBoundary()";
4108 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
4109 errorType
= "following()";
4110 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
4111 errorType
= "preceding()";
4115 if (errorType
!= NULL
) {
4116 // Format a range of the test text that includes the failure as
4117 // a data item that can be included in the rbbi test data file.
4119 // Start of the range is the last point where expected and actual results
4120 // both agreed that there was a break position.
4121 int startContext
= i
;
4124 if (startContext
==0) { break; }
4126 if (expectedBreaks
[startContext
] != 0) {
4127 if (count
== 2) break;
4132 // End of range is two expected breaks past the start position.
4133 int endContext
= i
+ 1;
4135 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
4137 if (endContext
>= testText
.length()) {break;}
4138 if (expectedBreaks
[endContext
-1] != 0) {
4139 if (count
== 0) break;
4146 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4147 UnicodeString errorText
= "<data>";
4148 /***if (strcmp(errorType, "next()") == 0) {
4150 endContext = testText.length();
4152 printStringBreaks(testText, expected, expectedCount);
4155 for (ci
=startContext
; ci
<endContext
;) {
4156 UnicodeString
hexChars("0123456789abcdef");
4159 c
= testText
.char32At(ci
);
4161 // This is the location of the error.
4162 errorText
.append("<?>");
4163 } else if (expectedBreaks
[ci
] != 0) {
4164 // This a non-error expected break position.
4165 errorText
.append("\\");
4168 errorText
.append("\\u");
4169 for (bn
=12; bn
>=0; bn
-=4) {
4170 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4173 errorText
.append("\\U");
4174 for (bn
=28; bn
>=0; bn
-=4) {
4175 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
4178 ci
= testText
.moveIndex32(ci
, 1);
4180 errorText
.append("\\");
4181 errorText
.append("</data>\n");
4184 char charErrorTxt
[500];
4185 UErrorCode status
= U_ZERO_ERROR
;
4186 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
4187 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
4188 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4189 name
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
4190 errorType
, seed
, i
, charErrorTxt
);
4201 // TestDebug - A place-holder test for debugging purposes.
4202 // For putting in fragments of other tests that can be invoked
4203 // for tracing without a lot of unwanted extra stuff happening.
4205 void RBBITest::TestDebug(void) {
4207 UErrorCode status
= U_ZERO_ERROR
;
4211 RuleBasedBreakIterator
* bi
=
4212 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4213 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4214 (RuleBasedBreakIterator
*)BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
4215 UnicodeString
s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4216 // UnicodeString s("Aaa. Bcd");
4219 UBool r
= bi
->isBoundary(8);
4220 printf("%s", r
?"true":"false");
4224 // ruleStatus = bi->getRuleStatus();
4225 printf("%d\t%d\n", pos
, ruleStatus
);
4226 pos
= bi
->previous();
4227 } while (pos
!= BreakIterator::DONE
);
4231 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */