1 /********************************************************************
3 * Copyright (c) 1999-2004, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/utypes.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/uchar.h"
20 #include "unicode/utf16.h"
21 #include "unicode/ucnv.h"
22 #include "unicode/schriter.h"
23 #include "unicode/uniset.h"
24 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/ustring.h"
38 //---------------------------------------------------------------------------
40 // class BITestData Holds a set of Break iterator test data and results
42 // - the string data to be broken
43 // - a vector of the expected break positions.
44 // - a vector of source line numbers for the data,
45 // (to help see where errors occured.)
46 // - The expected break tag values.
47 // - Vectors of actual break positions and tag values.
48 // - Functions for comparing actual with expected and
51 //----------------------------------------------------------------------------
54 UnicodeString fDataToBreak
;
55 UVector fExpectedBreakPositions
;
56 UVector fExpectedTags
;
58 UVector fActualBreakPositions
; // Test Results.
61 BITestData(UErrorCode
&status
);
62 void addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
);
63 void checkResults(const char *heading
, RBBITest
*test
);
64 void err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
);
71 BITestData::BITestData(UErrorCode
&status
)
72 : fExpectedBreakPositions(status
), fExpectedTags(status
), fLineNum(status
), fActualBreakPositions(status
),
78 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
79 // The macro form collects the line number, which is helpful
80 // when tracking down failures.
82 // A null data item is inserted at the start of each test's data
83 // to put the starting zero into the data list. The position saved for
84 // each non-null item is its ending position.
86 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
87 void BITestData::addDataChunk(const char *data
, int32_t tag
, int32_t lineNum
, UErrorCode status
) {
88 if (U_FAILURE(status
)) {return;}
90 fDataToBreak
.append(CharsToUnicodeString(data
));
92 fExpectedBreakPositions
.addElement(fDataToBreak
.length(), status
);
93 fExpectedTags
.addElement(tag
, status
);
94 fLineNum
.addElement(lineNum
, status
);
99 // checkResults. Compare the actual and expected break positions, report any differences.
101 void BITestData::checkResults(const char *heading
, RBBITest
*test
) {
102 int32_t expectedIndex
= 0;
103 int32_t actualIndex
= 0;
106 // If we've run through both the expected and actual results vectors, we're done.
107 // break out of the loop.
108 if (expectedIndex
>= fExpectedBreakPositions
.size() &&
109 actualIndex
>= fActualBreakPositions
.size()) {
114 if (expectedIndex
>= fExpectedBreakPositions
.size()) {
115 err(heading
, test
, expectedIndex
-1, actualIndex
);
120 if (actualIndex
>= fActualBreakPositions
.size()) {
121 err(heading
, test
, expectedIndex
, actualIndex
-1);
126 if (fActualBreakPositions
.elementAti(actualIndex
) != fExpectedBreakPositions
.elementAti(expectedIndex
)) {
127 err(heading
, test
, expectedIndex
, actualIndex
);
128 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
129 if (fActualBreakPositions
.elementAti(actualIndex
) < fExpectedBreakPositions
.elementAti(expectedIndex
)) {
137 if (fActualTags
.elementAti(actualIndex
) != fExpectedTags
.elementAti(expectedIndex
)) {
138 test
->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
139 heading
, fLineNum
.elementAt(expectedIndex
),
140 fExpectedTags
.elementAti(expectedIndex
), fActualTags
.elementAti(actualIndex
));
149 // err - An error was found. Report it, along with information about where the
150 // incorrectly broken test data appeared in the source file.
152 void BITestData::err(const char *heading
, RBBITest
*test
, int32_t expectedIdx
, int32_t actualIdx
)
154 int32_t expected
= fExpectedBreakPositions
.elementAti(expectedIdx
);
155 int32_t actual
= fActualBreakPositions
.elementAti(actualIdx
);
157 int32_t line
= fLineNum
.elementAti(expectedIdx
);
158 if (expectedIdx
> 0) {
159 // The line numbers are off by one because a premature break occurs somewhere
160 // within the previous item, rather than at the start of the current (expected) item.
161 // We want to report the offset of the unexpected break from the start of
162 // this previous item.
163 o
= actual
- fExpectedBreakPositions
.elementAti(expectedIdx
-1);
165 if (actual
< expected
) {
166 test
->errln("%s unexpected break at offset %d in test item from line %d", heading
, o
, line
);
168 test
->errln("%s Failed to find break at end of item from line %d", heading
, line
);
173 void BITestData::clearResults() {
174 fActualBreakPositions
.removeAllElements();
175 fActualTags
.removeAllElements();
179 //-----------------------------------------------------------------------------------
181 // Cannned Test Characters
183 //-----------------------------------------------------------------------------------
185 static const UChar cannedTestArray
[] = {
186 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
187 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
188 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
189 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
190 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
191 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
192 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
193 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
196 static UnicodeString
* cannedTestChars
= 0;
198 #define halfNA "\\u0928\\u094d\\u200d"
199 #define halfSA "\\u0938\\u094d\\u200d"
200 #define halfCHA "\\u091a\\u094d\\u200d"
201 #define halfKA "\\u0915\\u094d\\u200d"
202 #define deadTA "\\u0924\\u094d"
204 //--------------------------------------------------------------------------------------
206 // RBBITest constructor and destructor
208 //--------------------------------------------------------------------------------------
210 RBBITest::RBBITest() {
211 UnicodeString
temp(cannedTestArray
);
212 cannedTestChars
= new UnicodeString();
213 *cannedTestChars
+= (UChar
)0x0000;
214 *cannedTestChars
+= temp
;
218 RBBITest::~RBBITest() {
219 delete cannedTestChars
;
223 static const int T_NUMBER
= 100;
224 static const int T_LETTER
= 200;
225 static const int T_H_OR_K
= 300;
226 static const int T_IDEO
= 400;
233 //--------------------------------------------------------------------
234 //Testing the BreakIterator for devanagari script
235 //--------------------------------------------------------------------
237 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
238 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
239 #define deadTTHA "\\u0920\\u094d"
240 #define deadPA "\\u092a\\u094d"
241 #define deadSA "\\u0938\\u094d"
242 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
249 //-----------------------------------------------------------------------------------
251 // Test for status {tag} return value from break rules.
252 // TODO: a more thorough test.
254 //-----------------------------------------------------------------------------------
255 void RBBITest::TestStatusReturn() {
256 UnicodeString rulesString1
= "$Letters = [:L:];\n"
257 "$Numbers = [:N:];\n"
260 "Help\\ {4}/me\\!;\n"
261 "[^$Letters $Numbers];\n"
263 UnicodeString testString1
= "abc123..abc Help me Help me!";
264 // 01234567890123456789012345678
265 int32_t bounds1
[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
266 int32_t brkStatus
[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
268 UErrorCode status
=U_ZERO_ERROR
;
269 UParseError parseError
;
271 RuleBasedBreakIterator
*bi
= new RuleBasedBreakIterator(rulesString1
, parseError
, status
);
272 if(U_FAILURE(status
)) {
273 errln("FAIL : in construction");
277 bi
->setText(testString1
);
278 for (pos
=bi
->first(); pos
!= BreakIterator::DONE
; pos
=bi
->next()) {
279 if (pos
!= bounds1
[i
]) {
280 errln("FAIL: expected break at %d, got %d\n", bounds1
[i
], pos
);
284 int tag
= bi
->getRuleStatus();
285 if (tag
!= brkStatus
[i
]) {
286 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos
, brkStatus
[i
], tag
);
296 static void printStringBreaks(UnicodeString ustr
, int expected
[],
299 UErrorCode status
= U_ZERO_ERROR
;
301 printf("code alpha extend alphanum type line name\n");
303 for (j
= 0; j
< ustr
.length(); j
++) {
304 if (expectedcount
> 0) {
306 for (k
= 0; k
< expectedcount
; k
++) {
307 if (j
== expected
[k
]) {
308 printf("------------------------------------------------ %d\n",
313 UChar32 c
= ustr
.char32At(j
);
317 u_charName(c
, U_UNICODE_CHAR_NAME
, name
, 100, &status
);
318 printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c
,
320 u_hasBinaryProperty(c
, UCHAR_GRAPHEME_EXTEND
),
322 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY
,
324 U_SHORT_PROPERTY_NAME
),
325 u_getPropertyValueName(UCHAR_LINE_BREAK
,
326 u_getIntPropertyValue(c
,
328 U_SHORT_PROPERTY_NAME
),
333 void RBBITest::TestThaiLineBreak() {
334 UErrorCode status
= U_ZERO_ERROR
;
335 BITestData
thaiLineSelection(status
);
337 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
338 // represents elided letters at the end of a long word. It should be bound to
339 // the end of the word and not treated as an independent punctuation mark.
342 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
343 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status
);
344 ADD_DATACHUNK(thaiLineSelection
, "\\u0e08\\u0e30", 0, status
);
345 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status
);
346 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status
);
347 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
348 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
349 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status
);
350 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
351 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e2d\\u0e01", 0, status
);
352 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32", 0, status
);
353 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status
);
354 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status
);
355 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status
);
356 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status
);
358 // the one time where the paiyannoi occurs somewhere other than at the end
359 // of a word is in the Thai abbrevation for "etc.", which both begins and
360 // ends with a paiyannoi
361 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2f\\u0e25\\u0e2f", 0, status
);
362 ADD_DATACHUNK(thaiLineSelection
, "\\u0e17\\u0e35\\u0e48", 0, status
);
363 ADD_DATACHUNK(thaiLineSelection
, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status
);
365 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
366 Locale("th"), status
);
367 if (U_FAILURE(status
))
369 errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
373 generalIteratorTest(*e
, thaiLineSelection
);
379 void RBBITest::TestMixedThaiLineBreak()
381 UErrorCode status
= U_ZERO_ERROR
;
382 BITestData
thaiLineSelection(status
);
384 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
386 // Arabic numerals should always be separated from surrounding Thai text
388 ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
389 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
390 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
391 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
392 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
393 thaiLineSelection->addElement("39");
394 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
396 // words in non-Thai scripts should always be separated from surrounding Thai text
397 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);
398 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);
399 thaiLineSelection->addElement("Java");
400 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);
401 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);
402 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);
404 // Thai numerals should always be separated from the text surrounding them
405 ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
406 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
407 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
408 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
409 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
410 ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);
411 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
413 // Thai text should interact correctly with punctuation and symbols
414 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);
415 // ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);
416 // ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);
417 ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);
418 // I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
419 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);
420 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);
421 ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
424 // The Unicode Linebreak TR says do not break before or after quotes.
425 // So this test is changed ot not break around the quote.
426 // TODO: should Thai break around the around the quotes, like the original behavior here?
427 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);
428 // ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
429 ADD_DATACHUNK(thaiLineSelection
, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
430 "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status
);
432 ADD_DATACHUNK(thaiLineSelection
, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status
);
433 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status
);
434 ADD_DATACHUNK(thaiLineSelection
, "\\u0e22.", 0, status
);
435 ADD_DATACHUNK(thaiLineSelection
, "\\u0e19\\u0e35\\u0e49", 0, status
);
436 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status
);
437 ADD_DATACHUNK(thaiLineSelection
, "$200", 0, status
);
438 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status
);
439 ADD_DATACHUNK(thaiLineSelection
, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status
);
440 ADD_DATACHUNK(thaiLineSelection
, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status
);
442 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale("th"), status
);
443 if (U_FAILURE(status
))
445 errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
450 generalIteratorTest(*e
, thaiLineSelection
);
455 void RBBITest::TestMaiyamok()
457 UErrorCode status
= U_ZERO_ERROR
;
458 BITestData
thaiLineSelection(status
);
459 ADD_DATACHUNK(thaiLineSelection
, NULL
, 0, status
); // Break at start of data
460 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
461 // word". Instead of appearing as a word unto itself, however, it's kept together
462 // with the word before it
463 ADD_DATACHUNK(thaiLineSelection
, "\\u0e44\\u0e1b\\u0e46", 0, status
);
464 ADD_DATACHUNK(thaiLineSelection
, "\\u0e21\\u0e32\\u0e46", 0, status
);
465 ADD_DATACHUNK(thaiLineSelection
, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status
);
466 ADD_DATACHUNK(thaiLineSelection
, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status
);
467 ADD_DATACHUNK(thaiLineSelection
, "\\u0e41\\u0e25\\u0e30", 0, status
);
468 ADD_DATACHUNK(thaiLineSelection
, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status
);
469 ADD_DATACHUNK(thaiLineSelection
, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status
);
471 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(
472 Locale("th"), status
);
474 if (U_FAILURE(status
))
476 errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
479 generalIteratorTest(*e
, thaiLineSelection
);
483 void RBBITest::TestThaiWordBreak() {
484 UErrorCode status
= U_ZERO_ERROR
;
485 BITestData
thaiWordSelection(status
);
487 ADD_DATACHUNK(thaiWordSelection
, NULL
, 0, status
); // Break at start of data
488 ADD_DATACHUNK(thaiWordSelection
, "\\u0E1A\\u0E17", 0, status
); //2
489 ADD_DATACHUNK(thaiWordSelection
, "\\u0E17\\u0E35\\u0E48", 0, status
); //5
490 ADD_DATACHUNK(thaiWordSelection
, "\\u0E51", 0, status
); //6
491 ADD_DATACHUNK(thaiWordSelection
, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status
); //10
492 ADD_DATACHUNK(thaiWordSelection
, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status
); //16
493 ADD_DATACHUNK(thaiWordSelection
, "\\u000D\\u000A", 0, status
); //18
495 // This is the correct result
496 //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24
497 //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
499 // and this is what the dictionary does...
500 ADD_DATACHUNK(thaiWordSelection
, "\\u0E42\\u0E14", 0, status
); // 20
501 ADD_DATACHUNK(thaiWordSelection
, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status
); //29
503 ADD_DATACHUNK(thaiWordSelection
, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status
); //33
505 // This is the correct result
506 //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37
507 //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
509 // and this is what the dictionary does
510 ADD_DATACHUNK(thaiWordSelection
, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status
); //41
512 ADD_DATACHUNK(thaiWordSelection
, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status
); //45
513 ADD_DATACHUNK(thaiWordSelection
, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status
); //49
514 ADD_DATACHUNK(thaiWordSelection
, "\\u0E43\\u0E19", 0, status
); //51
516 // This is the correct result
517 //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57
518 //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60
520 // and this is what the dictionary does
521 ADD_DATACHUNK(thaiWordSelection
, "\\u0E41\\u0E04\\u0E19", 0, status
); // 54
522 ADD_DATACHUNK(thaiWordSelection
, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status
); //60
524 ADD_DATACHUNK(thaiWordSelection
, "\\u0E25\\u0E38\\u0E07", 0, status
); //63
526 // This is the correct result
527 //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68
528 //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71
529 //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74
530 //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77
532 // and this is what the dictionary does
533 ADD_DATACHUNK(thaiWordSelection
, "\\u0E40\\u0E2E", 0, status
); // 65
534 ADD_DATACHUNK(thaiWordSelection
, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status
); //77
536 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(
537 Locale("th"), status
);
538 if (U_FAILURE(status
))
540 errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
544 generalIteratorTest(*e
, thaiWordSelection
);
549 void RBBITest::TestBug3818() {
550 UErrorCode status
= U_ZERO_ERROR
;
552 // Four Thai words...
553 static const UChar thaiWordData
[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
554 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
555 UnicodeString
thaiStr(thaiWordData
);
557 RuleBasedBreakIterator
* bi
=
558 (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(Locale("th"), status
);
559 if (U_FAILURE(status
) || bi
== NULL
) {
560 errln("Fail at file %s, line %d, status = %s", __FILE__
, __LINE__
, u_errorName(status
));
563 bi
->setText(thaiStr
);
565 int32_t startOfSecondWord
= bi
->following(1);
566 if (startOfSecondWord
!= 4) {
567 errln("Fail at file %s, line %d expected start of word at 4, got %d",
568 __FILE__
, __LINE__
, startOfSecondWord
);
570 startOfSecondWord
= bi
->following(0);
571 if (startOfSecondWord
!= 4) {
572 errln("Fail at file %s, line %d expected start of word at 4, got %d",
573 __FILE__
, __LINE__
, startOfSecondWord
);
579 void RBBITest::TestJapaneseWordBreak() {
580 UErrorCode status
= U_ZERO_ERROR
;
581 BITestData
japaneseWordSelection(status
);
583 ADD_DATACHUNK(japaneseWordSelection
, NULL
, 0, status
); // Break at start of data
584 ADD_DATACHUNK(japaneseWordSelection
, "\\u4ECA\\u65E5", 400, status
); //2
585 ADD_DATACHUNK(japaneseWordSelection
, "\\u306F\\u3044\\u3044", 300, status
); //5
586 ADD_DATACHUNK(japaneseWordSelection
, "\\u5929\\u6C17", 400, status
); //7
587 ADD_DATACHUNK(japaneseWordSelection
, "\\u3067\\u3059\\u306D", 300, status
); //10
588 ADD_DATACHUNK(japaneseWordSelection
, "\\u3002", 0, status
); //11
589 ADD_DATACHUNK(japaneseWordSelection
, "\\u000D\\u000A", 0, status
); //12
591 RuleBasedBreakIterator
* e
= (RuleBasedBreakIterator
*)BreakIterator::createWordInstance(
592 Locale("ja"), status
);
593 if (U_FAILURE(status
))
595 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
599 generalIteratorTest(*e
, japaneseWordSelection
);
603 //---------------------------------------------
605 //---------------------------------------------
607 void RBBITest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* params
)
609 if (exec
) logln("TestSuite RuleBasedBreakIterator: ");
612 case 0: name
= "TestBug4153072";
613 if(exec
) TestBug4153072(); break;
614 case 1: name
= "TestJapaneseLineBreak";
615 if(exec
) TestJapaneseLineBreak(); break;
616 case 2: name
= "TestStatusReturn";
617 if(exec
) TestStatusReturn(); break;
619 case 3: name
= "TestLineBreakData";
620 if(exec
) TestLineBreakData(); break;
621 case 4: name
= "TestEmptyString";
622 if(exec
) TestEmptyString(); break;
624 case 5: name
= "TestGetAvailableLocales";
625 if(exec
) TestGetAvailableLocales(); break;
627 case 6: name
= "TestGetDisplayName";
628 if(exec
) TestGetDisplayName(); break;
630 case 7: name
= "TestEndBehaviour";
631 if(exec
) TestEndBehaviour(); break;
632 case 8: name
= "TestMixedThaiLineBreak";
633 if(exec
) TestMixedThaiLineBreak(); break;
634 case 9: name
= "TestThaiWordBreak";
635 if(exec
) TestThaiWordBreak(); break;
636 case 10: name
= "TestThaiLineBreak";
637 if(exec
) TestThaiLineBreak(); break;
638 case 11: name
= "TestMaiyamok";
639 if(exec
) TestMaiyamok(); break;
640 case 12: name
= "TestWordBreaks";
641 if(exec
) TestWordBreaks(); break;
642 case 13: name
= "TestWordBoundary";
643 if(exec
) TestWordBoundary(); break;
644 case 14: name
= "TestLineBreaks";
645 if(exec
) TestLineBreaks(); break;
646 case 15: name
= "TestSentBreaks";
647 if(exec
) TestSentBreaks(); break;
648 case 16: name
= "TestExtended";
649 if(exec
) TestExtended(); break;
650 case 17: name
= "TestMonkey";
652 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
655 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
659 case 18: name
= "TestBug3818";
660 if(exec
) TestBug3818(); break;
661 case 19: name
= "TestJapaneseWordBreak";
662 if(exec
) TestJapaneseWordBreak(); break;
664 default: name
= ""; break; //needed to end loop
669 //----------------------------------------------------------------------------
671 // generalIteratorTest Given a break iterator and a set of test data,
672 // Run the tests and report the results.
674 //----------------------------------------------------------------------------
675 void RBBITest::generalIteratorTest(RuleBasedBreakIterator
& bi
, BITestData
&td
)
678 bi
.setText(td
.fDataToBreak
);
680 testFirstAndNext(bi
, td
);
682 testLastAndPrevious(bi
, td
);
684 testFollowing(bi
, td
);
685 testPreceding(bi
, td
);
686 testIsBoundary(bi
, td
);
687 doMultipleSelectionTest(bi
, td
);
692 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
695 void RBBITest::testFirstAndNext(RuleBasedBreakIterator
& bi
, BITestData
&td
)
697 UErrorCode status
= U_ZERO_ERROR
;
702 logln("Test first and next");
703 bi
.setText(td
.fDataToBreak
);
706 for (p
=bi
.first(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.next()) {
707 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
708 tag
= bi
.getRuleStatus();
709 td
.fActualTags
.addElement(tag
, status
);
711 // If the iterator is not making forward progress, stop.
712 // No need to raise an error here, it'll be detected in the normal check of results.
717 td
.checkResults("testFirstAndNext", this);
722 // TestLastAndPrevious. Run the iterator backwards, starting with last().
724 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator
& bi
, BITestData
&td
)
726 UErrorCode status
= U_ZERO_ERROR
;
728 int32_t lastP
= 0x7ffffffe;
731 logln("Test first and next");
732 bi
.setText(td
.fDataToBreak
);
735 for (p
=bi
.last(); p
!=RuleBasedBreakIterator::DONE
; p
=bi
.previous()) {
736 // Save break position. Insert it at start of vector of results, shoving
737 // already-saved results further towards the end.
738 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
739 // bi.previous(); // TODO: Why does this fix things up????
741 tag
= bi
.getRuleStatus();
742 td
.fActualTags
.insertElementAt(tag
, 0, status
);
744 // If the iterator is not making progress, stop.
745 // No need to raise an error here, it'll be detected in the normal check of results.
750 td
.checkResults("testLastAndPrevious", this);
754 void RBBITest::testFollowing(RuleBasedBreakIterator
& bi
, BITestData
&td
)
756 UErrorCode status
= U_ZERO_ERROR
;
759 int32_t lastP
= -2; // A value that will never be returned as a break position.
760 // cannot be -1; that is returned for DONE.
763 logln("testFollowing():");
764 bi
.setText(td
.fDataToBreak
);
767 // Save the starting point, since we won't get that out of following.
769 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
770 tag
= bi
.getRuleStatus();
771 td
.fActualTags
.addElement(tag
, status
);
773 for (i
= 0; i
<= td
.fDataToBreak
.length()+1; i
++) {
776 if (p
== RuleBasedBreakIterator::DONE
) {
779 // We've reached a new break position. Save it.
780 td
.fActualBreakPositions
.addElement(p
, status
); // Save result.
781 tag
= bi
.getRuleStatus();
782 td
.fActualTags
.addElement(tag
, status
);
786 // The loop normally exits by means of the break in the middle.
787 // Make sure that the index was at the correct position for the break iterator to have
789 if (i
!= td
.fDataToBreak
.length()) {
790 errln("testFollowing(): iterator returned DONE prematurely.");
793 // Full check of all results.
794 td
.checkResults("testFollowing", this);
799 void RBBITest::testPreceding(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
800 UErrorCode status
= U_ZERO_ERROR
;
803 int32_t lastP
= 0x7ffffffe;
806 logln("testPreceding():");
807 bi
.setText(td
.fDataToBreak
);
811 td
.fActualBreakPositions
.addElement(p
, status
);
812 tag
= bi
.getRuleStatus();
813 td
.fActualTags
.addElement(tag
, status
);
815 for (i
= td
.fDataToBreak
.length(); i
>=-1; i
--) {
818 if (p
== RuleBasedBreakIterator::DONE
) {
821 // We've reached a new break position. Save it.
822 td
.fActualBreakPositions
.insertElementAt(p
, 0, status
);
824 tag
= bi
.getRuleStatus();
825 td
.fActualTags
.insertElementAt(tag
, 0, status
);
828 // The loop normally exits by means of the break in the middle.
829 // Make sure that the index was at the correct position for the break iterator to have
832 errln("testPreceding(): iterator returned DONE prematurely.");
835 // Full check of all results.
836 td
.checkResults("testPreceding", this);
841 void RBBITest::testIsBoundary(RuleBasedBreakIterator
& bi
, BITestData
&td
) {
842 UErrorCode status
= U_ZERO_ERROR
;
846 logln("testIsBoundary():");
847 bi
.setText(td
.fDataToBreak
);
850 for (i
= 0; i
<= td
.fDataToBreak
.length(); i
++) {
851 if (bi
.isBoundary(i
)) {
852 td
.fActualBreakPositions
.addElement(i
, status
); // Save result.
853 tag
= bi
.getRuleStatus();
854 td
.fActualTags
.addElement(tag
, status
);
857 td
.checkResults("testIsBoundary: ", this);
862 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator
& iterator
, BITestData
&td
)
864 iterator
.setText(td
.fDataToBreak
);
866 RuleBasedBreakIterator
* testIterator
=(RuleBasedBreakIterator
*)iterator
.clone();
867 int32_t offset
= iterator
.first();
871 logln("doMultipleSelectionTest text of length: %d", td
.fDataToBreak
.length());
873 if (*testIterator
!= iterator
)
874 errln("clone() or operator!= failed: two clones compared unequal");
877 testOffset
= testIterator
->first();
878 testOffset
= testIterator
->next(count
);
879 if (offset
!= testOffset
)
880 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
882 if (offset
!= RuleBasedBreakIterator::DONE
) {
884 offset
= iterator
.next();
886 if (offset
!= RuleBasedBreakIterator::DONE
&& *testIterator
== iterator
) {
887 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count
, offset
);
888 if (count
> 10000 || offset
== -1) {
889 errln("operator== failed too many times. Stopping test.");
891 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
897 } while (offset
!= RuleBasedBreakIterator::DONE
);
899 // now do it backwards...
900 offset
= iterator
.last();
904 testOffset
= testIterator
->last();
905 testOffset
= testIterator
->next(count
); // next() with a negative arg is same as previous
906 if (offset
!= testOffset
)
907 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count
+ ", next(n) returned " + testOffset
+ " and next() had " + offset
);
909 if (offset
!= RuleBasedBreakIterator::DONE
) {
911 offset
= iterator
.previous();
913 } while (offset
!= RuleBasedBreakIterator::DONE
);
920 //--------------------------------------------------------------------------------------------
922 // Break Iterator Invariants Tests
924 //--------------------------------------------------------------------------------------------
926 void RBBITest::TestCharacterInvariants()
928 UErrorCode status
= U_ZERO_ERROR
;
929 BreakIterator
*e
= BreakIterator::createCharacterInstance(Locale::getDefault(), status
);
930 if (U_FAILURE(status
))
932 errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
935 UnicodeString s
= *cannedTestChars
+ CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
936 doBreakInvariantTest(*e
, s
);
937 s
= *cannedTestChars
+ CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
938 doOtherInvariantTest(*e
, s
);
943 void RBBITest::TestWordInvariants()
945 UErrorCode status
= U_ZERO_ERROR
;
946 BreakIterator
*e
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
947 if (U_FAILURE(status
))
949 errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
952 UnicodeString s
= *cannedTestChars
+ CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
953 doBreakInvariantTest(*e
, s
);
954 s
= *cannedTestChars
+ CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
955 doOtherInvariantTest(*e
, s
);
960 void RBBITest::TestSentenceInvariants()
962 UErrorCode status
= U_ZERO_ERROR
;
963 BreakIterator
*e
= BreakIterator::createSentenceInstance(Locale::getDefault(), status
);
964 if (U_FAILURE(status
))
966 errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
969 UnicodeString s
= *cannedTestChars
+ CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
970 doOtherInvariantTest(*e
, s
);
977 void RBBITest::doBreakInvariantTest(BreakIterator
& tb
, UnicodeString
& testChars
)
979 UnicodeString
work("aaa");
980 int32_t errCount
= 0, testCharsLen
= testChars
.length(), breaksLen
;
982 // a break should always occur after CR (unless followed by LF), LF, PS, and LS
983 UnicodeString breaks
= CharsToUnicodeString("\r\n\\u2029\\u2028");
986 breaksLen
= breaks
.length();
987 for (i
= 0; i
< breaksLen
; i
++) {
988 UChar c1
= breaks
[i
];
989 work
.setCharAt(1, c1
);
990 for (j
= 0; j
< testCharsLen
; j
++) {
991 UChar c0
= testChars
[j
];
992 work
.setCharAt(0, c0
);
994 for (k
= 0; k
< testCharsLen
; k
++) {
995 UChar c2
= testChars
[k
];
996 work
.setCharAt(2, c2
);
998 // if a cr is followed by lf, ps, ls or etx, don't do the check (that's
999 // not supposed to work)
1000 if (c1
== '\r' && (c2
== '\n' || c2
== 0x2029
1001 || c2
== 0x2028 || c2
== 0x0003))
1004 if (u_charType(c1
) == U_CONTROL_CHAR
&&
1005 (u_charType(c2
) == U_NON_SPACING_MARK
||
1006 u_charType(c2
) == U_ENCLOSING_MARK
||
1007 u_charType(c2
) == U_COMBINING_SPACING_MARK
)
1009 // Combining marks don't combine with controls.
1010 // TODO: enhance test to verify that the break actually occurs,
1011 // not just ignore the case.
1017 UBool seen2
= FALSE
;
1019 for (l
= tb
.first(); l
!= BreakIterator::DONE
; l
= tb
.next()) {
1026 printStringBreaks(work
, NULL
, 0);
1027 errln("No Break between \\U%04x and \\U%04x", c1
, c2
);
1039 void RBBITest::doOtherInvariantTest(BreakIterator
& tb
, UnicodeString
& testChars
)
1041 UnicodeString
work("a\r\na");
1042 int32_t errCount
= 0, testCharsLen
= testChars
.length();
1046 // a break should never occur between CR and LF
1047 for (i
= 0; i
< testCharsLen
; i
++) {
1048 work
.setCharAt(0, testChars
[i
]);
1049 for (j
= 0; j
< testCharsLen
; j
++) {
1050 work
.setCharAt(3, testChars
[j
]);
1053 for (k
= tb
.first(); k
!= BreakIterator::DONE
; k
= tb
.next())
1055 errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
1056 work
[0], work
[1], work
[2], work
[3]);
1064 // a break should never occur before a non-spacing mark, unless the preceding
1065 // character is CR, LF, PS, or LS
1066 // Or the general category == Control.
1069 for (i
= 0; i
< testCharsLen
; i
++) {
1070 UChar c1
= testChars
[i
];
1071 if (c1
== '\n' || c1
== '\r' || c1
== 0x2029 || c1
== 0x2028 || c1
== 0x0003 ||
1072 u_charType(c1
) == U_CONTROL_CHAR
|| u_charType(c1
) == U_FORMAT_CHAR
) {
1075 work
.setCharAt(1, c1
);
1076 for (j
= 0; j
< testCharsLen
; j
++) {
1077 UChar c2
= testChars
[j
];
1078 type
= u_charType(c2
);
1079 if ((type
!= U_NON_SPACING_MARK
) &&
1080 (type
!= U_ENCLOSING_MARK
)) {
1083 work
.setCharAt(2, c2
);
1086 for (k
= tb
.first(); k
!= BreakIterator::DONE
; k
= tb
.next())
1088 //errln("Break between U+" + UCharToUnicodeString(work[1])
1089 // + " and U+" + UCharToUnicodeString(work[2]));
1090 errln("Unexpected Break between %6x and %6x", c1
, c2
);
1102 //---------------------------------------------
1106 //---------------------------------------------
1107 void RBBITest::TestEmptyString()
1109 UnicodeString text
= "";
1110 UErrorCode status
= U_ZERO_ERROR
;
1112 BITestData
x(status
);
1113 ADD_DATACHUNK(x
, "", 0, status
); // Break at start of data
1114 RuleBasedBreakIterator
* bi
= (RuleBasedBreakIterator
*)BreakIterator::createLineInstance(Locale::getDefault(), status
);
1115 if (U_FAILURE(status
))
1117 errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1120 generalIteratorTest(*bi
, x
);
1124 void RBBITest::TestGetAvailableLocales()
1126 int32_t locCount
= 0;
1127 const Locale
* locList
= BreakIterator::getAvailableLocales(locCount
);
1130 errln("getAvailableLocales() returned an empty list!");
1131 // Just make sure that it's returning good memory.
1133 for (i
= 0; i
< locCount
; ++i
) {
1134 logln(locList
[i
].getName());
1138 //Testing the BreakIterator::getDisplayName() function
1139 void RBBITest::TestGetDisplayName()
1141 UnicodeString result
;
1143 BreakIterator::getDisplayName(Locale::getUS(), result
);
1144 if (Locale::getDefault() == Locale::getUS() && result
!= "English (United States)")
1145 errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1148 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result
);
1149 if (result
!= "French (France)")
1150 errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1154 * Test End Behaviour
1157 void RBBITest::TestEndBehaviour()
1159 UErrorCode status
= U_ZERO_ERROR
;
1160 UnicodeString
testString("boo.");
1161 BreakIterator
*wb
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1162 if (U_FAILURE(status
))
1164 errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1167 wb
->setText(testString
);
1169 if (wb
->first() != 0)
1170 errln("Didn't get break at beginning of string.");
1171 if (wb
->next() != 3)
1172 errln("Didn't get break before period in \"boo.\"");
1173 if (wb
->current() != 4 && wb
->next() != 4)
1174 errln("Didn't get break at end of string.");
1180 void RBBITest::TestBug4153072() {
1181 UErrorCode status
= U_ZERO_ERROR
;
1182 BreakIterator
*iter
= BreakIterator::createWordInstance(Locale::getDefault(), status
);
1183 if (U_FAILURE(status
))
1185 errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1188 UnicodeString
str("...Hello, World!...");
1190 int32_t end
= str
.length() - 3;
1193 StringCharacterIterator
* textIterator
= new StringCharacterIterator(str
, begin
, end
, begin
);
1194 iter
->adoptText(textIterator
);
1196 for (index
= -1; index
< begin
+ 1; ++index
) {
1197 dummy
= iter
->isBoundary(index
);
1198 if (index
< begin
&& dummy
== TRUE
) {
1199 errln((UnicodeString
)"Didn't handle preceeding correctly with offset = " + index
+
1200 " and begin index = " + begin
);
1208 * Test Japanese Line Break
1211 void RBBITest::TestJapaneseLineBreak()
1214 // Test needs updating some more... Dump it for now.
1217 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1218 // as opening and closing punctuation for line breaking.
1219 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1220 // from these tests. 6-13-2002
1222 UErrorCode status
= U_ZERO_ERROR
;
1223 UnicodeString testString
= CharsToUnicodeString("\\u4e00x\\u4e8c");
1224 UnicodeString precedingChars
= CharsToUnicodeString(
1225 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1226 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1227 UnicodeString followingChars
= CharsToUnicodeString(
1228 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1229 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1230 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1231 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1232 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1233 BreakIterator
*iter
= BreakIterator::createLineInstance(Locale::getJapan(), status
);
1236 if (U_FAILURE(status
))
1238 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1242 for (i
= 0; i
< precedingChars
.length(); i
++) {
1243 testString
.setCharAt(1, precedingChars
[i
]);
1244 iter
->setText(testString
);
1245 int32_t j
= iter
->first();
1247 errln("ja line break failure: failed to start at 0");
1250 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars
[i
])
1251 + "' (" + ((int)(precedingChars
[i
])) + ")");
1254 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars
[i
])
1255 + "' (" + ((int)(precedingChars
[i
])) + ")");
1258 for (i
= 0; i
< followingChars
.length(); i
++) {
1259 testString
.setCharAt(1, followingChars
[i
]);
1260 iter
->setText(testString
);
1261 int j
= iter
->first();
1263 errln("ja line break failure: failed to start at 0");
1266 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars
[i
])
1267 + "' (" + ((int)(followingChars
[i
])) + ")");
1270 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars
[i
])
1271 + "' (" + ((int)(followingChars
[i
])) + ")");
1278 //------------------------------------------------------------------------------
1280 // RBBITest::Extended Run RBBI Tests from an external test data file
1282 //------------------------------------------------------------------------------
1286 UnicodeString dataToBreak
;
1287 UVector32
*expectedBreaks
;
1292 void RBBITest::executeTest(TestParams
*t
) {
1297 t
->bi
->setText(t
->dataToBreak
);
1299 // Run the iterator forward
1302 for (bp
= t
->bi
->first(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->next()) {
1304 // Fail for lack of forward progress.
1305 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1306 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1310 // Check that there were we didn't miss an expected break between the last one
1312 for (i
=prevBP
+1; i
<bp
; i
++) {
1313 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1314 int expected
[] = {0, i
};
1315 printStringBreaks(t
->dataToBreak
, expected
, 2);
1316 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1317 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1321 // Check that the break we did find was expected
1322 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1323 int expected
[] = {0, bp
};
1324 printStringBreaks(t
->dataToBreak
, expected
, 2);
1325 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1326 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1328 // The break was expected.
1329 // Check that the {nnn} tag value is correct.
1330 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1331 if (expectedTagVal
== -1) {
1334 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1335 if (rs
!= expectedTagVal
) {
1336 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1337 " Actual, Expected status = %4d, %4d",
1338 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1346 // Verify that there were no missed expected breaks after the last one found
1347 for (i
=prevBP
+1; i
<t
->expectedBreaks
->size(); i
++) {
1348 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1349 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1350 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1355 // Run the iterator backwards, verify that the same breaks are found.
1357 prevBP
= t
->dataToBreak
.length()+2; // start with a phony value for the last break pos seen.
1358 for (bp
= t
->bi
->last(); bp
!= BreakIterator::DONE
; bp
= t
->bi
->previous()) {
1360 // Fail for lack of progress.
1361 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1362 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1366 // Check that there were we didn't miss an expected break between the last one
1367 // and this one. (UVector returns zeros for index out of bounds.)
1368 for (i
=prevBP
-1; i
>bp
; i
--) {
1369 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1370 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1371 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1375 // Check that the break we did find was expected
1376 if (t
->expectedBreaks
->elementAti(bp
) == 0) {
1377 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1378 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
));
1380 // The break was expected.
1381 // Check that the {nnn} tag value is correct.
1382 int32_t expectedTagVal
= t
->expectedBreaks
->elementAti(bp
);
1383 if (expectedTagVal
== -1) {
1386 int32_t rs
= ((RuleBasedBreakIterator
*)t
->bi
)->getRuleStatus();
1387 if (rs
!= expectedTagVal
) {
1388 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1389 " Actual, Expected status = %4d, %4d",
1390 bp
, t
->srcLine
->elementAti(bp
), t
->srcCol
->elementAti(bp
), rs
, expectedTagVal
);
1397 // Verify that there were no missed breaks prior to the last one found
1398 for (i
=prevBP
-1; i
>=0; i
--) {
1399 if (t
->expectedBreaks
->elementAti(i
) != 0) {
1400 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1401 i
, t
->srcLine
->elementAti(i
), t
->srcCol
->elementAti(i
));
1407 void RBBITest::TestExtended() {
1408 UErrorCode status
= U_ZERO_ERROR
;
1409 Locale locale
= Locale::getDefault();
1411 UnicodeString rules
;
1414 tp
.expectedBreaks
= new UVector32(status
);
1415 tp
.srcLine
= new UVector32(status
);
1416 tp
.srcCol
= new UVector32(status
);
1420 // Open and read the test data file.
1422 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
1423 char testFileName
[1000];
1424 if (testDataDirectory
== NULL
|| strlen(testDataDirectory
) >= sizeof(testFileName
)) {
1425 errln("Can't open test data. Path too long.");
1428 strcpy(testFileName
, testDataDirectory
);
1429 strcat(testFileName
, "rbbitst.txt");
1432 UChar
*testFile
= ReadAndConvertFile(testFileName
, len
, status
);
1433 if (U_FAILURE(status
)) {
1434 return; /* something went wrong, error already output */
1440 // Put the test data into a UnicodeString
1442 UnicodeString
testString(FALSE
, testFile
, len
);
1450 parseState
= PARSE_TAG
;
1452 EParseState savedState
= PARSE_TAG
;
1454 static const UChar CH_LF
= 0x0a;
1455 static const UChar CH_CR
= 0x0d;
1456 static const UChar CH_HASH
= 0x23;
1457 /*static const UChar CH_PERIOD = 0x2e;*/
1458 static const UChar CH_LT
= 0x3c;
1459 static const UChar CH_GT
= 0x3e;
1460 static const UChar CH_BACKSLASH
= 0x5c;
1461 static const UChar CH_BULLET
= 0x2022;
1463 int32_t lineNum
= 1;
1464 int32_t colStart
= 0;
1466 int32_t charIdx
= 0;
1468 int32_t tagValue
= 0; // The numeric value of a <nnn> tag.
1470 for (charIdx
= 0; charIdx
< len
; ) {
1471 UChar c
= testString
.charAt(charIdx
);
1473 if (c
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
) == CH_LF
) {
1474 // treat CRLF as a unit
1478 if (c
== CH_LF
|| c
== CH_CR
) {
1482 column
= charIdx
- colStart
+ 1;
1484 switch (parseState
) {
1486 if (c
== 0x0a || c
== 0x0d) {
1487 parseState
= savedState
;
1494 parseState
= PARSE_COMMENT
;
1495 savedState
= PARSE_TAG
;
1498 if (u_isUWhiteSpace(c
)) {
1501 if (testString
.compare(charIdx
-1, 6, "<word>") == 0) {
1503 tp
.bi
= BreakIterator::createWordInstance(locale
, status
);
1507 if (testString
.compare(charIdx
-1, 6, "<char>") == 0) {
1509 tp
.bi
= BreakIterator::createCharacterInstance(locale
, status
);
1513 if (testString
.compare(charIdx
-1, 6, "<line>") == 0) {
1515 tp
.bi
= BreakIterator::createLineInstance(locale
, status
);
1519 if (testString
.compare(charIdx
-1, 6, "<sent>") == 0) {
1521 tp
.bi
= BreakIterator::createSentenceInstance(locale
, status
);
1525 if (testString
.compare(charIdx
-1, 7, "<title>") == 0) {
1527 tp
.bi
= BreakIterator::createTitleInstance(locale
, status
);
1531 if (testString
.compare(charIdx
-1, 6, "<data>") == 0) {
1532 parseState
= PARSE_DATA
;
1534 tp
.dataToBreak
= "";
1535 tp
.expectedBreaks
->removeAllElements();
1536 tp
.srcCol
->removeAllElements();
1537 tp
.srcLine
->removeAllElements();
1541 errln("line %d: Tag expected in test file.", lineNum
);
1543 parseState
= PARSE_COMMENT
;
1544 savedState
= PARSE_DATA
;
1549 if (c
== CH_BULLET
) {
1550 int32_t breakIdx
= tp
.dataToBreak
.length();
1551 tp
.expectedBreaks
->setSize(breakIdx
+1);
1552 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1553 tp
.srcLine
->setSize(breakIdx
+1);
1554 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1555 tp
.srcCol
->setSize(breakIdx
+1);
1556 tp
.srcCol
->setElementAt(column
, breakIdx
);
1560 if (testString
.compare(charIdx
-1, 7, "</data>") == 0) {
1561 // Add final entry to mappings from break location to source file position.
1562 // Need one extra because last break position returned is after the
1563 // last char in the data, not at the last char.
1564 tp
.srcLine
->addElement(lineNum
, status
);
1565 tp
.srcCol
->addElement(column
, status
);
1567 parseState
= PARSE_TAG
;
1575 if (testString
.compare(charIdx
-1, 3, "\\N{") == 0) {
1576 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1577 // Get the code point from the name and insert it into the test data.
1578 // (Damn, no API takes names in Unicode !!!
1579 // we've got to take it back to char *)
1580 int32_t nameEndIdx
= testString
.indexOf((UChar
)0x7d/*'}'*/, charIdx
);
1581 int32_t nameLength
= nameEndIdx
- (charIdx
+2);
1582 char charNameBuf
[200];
1583 UChar32 theChar
= -1;
1584 if (nameEndIdx
!= -1) {
1585 UErrorCode status
= U_ZERO_ERROR
;
1586 testString
.extract(charIdx
+2, nameLength
, charNameBuf
, sizeof(charNameBuf
));
1587 charNameBuf
[sizeof(charNameBuf
)-1] = 0;
1588 theChar
= u_charFromName(U_UNICODE_CHAR_NAME
, charNameBuf
, &status
);
1589 if (U_FAILURE(status
)) {
1593 if (theChar
== -1) {
1594 errln("Error in named character in test file at line %d, col %d",
1597 // Named code point was recognized. Insert it
1598 // into the test data.
1599 tp
.dataToBreak
.append(theChar
);
1600 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1601 tp
.srcLine
->addElement(lineNum
, status
);
1602 tp
.srcCol
->addElement(column
, status
);
1605 if (nameEndIdx
> charIdx
) {
1606 charIdx
= nameEndIdx
+1;
1614 if (testString
.compare(charIdx
-1, 2, "<>") == 0) {
1616 int32_t breakIdx
= tp
.dataToBreak
.length();
1617 tp
.expectedBreaks
->setSize(breakIdx
+1);
1618 tp
.expectedBreaks
->setElementAt(-1, breakIdx
);
1619 tp
.srcLine
->setSize(breakIdx
+1);
1620 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1621 tp
.srcCol
->setSize(breakIdx
+1);
1622 tp
.srcCol
->setElementAt(column
, breakIdx
);
1628 parseState
= PARSE_NUM
;
1632 if (c
== CH_HASH
&& column
==3) { // TODO: why is column off so far?
1633 parseState
= PARSE_COMMENT
;
1634 savedState
= PARSE_DATA
;
1638 if (c
== CH_BACKSLASH
) {
1639 // Check for \ at end of line, a line continuation.
1640 // Advance over (discard) the newline
1641 UChar32 cp
= testString
.char32At(charIdx
);
1642 if (cp
== CH_CR
&& charIdx
<len
&& testString
.charAt(charIdx
+1) == CH_LF
) {
1644 // Need an extra increment of the input ptr to move over both of them
1647 if (cp
== CH_LF
|| cp
== CH_CR
) {
1654 // Let unescape handle the back slash.
1655 cp
= testString
.unescapeAt(charIdx
);
1657 // Escape sequence was recognized. Insert the char
1658 // into the test data.
1659 tp
.dataToBreak
.append(cp
);
1660 while (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1661 tp
.srcLine
->addElement(lineNum
, status
);
1662 tp
.srcCol
->addElement(column
, status
);
1668 // Not a recognized backslash escape sequence.
1669 // Take the next char as a literal.
1670 // TODO: Should this be an error?
1671 c
= testString
.charAt(charIdx
);
1672 charIdx
= testString
.moveIndex32(charIdx
, 1);
1675 // Normal, non-escaped data char.
1676 tp
.dataToBreak
.append(c
);
1678 // Save the mapping from offset in the data to line/column numbers in
1679 // the original input file. Will be used for better error messages only.
1680 // If there's an expected break before this char, the slot in the mapping
1681 // vector will already be set for this char; don't overwrite it.
1682 if (tp
.dataToBreak
.length() > tp
.srcLine
->size()) {
1683 tp
.srcLine
->addElement(lineNum
, status
);
1684 tp
.srcCol
->addElement(column
, status
);
1690 // We are parsing an expected numeric tag value, like <1234>,
1691 // within a chunk of data.
1692 if (u_isUWhiteSpace(c
)) {
1697 // Finished the number. Add the info to the expected break data,
1698 // and switch parse state back to doing plain data.
1699 parseState
= PARSE_DATA
;
1700 if (tagValue
== 0) {
1703 int32_t breakIdx
= tp
.dataToBreak
.length();
1704 tp
.expectedBreaks
->setSize(breakIdx
+1);
1705 tp
.expectedBreaks
->setElementAt(tagValue
, breakIdx
);
1706 tp
.srcLine
->setSize(breakIdx
+1);
1707 tp
.srcLine
->setElementAt(lineNum
, breakIdx
);
1708 tp
.srcCol
->setSize(breakIdx
+1);
1709 tp
.srcCol
->setElementAt(column
, breakIdx
);
1714 tagValue
= tagValue
*10 + u_charDigitValue(c
);
1718 errln("Syntax Error in test file at line %d, col %d",
1721 parseState
= PARSE_COMMENT
;
1726 if (U_FAILURE(status
)) {
1727 errln("ICU Error %s while parsing test file at line %d.",
1728 u_errorName(status
), lineNum
);
1730 status
= U_ZERO_ERROR
;
1737 delete tp
.expectedBreaks
;
1744 //-------------------------------------------------------------------------------
1746 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1747 // return the datain one big UChar * buffer, which the caller must delete.
1749 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1750 // Move this function to some common place.
1752 //--------------------------------------------------------------------------------
1753 UChar
*RBBITest::ReadAndConvertFile(const char *fileName
, int &ulen
, UErrorCode
&status
) {
1754 UChar
*retPtr
= NULL
;
1755 char *fileBuf
= NULL
;
1756 UConverter
* conv
= NULL
;
1760 if (U_FAILURE(status
)) {
1767 f
= fopen(fileName
, "rb");
1769 errln("Error opening test data file %s\n", fileName
);
1770 status
= U_FILE_ACCESS_ERROR
;
1779 fseek( f
, 0, SEEK_END
);
1780 fileSize
= ftell(f
);
1781 fileBuf
= new char[fileSize
];
1782 fseek(f
, 0, SEEK_SET
);
1783 amt_read
= fread(fileBuf
, 1, fileSize
, f
);
1784 if (amt_read
!= fileSize
|| fileSize
<= 0) {
1785 errln("Error reading test data file.");
1786 goto cleanUpAndReturn
;
1790 // Look for a Unicode Signature (BOM) on the data just read
1792 int32_t signatureLength
;
1793 const char * fileBufC
;
1794 const char* encoding
;
1797 encoding
= ucnv_detectUnicodeSignature(
1798 fileBuf
, fileSize
, &signatureLength
, &status
);
1799 if(encoding
!=NULL
){
1800 fileBufC
+= signatureLength
;
1801 fileSize
-= signatureLength
;
1805 // Open a converter to take the rule file to UTF-16
1807 conv
= ucnv_open(encoding
, &status
);
1808 if (U_FAILURE(status
)) {
1809 goto cleanUpAndReturn
;
1813 // Convert the rules to UChar.
1814 // Preflight first to determine required buffer size.
1816 ulen
= ucnv_toUChars(conv
,
1822 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
1823 // Buffer Overflow is expected from the preflight operation.
1824 status
= U_ZERO_ERROR
;
1826 retPtr
= new UChar
[ulen
+1];
1839 if (U_FAILURE(status
)) {
1840 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status
));
1849 //--------------------------------------------------------------------------------------------
1851 // Exhaustive Tests, using Unicode Data Files.
1853 //--------------------------------------------------------------------------------------------
1856 // Token level scanner for the Unicode Line Break Test Data file.
1857 // Return the next token, as follows:
1858 // >= 0: a UChar32 character, scanned from hex in the file.
1859 // -1: a break position, a division sign in the file.
1860 // -2: end of rule. A new line in the file.
1861 // -3: end of file. No more rules.
1865 // strips comments, ('#' to end of line)
1866 // Recognizes CR, CR/LF and LF as new lines.
1867 // Skips over spaces and Xs (don't break here) in the data.
1874 ScanState() :fPeeked(FALSE
), fLineNum(0), fFile(NULL
) {};
1877 // Literal characters that are of interest. In hex to keep EBCDIC based machines happy.
1878 // The data itself is latin-1 on all platforms.
1879 static const int32_t chSpace
= 0x20;
1880 static const int32_t chTab
= 0x09;
1881 static const int32_t chCR
= 0x0D;
1882 static const int32_t chLF
= 0x0A;
1883 static const int32_t chHash
= 0x23;
1884 static const int32_t chMult
= 0xD7;
1885 static const int32_t chDivide
= 0xF7;
1887 static int32_t nextLBDToken(ScanState
*s
) {
1890 // Read characters from the input file until we get something interesting
1891 // to return. The file is in latin-1 encoding.
1893 // Get the next character to look at,
1901 // EOF. Return immediately.
1906 // Spaces. Treat the multiply sign as a space - it indicates a no-break position
1907 // in the data, and the test program doesn't want to see them.
1908 // Continue the next char loop, looking for something significant.
1909 if (c
== chSpace
|| c
== chTab
|| c
== chMult
) {
1913 // Divide sign. Indicates an expected break position.
1914 if (c
== chDivide
) {
1918 // New Line Handling. Keep track of line number in the file, which in turn
1919 // requires keeping track of CR/LF as a single new line.
1922 s
->fPeekChar
= getc(s
->fFile
);
1923 if (s
->fPeekChar
!= chLF
) {s
->fPeeked
= TRUE
;};
1931 // Comments. Consume everything up to the next new line.
1935 } while (!(c
== EOF
|| c
== chCR
|| c
== chLF
));
1938 return nextLBDToken(s
);
1941 // Scan a hex character (UChar32) value.
1942 if (u_digit(c
, 16) >= 0) {
1943 int32_t v
= u_digit(c
, 16);
1946 if (u_digit(c
, 16) < 0) {break;};
1948 v
+= u_digit(c
, 16);
1955 // Error. Character was something unexpected.
1962 void RBBITest::TestLineBreakData() {
1964 UErrorCode status
= U_ZERO_ERROR
;
1965 UnicodeString testString
;
1966 UVector
expectedBreaks(status
);
1970 BreakIterator
*bi
= BreakIterator::createLineInstance(Locale::getDefault(), status
);
1971 if (U_FAILURE(status
)) {
1972 errln("Failure creating break iterator");
1976 const char * lbdfName
= "LBTest.txt";
1978 // Open the test data file.
1979 // TODO: a proper way to handle this data.
1980 ss
.fFile
= fopen(lbdfName
, "rb");
1981 if (ss
.fFile
== NULL
) {
1982 logln("Unable to open Line Break Test Data file. Skipping test.");
1987 // Loop once per line from the test data file.
1989 // Zero out test data from previous line.
1990 testString
.truncate(0);
1991 expectedBreaks
.removeAllElements();
1993 // Read one test's (line's) worth of data from the file.
1994 // Loop once per token on the input file line.
1996 tok
= nextLBDToken(&ss
);
1998 // If we scanned a character number in the file.
1999 // save it in the test data array.
2001 testString
.append((UChar32
)tok
);
2005 // If we scanned a break position in the data, record it.
2007 expectedBreaks
.addElement(testString
.length(), status
);
2011 // If we scanned a new line, or EOF
2012 // drop out of scan loop and run the test case.
2013 if (tok
== -2 || tok
== -3) {break;};
2015 // None of above. Error.
2016 errln("Failure: Unrecognized data format, test file line %d", ss
.fLineNum
);
2020 // If this line from the test data file actually contained test data,
2022 if (testString
.length() > 0) {
2023 int32_t pos
; // Break Position in the test string
2024 int32_t expectedI
= 0; // Index of expected break position in vector of same.
2025 int32_t expectedPos
; // Expected break position (index into test string)
2027 bi
->setText(testString
);
2028 pos
= bi
->first(); // TODO: break iterators always return a match at pos 0.
2029 pos
= bi
->next(); // Line Break TR says no match at position 0.
2032 for (; pos
!= BreakIterator::DONE
; ) {
2033 expectedPos
= expectedBreaks
.elementAti(expectedI
);
2034 if (pos
< expectedPos
) {
2035 errln("Failure: Test file line %d, unexpected break found at position %d",
2039 if (pos
> expectedPos
) {
2040 errln("Failure: Test file line %d, failed to find break at position %d",
2041 ss
.fLineNum
, expectedPos
);
2049 // If we've hit EOF on the input file, we're done.
2061 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2063 //---------------------------------------------------------------------------------------
2065 // classs RBBIMonkeyKind
2067 // Monkey Test for Break Iteration
2068 // Abstract interface class. Concrete derived classes independently
2069 // implement the break rules for different iterator types.
2071 // The Monkey Test itself uses doesn't know which type of break iterator it is
2072 // testing, but works purely in terms of the interface defined here.
2074 //---------------------------------------------------------------------------------------
2075 class RBBIMonkeyKind
{
2077 // Return a UVector of UnicodeSets, representing the character classes used
2078 // for this type of iterator.
2079 virtual UVector
*charClasses() = 0;
2081 // Set the test text on which subsequent calls to next() will operate
2082 virtual void setText(const UnicodeString
&s
) = 0;
2084 // Find the next break postion, starting from the prev break position, or from zero.
2085 // Return -1 after reaching end of string.
2086 virtual int32_t next(int32_t i
) = 0;
2088 virtual ~RBBIMonkeyKind();
2089 UErrorCode deferredStatus
;
2098 RBBIMonkeyKind::RBBIMonkeyKind() {
2099 deferredStatus
= U_ZERO_ERROR
;
2102 RBBIMonkeyKind::~RBBIMonkeyKind() {
2106 //----------------------------------------------------------------------------------------
2108 // Random Numbers. Similar to standard lib rand() and srand()
2109 // Not using library to
2110 // 1. Get same results on all platforms.
2111 // 2. Get access to current seed, to more easily reproduce failures.
2113 //---------------------------------------------------------------------------------------
2114 static uint32_t m_seed
= 1;
2116 static uint32_t m_rand()
2118 m_seed
= m_seed
* 1103515245 + 12345;
2119 return (uint32_t)(m_seed
/65536) % 32768;
2123 //------------------------------------------------------------------------------------------
2125 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2126 // of RBBIMonkeyKind.
2128 //------------------------------------------------------------------------------------------
2129 class RBBICharMonkey
: public RBBIMonkeyKind
{
2132 virtual ~RBBICharMonkey();
2133 virtual UVector
*charClasses();
2134 virtual void setText(const UnicodeString
&s
);
2135 virtual int32_t next(int32_t i
);
2139 UnicodeSet
*fCRLFSet
;
2140 UnicodeSet
*fControlSet
;
2141 UnicodeSet
*fExtendSet
;
2142 UnicodeSet
*fHangulSet
;
2143 UnicodeSet
*fAnySet
;
2145 RegexMatcher
*fMatcher
;
2146 const UnicodeString
*fText
;
2150 RBBICharMonkey::RBBICharMonkey() {
2151 UErrorCode status
= U_ZERO_ERROR
;
2154 fMatcher
= new RegexMatcher("\\X", 0, status
); // Pattern to match a grampheme cluster
2156 fCRLFSet
= new UnicodeSet("[\\r\\n]", status
);
2157 fControlSet
= new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status
);
2158 fExtendSet
= new UnicodeSet("[\\p{Grapheme_Extend}]", status
);
2159 fHangulSet
= new UnicodeSet(
2160 "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
2161 "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status
);
2162 fAnySet
= new UnicodeSet("[\\u0000-\\U0010ffff]", status
);
2164 fSets
= new UVector(status
);
2165 fSets
->addElement(fCRLFSet
, status
);
2166 fSets
->addElement(fControlSet
, status
);
2167 fSets
->addElement(fExtendSet
, status
);
2168 fSets
->addElement(fHangulSet
, status
);
2169 fSets
->addElement(fAnySet
, status
);
2170 if (U_FAILURE(status
)) {
2171 deferredStatus
= status
;
2176 void RBBICharMonkey::setText(const UnicodeString
&s
) {
2182 int32_t RBBICharMonkey::next(int32_t i
) {
2183 UErrorCode status
= U_ZERO_ERROR
;
2184 int32_t retVal
= -1;
2186 if (fMatcher
->find(i
, status
)) {
2187 retVal
= fMatcher
->end(status
);
2189 if (U_FAILURE(status
)){
2196 UVector
*RBBICharMonkey::charClasses() {
2201 RBBICharMonkey::~RBBICharMonkey() {
2212 //------------------------------------------------------------------------------------------
2214 // class RBBIWordMonkey Word Break specific implementation
2215 // of RBBIMonkeyKind.
2217 //------------------------------------------------------------------------------------------
2218 class RBBIWordMonkey
: public RBBIMonkeyKind
{
2221 virtual ~RBBIWordMonkey();
2222 virtual UVector
*charClasses();
2223 virtual void setText(const UnicodeString
&s
);
2224 virtual int32_t next(int32_t i
);
2228 UnicodeSet
*fKatakanaSet
;
2229 UnicodeSet
*fALetterSet
;
2230 UnicodeSet
*fMidLetterSet
;
2231 UnicodeSet
*fMidNumSet
;
2232 UnicodeSet
*fNumericSet
;
2233 UnicodeSet
*fFormatSet
;
2234 UnicodeSet
*fOtherSet
;
2235 UnicodeSet
*fExtendSet
;
2236 UnicodeSet
*fExtendNumLetSet
;
2238 RegexMatcher
*fMatcher
;
2240 const UnicodeString
*fText
;
2242 RegexMatcher
*fGCFMatcher
;
2243 RegexMatcher
*fGCMatcher
;
2248 RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
2251 UErrorCode status
= U_ZERO_ERROR
;
2253 fSets
= new UVector(status
);
2255 fKatakanaSet
= new UnicodeSet("[\\p{script=KATAKANA}"
2256 "\\u3031-\\u3035\\u309b\\u309c\\u30a0"
2257 "\\u30fc\\uff70\\uff9e\\uff9f]", status
);
2259 const UnicodeString
ALetterStr( "[[\\p{Alphabetic}"
2261 "\\u05f3]" // Hebrew punct Geresh
2262 "-[\\p{Ideographic}]"
2263 "-[\\p{Script=Lao}]"
2264 "-[\\p{Script=Hiragana}]"
2265 "-[\\p{Grapheme_Extend}]]");
2266 fALetterSet
= new UnicodeSet(ALetterStr
, status
);
2267 fALetterSet
->removeAll(*fKatakanaSet
);
2269 fMidLetterSet
= new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status
);
2270 fMidNumSet
= new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status
);
2271 fNumericSet
= new UnicodeSet("[\\p{Nd}\\u066b\\u066c]", status
);
2272 fFormatSet
= new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status
);
2273 fExtendSet
= new UnicodeSet("[\\p{Grapheme_Extend}]", status
);
2274 fExtendNumLetSet
= new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status
);
2275 fOtherSet
= new UnicodeSet();
2276 if(U_FAILURE(status
)) {
2277 deferredStatus
= status
;
2281 fOtherSet
->complement();
2282 fOtherSet
->removeAll(*fKatakanaSet
);
2283 fOtherSet
->removeAll(*fALetterSet
);
2284 fOtherSet
->removeAll(*fMidLetterSet
);
2285 fOtherSet
->removeAll(*fMidNumSet
);
2286 fOtherSet
->removeAll(*fNumericSet
);
2287 fOtherSet
->removeAll(*fExtendNumLetSet
);
2289 fSets
->addElement(fALetterSet
, status
);
2290 fSets
->addElement(fKatakanaSet
, status
);
2291 fSets
->addElement(fMidLetterSet
, status
);
2292 fSets
->addElement(fMidNumSet
, status
);
2293 fSets
->addElement(fNumericSet
, status
);
2294 fSets
->addElement(fFormatSet
, status
);
2295 fSets
->addElement(fOtherSet
, status
);
2296 fSets
->addElement(fExtendNumLetSet
, status
);
2299 fGCFMatcher
= new RegexMatcher("\\X(?:[\\p{Format}-\\p{Grapheme_Extend}])*", 0, status
);
2300 fGCMatcher
= new RegexMatcher("\\X", 0, status
);
2302 if (U_FAILURE(status
)) {
2303 deferredStatus
= status
;
2307 void RBBIWordMonkey::setText(const UnicodeString
&s
) {
2309 fGCMatcher
->reset(*fText
);
2310 fGCFMatcher
->reset(*fText
);
2314 int32_t RBBIWordMonkey::next(int32_t prevPos
) {
2315 UErrorCode status
= U_ZERO_ERROR
;
2317 int p0
, p1
, p2
, p3
; // Indices of the significant code points around the
2318 // break position being tested. The candidate break
2319 // location is before p2.
2323 UChar32 c0
, c1
, c2
, c3
; // The code points at p0, p1, p2 & p3.
2325 // Prev break at end of string. return DONE.
2326 if (prevPos
>= fText
->length()) {
2329 p0
= p1
= p2
= p3
= prevPos
;
2330 c3
= fText
->char32At(prevPos
);
2334 // Format char after prev break? Special case, see last Note for Word Boundaries TR.
2335 // break immdiately after the format char.
2336 if (fFormatSet
->contains(c3
)) {
2337 breakPos
= fText
->moveIndex32(prevPos
, 1);
2342 // Loop runs once per "significant" character position in the input text.
2344 // Move all of the positions forward in the input string.
2348 // Advancd p3 by (GC Format*) Rules 3, 4
2349 status
= U_ZERO_ERROR
;
2350 if (fGCFMatcher
->find(p3
, status
) == FALSE
) {
2351 p3
= fText
->length();
2354 p3
= fGCFMatcher
->end(0, status
);
2355 U_ASSERT(U_SUCCESS(status
));
2356 c3
= fText
->char32At(p3
);
2360 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2363 if (p2
== fText
->length()) {
2364 // Reached end of string. Always a break position.
2368 // Rule (5). ALetter x ALetter
2369 if (fALetterSet
->contains(c1
) &&
2370 fALetterSet
->contains(c2
)) {
2374 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2376 // Also incorporates rule 7 by skipping pos ahead to position of the
2377 // terminating ALetter.
2378 if ( fALetterSet
->contains(c1
) &&
2379 fMidLetterSet
->contains(c2
) &&
2380 fALetterSet
->contains(c3
)) {
2385 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2386 if (fALetterSet
->contains(c0
) &&
2387 (fMidLetterSet
->contains(c1
) ) &&
2388 fALetterSet
->contains(c2
)) {
2392 // Rule (8) Numeric x Numeric
2393 if (fNumericSet
->contains(c1
) &&
2394 fNumericSet
->contains(c2
)) {
2398 // Rule (9) ALetter x Numeric
2399 if (fALetterSet
->contains(c1
) &&
2400 fNumericSet
->contains(c2
)) {
2404 // Rule (10) Numeric x ALetter
2405 if (fNumericSet
->contains(c1
) &&
2406 fALetterSet
->contains(c2
)) {
2410 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2411 if ( fNumericSet
->contains(c0
) &&
2412 fMidNumSet
->contains(c1
) &&
2413 fNumericSet
->contains(c2
)) {
2417 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2418 if (fNumericSet
->contains(c1
) &&
2419 fMidNumSet
->contains(c2
) &&
2420 fNumericSet
->contains(c3
)) {
2424 // Rule (13) Katakana x Katakana
2425 if (fKatakanaSet
->contains(c1
) &&
2426 fKatakanaSet
->contains(c2
)) {
2431 if ((fALetterSet
->contains(c1
) || fNumericSet
->contains(c1
) ||
2432 fKatakanaSet
->contains(c1
) || fExtendNumLetSet
->contains(c1
)) &&
2433 fExtendNumLetSet
->contains(c2
)) {
2438 if (fExtendNumLetSet
->contains(c1
) &&
2439 (fALetterSet
->contains(c2
) || fNumericSet
->contains(c2
) ||
2440 fKatakanaSet
->contains(c2
))) {
2445 // Rule 14. Break found here.
2450 // Rule 4 fixup, back up before any trailing
2451 // format characters at the end of the word.
2453 status
= U_ZERO_ERROR
;
2454 if (fGCMatcher
->find(p1
, status
)) {
2455 breakPos
= fGCMatcher
->end(0, status
);
2456 U_ASSERT(U_SUCCESS(status
));
2462 UVector
*RBBIWordMonkey::charClasses() {
2467 RBBIWordMonkey::~RBBIWordMonkey() {
2469 delete fKatakanaSet
;
2471 delete fMidLetterSet
;
2485 //-------------------------------------------------------------------------------------------
2489 //-------------------------------------------------------------------------------------------
2491 class RBBILineMonkey
: public RBBIMonkeyKind
{
2494 virtual ~RBBILineMonkey();
2495 virtual UVector
*charClasses();
2496 virtual void setText(const UnicodeString
&s
);
2497 virtual int32_t next(int32_t i
);
2498 virtual void rule67Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
);
2534 BreakIterator
*fCharBI
;
2536 const UnicodeString
*fText
;
2537 int32_t *fOrigPositions
;
2539 RegexMatcher
*fNumberMatcher
;
2540 RegexMatcher
*fLB10Matcher
;
2541 RegexMatcher
*fLB11Matcher
;
2545 RBBILineMonkey::RBBILineMonkey()
2547 UErrorCode status
= U_ZERO_ERROR
;
2549 fSets
= new UVector(status
);
2551 fBK
= new UnicodeSet("[\\p{Line_Break=BK}]", status
);
2552 fCR
= new UnicodeSet("[\\p{Line_break=CR}]", status
);
2553 fLF
= new UnicodeSet("[\\p{Line_break=LF}]", status
);
2554 fCM
= new UnicodeSet("[\\p{Line_break=CM}]", status
);
2555 fNL
= new UnicodeSet("[\\p{Line_break=NL}]", status
);
2556 fWJ
= new UnicodeSet("[\\p{Line_break=WJ}]", status
);
2557 fZW
= new UnicodeSet("[\\p{Line_break=ZW}]", status
);
2558 fGL
= new UnicodeSet("[\\p{Line_break=GL}]", status
);
2559 fCB
= new UnicodeSet("[\\p{Line_break=CB}]", status
);
2560 fSP
= new UnicodeSet("[\\p{Line_break=SP}]", status
);
2561 fB2
= new UnicodeSet("[\\p{Line_break=B2}]", status
);
2562 fBA
= new UnicodeSet("[\\p{Line_break=BA}]", status
);
2563 fBB
= new UnicodeSet("[\\p{Line_break=BB}]", status
);
2564 fHY
= new UnicodeSet("[\\p{Line_break=HY}]", status
);
2565 fCL
= new UnicodeSet("[\\p{Line_break=CL}]", status
);
2566 fEX
= new UnicodeSet("[\\p{Line_break=EX}]", status
);
2567 fIN
= new UnicodeSet("[\\p{Line_break=IN}]", status
);
2568 fNS
= new UnicodeSet("[\\p{Line_break=NS}]", status
);
2569 fOP
= new UnicodeSet("[\\p{Line_break=OP}]", status
);
2570 fQU
= new UnicodeSet("[\\p{Line_break=QU}]", status
);
2571 fIS
= new UnicodeSet("[\\p{Line_break=IS}]", status
);
2572 fNU
= new UnicodeSet("[\\p{Line_break=NU}]", status
);
2573 fPO
= new UnicodeSet("[\\p{Line_break=PO}]", status
);
2574 fPR
= new UnicodeSet("[\\p{Line_break=PR}]", status
);
2575 fSY
= new UnicodeSet("[\\p{Line_break=SY}]", status
);
2576 fAI
= new UnicodeSet("[\\p{Line_break=AI}]", status
);
2577 fAL
= new UnicodeSet("[\\p{Line_break=AL}]", status
);
2578 fID
= new UnicodeSet("[\\p{Line_break=ID}]", status
);
2579 fSA
= new UnicodeSet("[\\p{Line_break=SA}]", status
);
2580 fXX
= new UnicodeSet("[\\p{Line_break=XX}]", status
);
2582 fAL
->addAll(*fXX
); // Default behavior for XX is identical to AL
2583 fAL
->addAll(*fAI
); // Default behavior for AI is identical to AL
2584 fAL
->addAll(*fSA
); // Default behavior for SA is XX, which defaults to AL
2588 fSets
->addElement(fBK
, status
);
2589 fSets
->addElement(fCR
, status
);
2590 fSets
->addElement(fLF
, status
);
2591 fSets
->addElement(fCM
, status
);
2592 fSets
->addElement(fNL
, status
);
2593 fSets
->addElement(fWJ
, status
);
2594 fSets
->addElement(fZW
, status
);
2595 fSets
->addElement(fGL
, status
);
2596 fSets
->addElement(fCB
, status
);
2597 fSets
->addElement(fSP
, status
);
2598 fSets
->addElement(fB2
, status
);
2599 fSets
->addElement(fBA
, status
);
2600 fSets
->addElement(fBB
, status
);
2601 fSets
->addElement(fHY
, status
);
2602 fSets
->addElement(fCL
, status
);
2603 fSets
->addElement(fEX
, status
);
2604 fSets
->addElement(fIN
, status
);
2605 fSets
->addElement(fNS
, status
);
2606 fSets
->addElement(fOP
, status
);
2607 fSets
->addElement(fQU
, status
);
2608 fSets
->addElement(fIS
, status
);
2609 fSets
->addElement(fNU
, status
);
2610 fSets
->addElement(fPO
, status
);
2611 fSets
->addElement(fPR
, status
);
2612 fSets
->addElement(fSY
, status
);
2613 fSets
->addElement(fAI
, status
);
2614 fSets
->addElement(fAL
, status
);
2615 fSets
->addElement(fID
, status
);
2616 fSets
->addElement(fWJ
, status
);
2617 fSets
->addElement(fSA
, status
);
2618 // fSets->addElement(fXX, status);
2622 fNumberMatcher
= new RegexMatcher(
2623 "(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
2624 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2625 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2626 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2627 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
2628 "(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
2631 fLB10Matcher
= new RegexMatcher(
2632 "\\p{Line_Break=QU}\\p{Line_Break=CM}*"
2633 "\\p{Line_Break=SP}*"
2634 "(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
2637 fLB11Matcher
= new RegexMatcher(
2638 "\\p{Line_Break=CL}\\p{Line_Break=CM}*"
2639 "\\p{Line_Break=SP}*"
2640 "(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
2643 fCharBI
= BreakIterator::createCharacterInstance(Locale::getEnglish(), status
);
2645 if (U_FAILURE(status
)) {
2646 deferredStatus
= status
;
2651 void RBBILineMonkey::setText(const UnicodeString
&s
) {
2653 fCharBI
->setText(s
);
2654 fNumberMatcher
->reset(s
);
2659 // Line Break TR rules 6 and 7 implementation.
2660 // This deals with combining marks, Hangul Syllables, and other sequences that
2661 // that must be treated as if they were something other than what they actually are.
2663 // This is factored out into a separate function because it must be applied twice for
2664 // each potential break, once to the chars before the position being checked, then
2665 // again to the text following the possible break.
2667 void RBBILineMonkey::rule67Adjust(int32_t pos
, UChar32
*posChar
, int32_t *nextPos
, UChar32
*nextChar
) {
2669 // Invalid initial position. Happens during the warmup iteration of the
2670 // main loop in next().
2674 int32_t nPos
= *nextPos
;
2676 // LB 6 Treat Korean Syllables as a single unit
2677 int32_t hangultype
= u_getIntPropertyValue(*posChar
, UCHAR_HANGUL_SYLLABLE_TYPE
);
2678 if (hangultype
!= U_HST_NOT_APPLICABLE
) {
2679 nPos
= fCharBI
->following(pos
); // Advance by grapheme cluster, which
2680 // contains the logic to locate Hangul syllables.
2681 // Grapheme Cluster Ugliness: some Grapheme_Extend chars, which are absorbed
2682 // into a grapheme cluster, are NOT Line Break CM. (Some are GL, for example.)
2683 // We don't want consume any of these. The Approach is
2684 // 1. Back nPos up, undoing the consumption of any
2685 // Grapheme_Extend chars by the char break iterator.
2686 // 2. Let the LB 7b logic below reconsume any Line Break CM chars.
2688 nPos
= fText
->moveIndex32(nPos
, -1);
2689 UChar32 possiblyExtendChar
= fText
->char32At(nPos
);
2690 if (fID
->contains(possiblyExtendChar
)) {
2691 // We hit into the Hangul Syllable itself, class is ID.
2692 nPos
= fText
->moveIndex32(nPos
, +1);
2698 // LB 7b Keep combining sequences together.
2699 // advance over any CM class chars. (Line Break CM class is different from
2700 // grapheme cluster CM, so we need to do this even for HangulSyllables.
2701 // Line Break may eat additional stuff as combining, beyond what graphem cluster did.
2702 if (!(fBK
->contains(*posChar
) || fZW
->contains(*posChar
) || *posChar
==0x0a
2703 || *posChar
==0x0d || *posChar
==0x85)) {
2705 *nextChar
= fText
->char32At(nPos
);
2706 if (!fCM
->contains(*nextChar
)) {
2709 nPos
= fText
->moveIndex32(nPos
, 1);
2714 // LB 7a In a SP CM* sequence, treat the SP as an ID
2715 if (nPos
!= *nextPos
&& fSP
->contains(*posChar
)) {
2716 *posChar
= 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.
2719 // LB 7b Treat X CM* as if it were x.
2720 // No explicit action required.
2722 // LB 7c Treat any remaining combining mark as AL
2723 if (fCM
->contains(*posChar
)) {
2724 *posChar
= 0x41; // thisChar = 'A';
2727 // Push the updated nextPos and nextChar back to our caller.
2728 // This only makes a difference if posChar got bigger, by slurping up a
2729 // combining sequence or Hangul syllable.
2731 *nextChar
= fText
->char32At(nPos
);
2736 int32_t RBBILineMonkey::next(int32_t startPos
) {
2737 UErrorCode status
= U_ZERO_ERROR
;
2738 int32_t pos
; // Index of the char following a potential break position
2739 UChar32 thisChar
; // Character at above position "pos"
2741 int32_t prevPos
; // Index of the char preceding a potential break position
2742 UChar32 prevChar
; // Character at above position. Note that prevChar
2743 // and thisChar may not be adjacent because combining
2744 // characters between them will be ignored.
2746 int32_t nextPos
; // Index of the next character following pos.
2747 // Usually skips over combining marks.
2748 int32_t nextCPPos
; // Index of the code point following "pos."
2749 // May point to a combining mark.
2750 int32_t tPos
; // temp value.
2753 if (startPos
>= fText
->length()) {
2758 // Initial values for loop. Loop will run the first time without finding breaks,
2759 // while the invalid values shift out and the "this" and
2760 // "prev" positions are filled in with good values.
2761 pos
= prevPos
= -1; // Invalid value, serves as flag for initial loop iteration.
2762 thisChar
= prevChar
= 0;
2763 nextPos
= nextCPPos
= startPos
;
2766 // Loop runs once per position in the test text, until a break position
2770 prevChar
= thisChar
;
2773 thisChar
= fText
->char32At(pos
);
2775 nextCPPos
= fText
->moveIndex32(pos
, 1);
2776 nextPos
= nextCPPos
;
2778 // Break at end of text.
2779 if (pos
>= fText
->length()) {
2783 // LB 3a Always break after hard line breaks,
2784 if (fBK
->contains(prevChar
)) {
2788 // LB 3b Break after CR, LF, NL, but not inside CR LF
2789 if (prevChar
== 0x0d && thisChar
== 0x0a) {
2792 if (prevChar
== 0x0d ||
2798 // LB 3c Don't break before hard line breaks
2799 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 ||
2800 fBK
->contains(thisChar
)) {
2804 // LB 10 QU SP* x OP
2806 UnicodeString
subStr10(*fText
, prevPos
);
2807 fLB10Matcher
->reset(subStr10
);
2808 status
= U_ZERO_ERROR
;
2809 if (fLB10Matcher
->lookingAt(status
)) { // /QU CM* SP* (OP) CM*/;
2810 // TODO: Check status codes
2811 pos
= prevPos
+ fLB10Matcher
->start(1, status
);
2812 nextPos
= prevPos
+ fLB10Matcher
->end(0, status
);
2813 thisChar
= fText
->char32At(pos
);
2818 // LB 11 CL SP* x NS
2820 UnicodeString
subStr11(*fText
, prevPos
);
2821 fLB11Matcher
->reset(subStr11
);
2822 status
= U_ZERO_ERROR
;
2823 if (fLB11Matcher
->lookingAt(status
)) { // /QU CM* SP* (OP) CM*/;
2824 // TODO: Check status codes
2825 pos
= prevPos
+ fLB11Matcher
->start(1, status
);
2826 nextPos
= prevPos
+ fLB11Matcher
->end(0, status
);
2827 thisChar
= fText
->char32At(pos
);
2832 // LB 4 Don't break before spaces or zero-width space.
2833 if (fSP
->contains(thisChar
)) {
2837 if (fZW
->contains(thisChar
)) {
2841 // LB 5 Break after zero width space
2842 if (fZW
->contains(prevChar
)) {
2847 /*int32_t oldpos = pos;*/
2848 rule67Adjust(prevPos
, &prevChar
, &pos
, &thisChar
);
2850 nextCPPos
= fText
->moveIndex32(pos
, 1);
2851 nextPos
= nextCPPos
;
2852 c
= fText
->char32At(nextPos
);
2853 // another percularity of LB 4 - Dont break before space
2854 if (fSP
->contains(thisChar
)) {
2857 rule67Adjust(pos
, &thisChar
, &nextPos
, &c
);
2859 // If the loop is still warming up - if we haven't shifted the initial
2860 // -1 positions out of prevPos yet - loop back to advance the
2861 // position in the input without any further looking for breaks.
2862 if (prevPos
== -1) {
2866 // Re-apply rules 3c, 4 because these could be affected by having
2867 // a new thisChar from doing rule 6 or 7.
2868 if (thisChar
== 0x0d || thisChar
== 0x0a || thisChar
== 0x85 || // 3c
2869 fBK
->contains(thisChar
)) {
2872 if (fSP
->contains(thisChar
)) { // LB 4
2875 if (fZW
->contains(thisChar
)) { // LB 4
2880 // LB 8 Don't break before closings.
2881 // NU x CL and NU x IS are not matched here so that they will
2882 // fall into LB 17 and the more general number regular expression.
2884 if (!fNU
->contains(prevChar
) && fCL
->contains(thisChar
) ||
2885 fEX
->contains(thisChar
) ||
2886 !fNU
->contains(prevChar
) && fIS
->contains(thisChar
) ||
2887 !fNU
->contains(prevChar
) && fSY
->contains(thisChar
)) {
2891 // LB 9 Don't break after OP SP*
2892 // Scan backwards, checking for this sequence.
2893 // The OP char could include combining marks, so we acually check for
2895 // Another Twist: The Rule 67 fixes may have changed a CP CM
2896 // sequence into a ID char, so before scanning back through spaces,
2897 // verify that prevChar is indeed a space. The prevChar variable
2898 // may differ from fText[prevPos]
2900 if (fSP
->contains(prevChar
)) {
2901 while (tPos
> 0 && fSP
->contains(fText
->char32At(tPos
))) {
2902 tPos
=fText
->moveIndex32(tPos
, -1);
2905 while (tPos
> 0 && fCM
->contains(fText
->char32At(tPos
))) {
2906 tPos
=fText
->moveIndex32(tPos
, -1);
2908 if (fOP
->contains(fText
->char32At(tPos
))) {
2914 if (fB2
->contains(thisChar
) && fB2
->contains(prevChar
)) {
2921 if (fGL
->contains(thisChar
) || fGL
->contains(prevChar
)) {
2924 if (fWJ
->contains(thisChar
) || fWJ
->contains(prevChar
)) {
2928 // LB 12 break after space
2929 if (fSP
->contains(prevChar
)) {
2936 if (fQU
->contains(thisChar
) || fQU
->contains(prevChar
)) {
2940 // LB 14a Break around a CB
2941 if (fCB
->contains(thisChar
) || fCB
->contains(prevChar
)) {
2946 if (fBA
->contains(thisChar
) ||
2947 fHY
->contains(thisChar
) ||
2948 fNS
->contains(thisChar
) ||
2949 fBB
->contains(prevChar
) ) {
2954 if (fAL
->contains(prevChar
) && fIN
->contains(thisChar
) ||
2955 fID
->contains(prevChar
) && fIN
->contains(thisChar
) ||
2956 fIN
->contains(prevChar
) && fIN
->contains(thisChar
) ||
2957 fNU
->contains(prevChar
) && fIN
->contains(thisChar
) ) {
2962 // LB 17 ID x PO (Note: Leading CM behaves like ID)
2965 if (fID
->contains(prevChar
) && fPO
->contains(thisChar
) ||
2966 fCM
->contains(prevChar
) && fPO
->contains(thisChar
) ||
2967 fAL
->contains(prevChar
) && fNU
->contains(thisChar
) ||
2968 fNU
->contains(prevChar
) && fAL
->contains(thisChar
) ) {
2973 UnicodeString
subStr18(*fText
, prevPos
);
2974 fNumberMatcher
->reset(subStr18
);
2975 if (fNumberMatcher
->lookingAt(status
)) {
2976 // TODO: Check status codes
2977 // Matched a number. But could have been just a single digit, which would
2978 // not represent a "no break here" between prevChar and thisChar
2979 int32_t numEndIdx
= prevPos
+ fNumberMatcher
->end(status
); // idx of first char following num
2980 if (numEndIdx
> pos
) {
2981 // Number match includes at least our two chars being checked
2982 if (numEndIdx
> nextPos
) {
2983 // Number match includes additional chars. Update pos and nextPos
2984 // so that next loop iteration will continue at the end of the number,
2985 // checking for breaks between last char in number & whatever follows.
2986 nextPos
= numEndIdx
;
2987 pos
= fCharBI
->preceding(numEndIdx
);
2988 thisChar
= fText
->char32At(pos
);
2989 while (fCM
->contains(thisChar
)) {
2990 pos
= fCharBI
->preceding(pos
);
2991 thisChar
= fText
->char32At(pos
);
2998 if (fPR
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3002 if (fPR
->contains(prevChar
) && fID
->contains(thisChar
)) {
3007 if (fHY
->contains(prevChar
) || fBB
->contains(thisChar
)) {
3012 if (fAL
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3017 if (fIS
->contains(prevChar
) && fAL
->contains(thisChar
)) {
3021 // LB 20 Break everywhere else
3030 UVector
*RBBILineMonkey::charClasses() {
3035 RBBILineMonkey::~RBBILineMonkey() {
3070 delete fNumberMatcher
;
3071 delete fLB10Matcher
;
3072 delete fLB11Matcher
;
3076 //-------------------------------------------------------------------------------------------
3081 // seed=nnnnn Random number starting seed.
3082 // Setting the seed allows errors to be reproduced.
3083 // loop=nnn Looping count. Controls running time.
3085 // 0 or greater: run length.
3087 // type = char | word | line | sent | title
3089 //-------------------------------------------------------------------------------------------
3091 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
3092 int32_t val
= defaultVal
;
3093 name
.append(" *= *(-?\\d+)");
3094 UErrorCode status
= U_ZERO_ERROR
;
3095 RegexMatcher
m(name
, params
, 0, status
);
3097 // The param exists. Convert the string to an int.
3098 char valString
[100];
3099 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
3100 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
3101 paramLength
= (int32_t)(sizeof(valString
)-2);
3103 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
3104 val
= strtol(valString
, NULL
, 10);
3106 // Delete this parameter from the params string.
3108 params
= m
.replaceFirst("", status
);
3110 U_ASSERT(U_SUCCESS(status
));
3115 static void testBreakBoundPreceding(RBBITest
*test
, UnicodeString ustr
,
3124 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3126 if (count
< expectedcount
&& expected
[count
] != i
) {
3127 test
->errln("break forward test failed: expected %d but got %d",
3128 expected
[count
], i
);
3133 if (count
!= expectedcount
) {
3134 printStringBreaks(ustr
, expected
, expectedcount
);
3135 test
->errln("break test failed: missed %d match",
3136 expectedcount
- count
);
3139 // testing boundaries
3140 for (i
= 1; i
< expectedcount
; i
++) {
3141 int j
= expected
[i
- 1];
3142 if (!bi
->isBoundary(j
)) {
3143 printStringBreaks(ustr
, expected
, expectedcount
);
3144 test
->errln("Expected boundary at position %d", j
);
3147 for (j
= expected
[i
- 1] + 1; j
< expected
[i
]; j
++) {
3148 if (bi
->isBoundary(j
)) {
3149 printStringBreaks(ustr
, expected
, expectedcount
);
3150 test
->errln("Not expecting boundary at position %d", j
);
3156 for (i
= bi
->last(); i
!= BreakIterator::DONE
; i
= bi
->previous()) {
3158 if (forward
[count
] != i
) {
3159 test
->errln("happy break test reverse failed: expected %d but got %d",
3165 printStringBreaks(ustr
, expected
, expectedcount
);
3166 test
->errln("happy break test failed: missed a match");
3170 // testing preceding
3171 for (i
= 0; i
< expectedcount
- 1; i
++) {
3172 int j
= expected
[i
] + 1;
3173 for (; j
<= expected
[i
+ 1]; j
++) {
3174 if (bi
->preceding(j
) != expected
[i
]) {
3175 printStringBreaks(ustr
, expected
, expectedcount
);
3176 test
->errln("Not expecting backwards boundary at position %d", j
);
3183 void RBBITest::TestWordBreaks(void)
3185 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3187 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3188 Locale
locale("en");
3189 UErrorCode status
= U_ZERO_ERROR
;
3190 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3191 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3193 static const char *strlist
[] =
3195 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3196 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3197 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
3198 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3199 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3200 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3201 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3202 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3203 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3204 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3205 "\\u2027\\U000e0067\\u0a47\\u00b7",
3206 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3207 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3208 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3209 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3210 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3211 "\\u0027\\u11af\\U000e0057\\u0602",
3212 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3213 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3214 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3215 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3216 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3217 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3218 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3219 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3220 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3221 "\\u58f4\\U000e0049\\u20e7\\u2027",
3222 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3223 "\\ua183\\u102d\\u0bec\\u003a",
3224 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3225 "\\u003a\\u0e57\\u0fad\\u002e",
3226 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3227 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3228 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3229 "\\u003a\\u0664\\u00b7\\u1fba",
3230 "\\u003b\\u0027\\u00b7\\u47a3",
3231 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3232 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3233 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3236 if (U_FAILURE(status
)) {
3237 errln("Creation of break iterator failed %s", u_errorName(status
));
3240 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3241 // printf("looping %d\n", loop);
3242 u_unescape(strlist
[loop
], str
, 25);
3243 UnicodeString
ustr(str
);
3244 // RBBICharMonkey monkey;
3245 RBBIWordMonkey monkey
;
3248 int expectedcount
= 0;
3250 monkey
.setText(ustr
);
3252 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3253 expected
[expectedcount
++] = i
;
3256 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3262 void RBBITest::TestWordBoundary(void)
3264 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3265 Locale
locale("en");
3266 UErrorCode status
= U_ZERO_ERROR
;
3267 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3268 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3270 static const char *strlist
[] =
3272 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3273 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3274 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3275 "\\u2027\\U000e0067\\u0a47\\u00b7",
3276 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3277 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3278 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3279 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3280 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3281 "\\u0027\\u11af\\U000e0057\\u0602",
3282 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3283 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3284 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3285 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3286 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3287 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3288 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3289 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3290 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3291 "\\u58f4\\U000e0049\\u20e7\\u2027",
3292 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3293 "\\ua183\\u102d\\u0bec\\u003a",
3294 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3295 "\\u003a\\u0e57\\u0fad\\u002e",
3296 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3297 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3298 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3299 "\\u003a\\u0664\\u00b7\\u1fba",
3300 "\\u003b\\u0027\\u00b7\\u47a3",
3303 if (U_FAILURE(status
)) {
3304 errln("Creation of break iterator failed %s", u_errorName(status
));
3307 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3308 // printf("looping %d\n", loop);
3309 u_unescape(strlist
[loop
], str
, 20);
3310 UnicodeString
ustr(str
);
3317 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3318 forward
[count
++] = i
;
3321 for (j
= prev
+ 1; j
< i
; j
++) {
3322 if (bi
->isBoundary(j
)) {
3323 printStringBreaks(ustr
, forward
, count
);
3324 errln("happy boundary test failed: expected %d not a boundary",
3330 if (!bi
->isBoundary(i
)) {
3331 printStringBreaks(ustr
, forward
, count
);
3332 errln("happy boundary test failed: expected %d a boundary",
3342 void RBBITest::TestLineBreaks(void)
3344 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3345 Locale
locale("en");
3346 UErrorCode status
= U_ZERO_ERROR
;
3347 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3349 static const char *strlist
[] =
3351 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3352 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3353 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3354 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3355 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3356 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3357 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3358 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3359 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3360 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3361 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3362 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3363 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3364 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3365 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3366 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3367 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3368 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3369 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3370 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3371 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3372 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3373 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3374 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3375 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3376 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3377 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3378 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3379 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3380 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3381 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3382 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3383 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3384 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3385 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3386 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3387 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3390 if (U_FAILURE(status
)) {
3391 errln("Creation of break iterator failed %s", u_errorName(status
));
3394 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3395 // printf("looping %d\n", loop);
3396 u_unescape(strlist
[loop
], str
, 20);
3397 UnicodeString
ustr(str
);
3398 RBBILineMonkey monkey
;
3401 int expectedcount
= 0;
3403 monkey
.setText(ustr
);
3405 for (i
= 0; i
!= BreakIterator::DONE
; i
= monkey
.next(i
)) {
3406 expected
[expectedcount
++] = i
;
3409 testBreakBoundPreceding(this, ustr
, bi
, expected
, expectedcount
);
3415 void RBBITest::TestSentBreaks(void)
3417 Locale
locale("en");
3418 UErrorCode status
= U_ZERO_ERROR
;
3419 BreakIterator
*bi
= BreakIterator::createSentenceInstance(locale
, status
);
3421 static const char *strlist
[] =
3423 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3425 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3426 "\"Sentence ending with a quote.\" Bye.",
3427 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3428 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3429 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3430 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3431 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3432 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3436 if (U_FAILURE(status
)) {
3437 errln("Creation of break iterator failed %s", u_errorName(status
));
3440 for (loop
= 0; loop
< (int)(sizeof(strlist
) / sizeof(char *)); loop
++) {
3441 u_unescape(strlist
[loop
], str
, 100);
3442 UnicodeString
ustr(str
);
3447 for (i
= bi
->first(); i
!= BreakIterator::DONE
; i
= bi
->next()) {
3448 forward
[count
++] = i
;
3450 testBreakBoundPreceding(this, ustr
, bi
, forward
, count
);
3455 void RBBITest::TestMonkey(char *params
) {
3456 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3458 UErrorCode status
= U_ZERO_ERROR
;
3459 int32_t loopCount
= 500;
3461 UnicodeString breakType
= "all";
3462 Locale
locale("en");
3464 if (quick
== FALSE
) {
3469 UnicodeString
p(params
);
3470 loopCount
= getIntParam("loop", p
, loopCount
);
3471 seed
= getIntParam("seed", p
, seed
);
3473 RegexMatcher
m(" *type *= *(char|word|line|sent|title) *", p
, 0, status
);
3475 breakType
= m
.group(1, status
);
3477 p
= m
.replaceFirst("", status
);
3481 if (RegexMatcher("\\S", p
, 0, status
).find()) {
3482 // Each option is stripped out of the option string as it is processed.
3483 // All options have been checked. The option string should have been completely emptied..
3485 p
.extract(buf
, sizeof(buf
), NULL
, status
);
3486 buf
[sizeof(buf
)-1] = 0;
3487 errln("Unrecognized or extra parameter: %s\n", buf
);
3493 if (breakType
== "char" || breakType
== "all") {
3495 BreakIterator
*bi
= BreakIterator::createCharacterInstance(locale
, status
);
3496 if (U_SUCCESS(status
)) {
3497 RunMonkey(bi
, m
, "char", seed
, loopCount
);
3500 errln("Creation of character break iterator failed %s", u_errorName(status
));
3505 if (breakType
== "word" || breakType
== "all") {
3506 logln("Word Break Monkey Test");
3508 BreakIterator
*bi
= BreakIterator::createWordInstance(locale
, status
);
3509 if (U_SUCCESS(status
)) {
3510 RunMonkey(bi
, m
, "word", seed
, loopCount
);
3513 errln("Creation of word break iterator failed %s", u_errorName(status
));
3518 if (breakType
== "line" || breakType
== "all") {
3519 logln("Line Break Monkey Test");
3521 BreakIterator
*bi
= BreakIterator::createLineInstance(locale
, status
);
3522 if (params
== NULL
) {
3525 if (U_SUCCESS(status
)) {
3526 RunMonkey(bi
, m
, "line", seed
, loopCount
);
3529 errln("Creation of line break iterator failed %s", u_errorName(status
));
3539 // Run a RBBI monkey test. Common routine, for all break iterator types.
3541 // bi - the break iterator to use
3542 // mk - MonkeyKind, abstraction for obtaining expected results
3543 // name - Name of test (char, word, etc.) for use in error messages
3544 // seed - Seed for starting random number generator (parameter from user)
3547 void RBBITest::RunMonkey(BreakIterator
*bi
, RBBIMonkeyKind
&mk
, const char *name
, uint32_t seed
, int32_t numIterations
) {
3549 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3551 const int32_t TESTSTRINGLEN
= 500;
3552 UnicodeString testText
;
3553 int32_t numCharClasses
;
3555 int expected
[TESTSTRINGLEN
*2 + 1];
3556 int expectedCount
= 0;
3557 char expectedBreaks
[TESTSTRINGLEN
*2 + 1];
3558 char forwardBreaks
[TESTSTRINGLEN
*2 + 1];
3559 char reverseBreaks
[TESTSTRINGLEN
*2+1];
3560 char isBoundaryBreaks
[TESTSTRINGLEN
*2+1];
3561 char followingBreaks
[TESTSTRINGLEN
*2+1];
3562 char precedingBreaks
[TESTSTRINGLEN
*2+1];
3568 numCharClasses
= mk
.charClasses()->size();
3569 chClasses
= mk
.charClasses();
3571 // Check for errors that occured during the construction of the MonkeyKind object.
3572 // Can't report them where they occured because errln() is a method coming from intlTest,
3573 // and is not visible outside of RBBITest :-(
3574 if (U_FAILURE(mk
.deferredStatus
)) {
3575 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk
.deferredStatus
));
3579 // Verify that the character classes all have at least one member.
3580 for (i
=0; i
<numCharClasses
; i
++) {
3581 UnicodeSet
*s
= (UnicodeSet
*)chClasses
->elementAt(i
);
3582 if (s
== NULL
|| s
->size() == 0) {
3583 errln("Character Class #%d is null or of zero size.", i
);
3588 while (loopCount
< numIterations
|| numIterations
== -1) {
3589 if (numIterations
== -1 && loopCount
% 10 == 0) {
3590 // If test is running in an infinite loop, display a periodic tic so
3591 // we can tell that it is making progress.
3592 fprintf(stderr
, ".");
3594 // Save current random number seed, so that we can recreate the random numbers
3595 // for this loop iteration in event of an error.
3598 // Populate a test string with data.
3599 testText
.truncate(0);
3600 for (i
=0; i
<TESTSTRINGLEN
; i
++) {
3601 int32_t aClassNum
= m_rand() % numCharClasses
;
3602 UnicodeSet
*classSet
= (UnicodeSet
*)chClasses
->elementAt(aClassNum
);
3603 int32_t charIdx
= m_rand() % classSet
->size();
3604 UChar32 c
= classSet
->charAt(charIdx
);
3605 if (c
< 0) { // TODO: deal with sets containing strings.
3611 // Calculate the expected results for this test string.
3612 mk
.setText(testText
);
3613 memset(expectedBreaks
, 0, sizeof(expectedBreaks
));
3614 expectedBreaks
[0] = 1;
3615 int32_t breakPos
= 0;
3618 breakPos
= mk
.next(breakPos
);
3619 if (breakPos
== -1) {
3622 if (breakPos
> testText
.length()) {
3623 errln("breakPos > testText.length()");
3625 expectedBreaks
[breakPos
] = 1;
3626 expected
[expectedCount
++] = breakPos
;
3629 // Find the break positions using forward iteration
3630 memset(forwardBreaks
, 0, sizeof(forwardBreaks
));
3631 bi
->setText(testText
);
3632 for (i
=bi
->first(); i
!= BreakIterator::DONE
; i
=bi
->next()) {
3633 if (i
< 0 || i
> testText
.length()) {
3634 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
3637 forwardBreaks
[i
] = 1;
3640 // Find the break positions using reverse iteration
3641 memset(reverseBreaks
, 0, sizeof(reverseBreaks
));
3642 for (i
=bi
->last(); i
!= BreakIterator::DONE
; i
=bi
->previous()) {
3643 if (i
< 0 || i
> testText
.length()) {
3644 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name
);
3647 reverseBreaks
[i
] = 1;
3650 // Find the break positions using isBoundary() tests.
3651 memset(isBoundaryBreaks
, 0, sizeof(isBoundaryBreaks
));
3652 U_ASSERT(sizeof(isBoundaryBreaks
) > testText
.length());
3653 for (i
=0; i
<=testText
.length(); i
++) {
3654 isBoundaryBreaks
[i
] = bi
->isBoundary(i
);
3658 // Find the break positions using the following() function.
3660 memset(followingBreaks
, 0, sizeof(followingBreaks
));
3661 int32_t lastBreakPos
= 0;
3662 followingBreaks
[0] = 1;
3663 for (i
=0; i
<testText
.length(); i
++) {
3664 breakPos
= bi
->following(i
);
3665 if (breakPos
<= i
||
3666 breakPos
< lastBreakPos
||
3667 breakPos
> testText
.length() ||
3668 breakPos
> lastBreakPos
&& lastBreakPos
> i
) {
3669 errln("%s break monkey test: "
3670 "Out of range value returned by BreakIterator::following().\n"
3671 "Random seed=%d", name
, seed
);
3674 followingBreaks
[breakPos
] = 1;
3675 lastBreakPos
= breakPos
;
3678 // Find the break positions using the preceding() function.
3679 memset(precedingBreaks
, 0, sizeof(followingBreaks
));
3680 lastBreakPos
= testText
.length();
3681 precedingBreaks
[testText
.length()] = 1;
3682 for (i
=testText
.length(); i
>0; i
--) {
3683 breakPos
= bi
->preceding(i
);
3684 if (breakPos
>= i
||
3685 breakPos
> lastBreakPos
||
3687 breakPos
< lastBreakPos
&& lastBreakPos
< i
) {
3688 errln("%s break monkey test: "
3689 "Out of range value returned by BreakIterator::preceding().\n"
3690 "index=%d; prev returned %d; lastBreak=%d" ,
3691 name
, i
, breakPos
, lastBreakPos
);
3692 precedingBreaks
[i
] = 2; // Forces an error.
3694 precedingBreaks
[breakPos
] = 1;
3695 lastBreakPos
= breakPos
;
3699 // Compare the expected and actual results.
3700 for (i
=0; i
<=testText
.length(); i
++) {
3701 const char *errorType
= NULL
;
3702 if (forwardBreaks
[i
] != expectedBreaks
[i
]) {
3703 errorType
= "next()";
3704 } else if (reverseBreaks
[i
] != forwardBreaks
[i
]) {
3705 errorType
= "previous()";
3706 } else if (isBoundaryBreaks
[i
] != expectedBreaks
[i
]) {
3707 errorType
= "isBoundary()";
3708 } else if (followingBreaks
[i
] != expectedBreaks
[i
]) {
3709 errorType
= "following()";
3710 } else if (precedingBreaks
[i
] != expectedBreaks
[i
]) {
3711 errorType
= "preceding()";
3715 if (errorType
!= NULL
) {
3716 // Format a range of the test text that includes the failure as
3717 // a data item that can be included in the rbbi test data file.
3719 // Start of the range is the last point where expected and actual results
3720 // both agreed that there was a break position.
3721 int startContext
= i
;
3724 if (startContext
==0) { break; }
3726 if (expectedBreaks
[startContext
] != 0) {
3727 if (count
== 2) break;
3732 // End of range is two expected breaks past the start position.
3733 int endContext
= i
+ 1;
3735 for (ci
=0; ci
<2; ci
++) { // Number of items to include in error text.
3737 if (endContext
>= testText
.length()) {break;}
3738 if (expectedBreaks
[endContext
-1] != 0) {
3739 if (count
== 0) break;
3746 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
3747 UnicodeString errorText
= "<data>";
3748 /***if (strcmp(errorType, "next()") == 0) {
3750 endContext = testText.length();
3752 printStringBreaks(testText, expected, expectedCount);
3755 for (ci
=startContext
; ci
<endContext
;) {
3756 UnicodeString
hexChars("0123456789abcdef");
3759 c
= testText
.char32At(ci
);
3761 // This is the location of the error.
3762 errorText
.append("<?>");
3763 } else if (expectedBreaks
[ci
] != 0) {
3764 // This a non-error expected break position.
3765 errorText
.append("<>");
3768 errorText
.append("\\u");
3769 for (bn
=12; bn
>=0; bn
-=4) {
3770 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
3773 errorText
.append("\\U");
3774 for (bn
=28; bn
>=0; bn
-=4) {
3775 errorText
.append(hexChars
.charAt((c
>>bn
)&0xf));
3778 ci
= testText
.moveIndex32(ci
, 1);
3780 errorText
.append("<>");
3781 errorText
.append("</data>\n");
3784 char charErrorTxt
[500];
3785 UErrorCode status
= U_ZERO_ERROR
;
3786 errorText
.extract(charErrorTxt
, sizeof(charErrorTxt
), NULL
, status
);
3787 charErrorTxt
[sizeof(charErrorTxt
)-1] = 0;
3788 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
3789 name
, (expectedBreaks
[i
]? "break expected but not found" : "break found but not expected"),
3790 errorType
, seed
, i
, charErrorTxt
);
3801 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */