]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbitst.cpp
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2004, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/utypes.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/uchar.h"
20 #include "unicode/utf16.h"
21 #include "unicode/ucnv.h"
22 #include "unicode/schriter.h"
23 #include "unicode/uniset.h"
24 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/ustring.h"
26
27 #include "intltest.h"
28 #include "rbbitst.h"
29 #include <string.h>
30 #include "uvector.h"
31 #include "uvectr32.h"
32 #include <string.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35
36
37
38 //---------------------------------------------------------------------------
39 //
40 // class BITestData Holds a set of Break iterator test data and results
41 // Includes
42 // - the string data to be broken
43 // - a vector of the expected break positions.
44 // - a vector of source line numbers for the data,
45 // (to help see where errors occured.)
46 // - The expected break tag values.
47 // - Vectors of actual break positions and tag values.
48 // - Functions for comparing actual with expected and
49 // reporting errors.
50 //
51 //----------------------------------------------------------------------------
52 class BITestData {
53 public:
54 UnicodeString fDataToBreak;
55 UVector fExpectedBreakPositions;
56 UVector fExpectedTags;
57 UVector fLineNum;
58 UVector fActualBreakPositions; // Test Results.
59 UVector fActualTags;
60
61 BITestData(UErrorCode &status);
62 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
63 void checkResults(const char *heading, RBBITest *test);
64 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
65 void clearResults();
66 };
67
68 //
69 // Constructor.
70 //
71 BITestData::BITestData(UErrorCode &status)
72 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
73 fActualTags(status)
74 {
75 };
76
77 //
78 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
79 // The macro form collects the line number, which is helpful
80 // when tracking down failures.
81 //
82 // A null data item is inserted at the start of each test's data
83 // to put the starting zero into the data list. The position saved for
84 // each non-null item is its ending position.
85 //
86 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
87 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
88 if (U_FAILURE(status)) {return;}
89 if (data != NULL) {
90 fDataToBreak.append(CharsToUnicodeString(data));
91 }
92 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
93 fExpectedTags.addElement(tag, status);
94 fLineNum.addElement(lineNum, status);
95 };
96
97
98 //
99 // checkResults. Compare the actual and expected break positions, report any differences.
100 //
101 void BITestData::checkResults(const char *heading, RBBITest *test) {
102 int32_t expectedIndex = 0;
103 int32_t actualIndex = 0;
104
105 for (;;) {
106 // If we've run through both the expected and actual results vectors, we're done.
107 // break out of the loop.
108 if (expectedIndex >= fExpectedBreakPositions.size() &&
109 actualIndex >= fActualBreakPositions.size()) {
110 break;
111 }
112
113
114 if (expectedIndex >= fExpectedBreakPositions.size()) {
115 err(heading, test, expectedIndex-1, actualIndex);
116 actualIndex++;
117 continue;
118 }
119
120 if (actualIndex >= fActualBreakPositions.size()) {
121 err(heading, test, expectedIndex, actualIndex-1);
122 expectedIndex++;
123 continue;
124 }
125
126 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
127 err(heading, test, expectedIndex, actualIndex);
128 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
129 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
130 actualIndex++;
131 } else {
132 expectedIndex++;
133 }
134 continue;
135 }
136
137 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
138 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
139 heading, fLineNum.elementAt(expectedIndex),
140 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
141 }
142
143 actualIndex++;
144 expectedIndex++;
145 }
146 }
147
148 //
149 // err - An error was found. Report it, along with information about where the
150 // incorrectly broken test data appeared in the source file.
151 //
152 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
153 {
154 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
155 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
156 int32_t o = 0;
157 int32_t line = fLineNum.elementAti(expectedIdx);
158 if (expectedIdx > 0) {
159 // The line numbers are off by one because a premature break occurs somewhere
160 // within the previous item, rather than at the start of the current (expected) item.
161 // We want to report the offset of the unexpected break from the start of
162 // this previous item.
163 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
164 }
165 if (actual < expected) {
166 test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);
167 } else {
168 test->errln("%s Failed to find break at end of item from line %d", heading, line);
169 }
170 }
171
172
173 void BITestData::clearResults() {
174 fActualBreakPositions.removeAllElements();
175 fActualTags.removeAllElements();
176 }
177
178
179 //-----------------------------------------------------------------------------------
180 //
181 // Cannned Test Characters
182 //
183 //-----------------------------------------------------------------------------------
184
185 static const UChar cannedTestArray[] = {
186 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
187 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
188 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
189 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
190 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
191 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
192 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
193 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
194 };
195
196 static UnicodeString* cannedTestChars = 0;
197
198 #define halfNA "\\u0928\\u094d\\u200d"
199 #define halfSA "\\u0938\\u094d\\u200d"
200 #define halfCHA "\\u091a\\u094d\\u200d"
201 #define halfKA "\\u0915\\u094d\\u200d"
202 #define deadTA "\\u0924\\u094d"
203
204 //--------------------------------------------------------------------------------------
205 //
206 // RBBITest constructor and destructor
207 //
208 //--------------------------------------------------------------------------------------
209
210 RBBITest::RBBITest() {
211 UnicodeString temp(cannedTestArray);
212 cannedTestChars = new UnicodeString();
213 *cannedTestChars += (UChar)0x0000;
214 *cannedTestChars += temp;
215 }
216
217
218 RBBITest::~RBBITest() {
219 delete cannedTestChars;
220 }
221
222
223 static const int T_NUMBER = 100;
224 static const int T_LETTER = 200;
225 static const int T_H_OR_K = 300;
226 static const int T_IDEO = 400;
227
228
229
230
231
232
233 //--------------------------------------------------------------------
234 //Testing the BreakIterator for devanagari script
235 //--------------------------------------------------------------------
236
237 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
238 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
239 #define deadTTHA "\\u0920\\u094d"
240 #define deadPA "\\u092a\\u094d"
241 #define deadSA "\\u0938\\u094d"
242 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
243
244
245
246
247
248
249 //-----------------------------------------------------------------------------------
250 //
251 // Test for status {tag} return value from break rules.
252 // TODO: a more thorough test.
253 //
254 //-----------------------------------------------------------------------------------
255 void RBBITest::TestStatusReturn() {
256 UnicodeString rulesString1 = "$Letters = [:L:];\n"
257 "$Numbers = [:N:];\n"
258 "$Letters+{1};\n"
259 "$Numbers+{2};\n"
260 "Help\\ {4}/me\\!;\n"
261 "[^$Letters $Numbers];\n"
262 "!.*;\n";
263 UnicodeString testString1 = "abc123..abc Help me Help me!";
264 // 01234567890123456789012345678
265 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
266 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
267
268 UErrorCode status=U_ZERO_ERROR;
269 UParseError parseError;
270
271 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
272 if(U_FAILURE(status)) {
273 errln("FAIL : in construction");
274 } else {
275 int32_t pos;
276 int32_t i = 0;
277 bi->setText(testString1);
278 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
279 if (pos != bounds1[i]) {
280 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
281 break;
282 }
283
284 int tag = bi->getRuleStatus();
285 if (tag != brkStatus[i]) {
286 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
287 break;
288 }
289 i++;
290 }
291 }
292 delete bi;
293 }
294
295
296 static void printStringBreaks(UnicodeString ustr, int expected[],
297 int expectedcount)
298 {
299 UErrorCode status = U_ZERO_ERROR;
300 char name[100];
301 printf("code alpha extend alphanum type line name\n");
302 int j;
303 for (j = 0; j < ustr.length(); j ++) {
304 if (expectedcount > 0) {
305 int k;
306 for (k = 0; k < expectedcount; k ++) {
307 if (j == expected[k]) {
308 printf("------------------------------------------------ %d\n",
309 j);
310 }
311 }
312 }
313 UChar32 c = ustr.char32At(j);
314 if (c > 0xffff) {
315 j ++;
316 }
317 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
318 printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
319 u_isUAlphabetic(c),
320 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
321 u_isalnum(c),
322 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
323 u_charType(c),
324 U_SHORT_PROPERTY_NAME),
325 u_getPropertyValueName(UCHAR_LINE_BREAK,
326 u_getIntPropertyValue(c,
327 UCHAR_LINE_BREAK),
328 U_SHORT_PROPERTY_NAME),
329 name);
330 }
331 }
332
333 void RBBITest::TestThaiLineBreak() {
334 UErrorCode status = U_ZERO_ERROR;
335 BITestData thaiLineSelection(status);
336
337 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
338 // represents elided letters at the end of a long word. It should be bound to
339 // the end of the word and not treated as an independent punctuation mark.
340
341
342 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
343 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
344 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
345 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
346 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
347 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
348 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
349 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
350 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
351 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
352 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
353 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
354 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
355 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
356 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
357
358 // the one time where the paiyannoi occurs somewhere other than at the end
359 // of a word is in the Thai abbrevation for "etc.", which both begins and
360 // ends with a paiyannoi
361 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
362 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
363 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
364
365 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
366 Locale("th"), status);
367 if (U_FAILURE(status))
368 {
369 errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
370 return;
371 }
372
373 generalIteratorTest(*e, thaiLineSelection);
374 delete e;
375 }
376
377
378
379 void RBBITest::TestMixedThaiLineBreak()
380 {
381 UErrorCode status = U_ZERO_ERROR;
382 BITestData thaiLineSelection(status);
383
384 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
385
386 // Arabic numerals should always be separated from surrounding Thai text
387 /*
388 ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
389 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
390 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
391 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
392 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
393 thaiLineSelection->addElement("39");
394 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
395
396 // words in non-Thai scripts should always be separated from surrounding Thai text
397 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);
398 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);
399 thaiLineSelection->addElement("Java");
400 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);
401 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);
402 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);
403
404 // Thai numerals should always be separated from the text surrounding them
405 ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
406 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
407 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
408 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
409 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
410 ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);
411 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
412
413 // Thai text should interact correctly with punctuation and symbols
414 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);
415 // ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);
416 // ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);
417 ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);
418 // I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
419 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);
420 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);
421 ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
422 */
423
424 // The Unicode Linebreak TR says do not break before or after quotes.
425 // So this test is changed ot not break around the quote.
426 // TODO: should Thai break around the around the quotes, like the original behavior here?
427 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);
428 // ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
429 ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
430 "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
431
432 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
433 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);
434 ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);
435 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);
436 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);
437 ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);
438 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);
439 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);
440 ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);
441
442 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
443 if (U_FAILURE(status))
444 {
445 errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
446 return;
447 }
448
449
450 generalIteratorTest(*e, thaiLineSelection);
451 delete e;
452 }
453
454
455 void RBBITest::TestMaiyamok()
456 {
457 UErrorCode status = U_ZERO_ERROR;
458 BITestData thaiLineSelection(status);
459 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
460 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
461 // word". Instead of appearing as a word unto itself, however, it's kept together
462 // with the word before it
463 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
464 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
465 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
466 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);
467 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
468 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);
469 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
470
471 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
472 Locale("th"), status);
473
474 if (U_FAILURE(status))
475 {
476 errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
477 return;
478 }
479 generalIteratorTest(*e, thaiLineSelection);
480 delete e;
481 }
482
483 void RBBITest::TestThaiWordBreak() {
484 UErrorCode status = U_ZERO_ERROR;
485 BITestData thaiWordSelection(status);
486
487 ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data
488 ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2
489 ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5
490 ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6
491 ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10
492 ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16
493 ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18
494
495 // This is the correct result
496 //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24
497 //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
498
499 // and this is what the dictionary does...
500 ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20
501 ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29
502
503 ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33
504
505 // This is the correct result
506 //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37
507 //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
508
509 // and this is what the dictionary does
510 ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41
511
512 ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45
513 ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49
514 ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51
515
516 // This is the correct result
517 //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57
518 //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60
519
520 // and this is what the dictionary does
521 ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54
522 ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60
523
524 ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63
525
526 // This is the correct result
527 //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68
528 //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71
529 //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74
530 //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77
531
532 // and this is what the dictionary does
533 ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65
534 ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77
535
536 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
537 Locale("th"), status);
538 if (U_FAILURE(status))
539 {
540 errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
541 return;
542 }
543
544 generalIteratorTest(*e, thaiWordSelection);
545 delete e;
546 }
547
548
549 void RBBITest::TestBug3818() {
550 UErrorCode status = U_ZERO_ERROR;
551
552 // Four Thai words...
553 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
554 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
555 UnicodeString thaiStr(thaiWordData);
556
557 RuleBasedBreakIterator* bi =
558 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
559 if (U_FAILURE(status) || bi == NULL) {
560 errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
561 return;
562 }
563 bi->setText(thaiStr);
564
565 int32_t startOfSecondWord = bi->following(1);
566 if (startOfSecondWord != 4) {
567 errln("Fail at file %s, line %d expected start of word at 4, got %d",
568 __FILE__, __LINE__, startOfSecondWord);
569 }
570 startOfSecondWord = bi->following(0);
571 if (startOfSecondWord != 4) {
572 errln("Fail at file %s, line %d expected start of word at 4, got %d",
573 __FILE__, __LINE__, startOfSecondWord);
574 }
575 delete bi;
576 }
577
578
579 void RBBITest::TestJapaneseWordBreak() {
580 UErrorCode status = U_ZERO_ERROR;
581 BITestData japaneseWordSelection(status);
582
583 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
584 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
585 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
586 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
587 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
588 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
589 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
590
591 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
592 Locale("ja"), status);
593 if (U_FAILURE(status))
594 {
595 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
596 return;
597 }
598
599 generalIteratorTest(*e, japaneseWordSelection);
600 delete e;
601 }
602
603 //---------------------------------------------
604 // runIndexedTest
605 //---------------------------------------------
606
607 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
608 {
609 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
610
611 switch (index) {
612 case 0: name = "TestBug4153072";
613 if(exec) TestBug4153072(); break;
614 case 1: name = "TestJapaneseLineBreak";
615 if(exec) TestJapaneseLineBreak(); break;
616 case 2: name = "TestStatusReturn";
617 if(exec) TestStatusReturn(); break;
618
619 case 3: name = "TestLineBreakData";
620 if(exec) TestLineBreakData(); break;
621 case 4: name = "TestEmptyString";
622 if(exec) TestEmptyString(); break;
623
624 case 5: name = "TestGetAvailableLocales";
625 if(exec) TestGetAvailableLocales(); break;
626
627 case 6: name = "TestGetDisplayName";
628 if(exec) TestGetDisplayName(); break;
629
630 case 7: name = "TestEndBehaviour";
631 if(exec) TestEndBehaviour(); break;
632 case 8: name = "TestMixedThaiLineBreak";
633 if(exec) TestMixedThaiLineBreak(); break;
634 case 9: name = "TestThaiWordBreak";
635 if(exec) TestThaiWordBreak(); break;
636 case 10: name = "TestThaiLineBreak";
637 if(exec) TestThaiLineBreak(); break;
638 case 11: name = "TestMaiyamok";
639 if(exec) TestMaiyamok(); break;
640 case 12: name = "TestWordBreaks";
641 if(exec) TestWordBreaks(); break;
642 case 13: name = "TestWordBoundary";
643 if(exec) TestWordBoundary(); break;
644 case 14: name = "TestLineBreaks";
645 if(exec) TestLineBreaks(); break;
646 case 15: name = "TestSentBreaks";
647 if(exec) TestSentBreaks(); break;
648 case 16: name = "TestExtended";
649 if(exec) TestExtended(); break;
650 case 17: name = "TestMonkey";
651 if(exec) {
652 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
653 TestMonkey(params);
654 #else
655 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
656 #endif
657 }
658 break;
659 case 18: name = "TestBug3818";
660 if(exec) TestBug3818(); break;
661 case 19: name = "TestJapaneseWordBreak";
662 if(exec) TestJapaneseWordBreak(); break;
663
664 default: name = ""; break; //needed to end loop
665 }
666 }
667
668
669 //----------------------------------------------------------------------------
670 //
671 // generalIteratorTest Given a break iterator and a set of test data,
672 // Run the tests and report the results.
673 //
674 //----------------------------------------------------------------------------
675 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
676 {
677
678 bi.setText(td.fDataToBreak);
679
680 testFirstAndNext(bi, td);
681
682 testLastAndPrevious(bi, td);
683
684 testFollowing(bi, td);
685 testPreceding(bi, td);
686 testIsBoundary(bi, td);
687 doMultipleSelectionTest(bi, td);
688 }
689
690
691 //
692 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
693 // kind of loop.
694 //
695 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
696 {
697 UErrorCode status = U_ZERO_ERROR;
698 int32_t p;
699 int32_t lastP = -1;
700 int32_t tag;
701
702 logln("Test first and next");
703 bi.setText(td.fDataToBreak);
704 td.clearResults();
705
706 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
707 td.fActualBreakPositions.addElement(p, status); // Save result.
708 tag = bi.getRuleStatus();
709 td.fActualTags.addElement(tag, status);
710 if (p <= lastP) {
711 // If the iterator is not making forward progress, stop.
712 // No need to raise an error here, it'll be detected in the normal check of results.
713 break;
714 }
715 lastP = p;
716 }
717 td.checkResults("testFirstAndNext", this);
718 }
719
720
721 //
722 // TestLastAndPrevious. Run the iterator backwards, starting with last().
723 //
724 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
725 {
726 UErrorCode status = U_ZERO_ERROR;
727 int32_t p;
728 int32_t lastP = 0x7ffffffe;
729 int32_t tag;
730
731 logln("Test first and next");
732 bi.setText(td.fDataToBreak);
733 td.clearResults();
734
735 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
736 // Save break position. Insert it at start of vector of results, shoving
737 // already-saved results further towards the end.
738 td.fActualBreakPositions.insertElementAt(p, 0, status);
739 // bi.previous(); // TODO: Why does this fix things up????
740 // bi.next();
741 tag = bi.getRuleStatus();
742 td.fActualTags.insertElementAt(tag, 0, status);
743 if (p >= lastP) {
744 // If the iterator is not making progress, stop.
745 // No need to raise an error here, it'll be detected in the normal check of results.
746 break;
747 }
748 lastP = p;
749 }
750 td.checkResults("testLastAndPrevious", this);
751 }
752
753
754 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
755 {
756 UErrorCode status = U_ZERO_ERROR;
757 int32_t p;
758 int32_t tag;
759 int32_t lastP = -2; // A value that will never be returned as a break position.
760 // cannot be -1; that is returned for DONE.
761 int i;
762
763 logln("testFollowing():");
764 bi.setText(td.fDataToBreak);
765 td.clearResults();
766
767 // Save the starting point, since we won't get that out of following.
768 p = bi.first();
769 td.fActualBreakPositions.addElement(p, status); // Save result.
770 tag = bi.getRuleStatus();
771 td.fActualTags.addElement(tag, status);
772
773 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
774 p = bi.following(i);
775 if (p != lastP) {
776 if (p == RuleBasedBreakIterator::DONE) {
777 break;
778 }
779 // We've reached a new break position. Save it.
780 td.fActualBreakPositions.addElement(p, status); // Save result.
781 tag = bi.getRuleStatus();
782 td.fActualTags.addElement(tag, status);
783 lastP = p;
784 }
785 }
786 // The loop normally exits by means of the break in the middle.
787 // Make sure that the index was at the correct position for the break iterator to have
788 // returned DONE.
789 if (i != td.fDataToBreak.length()) {
790 errln("testFollowing(): iterator returned DONE prematurely.");
791 }
792
793 // Full check of all results.
794 td.checkResults("testFollowing", this);
795 }
796
797
798
799 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
800 UErrorCode status = U_ZERO_ERROR;
801 int32_t p;
802 int32_t tag;
803 int32_t lastP = 0x7ffffffe;
804 int i;
805
806 logln("testPreceding():");
807 bi.setText(td.fDataToBreak);
808 td.clearResults();
809
810 p = bi.last();
811 td.fActualBreakPositions.addElement(p, status);
812 tag = bi.getRuleStatus();
813 td.fActualTags.addElement(tag, status);
814
815 for (i = td.fDataToBreak.length(); i>=-1; i--) {
816 p = bi.preceding(i);
817 if (p != lastP) {
818 if (p == RuleBasedBreakIterator::DONE) {
819 break;
820 }
821 // We've reached a new break position. Save it.
822 td.fActualBreakPositions.insertElementAt(p, 0, status);
823 lastP = p;
824 tag = bi.getRuleStatus();
825 td.fActualTags.insertElementAt(tag, 0, status);
826 }
827 }
828 // The loop normally exits by means of the break in the middle.
829 // Make sure that the index was at the correct position for the break iterator to have
830 // returned DONE.
831 if (i != 0) {
832 errln("testPreceding(): iterator returned DONE prematurely.");
833 }
834
835 // Full check of all results.
836 td.checkResults("testPreceding", this);
837 }
838
839
840
841 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
842 UErrorCode status = U_ZERO_ERROR;
843 int i;
844 int32_t tag;
845
846 logln("testIsBoundary():");
847 bi.setText(td.fDataToBreak);
848 td.clearResults();
849
850 for (i = 0; i <= td.fDataToBreak.length(); i++) {
851 if (bi.isBoundary(i)) {
852 td.fActualBreakPositions.addElement(i, status); // Save result.
853 tag = bi.getRuleStatus();
854 td.fActualTags.addElement(tag, status);
855 }
856 }
857 td.checkResults("testIsBoundary: ", this);
858 }
859
860
861
862 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
863 {
864 iterator.setText(td.fDataToBreak);
865
866 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
867 int32_t offset = iterator.first();
868 int32_t testOffset;
869 int32_t count = 0;
870
871 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
872
873 if (*testIterator != iterator)
874 errln("clone() or operator!= failed: two clones compared unequal");
875
876 do {
877 testOffset = testIterator->first();
878 testOffset = testIterator->next(count);
879 if (offset != testOffset)
880 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
881
882 if (offset != RuleBasedBreakIterator::DONE) {
883 count++;
884 offset = iterator.next();
885
886 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
887 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
888 if (count > 10000 || offset == -1) {
889 errln("operator== failed too many times. Stopping test.");
890 if (offset == -1) {
891 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
892 }
893 return;
894 }
895 }
896 }
897 } while (offset != RuleBasedBreakIterator::DONE);
898
899 // now do it backwards...
900 offset = iterator.last();
901 count = 0;
902
903 do {
904 testOffset = testIterator->last();
905 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
906 if (offset != testOffset)
907 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
908
909 if (offset != RuleBasedBreakIterator::DONE) {
910 count--;
911 offset = iterator.previous();
912 }
913 } while (offset != RuleBasedBreakIterator::DONE);
914
915 delete testIterator;
916 }
917
918
919
920 //--------------------------------------------------------------------------------------------
921 //
922 // Break Iterator Invariants Tests
923 //
924 //--------------------------------------------------------------------------------------------
925
926 void RBBITest::TestCharacterInvariants()
927 {
928 UErrorCode status = U_ZERO_ERROR;
929 BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
930 if (U_FAILURE(status))
931 {
932 errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
933 return;
934 }
935 UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
936 doBreakInvariantTest(*e, s);
937 s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
938 doOtherInvariantTest(*e, s);
939 delete e;
940 }
941
942
943 void RBBITest::TestWordInvariants()
944 {
945 UErrorCode status = U_ZERO_ERROR;
946 BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);
947 if (U_FAILURE(status))
948 {
949 errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
950 return;
951 }
952 UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
953 doBreakInvariantTest(*e, s);
954 s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
955 doOtherInvariantTest(*e, s);
956 delete e;
957 }
958
959
960 void RBBITest::TestSentenceInvariants()
961 {
962 UErrorCode status = U_ZERO_ERROR;
963 BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
964 if (U_FAILURE(status))
965 {
966 errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
967 return;
968 }
969 UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
970 doOtherInvariantTest(*e, s);
971 delete e;
972 }
973
974
975
976
977 void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
978 {
979 UnicodeString work("aaa");
980 int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;
981
982 // a break should always occur after CR (unless followed by LF), LF, PS, and LS
983 UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");
984 int32_t i, j;
985
986 breaksLen = breaks.length();
987 for (i = 0; i < breaksLen; i++) {
988 UChar c1 = breaks[i];
989 work.setCharAt(1, c1);
990 for (j = 0; j < testCharsLen; j++) {
991 UChar c0 = testChars[j];
992 work.setCharAt(0, c0);
993 int k;
994 for (k = 0; k < testCharsLen; k++) {
995 UChar c2 = testChars[k];
996 work.setCharAt(2, c2);
997
998 // if a cr is followed by lf, ps, ls or etx, don't do the check (that's
999 // not supposed to work)
1000 if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029
1001 || c2 == 0x2028 || c2 == 0x0003))
1002 continue;
1003
1004 if (u_charType(c1) == U_CONTROL_CHAR &&
1005 (u_charType(c2) == U_NON_SPACING_MARK ||
1006 u_charType(c2) == U_ENCLOSING_MARK ||
1007 u_charType(c2) == U_COMBINING_SPACING_MARK)
1008 ) {
1009 // Combining marks don't combine with controls.
1010 // TODO: enhance test to verify that the break actually occurs,
1011 // not just ignore the case.
1012 continue;
1013 }
1014
1015
1016 tb.setText(work);
1017 UBool seen2 = FALSE;
1018 int l;
1019 for (l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
1020 if (l == 2) {
1021 seen2 = TRUE;
1022 break;
1023 }
1024 }
1025 if (!seen2) {
1026 printStringBreaks(work, NULL, 0);
1027 errln("No Break between \\U%04x and \\U%04x", c1, c2);
1028 errCount++;
1029 if (errCount >= 75)
1030 return;
1031 }
1032 }
1033 }
1034 }
1035 }
1036
1037
1038
1039 void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
1040 {
1041 UnicodeString work("a\r\na");
1042 int32_t errCount = 0, testCharsLen = testChars.length();
1043 int32_t i, j;
1044 int8_t type;
1045
1046 // a break should never occur between CR and LF
1047 for (i = 0; i < testCharsLen; i++) {
1048 work.setCharAt(0, testChars[i]);
1049 for (j = 0; j < testCharsLen; j++) {
1050 work.setCharAt(3, testChars[j]);
1051 tb.setText(work);
1052 int32_t k;
1053 for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())
1054 if (k == 2) {
1055 errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
1056 work[0], work[1], work[2], work[3]);
1057 errCount++;
1058 if (errCount >= 75)
1059 return;
1060 }
1061 }
1062 }
1063
1064 // a break should never occur before a non-spacing mark, unless the preceding
1065 // character is CR, LF, PS, or LS
1066 // Or the general category == Control.
1067 work.remove();
1068 work += "aaaa";
1069 for (i = 0; i < testCharsLen; i++) {
1070 UChar c1 = testChars[i];
1071 if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||
1072 u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) {
1073 continue;
1074 }
1075 work.setCharAt(1, c1);
1076 for (j = 0; j < testCharsLen; j++) {
1077 UChar c2 = testChars[j];
1078 type = u_charType(c2);
1079 if ((type != U_NON_SPACING_MARK) &&
1080 (type != U_ENCLOSING_MARK)) {
1081 continue;
1082 }
1083 work.setCharAt(2, c2);
1084 tb.setText(work);
1085 int k;
1086 for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())
1087 if (k == 2) {
1088 //errln("Break between U+" + UCharToUnicodeString(work[1])
1089 // + " and U+" + UCharToUnicodeString(work[2]));
1090 errln("Unexpected Break between %6x and %6x", c1, c2);
1091 errCount++;
1092 if (errCount >= 75)
1093 return;
1094 }
1095 }
1096 }
1097 }
1098
1099
1100
1101
1102 //---------------------------------------------
1103 //
1104 // other tests
1105 //
1106 //---------------------------------------------
1107 void RBBITest::TestEmptyString()
1108 {
1109 UnicodeString text = "";
1110 UErrorCode status = U_ZERO_ERROR;
1111
1112 BITestData x(status);
1113 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
1114 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1115 if (U_FAILURE(status))
1116 {
1117 errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1118 return;
1119 }
1120 generalIteratorTest(*bi, x);
1121 delete bi;
1122 }
1123
1124 void RBBITest::TestGetAvailableLocales()
1125 {
1126 int32_t locCount = 0;
1127 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1128
1129 if (locCount == 0)
1130 errln("getAvailableLocales() returned an empty list!");
1131 // Just make sure that it's returning good memory.
1132 int32_t i;
1133 for (i = 0; i < locCount; ++i) {
1134 logln(locList[i].getName());
1135 }
1136 }
1137
1138 //Testing the BreakIterator::getDisplayName() function
1139 void RBBITest::TestGetDisplayName()
1140 {
1141 UnicodeString result;
1142
1143 BreakIterator::getDisplayName(Locale::getUS(), result);
1144 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1145 errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1146 + result);
1147
1148 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1149 if (result != "French (France)")
1150 errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1151 + result);
1152 }
1153 /**
1154 * Test End Behaviour
1155 * @bug 4068137
1156 */
1157 void RBBITest::TestEndBehaviour()
1158 {
1159 UErrorCode status = U_ZERO_ERROR;
1160 UnicodeString testString("boo.");
1161 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1162 if (U_FAILURE(status))
1163 {
1164 errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1165 return;
1166 }
1167 wb->setText(testString);
1168
1169 if (wb->first() != 0)
1170 errln("Didn't get break at beginning of string.");
1171 if (wb->next() != 3)
1172 errln("Didn't get break before period in \"boo.\"");
1173 if (wb->current() != 4 && wb->next() != 4)
1174 errln("Didn't get break at end of string.");
1175 delete wb;
1176 }
1177 /*
1178 * @bug 4153072
1179 */
1180 void RBBITest::TestBug4153072() {
1181 UErrorCode status = U_ZERO_ERROR;
1182 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1183 if (U_FAILURE(status))
1184 {
1185 errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1186 return;
1187 }
1188 UnicodeString str("...Hello, World!...");
1189 int32_t begin = 3;
1190 int32_t end = str.length() - 3;
1191 UBool dummy;
1192
1193 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1194 iter->adoptText(textIterator);
1195 int index;
1196 for (index = -1; index < begin + 1; ++index) {
1197 dummy = iter->isBoundary(index);
1198 if (index < begin && dummy == TRUE) {
1199 errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
1200 " and begin index = " + begin);
1201 }
1202 }
1203 delete iter;
1204 }
1205
1206
1207 /**
1208 * Test Japanese Line Break
1209 * @bug 4095322
1210 */
1211 void RBBITest::TestJapaneseLineBreak()
1212 {
1213 #if 0
1214 // Test needs updating some more... Dump it for now.
1215
1216
1217 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1218 // as opening and closing punctuation for line breaking.
1219 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1220 // from these tests. 6-13-2002
1221 //
1222 UErrorCode status = U_ZERO_ERROR;
1223 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1224 UnicodeString precedingChars = CharsToUnicodeString(
1225 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1226 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1227 UnicodeString followingChars = CharsToUnicodeString(
1228 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1229 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1230 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1231 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1232 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1233 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1234
1235 int32_t i;
1236 if (U_FAILURE(status))
1237 {
1238 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1239 return;
1240 }
1241
1242 for (i = 0; i < precedingChars.length(); i++) {
1243 testString.setCharAt(1, precedingChars[i]);
1244 iter->setText(testString);
1245 int32_t j = iter->first();
1246 if (j != 0)
1247 errln("ja line break failure: failed to start at 0");
1248 j = iter->next();
1249 if (j != 1)
1250 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1251 + "' (" + ((int)(precedingChars[i])) + ")");
1252 j = iter->next();
1253 if (j != 3)
1254 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1255 + "' (" + ((int)(precedingChars[i])) + ")");
1256 }
1257
1258 for (i = 0; i < followingChars.length(); i++) {
1259 testString.setCharAt(1, followingChars[i]);
1260 iter->setText(testString);
1261 int j = iter->first();
1262 if (j != 0)
1263 errln("ja line break failure: failed to start at 0");
1264 j = iter->next();
1265 if (j != 2)
1266 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1267 + "' (" + ((int)(followingChars[i])) + ")");
1268 j = iter->next();
1269 if (j != 3)
1270 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1271 + "' (" + ((int)(followingChars[i])) + ")");
1272 }
1273 delete iter;
1274 #endif
1275 }
1276
1277
1278 //------------------------------------------------------------------------------
1279 //
1280 // RBBITest::Extended Run RBBI Tests from an external test data file
1281 //
1282 //------------------------------------------------------------------------------
1283
1284 struct TestParams {
1285 BreakIterator *bi;
1286 UnicodeString dataToBreak;
1287 UVector32 *expectedBreaks;
1288 UVector32 *srcLine;
1289 UVector32 *srcCol;
1290 };
1291
1292 void RBBITest::executeTest(TestParams *t) {
1293 int32_t bp;
1294 int32_t prevBP;
1295 int32_t i;
1296
1297 t->bi->setText(t->dataToBreak);
1298 //
1299 // Run the iterator forward
1300 //
1301 prevBP = -1;
1302 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1303 if (prevBP == bp) {
1304 // Fail for lack of forward progress.
1305 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1306 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1307 break;
1308 }
1309
1310 // Check that there were we didn't miss an expected break between the last one
1311 // and this one.
1312 for (i=prevBP+1; i<bp; i++) {
1313 if (t->expectedBreaks->elementAti(i) != 0) {
1314 int expected[] = {0, i};
1315 printStringBreaks(t->dataToBreak, expected, 2);
1316 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1317 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1318 }
1319 }
1320
1321 // Check that the break we did find was expected
1322 if (t->expectedBreaks->elementAti(bp) == 0) {
1323 int expected[] = {0, bp};
1324 printStringBreaks(t->dataToBreak, expected, 2);
1325 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1326 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1327 } else {
1328 // The break was expected.
1329 // Check that the {nnn} tag value is correct.
1330 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1331 if (expectedTagVal == -1) {
1332 expectedTagVal = 0;
1333 }
1334 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1335 if (rs != expectedTagVal) {
1336 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1337 " Actual, Expected status = %4d, %4d",
1338 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
1339 }
1340 }
1341
1342
1343 prevBP = bp;
1344 }
1345
1346 // Verify that there were no missed expected breaks after the last one found
1347 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1348 if (t->expectedBreaks->elementAti(i) != 0) {
1349 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1350 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1351 }
1352 }
1353
1354 //
1355 // Run the iterator backwards, verify that the same breaks are found.
1356 //
1357 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
1358 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1359 if (prevBP == bp) {
1360 // Fail for lack of progress.
1361 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1362 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1363 break;
1364 }
1365
1366 // Check that there were we didn't miss an expected break between the last one
1367 // and this one. (UVector returns zeros for index out of bounds.)
1368 for (i=prevBP-1; i>bp; i--) {
1369 if (t->expectedBreaks->elementAti(i) != 0) {
1370 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1371 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1372 }
1373 }
1374
1375 // Check that the break we did find was expected
1376 if (t->expectedBreaks->elementAti(bp) == 0) {
1377 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1378 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1379 } else {
1380 // The break was expected.
1381 // Check that the {nnn} tag value is correct.
1382 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1383 if (expectedTagVal == -1) {
1384 expectedTagVal = 0;
1385 }
1386 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1387 if (rs != expectedTagVal) {
1388 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1389 " Actual, Expected status = %4d, %4d",
1390 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
1391 }
1392 }
1393
1394 prevBP = bp;
1395 }
1396
1397 // Verify that there were no missed breaks prior to the last one found
1398 for (i=prevBP-1; i>=0; i--) {
1399 if (t->expectedBreaks->elementAti(i) != 0) {
1400 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1401 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1402 }
1403 }
1404 }
1405
1406
1407 void RBBITest::TestExtended() {
1408 UErrorCode status = U_ZERO_ERROR;
1409 Locale locale = Locale::getDefault();
1410
1411 UnicodeString rules;
1412 TestParams tp;
1413 tp.bi = NULL;
1414 tp.expectedBreaks = new UVector32(status);
1415 tp.srcLine = new UVector32(status);
1416 tp.srcCol = new UVector32(status);
1417
1418
1419 //
1420 // Open and read the test data file.
1421 //
1422 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1423 char testFileName[1000];
1424 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1425 errln("Can't open test data. Path too long.");
1426 return;
1427 }
1428 strcpy(testFileName, testDataDirectory);
1429 strcat(testFileName, "rbbitst.txt");
1430
1431 int len;
1432 UChar *testFile = ReadAndConvertFile(testFileName, len, status);
1433 if (U_FAILURE(status)) {
1434 return; /* something went wrong, error already output */
1435 }
1436
1437
1438
1439 //
1440 // Put the test data into a UnicodeString
1441 //
1442 UnicodeString testString(FALSE, testFile, len);
1443
1444 enum EParseState{
1445 PARSE_COMMENT,
1446 PARSE_TAG,
1447 PARSE_DATA,
1448 PARSE_NUM
1449 }
1450 parseState = PARSE_TAG;
1451
1452 EParseState savedState = PARSE_TAG;
1453
1454 static const UChar CH_LF = 0x0a;
1455 static const UChar CH_CR = 0x0d;
1456 static const UChar CH_HASH = 0x23;
1457 /*static const UChar CH_PERIOD = 0x2e;*/
1458 static const UChar CH_LT = 0x3c;
1459 static const UChar CH_GT = 0x3e;
1460 static const UChar CH_BACKSLASH = 0x5c;
1461 static const UChar CH_BULLET = 0x2022;
1462
1463 int32_t lineNum = 1;
1464 int32_t colStart = 0;
1465 int32_t column = 0;
1466 int32_t charIdx = 0;
1467
1468 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1469
1470 for (charIdx = 0; charIdx < len; ) {
1471 UChar c = testString.charAt(charIdx);
1472 charIdx++;
1473 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1474 // treat CRLF as a unit
1475 c = CH_LF;
1476 charIdx++;
1477 }
1478 if (c == CH_LF || c == CH_CR) {
1479 lineNum++;
1480 colStart = charIdx;
1481 }
1482 column = charIdx - colStart + 1;
1483
1484 switch (parseState) {
1485 case PARSE_COMMENT:
1486 if (c == 0x0a || c == 0x0d) {
1487 parseState = savedState;
1488 }
1489 break;
1490
1491 case PARSE_TAG:
1492 {
1493 if (c == CH_HASH) {
1494 parseState = PARSE_COMMENT;
1495 savedState = PARSE_TAG;
1496 break;
1497 }
1498 if (u_isUWhiteSpace(c)) {
1499 break;
1500 }
1501 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1502 delete tp.bi;
1503 tp.bi = BreakIterator::createWordInstance(locale, status);
1504 charIdx += 5;
1505 break;
1506 }
1507 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1508 delete tp.bi;
1509 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1510 charIdx += 5;
1511 break;
1512 }
1513 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1514 delete tp.bi;
1515 tp.bi = BreakIterator::createLineInstance(locale, status);
1516 charIdx += 5;
1517 break;
1518 }
1519 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1520 delete tp.bi;
1521 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1522 charIdx += 5;
1523 break;
1524 }
1525 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1526 delete tp.bi;
1527 tp.bi = BreakIterator::createTitleInstance(locale, status);
1528 charIdx += 6;
1529 break;
1530 }
1531 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1532 parseState = PARSE_DATA;
1533 charIdx += 5;
1534 tp.dataToBreak = "";
1535 tp.expectedBreaks->removeAllElements();
1536 tp.srcCol ->removeAllElements();
1537 tp.srcLine->removeAllElements();
1538 break;
1539 }
1540
1541 errln("line %d: Tag expected in test file.", lineNum);
1542 goto end_test;
1543 parseState = PARSE_COMMENT;
1544 savedState = PARSE_DATA;
1545 }
1546 break;
1547
1548 case PARSE_DATA:
1549 if (c == CH_BULLET) {
1550 int32_t breakIdx = tp.dataToBreak.length();
1551 tp.expectedBreaks->setSize(breakIdx+1);
1552 tp.expectedBreaks->setElementAt(-1, breakIdx);
1553 tp.srcLine->setSize(breakIdx+1);
1554 tp.srcLine->setElementAt(lineNum, breakIdx);
1555 tp.srcCol ->setSize(breakIdx+1);
1556 tp.srcCol ->setElementAt(column, breakIdx);
1557 break;
1558 }
1559
1560 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1561 // Add final entry to mappings from break location to source file position.
1562 // Need one extra because last break position returned is after the
1563 // last char in the data, not at the last char.
1564 tp.srcLine->addElement(lineNum, status);
1565 tp.srcCol ->addElement(column, status);
1566
1567 parseState = PARSE_TAG;
1568 charIdx += 7;
1569
1570 // RUN THE TEST!
1571 executeTest(&tp);
1572 break;
1573 }
1574
1575 if (testString.compare(charIdx-1, 3, "\\N{") == 0) {
1576 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1577 // Get the code point from the name and insert it into the test data.
1578 // (Damn, no API takes names in Unicode !!!
1579 // we've got to take it back to char *)
1580 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1581 int32_t nameLength = nameEndIdx - (charIdx+2);
1582 char charNameBuf[200];
1583 UChar32 theChar = -1;
1584 if (nameEndIdx != -1) {
1585 UErrorCode status = U_ZERO_ERROR;
1586 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1587 charNameBuf[sizeof(charNameBuf)-1] = 0;
1588 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1589 if (U_FAILURE(status)) {
1590 theChar = -1;
1591 }
1592 }
1593 if (theChar == -1) {
1594 errln("Error in named character in test file at line %d, col %d",
1595 lineNum, column);
1596 } else {
1597 // Named code point was recognized. Insert it
1598 // into the test data.
1599 tp.dataToBreak.append(theChar);
1600 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1601 tp.srcLine->addElement(lineNum, status);
1602 tp.srcCol ->addElement(column, status);
1603 }
1604 }
1605 if (nameEndIdx > charIdx) {
1606 charIdx = nameEndIdx+1;
1607 }
1608 break;
1609 }
1610
1611
1612
1613
1614 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1615 charIdx++;
1616 int32_t breakIdx = tp.dataToBreak.length();
1617 tp.expectedBreaks->setSize(breakIdx+1);
1618 tp.expectedBreaks->setElementAt(-1, breakIdx);
1619 tp.srcLine->setSize(breakIdx+1);
1620 tp.srcLine->setElementAt(lineNum, breakIdx);
1621 tp.srcCol ->setSize(breakIdx+1);
1622 tp.srcCol ->setElementAt(column, breakIdx);
1623 break;
1624 }
1625
1626 if (c == CH_LT) {
1627 tagValue = 0;
1628 parseState = PARSE_NUM;
1629 break;
1630 }
1631
1632 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1633 parseState = PARSE_COMMENT;
1634 savedState = PARSE_DATA;
1635 break;
1636 }
1637
1638 if (c == CH_BACKSLASH) {
1639 // Check for \ at end of line, a line continuation.
1640 // Advance over (discard) the newline
1641 UChar32 cp = testString.char32At(charIdx);
1642 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1643 // We have a CR LF
1644 // Need an extra increment of the input ptr to move over both of them
1645 charIdx++;
1646 }
1647 if (cp == CH_LF || cp == CH_CR) {
1648 lineNum++;
1649 colStart = charIdx;
1650 charIdx++;
1651 break;
1652 }
1653
1654 // Let unescape handle the back slash.
1655 cp = testString.unescapeAt(charIdx);
1656 if (cp != -1) {
1657 // Escape sequence was recognized. Insert the char
1658 // into the test data.
1659 tp.dataToBreak.append(cp);
1660 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1661 tp.srcLine->addElement(lineNum, status);
1662 tp.srcCol ->addElement(column, status);
1663 }
1664 break;
1665 }
1666
1667
1668 // Not a recognized backslash escape sequence.
1669 // Take the next char as a literal.
1670 // TODO: Should this be an error?
1671 c = testString.charAt(charIdx);
1672 charIdx = testString.moveIndex32(charIdx, 1);
1673 }
1674
1675 // Normal, non-escaped data char.
1676 tp.dataToBreak.append(c);
1677
1678 // Save the mapping from offset in the data to line/column numbers in
1679 // the original input file. Will be used for better error messages only.
1680 // If there's an expected break before this char, the slot in the mapping
1681 // vector will already be set for this char; don't overwrite it.
1682 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1683 tp.srcLine->addElement(lineNum, status);
1684 tp.srcCol ->addElement(column, status);
1685 }
1686 break;
1687
1688
1689 case PARSE_NUM:
1690 // We are parsing an expected numeric tag value, like <1234>,
1691 // within a chunk of data.
1692 if (u_isUWhiteSpace(c)) {
1693 break;
1694 }
1695
1696 if (c == CH_GT) {
1697 // Finished the number. Add the info to the expected break data,
1698 // and switch parse state back to doing plain data.
1699 parseState = PARSE_DATA;
1700 if (tagValue == 0) {
1701 tagValue = -1;
1702 }
1703 int32_t breakIdx = tp.dataToBreak.length();
1704 tp.expectedBreaks->setSize(breakIdx+1);
1705 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1706 tp.srcLine->setSize(breakIdx+1);
1707 tp.srcLine->setElementAt(lineNum, breakIdx);
1708 tp.srcCol ->setSize(breakIdx+1);
1709 tp.srcCol ->setElementAt(column, breakIdx);
1710 break;
1711 }
1712
1713 if (u_isdigit(c)) {
1714 tagValue = tagValue*10 + u_charDigitValue(c);
1715 break;
1716 }
1717
1718 errln("Syntax Error in test file at line %d, col %d",
1719 lineNum, column);
1720 goto end_test;
1721 parseState = PARSE_COMMENT;
1722 break;
1723 }
1724
1725
1726 if (U_FAILURE(status)) {
1727 errln("ICU Error %s while parsing test file at line %d.",
1728 u_errorName(status), lineNum);
1729 goto end_test;
1730 status = U_ZERO_ERROR;
1731 }
1732
1733 }
1734
1735 end_test:
1736 delete tp.bi;
1737 delete tp.expectedBreaks;
1738 delete tp.srcLine;
1739 delete tp.srcCol;
1740 delete [] testFile;
1741 }
1742
1743
1744 //-------------------------------------------------------------------------------
1745 //
1746 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1747 // return the datain one big UChar * buffer, which the caller must delete.
1748 //
1749 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1750 // Move this function to some common place.
1751 //
1752 //--------------------------------------------------------------------------------
1753 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
1754 UChar *retPtr = NULL;
1755 char *fileBuf = NULL;
1756 UConverter* conv = NULL;
1757 FILE *f = NULL;
1758
1759 ulen = 0;
1760 if (U_FAILURE(status)) {
1761 return retPtr;
1762 }
1763
1764 //
1765 // Open the file.
1766 //
1767 f = fopen(fileName, "rb");
1768 if (f == 0) {
1769 errln("Error opening test data file %s\n", fileName);
1770 status = U_FILE_ACCESS_ERROR;
1771 return NULL;
1772 }
1773 //
1774 // Read it in
1775 //
1776 int fileSize;
1777 int amt_read;
1778
1779 fseek( f, 0, SEEK_END);
1780 fileSize = ftell(f);
1781 fileBuf = new char[fileSize];
1782 fseek(f, 0, SEEK_SET);
1783 amt_read = fread(fileBuf, 1, fileSize, f);
1784 if (amt_read != fileSize || fileSize <= 0) {
1785 errln("Error reading test data file.");
1786 goto cleanUpAndReturn;
1787 }
1788
1789 //
1790 // Look for a Unicode Signature (BOM) on the data just read
1791 //
1792 int32_t signatureLength;
1793 const char * fileBufC;
1794 const char* encoding;
1795
1796 fileBufC = fileBuf;
1797 encoding = ucnv_detectUnicodeSignature(
1798 fileBuf, fileSize, &signatureLength, &status);
1799 if(encoding!=NULL ){
1800 fileBufC += signatureLength;
1801 fileSize -= signatureLength;
1802 }
1803
1804 //
1805 // Open a converter to take the rule file to UTF-16
1806 //
1807 conv = ucnv_open(encoding, &status);
1808 if (U_FAILURE(status)) {
1809 goto cleanUpAndReturn;
1810 }
1811
1812 //
1813 // Convert the rules to UChar.
1814 // Preflight first to determine required buffer size.
1815 //
1816 ulen = ucnv_toUChars(conv,
1817 NULL, // dest,
1818 0, // destCapacity,
1819 fileBufC,
1820 fileSize,
1821 &status);
1822 if (status == U_BUFFER_OVERFLOW_ERROR) {
1823 // Buffer Overflow is expected from the preflight operation.
1824 status = U_ZERO_ERROR;
1825
1826 retPtr = new UChar[ulen+1];
1827 ucnv_toUChars(conv,
1828 retPtr, // dest,
1829 ulen+1,
1830 fileBufC,
1831 fileSize,
1832 &status);
1833 }
1834
1835 cleanUpAndReturn:
1836 fclose(f);
1837 delete fileBuf;
1838 ucnv_close(conv);
1839 if (U_FAILURE(status)) {
1840 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1841 delete retPtr;
1842 retPtr = 0;
1843 ulen = 0;
1844 };
1845 return retPtr;
1846 }
1847
1848
1849 //--------------------------------------------------------------------------------------------
1850 //
1851 // Exhaustive Tests, using Unicode Data Files.
1852 //
1853 //--------------------------------------------------------------------------------------------
1854
1855 //
1856 // Token level scanner for the Unicode Line Break Test Data file.
1857 // Return the next token, as follows:
1858 // >= 0: a UChar32 character, scanned from hex in the file.
1859 // -1: a break position, a division sign in the file.
1860 // -2: end of rule. A new line in the file.
1861 // -3: end of file. No more rules.
1862 // -4: Error
1863 //
1864 // The scanner
1865 // strips comments, ('#' to end of line)
1866 // Recognizes CR, CR/LF and LF as new lines.
1867 // Skips over spaces and Xs (don't break here) in the data.
1868 //
1869 struct ScanState {
1870 int32_t fPeekChar;
1871 UBool fPeeked;
1872 int32_t fLineNum;
1873 FILE *fFile;
1874 ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};
1875 };
1876
1877 // Literal characters that are of interest. In hex to keep EBCDIC based machines happy.
1878 // The data itself is latin-1 on all platforms.
1879 static const int32_t chSpace = 0x20;
1880 static const int32_t chTab = 0x09;
1881 static const int32_t chCR = 0x0D;
1882 static const int32_t chLF = 0x0A;
1883 static const int32_t chHash = 0x23;
1884 static const int32_t chMult = 0xD7;
1885 static const int32_t chDivide = 0xF7;
1886
1887 static int32_t nextLBDToken(ScanState *s) {
1888 int32_t c;
1889
1890 // Read characters from the input file until we get something interesting
1891 // to return. The file is in latin-1 encoding.
1892 for (;;) {
1893 // Get the next character to look at,
1894 if (s->fPeeked) {
1895 c = s->fPeekChar;
1896 s->fPeeked = FALSE;
1897 } else {
1898 c = getc(s->fFile);
1899 }
1900
1901 // EOF. Return immediately.
1902 if (c == EOF) {
1903 return -3;
1904 }
1905
1906 // Spaces. Treat the multiply sign as a space - it indicates a no-break position
1907 // in the data, and the test program doesn't want to see them.
1908 // Continue the next char loop, looking for something significant.
1909 if (c == chSpace || c == chTab || c == chMult) {
1910 continue;
1911 }
1912
1913 // Divide sign. Indicates an expected break position.
1914 if (c == chDivide) {
1915 return -1;
1916 }
1917
1918 // New Line Handling. Keep track of line number in the file, which in turn
1919 // requires keeping track of CR/LF as a single new line.
1920 if (c == chCR) {
1921 s->fLineNum++;
1922 s->fPeekChar = getc(s->fFile);
1923 if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};
1924 return -2;
1925 }
1926 if (c == chLF) {
1927 s->fLineNum++;
1928 return -2;
1929 }
1930
1931 // Comments. Consume everything up to the next new line.
1932 if (c == chHash) {
1933 do {
1934 c = getc(s->fFile);
1935 } while (!(c == EOF || c == chCR || c == chLF));
1936 s->fPeekChar = c;
1937 s->fPeeked = TRUE;
1938 return nextLBDToken(s);
1939 }
1940
1941 // Scan a hex character (UChar32) value.
1942 if (u_digit(c, 16) >= 0) {
1943 int32_t v = u_digit(c, 16);
1944 for (;;) {
1945 c = getc(s->fFile);
1946 if (u_digit(c, 16) < 0) {break;};
1947 v <<= 4;
1948 v += u_digit(c, 16);
1949 }
1950 s->fPeekChar = c;
1951 s->fPeeked = TRUE;
1952 return v;
1953 }
1954
1955 // Error. Character was something unexpected.
1956 return -4;
1957 }
1958 }
1959
1960
1961
1962 void RBBITest::TestLineBreakData() {
1963
1964 UErrorCode status = U_ZERO_ERROR;
1965 UnicodeString testString;
1966 UVector expectedBreaks(status);
1967 ScanState ss;
1968 int32_t tok;
1969
1970 BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
1971 if (U_FAILURE(status)) {
1972 errln("Failure creating break iterator");
1973 return;
1974 }
1975
1976 const char * lbdfName = "LBTest.txt";
1977
1978 // Open the test data file.
1979 // TODO: a proper way to handle this data.
1980 ss.fFile = fopen(lbdfName, "rb");
1981 if (ss.fFile == NULL) {
1982 logln("Unable to open Line Break Test Data file. Skipping test.");
1983 delete bi;
1984 return;
1985 }
1986
1987 // Loop once per line from the test data file.
1988 for (;;) {
1989 // Zero out test data from previous line.
1990 testString.truncate(0);
1991 expectedBreaks.removeAllElements();
1992
1993 // Read one test's (line's) worth of data from the file.
1994 // Loop once per token on the input file line.
1995 for(;;) {
1996 tok = nextLBDToken(&ss);
1997
1998 // If we scanned a character number in the file.
1999 // save it in the test data array.
2000 if (tok >= 0) {
2001 testString.append((UChar32)tok);
2002 continue;
2003 }
2004
2005 // If we scanned a break position in the data, record it.
2006 if (tok == -1) {
2007 expectedBreaks.addElement(testString.length(), status);
2008 continue;
2009 }
2010
2011 // If we scanned a new line, or EOF
2012 // drop out of scan loop and run the test case.
2013 if (tok == -2 || tok == -3) {break;};
2014
2015 // None of above. Error.
2016 errln("Failure: Unrecognized data format, test file line %d", ss.fLineNum);
2017 break;
2018 }
2019
2020 // If this line from the test data file actually contained test data,
2021 // run the test.
2022 if (testString.length() > 0) {
2023 int32_t pos; // Break Position in the test string
2024 int32_t expectedI = 0; // Index of expected break position in vector of same.
2025 int32_t expectedPos; // Expected break position (index into test string)
2026
2027 bi->setText(testString);
2028 pos = bi->first(); // TODO: break iterators always return a match at pos 0.
2029 pos = bi->next(); // Line Break TR says no match at position 0.
2030 // Resolve.
2031
2032 for (; pos != BreakIterator::DONE; ) {
2033 expectedPos = expectedBreaks.elementAti(expectedI);
2034 if (pos < expectedPos) {
2035 errln("Failure: Test file line %d, unexpected break found at position %d",
2036 ss.fLineNum, pos);
2037 break;
2038 }
2039 if (pos > expectedPos) {
2040 errln("Failure: Test file line %d, failed to find break at position %d",
2041 ss.fLineNum, expectedPos);
2042 break;
2043 }
2044 pos = bi->next();
2045 expectedI++;
2046 }
2047 }
2048
2049 // If we've hit EOF on the input file, we're done.
2050 if (tok == -3) {
2051 break;
2052 }
2053
2054 }
2055
2056 fclose(ss.fFile);
2057 delete bi;
2058
2059 }
2060
2061 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2062
2063 //---------------------------------------------------------------------------------------
2064 //
2065 // classs RBBIMonkeyKind
2066 //
2067 // Monkey Test for Break Iteration
2068 // Abstract interface class. Concrete derived classes independently
2069 // implement the break rules for different iterator types.
2070 //
2071 // The Monkey Test itself uses doesn't know which type of break iterator it is
2072 // testing, but works purely in terms of the interface defined here.
2073 //
2074 //---------------------------------------------------------------------------------------
2075 class RBBIMonkeyKind {
2076 public:
2077 // Return a UVector of UnicodeSets, representing the character classes used
2078 // for this type of iterator.
2079 virtual UVector *charClasses() = 0;
2080
2081 // Set the test text on which subsequent calls to next() will operate
2082 virtual void setText(const UnicodeString &s) = 0;
2083
2084 // Find the next break postion, starting from the prev break position, or from zero.
2085 // Return -1 after reaching end of string.
2086 virtual int32_t next(int32_t i) = 0;
2087
2088 virtual ~RBBIMonkeyKind();
2089 UErrorCode deferredStatus;
2090
2091
2092 protected:
2093 RBBIMonkeyKind();
2094
2095 private:
2096 };
2097
2098 RBBIMonkeyKind::RBBIMonkeyKind() {
2099 deferredStatus = U_ZERO_ERROR;
2100 }
2101
2102 RBBIMonkeyKind::~RBBIMonkeyKind() {
2103 }
2104
2105
2106 //----------------------------------------------------------------------------------------
2107 //
2108 // Random Numbers. Similar to standard lib rand() and srand()
2109 // Not using library to
2110 // 1. Get same results on all platforms.
2111 // 2. Get access to current seed, to more easily reproduce failures.
2112 //
2113 //---------------------------------------------------------------------------------------
2114 static uint32_t m_seed = 1;
2115
2116 static uint32_t m_rand()
2117 {
2118 m_seed = m_seed * 1103515245 + 12345;
2119 return (uint32_t)(m_seed/65536) % 32768;
2120 }
2121
2122
2123 //------------------------------------------------------------------------------------------
2124 //
2125 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2126 // of RBBIMonkeyKind.
2127 //
2128 //------------------------------------------------------------------------------------------
2129 class RBBICharMonkey: public RBBIMonkeyKind {
2130 public:
2131 RBBICharMonkey();
2132 virtual ~RBBICharMonkey();
2133 virtual UVector *charClasses();
2134 virtual void setText(const UnicodeString &s);
2135 virtual int32_t next(int32_t i);
2136 private:
2137 UVector *fSets;
2138
2139 UnicodeSet *fCRLFSet;
2140 UnicodeSet *fControlSet;
2141 UnicodeSet *fExtendSet;
2142 UnicodeSet *fHangulSet;
2143 UnicodeSet *fAnySet;
2144
2145 RegexMatcher *fMatcher;
2146 const UnicodeString *fText;
2147 };
2148
2149
2150 RBBICharMonkey::RBBICharMonkey() {
2151 UErrorCode status = U_ZERO_ERROR;
2152
2153 fText = NULL;
2154 fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster
2155
2156 fCRLFSet = new UnicodeSet("[\\r\\n]", status);
2157 fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
2158 fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
2159 fHangulSet = new UnicodeSet(
2160 "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
2161 "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
2162 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]", status);
2163
2164 fSets = new UVector(status);
2165 fSets->addElement(fCRLFSet, status);
2166 fSets->addElement(fControlSet, status);
2167 fSets->addElement(fExtendSet, status);
2168 fSets->addElement(fHangulSet, status);
2169 fSets->addElement(fAnySet, status);
2170 if (U_FAILURE(status)) {
2171 deferredStatus = status;
2172 }
2173 };
2174
2175
2176 void RBBICharMonkey::setText(const UnicodeString &s) {
2177 fText = &s;
2178 fMatcher->reset(s);
2179 }
2180
2181
2182 int32_t RBBICharMonkey::next(int32_t i) {
2183 UErrorCode status = U_ZERO_ERROR;
2184 int32_t retVal = -1;
2185
2186 if (fMatcher->find(i, status)) {
2187 retVal = fMatcher->end(status);
2188 }
2189 if (U_FAILURE(status)){
2190 retVal = -1;
2191 }
2192 return retVal;
2193 }
2194
2195
2196 UVector *RBBICharMonkey::charClasses() {
2197 return fSets;
2198 }
2199
2200
2201 RBBICharMonkey::~RBBICharMonkey() {
2202 delete fSets;
2203 delete fCRLFSet;
2204 delete fControlSet;
2205 delete fExtendSet;
2206 delete fHangulSet;
2207 delete fAnySet;
2208
2209 delete fMatcher;
2210 }
2211
2212 //------------------------------------------------------------------------------------------
2213 //
2214 // class RBBIWordMonkey Word Break specific implementation
2215 // of RBBIMonkeyKind.
2216 //
2217 //------------------------------------------------------------------------------------------
2218 class RBBIWordMonkey: public RBBIMonkeyKind {
2219 public:
2220 RBBIWordMonkey();
2221 virtual ~RBBIWordMonkey();
2222 virtual UVector *charClasses();
2223 virtual void setText(const UnicodeString &s);
2224 virtual int32_t next(int32_t i);
2225 private:
2226 UVector *fSets;
2227
2228 UnicodeSet *fKatakanaSet;
2229 UnicodeSet *fALetterSet;
2230 UnicodeSet *fMidLetterSet;
2231 UnicodeSet *fMidNumSet;
2232 UnicodeSet *fNumericSet;
2233 UnicodeSet *fFormatSet;
2234 UnicodeSet *fOtherSet;
2235 UnicodeSet *fExtendSet;
2236 UnicodeSet *fExtendNumLetSet;
2237
2238 RegexMatcher *fMatcher;
2239
2240 const UnicodeString *fText;
2241
2242 RegexMatcher *fGCFMatcher;
2243 RegexMatcher *fGCMatcher;
2244
2245 };
2246
2247
2248 RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
2249 fGCMatcher(0)
2250 {
2251 UErrorCode status = U_ZERO_ERROR;
2252
2253 fSets = new UVector(status);
2254
2255 fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}"
2256 "\\u3031-\\u3035\\u309b\\u309c\\u30a0"
2257 "\\u30fc\\uff70\\uff9e\\uff9f]", status);
2258
2259 const UnicodeString ALetterStr( "[[\\p{Alphabetic}"
2260 "\\u00a0" // NBSP
2261 "\\u05f3]" // Hebrew punct Geresh
2262 "-[\\p{Ideographic}]"
2263 "-[\\p{Script=Lao}]"
2264 "-[\\p{Script=Hiragana}]"
2265 "-[\\p{Grapheme_Extend}]]");
2266 fALetterSet = new UnicodeSet(ALetterStr, status);
2267 fALetterSet->removeAll(*fKatakanaSet);
2268
2269 fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);
2270 fMidNumSet = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);
2271 fNumericSet = new UnicodeSet("[\\p{Nd}\\u066b\\u066c]", status);
2272 fFormatSet = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);
2273 fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
2274 fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);
2275 fOtherSet = new UnicodeSet();
2276 if(U_FAILURE(status)) {
2277 deferredStatus = status;
2278 return;
2279 }
2280
2281 fOtherSet->complement();
2282 fOtherSet->removeAll(*fKatakanaSet);
2283 fOtherSet->removeAll(*fALetterSet);
2284 fOtherSet->removeAll(*fMidLetterSet);
2285 fOtherSet->removeAll(*fMidNumSet);
2286 fOtherSet->removeAll(*fNumericSet);
2287 fOtherSet->removeAll(*fExtendNumLetSet);
2288
2289 fSets->addElement(fALetterSet, status);
2290 fSets->addElement(fKatakanaSet, status);
2291 fSets->addElement(fMidLetterSet, status);
2292 fSets->addElement(fMidNumSet, status);
2293 fSets->addElement(fNumericSet, status);
2294 fSets->addElement(fFormatSet, status);
2295 fSets->addElement(fOtherSet, status);
2296 fSets->addElement(fExtendNumLetSet, status);
2297
2298
2299 fGCFMatcher = new RegexMatcher("\\X(?:[\\p{Format}-\\p{Grapheme_Extend}])*", 0, status);
2300 fGCMatcher = new RegexMatcher("\\X", 0, status);
2301
2302 if (U_FAILURE(status)) {
2303 deferredStatus = status;
2304 }
2305 };
2306
2307 void RBBIWordMonkey::setText(const UnicodeString &s) {
2308 fText = &s;
2309 fGCMatcher->reset(*fText);
2310 fGCFMatcher->reset(*fText);
2311 }
2312
2313
2314 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2315 UErrorCode status = U_ZERO_ERROR;
2316
2317 int p0, p1, p2, p3; // Indices of the significant code points around the
2318 // break position being tested. The candidate break
2319 // location is before p2.
2320
2321 int breakPos = -1;
2322
2323 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2324
2325 // Prev break at end of string. return DONE.
2326 if (prevPos >= fText->length()) {
2327 return -1;
2328 }
2329 p0 = p1 = p2 = p3 = prevPos;
2330 c3 = fText->char32At(prevPos);
2331 c0 = c1 = c2 = 0;
2332
2333
2334 // Format char after prev break? Special case, see last Note for Word Boundaries TR.
2335 // break immdiately after the format char.
2336 if (fFormatSet->contains(c3)) {
2337 breakPos = fText->moveIndex32(prevPos, 1);
2338 return breakPos;
2339 }
2340
2341
2342 // Loop runs once per "significant" character position in the input text.
2343 for (;;) {
2344 // Move all of the positions forward in the input string.
2345 p0 = p1; c0 = c1;
2346 p1 = p2; c1 = c2;
2347 p2 = p3; c2 = c3;
2348 // Advancd p3 by (GC Format*) Rules 3, 4
2349 status = U_ZERO_ERROR;
2350 if (fGCFMatcher->find(p3, status) == FALSE) {
2351 p3 = fText->length();
2352 c3 = 0;
2353 } else {
2354 p3 = fGCFMatcher->end(0, status);
2355 U_ASSERT(U_SUCCESS(status));
2356 c3 = fText->char32At(p3);
2357 }
2358
2359 if (p1 == p2) {
2360 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2361 continue;
2362 }
2363 if (p2 == fText->length()) {
2364 // Reached end of string. Always a break position.
2365 break;
2366 }
2367
2368 // Rule (5). ALetter x ALetter
2369 if (fALetterSet->contains(c1) &&
2370 fALetterSet->contains(c2)) {
2371 continue;
2372 }
2373
2374 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2375 //
2376 // Also incorporates rule 7 by skipping pos ahead to position of the
2377 // terminating ALetter.
2378 if ( fALetterSet->contains(c1) &&
2379 fMidLetterSet->contains(c2) &&
2380 fALetterSet->contains(c3)) {
2381 continue;
2382 }
2383
2384
2385 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2386 if (fALetterSet->contains(c0) &&
2387 (fMidLetterSet->contains(c1) ) &&
2388 fALetterSet->contains(c2)) {
2389 continue;
2390 }
2391
2392 // Rule (8) Numeric x Numeric
2393 if (fNumericSet->contains(c1) &&
2394 fNumericSet->contains(c2)) {
2395 continue;
2396 }
2397
2398 // Rule (9) ALetter x Numeric
2399 if (fALetterSet->contains(c1) &&
2400 fNumericSet->contains(c2)) {
2401 continue;
2402 }
2403
2404 // Rule (10) Numeric x ALetter
2405 if (fNumericSet->contains(c1) &&
2406 fALetterSet->contains(c2)) {
2407 continue;
2408 }
2409
2410 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2411 if ( fNumericSet->contains(c0) &&
2412 fMidNumSet->contains(c1) &&
2413 fNumericSet->contains(c2)) {
2414 continue;
2415 }
2416
2417 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2418 if (fNumericSet->contains(c1) &&
2419 fMidNumSet->contains(c2) &&
2420 fNumericSet->contains(c3)) {
2421 continue;
2422 }
2423
2424 // Rule (13) Katakana x Katakana
2425 if (fKatakanaSet->contains(c1) &&
2426 fKatakanaSet->contains(c2)) {
2427 continue;
2428 }
2429
2430 // Rule 13a
2431 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2432 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2433 fExtendNumLetSet->contains(c2)) {
2434 continue;
2435 }
2436
2437 // Rule 13b
2438 if (fExtendNumLetSet->contains(c1) &&
2439 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2440 fKatakanaSet->contains(c2))) {
2441 continue;
2442 }
2443
2444
2445 // Rule 14. Break found here.
2446 break;
2447 }
2448
2449
2450 // Rule 4 fixup, back up before any trailing
2451 // format characters at the end of the word.
2452 breakPos = p2;
2453 status = U_ZERO_ERROR;
2454 if (fGCMatcher->find(p1, status)) {
2455 breakPos = fGCMatcher->end(0, status);
2456 U_ASSERT(U_SUCCESS(status));
2457 }
2458 return breakPos;
2459 }
2460
2461
2462 UVector *RBBIWordMonkey::charClasses() {
2463 return fSets;
2464 }
2465
2466
2467 RBBIWordMonkey::~RBBIWordMonkey() {
2468 delete fSets;
2469 delete fKatakanaSet;
2470 delete fALetterSet;
2471 delete fMidLetterSet;
2472 delete fMidNumSet;
2473 delete fNumericSet;
2474 delete fFormatSet;
2475 delete fExtendSet;
2476 delete fOtherSet;
2477
2478 delete fGCFMatcher;
2479 delete fGCMatcher;
2480 }
2481
2482
2483
2484
2485 //-------------------------------------------------------------------------------------------
2486 //
2487 // RBBILineMonkey
2488 //
2489 //-------------------------------------------------------------------------------------------
2490
2491 class RBBILineMonkey: public RBBIMonkeyKind {
2492 public:
2493 RBBILineMonkey();
2494 virtual ~RBBILineMonkey();
2495 virtual UVector *charClasses();
2496 virtual void setText(const UnicodeString &s);
2497 virtual int32_t next(int32_t i);
2498 virtual void rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2499 private:
2500 UVector *fSets;
2501
2502 UnicodeSet *fBK;
2503 UnicodeSet *fCR;
2504 UnicodeSet *fLF;
2505 UnicodeSet *fCM;
2506 UnicodeSet *fNL;
2507 UnicodeSet *fSG;
2508 UnicodeSet *fWJ;
2509 UnicodeSet *fZW;
2510 UnicodeSet *fGL;
2511 UnicodeSet *fCB;
2512 UnicodeSet *fSP;
2513 UnicodeSet *fB2;
2514 UnicodeSet *fBA;
2515 UnicodeSet *fBB;
2516 UnicodeSet *fHY;
2517 UnicodeSet *fCL;
2518 UnicodeSet *fEX;
2519 UnicodeSet *fIN;
2520 UnicodeSet *fNS;
2521 UnicodeSet *fOP;
2522 UnicodeSet *fQU;
2523 UnicodeSet *fIS;
2524 UnicodeSet *fNU;
2525 UnicodeSet *fPO;
2526 UnicodeSet *fPR;
2527 UnicodeSet *fSY;
2528 UnicodeSet *fAI;
2529 UnicodeSet *fAL;
2530 UnicodeSet *fID;
2531 UnicodeSet *fSA;
2532 UnicodeSet *fXX;
2533
2534 BreakIterator *fCharBI;
2535
2536 const UnicodeString *fText;
2537 int32_t *fOrigPositions;
2538
2539 RegexMatcher *fNumberMatcher;
2540 RegexMatcher *fLB10Matcher;
2541 RegexMatcher *fLB11Matcher;
2542 };
2543
2544
2545 RBBILineMonkey::RBBILineMonkey()
2546 {
2547 UErrorCode status = U_ZERO_ERROR;
2548
2549 fSets = new UVector(status);
2550
2551 fBK = new UnicodeSet("[\\p{Line_Break=BK}]", status);
2552 fCR = new UnicodeSet("[\\p{Line_break=CR}]", status);
2553 fLF = new UnicodeSet("[\\p{Line_break=LF}]", status);
2554 fCM = new UnicodeSet("[\\p{Line_break=CM}]", status);
2555 fNL = new UnicodeSet("[\\p{Line_break=NL}]", status);
2556 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]", status);
2557 fZW = new UnicodeSet("[\\p{Line_break=ZW}]", status);
2558 fGL = new UnicodeSet("[\\p{Line_break=GL}]", status);
2559 fCB = new UnicodeSet("[\\p{Line_break=CB}]", status);
2560 fSP = new UnicodeSet("[\\p{Line_break=SP}]", status);
2561 fB2 = new UnicodeSet("[\\p{Line_break=B2}]", status);
2562 fBA = new UnicodeSet("[\\p{Line_break=BA}]", status);
2563 fBB = new UnicodeSet("[\\p{Line_break=BB}]", status);
2564 fHY = new UnicodeSet("[\\p{Line_break=HY}]", status);
2565 fCL = new UnicodeSet("[\\p{Line_break=CL}]", status);
2566 fEX = new UnicodeSet("[\\p{Line_break=EX}]", status);
2567 fIN = new UnicodeSet("[\\p{Line_break=IN}]", status);
2568 fNS = new UnicodeSet("[\\p{Line_break=NS}]", status);
2569 fOP = new UnicodeSet("[\\p{Line_break=OP}]", status);
2570 fQU = new UnicodeSet("[\\p{Line_break=QU}]", status);
2571 fIS = new UnicodeSet("[\\p{Line_break=IS}]", status);
2572 fNU = new UnicodeSet("[\\p{Line_break=NU}]", status);
2573 fPO = new UnicodeSet("[\\p{Line_break=PO}]", status);
2574 fPR = new UnicodeSet("[\\p{Line_break=PR}]", status);
2575 fSY = new UnicodeSet("[\\p{Line_break=SY}]", status);
2576 fAI = new UnicodeSet("[\\p{Line_break=AI}]", status);
2577 fAL = new UnicodeSet("[\\p{Line_break=AL}]", status);
2578 fID = new UnicodeSet("[\\p{Line_break=ID}]", status);
2579 fSA = new UnicodeSet("[\\p{Line_break=SA}]", status);
2580 fXX = new UnicodeSet("[\\p{Line_break=XX}]", status);
2581
2582 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2583 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2584 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
2585
2586
2587
2588 fSets->addElement(fBK, status);
2589 fSets->addElement(fCR, status);
2590 fSets->addElement(fLF, status);
2591 fSets->addElement(fCM, status);
2592 fSets->addElement(fNL, status);
2593 fSets->addElement(fWJ, status);
2594 fSets->addElement(fZW, status);
2595 fSets->addElement(fGL, status);
2596 fSets->addElement(fCB, status);
2597 fSets->addElement(fSP, status);
2598 fSets->addElement(fB2, status);
2599 fSets->addElement(fBA, status);
2600 fSets->addElement(fBB, status);
2601 fSets->addElement(fHY, status);
2602 fSets->addElement(fCL, status);
2603 fSets->addElement(fEX, status);
2604 fSets->addElement(fIN, status);
2605 fSets->addElement(fNS, status);
2606 fSets->addElement(fOP, status);
2607 fSets->addElement(fQU, status);
2608 fSets->addElement(fIS, status);
2609 fSets->addElement(fNU, status);
2610 fSets->addElement(fPO, status);
2611 fSets->addElement(fPR, status);
2612 fSets->addElement(fSY, status);
2613 fSets->addElement(fAI, status);
2614 fSets->addElement(fAL, status);
2615 fSets->addElement(fID, status);
2616 fSets->addElement(fWJ, status);
2617 fSets->addElement(fSA, status);
2618 // fSets->addElement(fXX, status);
2619
2620
2621
2622 fNumberMatcher = new RegexMatcher(
2623 "(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
2624 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2625 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2626 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2627 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
2628 "(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
2629 0, status);
2630
2631 fLB10Matcher = new RegexMatcher(
2632 "\\p{Line_Break=QU}\\p{Line_Break=CM}*"
2633 "\\p{Line_Break=SP}*"
2634 "(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
2635 0, status);
2636
2637 fLB11Matcher = new RegexMatcher(
2638 "\\p{Line_Break=CL}\\p{Line_Break=CM}*"
2639 "\\p{Line_Break=SP}*"
2640 "(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
2641 0, status);
2642
2643 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2644
2645 if (U_FAILURE(status)) {
2646 deferredStatus = status;
2647 }
2648 };
2649
2650
2651 void RBBILineMonkey::setText(const UnicodeString &s) {
2652 fText = &s;
2653 fCharBI->setText(s);
2654 fNumberMatcher->reset(s);
2655 }
2656
2657 //
2658 // rule67Adjust
2659 // Line Break TR rules 6 and 7 implementation.
2660 // This deals with combining marks, Hangul Syllables, and other sequences that
2661 // that must be treated as if they were something other than what they actually are.
2662 //
2663 // This is factored out into a separate function because it must be applied twice for
2664 // each potential break, once to the chars before the position being checked, then
2665 // again to the text following the possible break.
2666 //
2667 void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2668 if (pos == -1) {
2669 // Invalid initial position. Happens during the warmup iteration of the
2670 // main loop in next().
2671 return;
2672 }
2673
2674 int32_t nPos = *nextPos;
2675
2676 // LB 6 Treat Korean Syllables as a single unit
2677 int32_t hangultype = u_getIntPropertyValue(*posChar, UCHAR_HANGUL_SYLLABLE_TYPE);
2678 if (hangultype != U_HST_NOT_APPLICABLE) {
2679 nPos = fCharBI->following(pos); // Advance by grapheme cluster, which
2680 // contains the logic to locate Hangul syllables.
2681 // Grapheme Cluster Ugliness: some Grapheme_Extend chars, which are absorbed
2682 // into a grapheme cluster, are NOT Line Break CM. (Some are GL, for example.)
2683 // We don't want consume any of these. The Approach is
2684 // 1. Back nPos up, undoing the consumption of any
2685 // Grapheme_Extend chars by the char break iterator.
2686 // 2. Let the LB 7b logic below reconsume any Line Break CM chars.
2687 for (;;) {
2688 nPos = fText->moveIndex32(nPos, -1);
2689 UChar32 possiblyExtendChar = fText->char32At(nPos);
2690 if (fID->contains(possiblyExtendChar)) {
2691 // We hit into the Hangul Syllable itself, class is ID.
2692 nPos = fText->moveIndex32(nPos, +1);
2693 break;
2694 }
2695 }
2696 }
2697
2698 // LB 7b Keep combining sequences together.
2699 // advance over any CM class chars. (Line Break CM class is different from
2700 // grapheme cluster CM, so we need to do this even for HangulSyllables.
2701 // Line Break may eat additional stuff as combining, beyond what graphem cluster did.
2702 if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a
2703 || *posChar==0x0d || *posChar==0x85)) {
2704 for (;;) {
2705 *nextChar = fText->char32At(nPos);
2706 if (!fCM->contains(*nextChar)) {
2707 break;
2708 }
2709 nPos = fText->moveIndex32(nPos, 1);
2710 }
2711 }
2712
2713
2714 // LB 7a In a SP CM* sequence, treat the SP as an ID
2715 if (nPos != *nextPos && fSP->contains(*posChar)) {
2716 *posChar = 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.
2717 }
2718
2719 // LB 7b Treat X CM* as if it were x.
2720 // No explicit action required.
2721
2722 // LB 7c Treat any remaining combining mark as AL
2723 if (fCM->contains(*posChar)) {
2724 *posChar = 0x41; // thisChar = 'A';
2725 }
2726
2727 // Push the updated nextPos and nextChar back to our caller.
2728 // This only makes a difference if posChar got bigger, by slurping up a
2729 // combining sequence or Hangul syllable.
2730 *nextPos = nPos;
2731 *nextChar = fText->char32At(nPos);
2732 }
2733
2734
2735
2736 int32_t RBBILineMonkey::next(int32_t startPos) {
2737 UErrorCode status = U_ZERO_ERROR;
2738 int32_t pos; // Index of the char following a potential break position
2739 UChar32 thisChar; // Character at above position "pos"
2740
2741 int32_t prevPos; // Index of the char preceding a potential break position
2742 UChar32 prevChar; // Character at above position. Note that prevChar
2743 // and thisChar may not be adjacent because combining
2744 // characters between them will be ignored.
2745
2746 int32_t nextPos; // Index of the next character following pos.
2747 // Usually skips over combining marks.
2748 int32_t nextCPPos; // Index of the code point following "pos."
2749 // May point to a combining mark.
2750 int32_t tPos; // temp value.
2751 UChar32 c;
2752
2753 if (startPos >= fText->length()) {
2754 return -1;
2755 }
2756
2757
2758 // Initial values for loop. Loop will run the first time without finding breaks,
2759 // while the invalid values shift out and the "this" and
2760 // "prev" positions are filled in with good values.
2761 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
2762 thisChar = prevChar = 0;
2763 nextPos = nextCPPos = startPos;
2764
2765
2766 // Loop runs once per position in the test text, until a break position
2767 // is found.
2768 for (;;) {
2769 prevPos = pos;
2770 prevChar = thisChar;
2771
2772 pos = nextPos;
2773 thisChar = fText->char32At(pos);
2774
2775 nextCPPos = fText->moveIndex32(pos, 1);
2776 nextPos = nextCPPos;
2777
2778 // Break at end of text.
2779 if (pos >= fText->length()) {
2780 break;
2781 }
2782
2783 // LB 3a Always break after hard line breaks,
2784 if (fBK->contains(prevChar)) {
2785 break;
2786 }
2787
2788 // LB 3b Break after CR, LF, NL, but not inside CR LF
2789 if (prevChar == 0x0d && thisChar == 0x0a) {
2790 continue;
2791 }
2792 if (prevChar == 0x0d ||
2793 prevChar == 0x0a ||
2794 prevChar == 0x85) {
2795 break;
2796 }
2797
2798 // LB 3c Don't break before hard line breaks
2799 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2800 fBK->contains(thisChar)) {
2801 continue;
2802 }
2803
2804 // LB 10 QU SP* x OP
2805 if (prevPos >= 0) {
2806 UnicodeString subStr10(*fText, prevPos);
2807 fLB10Matcher->reset(subStr10);
2808 status = U_ZERO_ERROR;
2809 if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
2810 // TODO: Check status codes
2811 pos = prevPos + fLB10Matcher->start(1, status);
2812 nextPos = prevPos + fLB10Matcher->end(0, status);
2813 thisChar = fText->char32At(pos);
2814 continue;
2815 }
2816 }
2817
2818 // LB 11 CL SP* x NS
2819 if (prevPos >= 0) {
2820 UnicodeString subStr11(*fText, prevPos);
2821 fLB11Matcher->reset(subStr11);
2822 status = U_ZERO_ERROR;
2823 if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
2824 // TODO: Check status codes
2825 pos = prevPos + fLB11Matcher->start(1, status);
2826 nextPos = prevPos + fLB11Matcher->end(0, status);
2827 thisChar = fText->char32At(pos);
2828 continue;
2829 }
2830 }
2831
2832 // LB 4 Don't break before spaces or zero-width space.
2833 if (fSP->contains(thisChar)) {
2834 continue;
2835 }
2836
2837 if (fZW->contains(thisChar)) {
2838 continue;
2839 }
2840
2841 // LB 5 Break after zero width space
2842 if (fZW->contains(prevChar)) {
2843 break;
2844 }
2845
2846 // LB 6, LB 7
2847 /*int32_t oldpos = pos;*/
2848 rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
2849
2850 nextCPPos = fText->moveIndex32(pos, 1);
2851 nextPos = nextCPPos;
2852 c = fText->char32At(nextPos);
2853 // another percularity of LB 4 - Dont break before space
2854 if (fSP->contains(thisChar)) {
2855 continue;
2856 }
2857 rule67Adjust(pos, &thisChar, &nextPos, &c);
2858
2859 // If the loop is still warming up - if we haven't shifted the initial
2860 // -1 positions out of prevPos yet - loop back to advance the
2861 // position in the input without any further looking for breaks.
2862 if (prevPos == -1) {
2863 continue;
2864 }
2865
2866 // Re-apply rules 3c, 4 because these could be affected by having
2867 // a new thisChar from doing rule 6 or 7.
2868 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c
2869 fBK->contains(thisChar)) {
2870 continue;
2871 }
2872 if (fSP->contains(thisChar)) { // LB 4
2873 continue;
2874 }
2875 if (fZW->contains(thisChar)) { // LB 4
2876 continue;
2877 }
2878
2879
2880 // LB 8 Don't break before closings.
2881 // NU x CL and NU x IS are not matched here so that they will
2882 // fall into LB 17 and the more general number regular expression.
2883 //
2884 if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
2885 fEX->contains(thisChar) ||
2886 !fNU->contains(prevChar) && fIS->contains(thisChar) ||
2887 !fNU->contains(prevChar) && fSY->contains(thisChar)) {
2888 continue;
2889 }
2890
2891 // LB 9 Don't break after OP SP*
2892 // Scan backwards, checking for this sequence.
2893 // The OP char could include combining marks, so we acually check for
2894 // OP CM* SP*
2895 // Another Twist: The Rule 67 fixes may have changed a CP CM
2896 // sequence into a ID char, so before scanning back through spaces,
2897 // verify that prevChar is indeed a space. The prevChar variable
2898 // may differ from fText[prevPos]
2899 tPos = prevPos;
2900 if (fSP->contains(prevChar)) {
2901 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2902 tPos=fText->moveIndex32(tPos, -1);
2903 }
2904 }
2905 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2906 tPos=fText->moveIndex32(tPos, -1);
2907 }
2908 if (fOP->contains(fText->char32At(tPos))) {
2909 continue;
2910 }
2911
2912
2913 // LB 11a B2 x B2
2914 if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
2915 continue;
2916 }
2917
2918 // LB 11b
2919 // x GL
2920 // GL x
2921 if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
2922 continue;
2923 }
2924 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2925 continue;
2926 }
2927
2928 // LB 12 break after space
2929 if (fSP->contains(prevChar)) {
2930 break;
2931 }
2932
2933 // LB 14
2934 // x QU
2935 // QU x
2936 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
2937 continue;
2938 }
2939
2940 // LB 14a Break around a CB
2941 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
2942 break;
2943 }
2944
2945 // LB 15
2946 if (fBA->contains(thisChar) ||
2947 fHY->contains(thisChar) ||
2948 fNS->contains(thisChar) ||
2949 fBB->contains(prevChar) ) {
2950 continue;
2951 }
2952
2953 // LB 16
2954 if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
2955 fID->contains(prevChar) && fIN->contains(thisChar) ||
2956 fIN->contains(prevChar) && fIN->contains(thisChar) ||
2957 fNU->contains(prevChar) && fIN->contains(thisChar) ) {
2958 continue;
2959 }
2960
2961
2962 // LB 17 ID x PO (Note: Leading CM behaves like ID)
2963 // AL x NU
2964 // NU x AL
2965 if (fID->contains(prevChar) && fPO->contains(thisChar) ||
2966 fCM->contains(prevChar) && fPO->contains(thisChar) ||
2967 fAL->contains(prevChar) && fNU->contains(thisChar) ||
2968 fNU->contains(prevChar) && fAL->contains(thisChar) ) {
2969 continue;
2970 }
2971
2972 // LB 18 Numbers
2973 UnicodeString subStr18(*fText, prevPos);
2974 fNumberMatcher->reset(subStr18);
2975 if (fNumberMatcher->lookingAt(status)) {
2976 // TODO: Check status codes
2977 // Matched a number. But could have been just a single digit, which would
2978 // not represent a "no break here" between prevChar and thisChar
2979 int32_t numEndIdx = prevPos + fNumberMatcher->end(status); // idx of first char following num
2980 if (numEndIdx > pos) {
2981 // Number match includes at least our two chars being checked
2982 if (numEndIdx > nextPos) {
2983 // Number match includes additional chars. Update pos and nextPos
2984 // so that next loop iteration will continue at the end of the number,
2985 // checking for breaks between last char in number & whatever follows.
2986 nextPos = numEndIdx;
2987 pos = fCharBI->preceding(numEndIdx);
2988 thisChar = fText->char32At(pos);
2989 while (fCM->contains(thisChar)) {
2990 pos = fCharBI->preceding(pos);
2991 thisChar = fText->char32At(pos);
2992 }
2993 }
2994 continue;
2995 }
2996 }
2997
2998 if (fPR->contains(prevChar) && fAL->contains(thisChar)) {
2999 continue;
3000 }
3001
3002 if (fPR->contains(prevChar) && fID->contains(thisChar)) {
3003 continue;
3004 }
3005
3006 // LB 18b
3007 if (fHY->contains(prevChar) || fBB->contains(thisChar)) {
3008 break;
3009 }
3010
3011 // LB 19
3012 if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3013 continue;
3014 }
3015
3016 // LB 19b
3017 if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3018 continue;
3019 }
3020
3021 // LB 20 Break everywhere else
3022 break;
3023
3024 }
3025
3026 return pos;
3027 }
3028
3029
3030 UVector *RBBILineMonkey::charClasses() {
3031 return fSets;
3032 }
3033
3034
3035 RBBILineMonkey::~RBBILineMonkey() {
3036 delete fSets;
3037
3038 delete fBK;
3039 delete fCR;
3040 delete fLF;
3041 delete fCM;
3042 delete fNL;
3043 delete fWJ;
3044 delete fZW;
3045 delete fGL;
3046 delete fCB;
3047 delete fSP;
3048 delete fB2;
3049 delete fBA;
3050 delete fBB;
3051 delete fHY;
3052 delete fCL;
3053 delete fEX;
3054 delete fIN;
3055 delete fNS;
3056 delete fOP;
3057 delete fQU;
3058 delete fIS;
3059 delete fNU;
3060 delete fPO;
3061 delete fPR;
3062 delete fSY;
3063 delete fAI;
3064 delete fAL;
3065 delete fID;
3066 delete fSA;
3067 delete fXX;
3068
3069 delete fCharBI;
3070 delete fNumberMatcher;
3071 delete fLB10Matcher;
3072 delete fLB11Matcher;
3073 }
3074
3075
3076 //-------------------------------------------------------------------------------------------
3077 //
3078 // TestMonkey
3079 //
3080 // params
3081 // seed=nnnnn Random number starting seed.
3082 // Setting the seed allows errors to be reproduced.
3083 // loop=nnn Looping count. Controls running time.
3084 // -1: run forever.
3085 // 0 or greater: run length.
3086 //
3087 // type = char | word | line | sent | title
3088 //
3089 //-------------------------------------------------------------------------------------------
3090
3091 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3092 int32_t val = defaultVal;
3093 name.append(" *= *(-?\\d+)");
3094 UErrorCode status = U_ZERO_ERROR;
3095 RegexMatcher m(name, params, 0, status);
3096 if (m.find()) {
3097 // The param exists. Convert the string to an int.
3098 char valString[100];
3099 int32_t paramLength = m.end(1, status) - m.start(1, status);
3100 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3101 paramLength = (int32_t)(sizeof(valString)-2);
3102 }
3103 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3104 val = strtol(valString, NULL, 10);
3105
3106 // Delete this parameter from the params string.
3107 m.reset();
3108 params = m.replaceFirst("", status);
3109 }
3110 U_ASSERT(U_SUCCESS(status));
3111 return val;
3112 }
3113 #endif
3114
3115 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3116 BreakIterator *bi,
3117 int expected[],
3118 int expectedcount)
3119 {
3120 int count = 0;
3121 int i = 0;
3122 int forward[50];
3123 bi->setText(ustr);
3124 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3125 forward[count] = i;
3126 if (count < expectedcount && expected[count] != i) {
3127 test->errln("break forward test failed: expected %d but got %d",
3128 expected[count], i);
3129 break;
3130 }
3131 count ++;
3132 }
3133 if (count != expectedcount) {
3134 printStringBreaks(ustr, expected, expectedcount);
3135 test->errln("break test failed: missed %d match",
3136 expectedcount - count);
3137 return;
3138 }
3139 // testing boundaries
3140 for (i = 1; i < expectedcount; i ++) {
3141 int j = expected[i - 1];
3142 if (!bi->isBoundary(j)) {
3143 printStringBreaks(ustr, expected, expectedcount);
3144 test->errln("Expected boundary at position %d", j);
3145 return;
3146 }
3147 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3148 if (bi->isBoundary(j)) {
3149 printStringBreaks(ustr, expected, expectedcount);
3150 test->errln("Not expecting boundary at position %d", j);
3151 return;
3152 }
3153 }
3154 }
3155
3156 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3157 count --;
3158 if (forward[count] != i) {
3159 test->errln("happy break test reverse failed: expected %d but got %d",
3160 forward[count], i);
3161 break;
3162 }
3163 }
3164 if (count != 0) {
3165 printStringBreaks(ustr, expected, expectedcount);
3166 test->errln("happy break test failed: missed a match");
3167 return;
3168 }
3169
3170 // testing preceding
3171 for (i = 0; i < expectedcount - 1; i ++) {
3172 int j = expected[i] + 1;
3173 for (; j <= expected[i + 1]; j ++) {
3174 if (bi->preceding(j) != expected[i]) {
3175 printStringBreaks(ustr, expected, expectedcount);
3176 test->errln("Not expecting backwards boundary at position %d", j);
3177 return;
3178 }
3179 }
3180 }
3181 }
3182
3183 void RBBITest::TestWordBreaks(void)
3184 {
3185 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3186
3187 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3188 Locale locale("en");
3189 UErrorCode status = U_ZERO_ERROR;
3190 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3191 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3192 UChar str[300];
3193 static const char *strlist[] =
3194 {
3195 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3196 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3197 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
3198 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3199 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3200 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3201 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3202 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3203 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3204 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3205 "\\u2027\\U000e0067\\u0a47\\u00b7",
3206 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3207 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3208 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3209 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3210 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3211 "\\u0027\\u11af\\U000e0057\\u0602",
3212 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3213 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3214 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3215 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3216 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3217 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3218 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3219 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3220 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3221 "\\u58f4\\U000e0049\\u20e7\\u2027",
3222 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3223 "\\ua183\\u102d\\u0bec\\u003a",
3224 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3225 "\\u003a\\u0e57\\u0fad\\u002e",
3226 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3227 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3228 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3229 "\\u003a\\u0664\\u00b7\\u1fba",
3230 "\\u003b\\u0027\\u00b7\\u47a3",
3231 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3232 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3233 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3234 };
3235 int loop;
3236 if (U_FAILURE(status)) {
3237 errln("Creation of break iterator failed %s", u_errorName(status));
3238 return;
3239 }
3240 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3241 // printf("looping %d\n", loop);
3242 u_unescape(strlist[loop], str, 25);
3243 UnicodeString ustr(str);
3244 // RBBICharMonkey monkey;
3245 RBBIWordMonkey monkey;
3246
3247 int expected[50];
3248 int expectedcount = 0;
3249
3250 monkey.setText(ustr);
3251 int i;
3252 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3253 expected[expectedcount ++] = i;
3254 }
3255
3256 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3257 }
3258 delete bi;
3259 #endif
3260 }
3261
3262 void RBBITest::TestWordBoundary(void)
3263 {
3264 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3265 Locale locale("en");
3266 UErrorCode status = U_ZERO_ERROR;
3267 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3268 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3269 UChar str[50];
3270 static const char *strlist[] =
3271 {
3272 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3273 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3274 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3275 "\\u2027\\U000e0067\\u0a47\\u00b7",
3276 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3277 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3278 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3279 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3280 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3281 "\\u0027\\u11af\\U000e0057\\u0602",
3282 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3283 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3284 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3285 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3286 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3287 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3288 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3289 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3290 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3291 "\\u58f4\\U000e0049\\u20e7\\u2027",
3292 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3293 "\\ua183\\u102d\\u0bec\\u003a",
3294 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3295 "\\u003a\\u0e57\\u0fad\\u002e",
3296 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3297 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3298 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3299 "\\u003a\\u0664\\u00b7\\u1fba",
3300 "\\u003b\\u0027\\u00b7\\u47a3",
3301 };
3302 int loop;
3303 if (U_FAILURE(status)) {
3304 errln("Creation of break iterator failed %s", u_errorName(status));
3305 return;
3306 }
3307 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3308 // printf("looping %d\n", loop);
3309 u_unescape(strlist[loop], str, 20);
3310 UnicodeString ustr(str);
3311 int forward[50];
3312 int count = 0;
3313
3314 bi->setText(ustr);
3315 int prev = 0;
3316 int i;
3317 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3318 forward[count ++] = i;
3319 if (i > prev) {
3320 int j;
3321 for (j = prev + 1; j < i; j ++) {
3322 if (bi->isBoundary(j)) {
3323 printStringBreaks(ustr, forward, count);
3324 errln("happy boundary test failed: expected %d not a boundary",
3325 j);
3326 return;
3327 }
3328 }
3329 }
3330 if (!bi->isBoundary(i)) {
3331 printStringBreaks(ustr, forward, count);
3332 errln("happy boundary test failed: expected %d a boundary",
3333 i);
3334 return;
3335 }
3336 prev = i;
3337 }
3338 }
3339 delete bi;
3340 }
3341
3342 void RBBITest::TestLineBreaks(void)
3343 {
3344 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3345 Locale locale("en");
3346 UErrorCode status = U_ZERO_ERROR;
3347 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3348 UChar str[50];
3349 static const char *strlist[] =
3350 {
3351 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3352 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3353 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3354 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3355 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3356 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3357 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3358 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3359 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3360 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3361 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3362 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3363 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3364 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3365 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3366 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3367 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3368 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3369 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3370 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3371 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3372 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3373 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3374 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3375 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3376 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3377 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3378 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3379 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3380 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3381 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3382 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3383 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3384 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3385 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3386 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3387 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3388 };
3389 int loop;
3390 if (U_FAILURE(status)) {
3391 errln("Creation of break iterator failed %s", u_errorName(status));
3392 return;
3393 }
3394 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3395 // printf("looping %d\n", loop);
3396 u_unescape(strlist[loop], str, 20);
3397 UnicodeString ustr(str);
3398 RBBILineMonkey monkey;
3399
3400 int expected[50];
3401 int expectedcount = 0;
3402
3403 monkey.setText(ustr);
3404 int i;
3405 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3406 expected[expectedcount ++] = i;
3407 }
3408
3409 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3410 }
3411 delete bi;
3412 #endif
3413 }
3414
3415 void RBBITest::TestSentBreaks(void)
3416 {
3417 Locale locale("en");
3418 UErrorCode status = U_ZERO_ERROR;
3419 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3420 UChar str[100];
3421 static const char *strlist[] =
3422 {
3423 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3424 "This\n",
3425 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3426 "\"Sentence ending with a quote.\" Bye.",
3427 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3428 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3429 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3430 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3431 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3432 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3433 };
3434 int loop;
3435 int forward[100];
3436 if (U_FAILURE(status)) {
3437 errln("Creation of break iterator failed %s", u_errorName(status));
3438 return;
3439 }
3440 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3441 u_unescape(strlist[loop], str, 100);
3442 UnicodeString ustr(str);
3443
3444 int count = 0;
3445 bi->setText(ustr);
3446 int i;
3447 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3448 forward[count ++] = i;
3449 }
3450 testBreakBoundPreceding(this, ustr, bi, forward, count);
3451 }
3452 delete bi;
3453 }
3454
3455 void RBBITest::TestMonkey(char *params) {
3456 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3457
3458 UErrorCode status = U_ZERO_ERROR;
3459 int32_t loopCount = 500;
3460 int32_t seed = 1;
3461 UnicodeString breakType = "all";
3462 Locale locale("en");
3463
3464 if (quick == FALSE) {
3465 loopCount = 10000;
3466 }
3467
3468 if (params) {
3469 UnicodeString p(params);
3470 loopCount = getIntParam("loop", p, loopCount);
3471 seed = getIntParam("seed", p, seed);
3472
3473 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3474 if (m.find()) {
3475 breakType = m.group(1, status);
3476 m.reset();
3477 p = m.replaceFirst("", status);
3478 }
3479
3480 m.reset(p);
3481 if (RegexMatcher("\\S", p, 0, status).find()) {
3482 // Each option is stripped out of the option string as it is processed.
3483 // All options have been checked. The option string should have been completely emptied..
3484 char buf[100];
3485 p.extract(buf, sizeof(buf), NULL, status);
3486 buf[sizeof(buf)-1] = 0;
3487 errln("Unrecognized or extra parameter: %s\n", buf);
3488 return;
3489 }
3490
3491 }
3492
3493 if (breakType == "char" || breakType == "all") {
3494 RBBICharMonkey m;
3495 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3496 if (U_SUCCESS(status)) {
3497 RunMonkey(bi, m, "char", seed, loopCount);
3498 }
3499 else {
3500 errln("Creation of character break iterator failed %s", u_errorName(status));
3501 }
3502 delete bi;
3503 }
3504
3505 if (breakType == "word" || breakType == "all") {
3506 logln("Word Break Monkey Test");
3507 RBBIWordMonkey m;
3508 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3509 if (U_SUCCESS(status)) {
3510 RunMonkey(bi, m, "word", seed, loopCount);
3511 }
3512 else {
3513 errln("Creation of word break iterator failed %s", u_errorName(status));
3514 }
3515 delete bi;
3516 }
3517
3518 if (breakType == "line" || breakType == "all") {
3519 logln("Line Break Monkey Test");
3520 RBBILineMonkey m;
3521 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3522 if (params == NULL) {
3523 loopCount = 50;
3524 }
3525 if (U_SUCCESS(status)) {
3526 RunMonkey(bi, m, "line", seed, loopCount);
3527 }
3528 else {
3529 errln("Creation of line break iterator failed %s", u_errorName(status));
3530 }
3531 delete bi;
3532 }
3533
3534
3535 #endif
3536 }
3537
3538 //
3539 // Run a RBBI monkey test. Common routine, for all break iterator types.
3540 // Parameters:
3541 // bi - the break iterator to use
3542 // mk - MonkeyKind, abstraction for obtaining expected results
3543 // name - Name of test (char, word, etc.) for use in error messages
3544 // seed - Seed for starting random number generator (parameter from user)
3545 // numIterations
3546 //
3547 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, int32_t numIterations) {
3548
3549 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3550
3551 const int32_t TESTSTRINGLEN = 500;
3552 UnicodeString testText;
3553 int32_t numCharClasses;
3554 UVector *chClasses;
3555 int expected[TESTSTRINGLEN*2 + 1];
3556 int expectedCount = 0;
3557 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3558 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3559 char reverseBreaks[TESTSTRINGLEN*2+1];
3560 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3561 char followingBreaks[TESTSTRINGLEN*2+1];
3562 char precedingBreaks[TESTSTRINGLEN*2+1];
3563 int i;
3564 int loopCount = 0;
3565
3566 m_seed = seed;
3567
3568 numCharClasses = mk.charClasses()->size();
3569 chClasses = mk.charClasses();
3570
3571 // Check for errors that occured during the construction of the MonkeyKind object.
3572 // Can't report them where they occured because errln() is a method coming from intlTest,
3573 // and is not visible outside of RBBITest :-(
3574 if (U_FAILURE(mk.deferredStatus)) {
3575 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3576 return;
3577 }
3578
3579 // Verify that the character classes all have at least one member.
3580 for (i=0; i<numCharClasses; i++) {
3581 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3582 if (s == NULL || s->size() == 0) {
3583 errln("Character Class #%d is null or of zero size.", i);
3584 return;
3585 }
3586 }
3587
3588 while (loopCount < numIterations || numIterations == -1) {
3589 if (numIterations == -1 && loopCount % 10 == 0) {
3590 // If test is running in an infinite loop, display a periodic tic so
3591 // we can tell that it is making progress.
3592 fprintf(stderr, ".");
3593 }
3594 // Save current random number seed, so that we can recreate the random numbers
3595 // for this loop iteration in event of an error.
3596 seed = m_seed;
3597
3598 // Populate a test string with data.
3599 testText.truncate(0);
3600 for (i=0; i<TESTSTRINGLEN; i++) {
3601 int32_t aClassNum = m_rand() % numCharClasses;
3602 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3603 int32_t charIdx = m_rand() % classSet->size();
3604 UChar32 c = classSet->charAt(charIdx);
3605 if (c < 0) { // TODO: deal with sets containing strings.
3606 errln("c < 0");
3607 }
3608 testText.append(c);
3609 }
3610
3611 // Calculate the expected results for this test string.
3612 mk.setText(testText);
3613 memset(expectedBreaks, 0, sizeof(expectedBreaks));
3614 expectedBreaks[0] = 1;
3615 int32_t breakPos = 0;
3616 expectedCount = 0;
3617 for (;;) {
3618 breakPos = mk.next(breakPos);
3619 if (breakPos == -1) {
3620 break;
3621 }
3622 if (breakPos > testText.length()) {
3623 errln("breakPos > testText.length()");
3624 }
3625 expectedBreaks[breakPos] = 1;
3626 expected[expectedCount ++] = breakPos;
3627 }
3628
3629 // Find the break positions using forward iteration
3630 memset(forwardBreaks, 0, sizeof(forwardBreaks));
3631 bi->setText(testText);
3632 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3633 if (i < 0 || i > testText.length()) {
3634 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3635 break;
3636 }
3637 forwardBreaks[i] = 1;
3638 }
3639
3640 // Find the break positions using reverse iteration
3641 memset(reverseBreaks, 0, sizeof(reverseBreaks));
3642 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3643 if (i < 0 || i > testText.length()) {
3644 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3645 break;
3646 }
3647 reverseBreaks[i] = 1;
3648 }
3649
3650 // Find the break positions using isBoundary() tests.
3651 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3652 U_ASSERT(sizeof(isBoundaryBreaks) > testText.length());
3653 for (i=0; i<=testText.length(); i++) {
3654 isBoundaryBreaks[i] = bi->isBoundary(i);
3655 }
3656
3657
3658 // Find the break positions using the following() function.
3659 // printf(".");
3660 memset(followingBreaks, 0, sizeof(followingBreaks));
3661 int32_t lastBreakPos = 0;
3662 followingBreaks[0] = 1;
3663 for (i=0; i<testText.length(); i++) {
3664 breakPos = bi->following(i);
3665 if (breakPos <= i ||
3666 breakPos < lastBreakPos ||
3667 breakPos > testText.length() ||
3668 breakPos > lastBreakPos && lastBreakPos > i ) {
3669 errln("%s break monkey test: "
3670 "Out of range value returned by BreakIterator::following().\n"
3671 "Random seed=%d", name, seed);
3672 break;
3673 }
3674 followingBreaks[breakPos] = 1;
3675 lastBreakPos = breakPos;
3676 }
3677
3678 // Find the break positions using the preceding() function.
3679 memset(precedingBreaks, 0, sizeof(followingBreaks));
3680 lastBreakPos = testText.length();
3681 precedingBreaks[testText.length()] = 1;
3682 for (i=testText.length(); i>0; i--) {
3683 breakPos = bi->preceding(i);
3684 if (breakPos >= i ||
3685 breakPos > lastBreakPos ||
3686 breakPos < 0 ||
3687 breakPos < lastBreakPos && lastBreakPos < i ) {
3688 errln("%s break monkey test: "
3689 "Out of range value returned by BreakIterator::preceding().\n"
3690 "index=%d; prev returned %d; lastBreak=%d" ,
3691 name, i, breakPos, lastBreakPos);
3692 precedingBreaks[i] = 2; // Forces an error.
3693 } else {
3694 precedingBreaks[breakPos] = 1;
3695 lastBreakPos = breakPos;
3696 }
3697 }
3698
3699 // Compare the expected and actual results.
3700 for (i=0; i<=testText.length(); i++) {
3701 const char *errorType = NULL;
3702 if (forwardBreaks[i] != expectedBreaks[i]) {
3703 errorType = "next()";
3704 } else if (reverseBreaks[i] != forwardBreaks[i]) {
3705 errorType = "previous()";
3706 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
3707 errorType = "isBoundary()";
3708 } else if (followingBreaks[i] != expectedBreaks[i]) {
3709 errorType = "following()";
3710 } else if (precedingBreaks[i] != expectedBreaks[i]) {
3711 errorType = "preceding()";
3712 }
3713
3714
3715 if (errorType != NULL) {
3716 // Format a range of the test text that includes the failure as
3717 // a data item that can be included in the rbbi test data file.
3718
3719 // Start of the range is the last point where expected and actual results
3720 // both agreed that there was a break position.
3721 int startContext = i;
3722 int32_t count = 0;
3723 for (;;) {
3724 if (startContext==0) { break; }
3725 startContext --;
3726 if (expectedBreaks[startContext] != 0) {
3727 if (count == 2) break;
3728 count ++;
3729 }
3730 }
3731
3732 // End of range is two expected breaks past the start position.
3733 int endContext = i + 1;
3734 int ci;
3735 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
3736 for (;;) {
3737 if (endContext >= testText.length()) {break;}
3738 if (expectedBreaks[endContext-1] != 0) {
3739 if (count == 0) break;
3740 count --;
3741 }
3742 endContext ++;
3743 }
3744 }
3745
3746 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
3747 UnicodeString errorText = "<data>";
3748 /***if (strcmp(errorType, "next()") == 0) {
3749 startContext = 0;
3750 endContext = testText.length();
3751
3752 printStringBreaks(testText, expected, expectedCount);
3753 }***/
3754
3755 for (ci=startContext; ci<endContext;) {
3756 UnicodeString hexChars("0123456789abcdef");
3757 UChar32 c;
3758 int bn;
3759 c = testText.char32At(ci);
3760 if (ci == i) {
3761 // This is the location of the error.
3762 errorText.append("<?>");
3763 } else if (expectedBreaks[ci] != 0) {
3764 // This a non-error expected break position.
3765 errorText.append("<>");
3766 }
3767 if (c < 0x10000) {
3768 errorText.append("\\u");
3769 for (bn=12; bn>=0; bn-=4) {
3770 errorText.append(hexChars.charAt((c>>bn)&0xf));
3771 }
3772 } else {
3773 errorText.append("\\U");
3774 for (bn=28; bn>=0; bn-=4) {
3775 errorText.append(hexChars.charAt((c>>bn)&0xf));
3776 }
3777 }
3778 ci = testText.moveIndex32(ci, 1);
3779 }
3780 errorText.append("<>");
3781 errorText.append("</data>\n");
3782
3783 // Output the error
3784 char charErrorTxt[500];
3785 UErrorCode status = U_ZERO_ERROR;
3786 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
3787 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
3788 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
3789 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
3790 errorType, seed, i, charErrorTxt);
3791 break;
3792 }
3793 }
3794
3795 loopCount++;
3796 }
3797 #endif
3798 }
3799
3800
3801 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */