]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
374ca955
A
1/********************************************************************\r
2 * COPYRIGHT:\r
3 * Copyright (c) 1999-2004, International Business Machines Corporation and\r
4 * others. All Rights Reserved.\r
5 ********************************************************************/\r
6/************************************************************************\r
7* Date Name Description\r
8* 12/15/99 Madhu Creation.\r
9* 01/12/2000 Madhu Updated for changed API and added new tests\r
10************************************************************************/\r
11\r
12#include "unicode/utypes.h"\r
13\r
14#if !UCONFIG_NO_BREAK_ITERATION\r
15\r
16#include "unicode/utypes.h"\r
17#include "unicode/brkiter.h"\r
18#include "unicode/rbbi.h"\r
19#include "unicode/uchar.h"\r
20#include "unicode/utf16.h"\r
21#include "unicode/ucnv.h"\r
22#include "unicode/schriter.h"\r
23#include "unicode/uniset.h"\r
24#include "unicode/regex.h" // TODO: make conditional on regexp being built.\r
25#include "unicode/ustring.h"\r
26\r
27#include "intltest.h"\r
28#include "rbbitst.h"\r
29#include <string.h>\r
30#include "uvector.h"\r
31#include "uvectr32.h"\r
32#include <string.h>\r
33#include <stdio.h>\r
34#include <stdlib.h>\r
35\r
36\r
37\r
38//---------------------------------------------------------------------------\r
39//\r
40// class BITestData Holds a set of Break iterator test data and results\r
41// Includes\r
42// - the string data to be broken\r
43// - a vector of the expected break positions.\r
44// - a vector of source line numbers for the data,\r
45// (to help see where errors occured.)\r
46// - The expected break tag values.\r
47// - Vectors of actual break positions and tag values.\r
48// - Functions for comparing actual with expected and\r
49// reporting errors.\r
50//\r
51//----------------------------------------------------------------------------\r
52class BITestData {\r
53public:\r
54 UnicodeString fDataToBreak;\r
55 UVector fExpectedBreakPositions;\r
56 UVector fExpectedTags;\r
57 UVector fLineNum;\r
58 UVector fActualBreakPositions; // Test Results.\r
59 UVector fActualTags;\r
60\r
61 BITestData(UErrorCode &status);\r
62 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);\r
63 void checkResults(const char *heading, RBBITest *test);\r
64 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);\r
65 void clearResults();\r
66};\r
67\r
68//\r
69// Constructor.\r
70//\r
71BITestData::BITestData(UErrorCode &status)\r
72: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),\r
73 fActualTags(status)\r
74{\r
75};\r
76\r
77//\r
78// addDataChunk. Add a section (non-breaking) piece if data to the test data.\r
79// The macro form collects the line number, which is helpful\r
80// when tracking down failures.\r
81//\r
82// A null data item is inserted at the start of each test's data\r
83// to put the starting zero into the data list. The position saved for\r
84// each non-null item is its ending position.\r
85//\r
86#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);\r
87void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {\r
88 if (U_FAILURE(status)) {return;}\r
89 if (data != NULL) {\r
90 fDataToBreak.append(CharsToUnicodeString(data));\r
91 }\r
92 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);\r
93 fExpectedTags.addElement(tag, status);\r
94 fLineNum.addElement(lineNum, status);\r
95};\r
96\r
97\r
98//\r
99// checkResults. Compare the actual and expected break positions, report any differences.\r
100//\r
101void BITestData::checkResults(const char *heading, RBBITest *test) {\r
102 int32_t expectedIndex = 0;\r
103 int32_t actualIndex = 0;\r
104\r
105 for (;;) {\r
106 // If we've run through both the expected and actual results vectors, we're done.\r
107 // break out of the loop.\r
108 if (expectedIndex >= fExpectedBreakPositions.size() &&\r
109 actualIndex >= fActualBreakPositions.size()) {\r
110 break;\r
111 }\r
112\r
113\r
114 if (expectedIndex >= fExpectedBreakPositions.size()) {\r
115 err(heading, test, expectedIndex-1, actualIndex);\r
116 actualIndex++;\r
117 continue;\r
118 }\r
119\r
120 if (actualIndex >= fActualBreakPositions.size()) {\r
121 err(heading, test, expectedIndex, actualIndex-1);\r
122 expectedIndex++;\r
123 continue;\r
124 }\r
125\r
126 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {\r
127 err(heading, test, expectedIndex, actualIndex);\r
128 // Try to resync the positions of the indices, to avoid a rash of spurious erros.\r
129 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {\r
130 actualIndex++;\r
131 } else {\r
132 expectedIndex++;\r
133 }\r
134 continue;\r
135 }\r
136\r
137 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {\r
138 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",\r
139 heading, fLineNum.elementAt(expectedIndex),\r
140 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));\r
141 }\r
142\r
143 actualIndex++;\r
144 expectedIndex++;\r
145 }\r
146}\r
147\r
148//\r
149// err - An error was found. Report it, along with information about where the\r
150// incorrectly broken test data appeared in the source file.\r
151//\r
152void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)\r
153{\r
154 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);\r
155 int32_t actual = fActualBreakPositions.elementAti(actualIdx);\r
156 int32_t o = 0;\r
157 int32_t line = fLineNum.elementAti(expectedIdx);\r
158 if (expectedIdx > 0) {\r
159 // The line numbers are off by one because a premature break occurs somewhere\r
160 // within the previous item, rather than at the start of the current (expected) item.\r
161 // We want to report the offset of the unexpected break from the start of\r
162 // this previous item.\r
163 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);\r
164 }\r
165 if (actual < expected) {\r
166 test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);\r
167 } else {\r
168 test->errln("%s Failed to find break at end of item from line %d", heading, line);\r
169 }\r
170}\r
171\r
172\r
173void BITestData::clearResults() {\r
174 fActualBreakPositions.removeAllElements();\r
175 fActualTags.removeAllElements();\r
176}\r
177\r
178\r
179//-----------------------------------------------------------------------------------\r
180//\r
181// Cannned Test Characters\r
182//\r
183//-----------------------------------------------------------------------------------\r
184\r
185static const UChar cannedTestArray[] = {\r
186 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,\r
187 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,\r
188 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,\r
189 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,\r
190 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,\r
191 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,\r
192 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,\r
193 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000\r
194};\r
195\r
196static UnicodeString* cannedTestChars = 0;\r
197\r
198#define halfNA "\\u0928\\u094d\\u200d"\r
199#define halfSA "\\u0938\\u094d\\u200d"\r
200#define halfCHA "\\u091a\\u094d\\u200d"\r
201#define halfKA "\\u0915\\u094d\\u200d"\r
202#define deadTA "\\u0924\\u094d"\r
203\r
204//--------------------------------------------------------------------------------------\r
205//\r
206// RBBITest constructor and destructor\r
207//\r
208//--------------------------------------------------------------------------------------\r
209\r
210RBBITest::RBBITest() {\r
211 UnicodeString temp(cannedTestArray);\r
212 cannedTestChars = new UnicodeString();\r
213 *cannedTestChars += (UChar)0x0000;\r
214 *cannedTestChars += temp;\r
215}\r
216\r
217\r
218RBBITest::~RBBITest() {\r
219 delete cannedTestChars;\r
220}\r
221\r
222\r
223static const int T_NUMBER = 100;\r
224static const int T_LETTER = 200;\r
225static const int T_H_OR_K = 300;\r
226static const int T_IDEO = 400;\r
227\r
228\r
229\r
230\r
231\r
232\r
233//--------------------------------------------------------------------\r
234//Testing the BreakIterator for devanagari script\r
235//--------------------------------------------------------------------\r
236\r
237#define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/\r
238#define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/\r
239#define deadTTHA "\\u0920\\u094d"\r
240#define deadPA "\\u092a\\u094d"\r
241#define deadSA "\\u0938\\u094d"\r
242#define visarga "\\u0903" /*devanagari visarga looks like a english colon*/\r
243\r
244\r
245\r
246\r
247\r
248\r
249//-----------------------------------------------------------------------------------\r
250//\r
251// Test for status {tag} return value from break rules.\r
252// TODO: a more thorough test.\r
253//\r
254//-----------------------------------------------------------------------------------\r
255void RBBITest::TestStatusReturn() {\r
256 UnicodeString rulesString1 = "$Letters = [:L:];\n"\r
257 "$Numbers = [:N:];\n"\r
258 "$Letters+{1};\n"\r
259 "$Numbers+{2};\n"\r
260 "Help\\ {4}/me\\!;\n"\r
261 "[^$Letters $Numbers];\n"\r
262 "!.*;\n";\r
263 UnicodeString testString1 = "abc123..abc Help me Help me!";\r
264 // 01234567890123456789012345678\r
265 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};\r
266 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};\r
267\r
268 UErrorCode status=U_ZERO_ERROR;\r
269 UParseError parseError;\r
270\r
271 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);\r
272 if(U_FAILURE(status)) {\r
273 errln("FAIL : in construction");\r
274 } else {\r
275 int32_t pos;\r
276 int32_t i = 0;\r
277 bi->setText(testString1);\r
278 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {\r
279 if (pos != bounds1[i]) {\r
280 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);\r
281 break;\r
282 }\r
283\r
284 int tag = bi->getRuleStatus();\r
285 if (tag != brkStatus[i]) {\r
286 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);\r
287 break;\r
288 }\r
289 i++;\r
290 }\r
291 }\r
292 delete bi;\r
293}\r
294\r
295\r
296static void printStringBreaks(UnicodeString ustr, int expected[],\r
297 int expectedcount)\r
298{\r
299 UErrorCode status = U_ZERO_ERROR;\r
300 char name[100];\r
301 printf("code alpha extend alphanum type line name\n");\r
302 int j;\r
303 for (j = 0; j < ustr.length(); j ++) {\r
304 if (expectedcount > 0) {\r
305 int k;\r
306 for (k = 0; k < expectedcount; k ++) {\r
307 if (j == expected[k]) {\r
308 printf("------------------------------------------------ %d\n",\r
309 j);\r
310 }\r
311 }\r
312 }\r
313 UChar32 c = ustr.char32At(j);\r
314 if (c > 0xffff) {\r
315 j ++;\r
316 }\r
317 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);\r
318 printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c, \r
319 u_isUAlphabetic(c), \r
320 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),\r
321 u_isalnum(c), \r
322 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, \r
323 u_charType(c), \r
324 U_SHORT_PROPERTY_NAME), \r
325 u_getPropertyValueName(UCHAR_LINE_BREAK, \r
326 u_getIntPropertyValue(c, \r
327 UCHAR_LINE_BREAK), \r
328 U_SHORT_PROPERTY_NAME),\r
329 name);\r
330 }\r
331}\r
332\r
333void RBBITest::TestThaiLineBreak() {\r
334 UErrorCode status = U_ZERO_ERROR;\r
335 BITestData thaiLineSelection(status);\r
336\r
337 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that\r
338 // represents elided letters at the end of a long word. It should be bound to\r
339 // the end of the word and not treated as an independent punctuation mark.\r
340\r
341\r
342 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data\r
343 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);\r
344 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);\r
345 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);\r
346 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);\r
347// ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);\r
348// ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);\r
349 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);\r
350 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us\r
351 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);\r
352 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);\r
353 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);\r
354 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);\r
355 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);\r
356 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);\r
357\r
358 // the one time where the paiyannoi occurs somewhere other than at the end\r
359 // of a word is in the Thai abbrevation for "etc.", which both begins and\r
360 // ends with a paiyannoi\r
361 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);\r
362 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);\r
363 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);\r
364\r
365 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(\r
366 Locale("th"), status);\r
367 if (U_FAILURE(status))\r
368 {\r
369 errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");\r
370 return;\r
371 }\r
372\r
373 generalIteratorTest(*e, thaiLineSelection);\r
374 delete e;\r
375}\r
376\r
377\r
378\r
379void RBBITest::TestMixedThaiLineBreak()\r
380{\r
381 UErrorCode status = U_ZERO_ERROR;\r
382 BITestData thaiLineSelection(status);\r
383\r
384 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data\r
385\r
386 // Arabic numerals should always be separated from surrounding Thai text\r
387/*\r
388 ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);\r
389 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);\r
390 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);\r
391 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);\r
392 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);\r
393 thaiLineSelection->addElement("39");\r
394 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);\r
395\r
396 // words in non-Thai scripts should always be separated from surrounding Thai text\r
397 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);\r
398 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);\r
399 thaiLineSelection->addElement("Java");\r
400 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);\r
401 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);\r
402 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);\r
403\r
404 // Thai numerals should always be separated from the text surrounding them\r
405 ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);\r
406 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);\r
407 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);\r
408 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);\r
409 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);\r
410 ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);\r
411 ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);\r
412\r
413 // Thai text should interact correctly with punctuation and symbols\r
414 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);\r
415// ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);\r
416// ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);\r
417ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);\r
418// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary\r
419 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);\r
420 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);\r
421 ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);\r
422*/\r
423\r
424 // The Unicode Linebreak TR says do not break before or after quotes.\r
425 // So this test is changed ot not break around the quote.\r
426 // TODO: should Thai break around the around the quotes, like the original behavior here?\r
427// ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);\r
428// ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);\r
429 ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""\r
430 "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);\r
431\r
432 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);\r
433 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);\r
434 ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);\r
435 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);\r
436 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);\r
437 ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);\r
438 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);\r
439 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);\r
440 ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);\r
441\r
442 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);\r
443 if (U_FAILURE(status))\r
444 {\r
445 errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");\r
446 return;\r
447 }\r
448\r
449\r
450 generalIteratorTest(*e, thaiLineSelection);\r
451 delete e;\r
452}\r
453\r
454\r
455void RBBITest::TestMaiyamok()\r
456{\r
457 UErrorCode status = U_ZERO_ERROR;\r
458 BITestData thaiLineSelection(status);\r
459 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data\r
460 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous\r
461 // word". Instead of appearing as a word unto itself, however, it's kept together\r
462 // with the word before it\r
463 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);\r
464 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);\r
465 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);\r
466 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);\r
467 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);\r
468 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);\r
469 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);\r
470\r
471 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(\r
472 Locale("th"), status);\r
473\r
474 if (U_FAILURE(status))\r
475 {\r
476 errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");\r
477 return;\r
478 }\r
479 generalIteratorTest(*e, thaiLineSelection);\r
480 delete e;\r
481}\r
482\r
483void RBBITest::TestThaiWordBreak() {\r
484 UErrorCode status = U_ZERO_ERROR;\r
485 BITestData thaiWordSelection(status);\r
486\r
487 ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data\r
488 ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2\r
489 ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5\r
490 ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6\r
491 ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10\r
492 ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16\r
493 ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18\r
494\r
495 // This is the correct result\r
496 //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24\r
497 //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29\r
498\r
499 // and this is what the dictionary does...\r
500 ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20\r
501 ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29\r
502\r
503 ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33\r
504\r
505 // This is the correct result\r
506 //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37\r
507 //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41\r
508\r
509 // and this is what the dictionary does\r
510 ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41\r
511\r
512 ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45\r
513 ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49\r
514 ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51\r
515\r
516 // This is the correct result\r
517 //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57\r
518 //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60\r
519\r
520 // and this is what the dictionary does\r
521 ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54\r
522 ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60\r
523\r
524 ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63\r
525\r
526 // This is the correct result\r
527 //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68\r
528 //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71\r
529 //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74\r
530 //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77\r
531\r
532 // and this is what the dictionary does\r
533 ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65\r
534 ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77\r
535\r
536 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(\r
537 Locale("th"), status);\r
538 if (U_FAILURE(status))\r
539 {\r
540 errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");\r
541 return;\r
542 }\r
543\r
544 generalIteratorTest(*e, thaiWordSelection);\r
545 delete e;\r
546}\r
547\r
548\r
549void RBBITest::TestBug3818() {\r
550 UErrorCode status = U_ZERO_ERROR;\r
551\r
552 // Four Thai words...\r
553 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, \r
554 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; \r
555 UnicodeString thaiStr(thaiWordData);\r
556\r
557 RuleBasedBreakIterator* bi = \r
558 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);\r
559 if (U_FAILURE(status) || bi == NULL) {\r
560 errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));\r
561 return;\r
562 }\r
563 bi->setText(thaiStr);\r
564\r
565 int32_t startOfSecondWord = bi->following(1);\r
566 if (startOfSecondWord != 4) {\r
567 errln("Fail at file %s, line %d expected start of word at 4, got %d",\r
568 __FILE__, __LINE__, startOfSecondWord);\r
569 }\r
570 startOfSecondWord = bi->following(0);\r
571 if (startOfSecondWord != 4) {\r
572 errln("Fail at file %s, line %d expected start of word at 4, got %d",\r
573 __FILE__, __LINE__, startOfSecondWord);\r
574 }\r
575 delete bi;\r
576}\r
577\r
578\r
579void RBBITest::TestJapaneseWordBreak() {\r
580 UErrorCode status = U_ZERO_ERROR;\r
581 BITestData japaneseWordSelection(status);\r
582\r
583 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data\r
584 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2\r
585 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5\r
586 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7\r
587 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10\r
588 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11\r
589 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12\r
590\r
591 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(\r
592 Locale("ja"), status);\r
593 if (U_FAILURE(status))\r
594 {\r
595 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");\r
596 return;\r
597 }\r
598\r
599 generalIteratorTest(*e, japaneseWordSelection);\r
600 delete e;\r
601}\r
602\r
603//---------------------------------------------\r
604// runIndexedTest\r
605//---------------------------------------------\r
606\r
607void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )\r
608{\r
609 if (exec) logln("TestSuite RuleBasedBreakIterator: ");\r
610 \r
611 switch (index) {\r
612 case 0: name = "TestBug4153072";\r
613 if(exec) TestBug4153072(); break;\r
614 case 1: name = "TestJapaneseLineBreak";\r
615 if(exec) TestJapaneseLineBreak(); break;\r
616 case 2: name = "TestStatusReturn";\r
617 if(exec) TestStatusReturn(); break;\r
618\r
619 case 3: name = "TestLineBreakData";\r
620 if(exec) TestLineBreakData(); break;\r
621 case 4: name = "TestEmptyString";\r
622 if(exec) TestEmptyString(); break;\r
623\r
624 case 5: name = "TestGetAvailableLocales";\r
625 if(exec) TestGetAvailableLocales(); break;\r
626\r
627 case 6: name = "TestGetDisplayName";\r
628 if(exec) TestGetDisplayName(); break;\r
629\r
630 case 7: name = "TestEndBehaviour";\r
631 if(exec) TestEndBehaviour(); break;\r
632 case 8: name = "TestMixedThaiLineBreak";\r
633 if(exec) TestMixedThaiLineBreak(); break;\r
634 case 9: name = "TestThaiWordBreak";\r
635 if(exec) TestThaiWordBreak(); break;\r
636 case 10: name = "TestThaiLineBreak";\r
637 if(exec) TestThaiLineBreak(); break;\r
638 case 11: name = "TestMaiyamok";\r
639 if(exec) TestMaiyamok(); break;\r
640 case 12: name = "TestWordBreaks";\r
641 if(exec) TestWordBreaks(); break;\r
642 case 13: name = "TestWordBoundary";\r
643 if(exec) TestWordBoundary(); break;\r
644 case 14: name = "TestLineBreaks";\r
645 if(exec) TestLineBreaks(); break;\r
646 case 15: name = "TestSentBreaks";\r
647 if(exec) TestSentBreaks(); break;\r
648 case 16: name = "TestExtended";\r
649 if(exec) TestExtended(); break;\r
650 case 17: name = "TestMonkey";\r
651 if(exec) {\r
652 #if !UCONFIG_NO_REGULAR_EXPRESSIONS\r
653 TestMonkey(params);\r
654 #else\r
655 logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");\r
656 #endif\r
657 }\r
658 break;\r
659 case 18: name = "TestBug3818";\r
660 if(exec) TestBug3818(); break;\r
661 case 19: name = "TestJapaneseWordBreak";\r
662 if(exec) TestJapaneseWordBreak(); break;\r
663\r
664 default: name = ""; break; //needed to end loop\r
665 }\r
666}\r
667\r
668\r
669//----------------------------------------------------------------------------\r
670//\r
671// generalIteratorTest Given a break iterator and a set of test data,\r
672// Run the tests and report the results.\r
673//\r
674//----------------------------------------------------------------------------\r
675void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)\r
676{\r
677\r
678 bi.setText(td.fDataToBreak);\r
679\r
680 testFirstAndNext(bi, td);\r
681\r
682 testLastAndPrevious(bi, td);\r
683\r
684 testFollowing(bi, td);\r
685 testPreceding(bi, td);\r
686 testIsBoundary(bi, td);\r
687 doMultipleSelectionTest(bi, td);\r
688}\r
689\r
690\r
691//\r
692// testFirstAndNext. Run the iterator forwards in the obvious first(), next()\r
693// kind of loop.\r
694//\r
695void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)\r
696{\r
697 UErrorCode status = U_ZERO_ERROR;\r
698 int32_t p;\r
699 int32_t lastP = -1;\r
700 int32_t tag;\r
701\r
702 logln("Test first and next");\r
703 bi.setText(td.fDataToBreak);\r
704 td.clearResults();\r
705\r
706 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {\r
707 td.fActualBreakPositions.addElement(p, status); // Save result.\r
708 tag = bi.getRuleStatus();\r
709 td.fActualTags.addElement(tag, status);\r
710 if (p <= lastP) {\r
711 // If the iterator is not making forward progress, stop.\r
712 // No need to raise an error here, it'll be detected in the normal check of results.\r
713 break;\r
714 }\r
715 lastP = p;\r
716 }\r
717 td.checkResults("testFirstAndNext", this);\r
718}\r
719\r
720\r
721//\r
722// TestLastAndPrevious. Run the iterator backwards, starting with last().\r
723//\r
724void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)\r
725{\r
726 UErrorCode status = U_ZERO_ERROR;\r
727 int32_t p;\r
728 int32_t lastP = 0x7ffffffe;\r
729 int32_t tag;\r
730\r
731 logln("Test first and next");\r
732 bi.setText(td.fDataToBreak);\r
733 td.clearResults();\r
734\r
735 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {\r
736 // Save break position. Insert it at start of vector of results, shoving\r
737 // already-saved results further towards the end.\r
738 td.fActualBreakPositions.insertElementAt(p, 0, status);\r
739 // bi.previous(); // TODO: Why does this fix things up????\r
740 // bi.next();\r
741 tag = bi.getRuleStatus();\r
742 td.fActualTags.insertElementAt(tag, 0, status);\r
743 if (p >= lastP) {\r
744 // If the iterator is not making progress, stop.\r
745 // No need to raise an error here, it'll be detected in the normal check of results.\r
746 break;\r
747 }\r
748 lastP = p;\r
749 }\r
750 td.checkResults("testLastAndPrevious", this);\r
751}\r
752\r
753\r
754void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)\r
755{\r
756 UErrorCode status = U_ZERO_ERROR;\r
757 int32_t p;\r
758 int32_t tag;\r
759 int32_t lastP = -2; // A value that will never be returned as a break position.\r
760 // cannot be -1; that is returned for DONE.\r
761 int i;\r
762\r
763 logln("testFollowing():");\r
764 bi.setText(td.fDataToBreak);\r
765 td.clearResults();\r
766\r
767 // Save the starting point, since we won't get that out of following.\r
768 p = bi.first();\r
769 td.fActualBreakPositions.addElement(p, status); // Save result.\r
770 tag = bi.getRuleStatus();\r
771 td.fActualTags.addElement(tag, status);\r
772\r
773 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {\r
774 p = bi.following(i);\r
775 if (p != lastP) {\r
776 if (p == RuleBasedBreakIterator::DONE) {\r
777 break;\r
778 }\r
779 // We've reached a new break position. Save it.\r
780 td.fActualBreakPositions.addElement(p, status); // Save result.\r
781 tag = bi.getRuleStatus();\r
782 td.fActualTags.addElement(tag, status);\r
783 lastP = p;\r
784 }\r
785 }\r
786 // The loop normally exits by means of the break in the middle.\r
787 // Make sure that the index was at the correct position for the break iterator to have\r
788 // returned DONE.\r
789 if (i != td.fDataToBreak.length()) {\r
790 errln("testFollowing(): iterator returned DONE prematurely.");\r
791 }\r
792\r
793 // Full check of all results.\r
794 td.checkResults("testFollowing", this);\r
795}\r
796\r
797\r
798\r
799void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {\r
800 UErrorCode status = U_ZERO_ERROR;\r
801 int32_t p;\r
802 int32_t tag;\r
803 int32_t lastP = 0x7ffffffe;\r
804 int i;\r
805\r
806 logln("testPreceding():");\r
807 bi.setText(td.fDataToBreak);\r
808 td.clearResults();\r
809\r
810 p = bi.last();\r
811 td.fActualBreakPositions.addElement(p, status);\r
812 tag = bi.getRuleStatus();\r
813 td.fActualTags.addElement(tag, status);\r
814\r
815 for (i = td.fDataToBreak.length(); i>=-1; i--) {\r
816 p = bi.preceding(i);\r
817 if (p != lastP) {\r
818 if (p == RuleBasedBreakIterator::DONE) {\r
819 break;\r
820 }\r
821 // We've reached a new break position. Save it.\r
822 td.fActualBreakPositions.insertElementAt(p, 0, status);\r
823 lastP = p;\r
824 tag = bi.getRuleStatus();\r
825 td.fActualTags.insertElementAt(tag, 0, status);\r
826 }\r
827 }\r
828 // The loop normally exits by means of the break in the middle.\r
829 // Make sure that the index was at the correct position for the break iterator to have\r
830 // returned DONE.\r
831 if (i != 0) {\r
832 errln("testPreceding(): iterator returned DONE prematurely.");\r
833 }\r
834\r
835 // Full check of all results.\r
836 td.checkResults("testPreceding", this);\r
837}\r
838\r
839\r
840\r
841void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {\r
842 UErrorCode status = U_ZERO_ERROR;\r
843 int i;\r
844 int32_t tag;\r
845\r
846 logln("testIsBoundary():");\r
847 bi.setText(td.fDataToBreak);\r
848 td.clearResults();\r
849\r
850 for (i = 0; i <= td.fDataToBreak.length(); i++) {\r
851 if (bi.isBoundary(i)) {\r
852 td.fActualBreakPositions.addElement(i, status); // Save result.\r
853 tag = bi.getRuleStatus();\r
854 td.fActualTags.addElement(tag, status);\r
855 }\r
856 }\r
857 td.checkResults("testIsBoundary: ", this);\r
858}\r
859\r
860\r
861\r
862void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)\r
863{\r
864 iterator.setText(td.fDataToBreak);\r
865\r
866 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();\r
867 int32_t offset = iterator.first();\r
868 int32_t testOffset;\r
869 int32_t count = 0;\r
870\r
871 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());\r
872\r
873 if (*testIterator != iterator)\r
874 errln("clone() or operator!= failed: two clones compared unequal");\r
875\r
876 do {\r
877 testOffset = testIterator->first();\r
878 testOffset = testIterator->next(count);\r
879 if (offset != testOffset)\r
880 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);\r
881\r
882 if (offset != RuleBasedBreakIterator::DONE) {\r
883 count++;\r
884 offset = iterator.next();\r
885\r
886 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {\r
887 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);\r
888 if (count > 10000 || offset == -1) {\r
889 errln("operator== failed too many times. Stopping test.");\r
890 if (offset == -1) {\r
891 errln("Does (RuleBasedBreakIterator::DONE == -1)?");\r
892 }\r
893 return;\r
894 }\r
895 }\r
896 }\r
897 } while (offset != RuleBasedBreakIterator::DONE);\r
898\r
899 // now do it backwards...\r
900 offset = iterator.last();\r
901 count = 0;\r
902\r
903 do {\r
904 testOffset = testIterator->last();\r
905 testOffset = testIterator->next(count); // next() with a negative arg is same as previous\r
906 if (offset != testOffset)\r
907 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);\r
908\r
909 if (offset != RuleBasedBreakIterator::DONE) {\r
910 count--;\r
911 offset = iterator.previous();\r
912 }\r
913 } while (offset != RuleBasedBreakIterator::DONE);\r
914\r
915 delete testIterator;\r
916}\r
917\r
918\r
919\r
920//--------------------------------------------------------------------------------------------\r
921//\r
922// Break Iterator Invariants Tests\r
923//\r
924//--------------------------------------------------------------------------------------------\r
925\r
926void RBBITest::TestCharacterInvariants()\r
927{\r
928 UErrorCode status = U_ZERO_ERROR;\r
929 BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);\r
930 if (U_FAILURE(status))\r
931 {\r
932 errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");\r
933 return;\r
934 }\r
935 UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");\r
936 doBreakInvariantTest(*e, s);\r
937 s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");\r
938 doOtherInvariantTest(*e, s);\r
939 delete e;\r
940}\r
941\r
942\r
943void RBBITest::TestWordInvariants()\r
944{\r
945 UErrorCode status = U_ZERO_ERROR;\r
946 BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);\r
947 if (U_FAILURE(status))\r
948 {\r
949 errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");\r
950 return;\r
951 }\r
952 UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");\r
953 doBreakInvariantTest(*e, s);\r
954 s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");\r
955 doOtherInvariantTest(*e, s);\r
956 delete e;\r
957}\r
958\r
959\r
960void RBBITest::TestSentenceInvariants()\r
961{\r
962 UErrorCode status = U_ZERO_ERROR;\r
963 BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);\r
964 if (U_FAILURE(status))\r
965 {\r
966 errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");\r
967 return;\r
968 }\r
969 UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");\r
970 doOtherInvariantTest(*e, s);\r
971 delete e;\r
972}\r
973\r
974\r
975\r
976\r
977void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)\r
978{\r
979 UnicodeString work("aaa");\r
980 int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;\r
981\r
982 // a break should always occur after CR (unless followed by LF), LF, PS, and LS\r
983 UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");\r
984 int32_t i, j;\r
985\r
986 breaksLen = breaks.length();\r
987 for (i = 0; i < breaksLen; i++) {\r
988 UChar c1 = breaks[i];\r
989 work.setCharAt(1, c1);\r
990 for (j = 0; j < testCharsLen; j++) {\r
991 UChar c0 = testChars[j];\r
992 work.setCharAt(0, c0);\r
993 int k;\r
994 for (k = 0; k < testCharsLen; k++) {\r
995 UChar c2 = testChars[k];\r
996 work.setCharAt(2, c2);\r
997\r
998 // if a cr is followed by lf, ps, ls or etx, don't do the check (that's\r
999 // not supposed to work)\r
1000 if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029\r
1001 || c2 == 0x2028 || c2 == 0x0003))\r
1002 continue;\r
1003\r
1004 if (u_charType(c1) == U_CONTROL_CHAR &&\r
1005 (u_charType(c2) == U_NON_SPACING_MARK ||\r
1006 u_charType(c2) == U_ENCLOSING_MARK ||\r
1007 u_charType(c2) == U_COMBINING_SPACING_MARK)\r
1008 ) {\r
1009 // Combining marks don't combine with controls.\r
1010 // TODO: enhance test to verify that the break actually occurs,\r
1011 // not just ignore the case.\r
1012 continue;\r
1013 }\r
1014\r
1015\r
1016 tb.setText(work);\r
1017 UBool seen2 = FALSE;\r
1018 int l;\r
1019 for (l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {\r
1020 if (l == 2) {\r
1021 seen2 = TRUE;\r
1022 break;\r
1023 }\r
1024 }\r
1025 if (!seen2) {\r
1026 printStringBreaks(work, NULL, 0); \r
1027 errln("No Break between \\U%04x and \\U%04x", c1, c2);\r
1028 errCount++;\r
1029 if (errCount >= 75)\r
1030 return;\r
1031 }\r
1032 }\r
1033 }\r
1034 }\r
1035}\r
1036\r
1037\r
1038\r
1039void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)\r
1040{\r
1041 UnicodeString work("a\r\na");\r
1042 int32_t errCount = 0, testCharsLen = testChars.length();\r
1043 int32_t i, j;\r
1044 int8_t type;\r
1045\r
1046 // a break should never occur between CR and LF\r
1047 for (i = 0; i < testCharsLen; i++) {\r
1048 work.setCharAt(0, testChars[i]);\r
1049 for (j = 0; j < testCharsLen; j++) {\r
1050 work.setCharAt(3, testChars[j]);\r
1051 tb.setText(work);\r
1052 int32_t k;\r
1053 for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())\r
1054 if (k == 2) {\r
1055 errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",\r
1056 work[0], work[1], work[2], work[3]);\r
1057 errCount++;\r
1058 if (errCount >= 75)\r
1059 return;\r
1060 }\r
1061 }\r
1062 }\r
1063\r
1064 // a break should never occur before a non-spacing mark, unless the preceding\r
1065 // character is CR, LF, PS, or LS\r
1066 // Or the general category == Control.\r
1067 work.remove();\r
1068 work += "aaaa";\r
1069 for (i = 0; i < testCharsLen; i++) {\r
1070 UChar c1 = testChars[i];\r
1071 if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||\r
1072 u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) {\r
1073 continue;\r
1074 }\r
1075 work.setCharAt(1, c1);\r
1076 for (j = 0; j < testCharsLen; j++) {\r
1077 UChar c2 = testChars[j];\r
1078 type = u_charType(c2);\r
1079 if ((type != U_NON_SPACING_MARK) &&\r
1080 (type != U_ENCLOSING_MARK)) {\r
1081 continue;\r
1082 }\r
1083 work.setCharAt(2, c2);\r
1084 tb.setText(work);\r
1085 int k;\r
1086 for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())\r
1087 if (k == 2) {\r
1088 //errln("Break between U+" + UCharToUnicodeString(work[1])\r
1089 // + " and U+" + UCharToUnicodeString(work[2]));\r
1090 errln("Unexpected Break between %6x and %6x", c1, c2);\r
1091 errCount++;\r
1092 if (errCount >= 75)\r
1093 return;\r
1094 }\r
1095 }\r
1096 }\r
1097}\r
1098\r
1099\r
1100\r
1101\r
1102//---------------------------------------------\r
1103//\r
1104// other tests\r
1105//\r
1106//---------------------------------------------\r
1107void RBBITest::TestEmptyString()\r
1108{\r
1109 UnicodeString text = "";\r
1110 UErrorCode status = U_ZERO_ERROR;\r
1111\r
1112 BITestData x(status);\r
1113 ADD_DATACHUNK(x, "", 0, status); // Break at start of data\r
1114 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);\r
1115 if (U_FAILURE(status))\r
1116 {\r
1117 errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");\r
1118 return;\r
1119 }\r
1120 generalIteratorTest(*bi, x);\r
1121 delete bi;\r
1122}\r
1123\r
1124void RBBITest::TestGetAvailableLocales()\r
1125{\r
1126 int32_t locCount = 0;\r
1127 const Locale* locList = BreakIterator::getAvailableLocales(locCount);\r
1128\r
1129 if (locCount == 0)\r
1130 errln("getAvailableLocales() returned an empty list!");\r
1131 // Just make sure that it's returning good memory.\r
1132 int32_t i;\r
1133 for (i = 0; i < locCount; ++i) {\r
1134 logln(locList[i].getName());\r
1135 }\r
1136}\r
1137\r
1138//Testing the BreakIterator::getDisplayName() function\r
1139void RBBITest::TestGetDisplayName()\r
1140{\r
1141 UnicodeString result;\r
1142\r
1143 BreakIterator::getDisplayName(Locale::getUS(), result);\r
1144 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")\r
1145 errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""\r
1146 + result);\r
1147\r
1148 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);\r
1149 if (result != "French (France)")\r
1150 errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""\r
1151 + result);\r
1152}\r
1153/**\r
1154 * Test End Behaviour\r
1155 * @bug 4068137\r
1156 */\r
1157void RBBITest::TestEndBehaviour()\r
1158{\r
1159 UErrorCode status = U_ZERO_ERROR;\r
1160 UnicodeString testString("boo.");\r
1161 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);\r
1162 if (U_FAILURE(status))\r
1163 {\r
1164 errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");\r
1165 return;\r
1166 }\r
1167 wb->setText(testString);\r
1168\r
1169 if (wb->first() != 0)\r
1170 errln("Didn't get break at beginning of string.");\r
1171 if (wb->next() != 3)\r
1172 errln("Didn't get break before period in \"boo.\"");\r
1173 if (wb->current() != 4 && wb->next() != 4)\r
1174 errln("Didn't get break at end of string.");\r
1175 delete wb;\r
1176}\r
1177/*\r
1178 * @bug 4153072\r
1179 */\r
1180void RBBITest::TestBug4153072() {\r
1181 UErrorCode status = U_ZERO_ERROR;\r
1182 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);\r
1183 if (U_FAILURE(status))\r
1184 {\r
1185 errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");\r
1186 return;\r
1187 }\r
1188 UnicodeString str("...Hello, World!...");\r
1189 int32_t begin = 3;\r
1190 int32_t end = str.length() - 3;\r
1191 UBool dummy;\r
1192\r
1193 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);\r
1194 iter->adoptText(textIterator);\r
1195 int index;\r
1196 for (index = -1; index < begin + 1; ++index) {\r
1197 dummy = iter->isBoundary(index);\r
1198 if (index < begin && dummy == TRUE) {\r
1199 errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +\r
1200 " and begin index = " + begin);\r
1201 }\r
1202 }\r
1203 delete iter;\r
1204}\r
1205\r
1206\r
1207/**\r
1208 * Test Japanese Line Break\r
1209 * @bug 4095322\r
1210 */\r
1211void RBBITest::TestJapaneseLineBreak()\r
1212{\r
1213#if 0\r
1214 // Test needs updating some more... Dump it for now.\r
1215\r
1216\r
1217 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count\r
1218 // as opening and closing punctuation for line breaking.\r
1219 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars\r
1220 // from these tests. 6-13-2002\r
1221 //\r
1222 UErrorCode status = U_ZERO_ERROR;\r
1223 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");\r
1224 UnicodeString precedingChars = CharsToUnicodeString(\r
1225 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");\r
1226 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");\r
1227 UnicodeString followingChars = CharsToUnicodeString(\r
1228 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"\r
1229 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"\r
1230 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"\r
1231 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"\r
1232 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");\r
1233 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);\r
1234\r
1235 int32_t i;\r
1236 if (U_FAILURE(status))\r
1237 {\r
1238 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");\r
1239 return;\r
1240 }\r
1241\r
1242 for (i = 0; i < precedingChars.length(); i++) {\r
1243 testString.setCharAt(1, precedingChars[i]);\r
1244 iter->setText(testString);\r
1245 int32_t j = iter->first();\r
1246 if (j != 0)\r
1247 errln("ja line break failure: failed to start at 0");\r
1248 j = iter->next();\r
1249 if (j != 1)\r
1250 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])\r
1251 + "' (" + ((int)(precedingChars[i])) + ")");\r
1252 j = iter->next();\r
1253 if (j != 3)\r
1254 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])\r
1255 + "' (" + ((int)(precedingChars[i])) + ")");\r
1256 }\r
1257\r
1258 for (i = 0; i < followingChars.length(); i++) {\r
1259 testString.setCharAt(1, followingChars[i]);\r
1260 iter->setText(testString);\r
1261 int j = iter->first();\r
1262 if (j != 0)\r
1263 errln("ja line break failure: failed to start at 0");\r
1264 j = iter->next();\r
1265 if (j != 2)\r
1266 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])\r
1267 + "' (" + ((int)(followingChars[i])) + ")");\r
1268 j = iter->next();\r
1269 if (j != 3)\r
1270 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])\r
1271 + "' (" + ((int)(followingChars[i])) + ")");\r
1272 }\r
1273 delete iter;\r
1274#endif\r
1275}\r
1276\r
1277\r
1278//------------------------------------------------------------------------------\r
1279//\r
1280// RBBITest::Extended Run RBBI Tests from an external test data file\r
1281//\r
1282//------------------------------------------------------------------------------\r
1283\r
1284struct TestParams {\r
1285 BreakIterator *bi;\r
1286 UnicodeString dataToBreak;\r
1287 UVector32 *expectedBreaks;\r
1288 UVector32 *srcLine;\r
1289 UVector32 *srcCol;\r
1290};\r
1291\r
1292void RBBITest::executeTest(TestParams *t) {\r
1293 int32_t bp;\r
1294 int32_t prevBP;\r
1295 int32_t i;\r
1296\r
1297 t->bi->setText(t->dataToBreak);\r
1298 //\r
1299 // Run the iterator forward\r
1300 //\r
1301 prevBP = -1;\r
1302 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {\r
1303 if (prevBP == bp) {\r
1304 // Fail for lack of forward progress.\r
1305 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",\r
1306 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r
1307 break;\r
1308 }\r
1309\r
1310 // Check that there were we didn't miss an expected break between the last one\r
1311 // and this one.\r
1312 for (i=prevBP+1; i<bp; i++) {\r
1313 if (t->expectedBreaks->elementAti(i) != 0) {\r
1314 int expected[] = {0, i};\r
1315 printStringBreaks(t->dataToBreak, expected, 2);\r
1316 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r
1317 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r
1318 }\r
1319 }\r
1320\r
1321 // Check that the break we did find was expected\r
1322 if (t->expectedBreaks->elementAti(bp) == 0) {\r
1323 int expected[] = {0, bp};\r
1324 printStringBreaks(t->dataToBreak, expected, 2);\r
1325 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",\r
1326 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r
1327 } else {\r
1328 // The break was expected.\r
1329 // Check that the {nnn} tag value is correct.\r
1330 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);\r
1331 if (expectedTagVal == -1) {\r
1332 expectedTagVal = 0;\r
1333 }\r
1334 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();\r
1335 if (rs != expectedTagVal) {\r
1336 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"\r
1337 " Actual, Expected status = %4d, %4d",\r
1338 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);\r
1339 }\r
1340 }\r
1341\r
1342\r
1343 prevBP = bp;\r
1344 }\r
1345\r
1346 // Verify that there were no missed expected breaks after the last one found\r
1347 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {\r
1348 if (t->expectedBreaks->elementAti(i) != 0) {\r
1349 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r
1350 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r
1351 }\r
1352 }\r
1353\r
1354 //\r
1355 // Run the iterator backwards, verify that the same breaks are found.\r
1356 //\r
1357 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.\r
1358 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {\r
1359 if (prevBP == bp) {\r
1360 // Fail for lack of progress.\r
1361 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",\r
1362 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r
1363 break;\r
1364 }\r
1365\r
1366 // Check that there were we didn't miss an expected break between the last one\r
1367 // and this one. (UVector returns zeros for index out of bounds.)\r
1368 for (i=prevBP-1; i>bp; i--) {\r
1369 if (t->expectedBreaks->elementAti(i) != 0) {\r
1370 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r
1371 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r
1372 }\r
1373 }\r
1374\r
1375 // Check that the break we did find was expected\r
1376 if (t->expectedBreaks->elementAti(bp) == 0) {\r
1377 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",\r
1378 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r
1379 } else {\r
1380 // The break was expected.\r
1381 // Check that the {nnn} tag value is correct.\r
1382 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);\r
1383 if (expectedTagVal == -1) {\r
1384 expectedTagVal = 0;\r
1385 }\r
1386 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();\r
1387 if (rs != expectedTagVal) {\r
1388 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"\r
1389 " Actual, Expected status = %4d, %4d",\r
1390 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);\r
1391 }\r
1392 }\r
1393\r
1394 prevBP = bp;\r
1395 }\r
1396\r
1397 // Verify that there were no missed breaks prior to the last one found\r
1398 for (i=prevBP-1; i>=0; i--) {\r
1399 if (t->expectedBreaks->elementAti(i) != 0) {\r
1400 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r
1401 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r
1402 }\r
1403 }\r
1404}\r
1405\r
1406\r
1407void RBBITest::TestExtended() {\r
1408 UErrorCode status = U_ZERO_ERROR;\r
1409 Locale locale = Locale::getDefault();\r
1410\r
1411 UnicodeString rules;\r
1412 TestParams tp;\r
1413 tp.bi = NULL;\r
1414 tp.expectedBreaks = new UVector32(status);\r
1415 tp.srcLine = new UVector32(status);\r
1416 tp.srcCol = new UVector32(status);\r
1417\r
1418\r
1419 //\r
1420 // Open and read the test data file.\r
1421 //\r
1422 const char *testDataDirectory = IntlTest::getSourceTestData(status);\r
1423 char testFileName[1000];\r
1424 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {\r
1425 errln("Can't open test data. Path too long.");\r
1426 return;\r
1427 }\r
1428 strcpy(testFileName, testDataDirectory);\r
1429 strcat(testFileName, "rbbitst.txt");\r
1430\r
1431 int len;\r
1432 UChar *testFile = ReadAndConvertFile(testFileName, len, status);\r
1433 if (U_FAILURE(status)) {\r
1434 return; /* something went wrong, error already output */\r
1435 }\r
1436\r
1437\r
1438\r
1439 //\r
1440 // Put the test data into a UnicodeString\r
1441 //\r
1442 UnicodeString testString(FALSE, testFile, len);\r
1443\r
1444 enum EParseState{\r
1445 PARSE_COMMENT,\r
1446 PARSE_TAG,\r
1447 PARSE_DATA,\r
1448 PARSE_NUM\r
1449 }\r
1450 parseState = PARSE_TAG;\r
1451\r
1452 EParseState savedState = PARSE_TAG;\r
1453\r
1454 static const UChar CH_LF = 0x0a;\r
1455 static const UChar CH_CR = 0x0d;\r
1456 static const UChar CH_HASH = 0x23;\r
1457 /*static const UChar CH_PERIOD = 0x2e;*/\r
1458 static const UChar CH_LT = 0x3c;\r
1459 static const UChar CH_GT = 0x3e;\r
1460 static const UChar CH_BACKSLASH = 0x5c;\r
1461 static const UChar CH_BULLET = 0x2022;\r
1462\r
1463 int32_t lineNum = 1;\r
1464 int32_t colStart = 0;\r
1465 int32_t column = 0;\r
1466 int32_t charIdx = 0;\r
1467\r
1468 int32_t tagValue = 0; // The numeric value of a <nnn> tag.\r
1469\r
1470 for (charIdx = 0; charIdx < len; ) {\r
1471 UChar c = testString.charAt(charIdx);\r
1472 charIdx++;\r
1473 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {\r
1474 // treat CRLF as a unit\r
1475 c = CH_LF;\r
1476 charIdx++;\r
1477 }\r
1478 if (c == CH_LF || c == CH_CR) {\r
1479 lineNum++;\r
1480 colStart = charIdx;\r
1481 }\r
1482 column = charIdx - colStart + 1;\r
1483\r
1484 switch (parseState) {\r
1485 case PARSE_COMMENT:\r
1486 if (c == 0x0a || c == 0x0d) {\r
1487 parseState = savedState;\r
1488 }\r
1489 break;\r
1490\r
1491 case PARSE_TAG:\r
1492 {\r
1493 if (c == CH_HASH) {\r
1494 parseState = PARSE_COMMENT;\r
1495 savedState = PARSE_TAG;\r
1496 break;\r
1497 }\r
1498 if (u_isUWhiteSpace(c)) {\r
1499 break;\r
1500 }\r
1501 if (testString.compare(charIdx-1, 6, "<word>") == 0) {\r
1502 delete tp.bi;\r
1503 tp.bi = BreakIterator::createWordInstance(locale, status);\r
1504 charIdx += 5;\r
1505 break;\r
1506 }\r
1507 if (testString.compare(charIdx-1, 6, "<char>") == 0) {\r
1508 delete tp.bi;\r
1509 tp.bi = BreakIterator::createCharacterInstance(locale, status);\r
1510 charIdx += 5;\r
1511 break;\r
1512 }\r
1513 if (testString.compare(charIdx-1, 6, "<line>") == 0) {\r
1514 delete tp.bi;\r
1515 tp.bi = BreakIterator::createLineInstance(locale, status);\r
1516 charIdx += 5;\r
1517 break;\r
1518 }\r
1519 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {\r
1520 delete tp.bi;\r
1521 tp.bi = BreakIterator::createSentenceInstance(locale, status);\r
1522 charIdx += 5;\r
1523 break;\r
1524 }\r
1525 if (testString.compare(charIdx-1, 7, "<title>") == 0) {\r
1526 delete tp.bi;\r
1527 tp.bi = BreakIterator::createTitleInstance(locale, status);\r
1528 charIdx += 6;\r
1529 break;\r
1530 }\r
1531 if (testString.compare(charIdx-1, 6, "<data>") == 0) {\r
1532 parseState = PARSE_DATA;\r
1533 charIdx += 5;\r
1534 tp.dataToBreak = "";\r
1535 tp.expectedBreaks->removeAllElements();\r
1536 tp.srcCol ->removeAllElements();\r
1537 tp.srcLine->removeAllElements();\r
1538 break;\r
1539 }\r
1540\r
1541 errln("line %d: Tag expected in test file.", lineNum);\r
1542 goto end_test;\r
1543 parseState = PARSE_COMMENT;\r
1544 savedState = PARSE_DATA;\r
1545 }\r
1546 break;\r
1547\r
1548 case PARSE_DATA:\r
1549 if (c == CH_BULLET) {\r
1550 int32_t breakIdx = tp.dataToBreak.length();\r
1551 tp.expectedBreaks->setSize(breakIdx+1);\r
1552 tp.expectedBreaks->setElementAt(-1, breakIdx);\r
1553 tp.srcLine->setSize(breakIdx+1);\r
1554 tp.srcLine->setElementAt(lineNum, breakIdx);\r
1555 tp.srcCol ->setSize(breakIdx+1);\r
1556 tp.srcCol ->setElementAt(column, breakIdx);\r
1557 break;\r
1558 }\r
1559\r
1560 if (testString.compare(charIdx-1, 7, "</data>") == 0) {\r
1561 // Add final entry to mappings from break location to source file position.\r
1562 // Need one extra because last break position returned is after the\r
1563 // last char in the data, not at the last char.\r
1564 tp.srcLine->addElement(lineNum, status);\r
1565 tp.srcCol ->addElement(column, status);\r
1566\r
1567 parseState = PARSE_TAG;\r
1568 charIdx += 7;\r
1569\r
1570 // RUN THE TEST!\r
1571 executeTest(&tp);\r
1572 break;\r
1573 }\r
1574\r
1575 if (testString.compare(charIdx-1, 3, "\\N{") == 0) {\r
1576 // Named character, e.g. \N{COMBINING GRAVE ACCENT}\r
1577 // Get the code point from the name and insert it into the test data.\r
1578 // (Damn, no API takes names in Unicode !!!\r
1579 // we've got to take it back to char *)\r
1580 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);\r
1581 int32_t nameLength = nameEndIdx - (charIdx+2);\r
1582 char charNameBuf[200];\r
1583 UChar32 theChar = -1;\r
1584 if (nameEndIdx != -1) {\r
1585 UErrorCode status = U_ZERO_ERROR;\r
1586 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));\r
1587 charNameBuf[sizeof(charNameBuf)-1] = 0;\r
1588 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);\r
1589 if (U_FAILURE(status)) {\r
1590 theChar = -1;\r
1591 }\r
1592 }\r
1593 if (theChar == -1) {\r
1594 errln("Error in named character in test file at line %d, col %d",\r
1595 lineNum, column);\r
1596 } else {\r
1597 // Named code point was recognized. Insert it\r
1598 // into the test data.\r
1599 tp.dataToBreak.append(theChar);\r
1600 while (tp.dataToBreak.length() > tp.srcLine->size()) {\r
1601 tp.srcLine->addElement(lineNum, status);\r
1602 tp.srcCol ->addElement(column, status);\r
1603 }\r
1604 }\r
1605 if (nameEndIdx > charIdx) {\r
1606 charIdx = nameEndIdx+1;\r
1607 }\r
1608 break;\r
1609 }\r
1610\r
1611\r
1612\r
1613\r
1614 if (testString.compare(charIdx-1, 2, "<>") == 0) {\r
1615 charIdx++;\r
1616 int32_t breakIdx = tp.dataToBreak.length();\r
1617 tp.expectedBreaks->setSize(breakIdx+1);\r
1618 tp.expectedBreaks->setElementAt(-1, breakIdx);\r
1619 tp.srcLine->setSize(breakIdx+1);\r
1620 tp.srcLine->setElementAt(lineNum, breakIdx);\r
1621 tp.srcCol ->setSize(breakIdx+1);\r
1622 tp.srcCol ->setElementAt(column, breakIdx);\r
1623 break;\r
1624 }\r
1625\r
1626 if (c == CH_LT) {\r
1627 tagValue = 0;\r
1628 parseState = PARSE_NUM;\r
1629 break;\r
1630 }\r
1631\r
1632 if (c == CH_HASH && column==3) { // TODO: why is column off so far?\r
1633 parseState = PARSE_COMMENT;\r
1634 savedState = PARSE_DATA;\r
1635 break;\r
1636 }\r
1637\r
1638 if (c == CH_BACKSLASH) {\r
1639 // Check for \ at end of line, a line continuation.\r
1640 // Advance over (discard) the newline\r
1641 UChar32 cp = testString.char32At(charIdx);\r
1642 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {\r
1643 // We have a CR LF\r
1644 // Need an extra increment of the input ptr to move over both of them\r
1645 charIdx++;\r
1646 }\r
1647 if (cp == CH_LF || cp == CH_CR) {\r
1648 lineNum++;\r
1649 colStart = charIdx;\r
1650 charIdx++;\r
1651 break;\r
1652 }\r
1653\r
1654 // Let unescape handle the back slash.\r
1655 cp = testString.unescapeAt(charIdx);\r
1656 if (cp != -1) {\r
1657 // Escape sequence was recognized. Insert the char\r
1658 // into the test data.\r
1659 tp.dataToBreak.append(cp);\r
1660 while (tp.dataToBreak.length() > tp.srcLine->size()) {\r
1661 tp.srcLine->addElement(lineNum, status);\r
1662 tp.srcCol ->addElement(column, status);\r
1663 }\r
1664 break;\r
1665 }\r
1666\r
1667\r
1668 // Not a recognized backslash escape sequence.\r
1669 // Take the next char as a literal.\r
1670 // TODO: Should this be an error?\r
1671 c = testString.charAt(charIdx);\r
1672 charIdx = testString.moveIndex32(charIdx, 1);\r
1673 }\r
1674\r
1675 // Normal, non-escaped data char.\r
1676 tp.dataToBreak.append(c);\r
1677\r
1678 // Save the mapping from offset in the data to line/column numbers in\r
1679 // the original input file. Will be used for better error messages only.\r
1680 // If there's an expected break before this char, the slot in the mapping\r
1681 // vector will already be set for this char; don't overwrite it.\r
1682 if (tp.dataToBreak.length() > tp.srcLine->size()) {\r
1683 tp.srcLine->addElement(lineNum, status);\r
1684 tp.srcCol ->addElement(column, status);\r
1685 }\r
1686 break;\r
1687\r
1688\r
1689 case PARSE_NUM:\r
1690 // We are parsing an expected numeric tag value, like <1234>,\r
1691 // within a chunk of data.\r
1692 if (u_isUWhiteSpace(c)) {\r
1693 break;\r
1694 }\r
1695\r
1696 if (c == CH_GT) {\r
1697 // Finished the number. Add the info to the expected break data,\r
1698 // and switch parse state back to doing plain data.\r
1699 parseState = PARSE_DATA;\r
1700 if (tagValue == 0) {\r
1701 tagValue = -1;\r
1702 }\r
1703 int32_t breakIdx = tp.dataToBreak.length();\r
1704 tp.expectedBreaks->setSize(breakIdx+1);\r
1705 tp.expectedBreaks->setElementAt(tagValue, breakIdx);\r
1706 tp.srcLine->setSize(breakIdx+1);\r
1707 tp.srcLine->setElementAt(lineNum, breakIdx);\r
1708 tp.srcCol ->setSize(breakIdx+1);\r
1709 tp.srcCol ->setElementAt(column, breakIdx);\r
1710 break;\r
1711 }\r
1712\r
1713 if (u_isdigit(c)) {\r
1714 tagValue = tagValue*10 + u_charDigitValue(c);\r
1715 break;\r
1716 }\r
1717\r
1718 errln("Syntax Error in test file at line %d, col %d",\r
1719 lineNum, column);\r
1720 goto end_test;\r
1721 parseState = PARSE_COMMENT;\r
1722 break;\r
1723 }\r
1724\r
1725\r
1726 if (U_FAILURE(status)) {\r
1727 errln("ICU Error %s while parsing test file at line %d.",\r
1728 u_errorName(status), lineNum);\r
1729 goto end_test;\r
1730 status = U_ZERO_ERROR;\r
1731 }\r
1732\r
1733 }\r
1734\r
1735end_test:\r
1736 delete tp.bi;\r
1737 delete tp.expectedBreaks;\r
1738 delete tp.srcLine;\r
1739 delete tp.srcCol;\r
1740 delete [] testFile;\r
1741}\r
1742\r
1743\r
1744//-------------------------------------------------------------------------------\r
1745//\r
1746// ReadAndConvertFile Read a text data file, convert it to UChars, and\r
1747// return the datain one big UChar * buffer, which the caller must delete.\r
1748//\r
1749// TODO: This is a clone of RegexTest::ReadAndConvertFile.\r
1750// Move this function to some common place.\r
1751//\r
1752//--------------------------------------------------------------------------------\r
1753UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {\r
1754 UChar *retPtr = NULL;\r
1755 char *fileBuf = NULL;\r
1756 UConverter* conv = NULL;\r
1757 FILE *f = NULL;\r
1758\r
1759 ulen = 0;\r
1760 if (U_FAILURE(status)) {\r
1761 return retPtr;\r
1762 }\r
1763\r
1764 //\r
1765 // Open the file.\r
1766 //\r
1767 f = fopen(fileName, "rb");\r
1768 if (f == 0) {\r
1769 errln("Error opening test data file %s\n", fileName);\r
1770 status = U_FILE_ACCESS_ERROR;\r
1771 return NULL;\r
1772 }\r
1773 //\r
1774 // Read it in\r
1775 //\r
1776 int fileSize;\r
1777 int amt_read;\r
1778\r
1779 fseek( f, 0, SEEK_END);\r
1780 fileSize = ftell(f);\r
1781 fileBuf = new char[fileSize];\r
1782 fseek(f, 0, SEEK_SET);\r
1783 amt_read = fread(fileBuf, 1, fileSize, f);\r
1784 if (amt_read != fileSize || fileSize <= 0) {\r
1785 errln("Error reading test data file.");\r
1786 goto cleanUpAndReturn;\r
1787 }\r
1788\r
1789 //\r
1790 // Look for a Unicode Signature (BOM) on the data just read\r
1791 //\r
1792 int32_t signatureLength;\r
1793 const char * fileBufC;\r
1794 const char* encoding;\r
1795\r
1796 fileBufC = fileBuf;\r
1797 encoding = ucnv_detectUnicodeSignature(\r
1798 fileBuf, fileSize, &signatureLength, &status);\r
1799 if(encoding!=NULL ){\r
1800 fileBufC += signatureLength;\r
1801 fileSize -= signatureLength;\r
1802 }\r
1803\r
1804 //\r
1805 // Open a converter to take the rule file to UTF-16\r
1806 //\r
1807 conv = ucnv_open(encoding, &status);\r
1808 if (U_FAILURE(status)) {\r
1809 goto cleanUpAndReturn;\r
1810 }\r
1811\r
1812 //\r
1813 // Convert the rules to UChar.\r
1814 // Preflight first to determine required buffer size.\r
1815 //\r
1816 ulen = ucnv_toUChars(conv,\r
1817 NULL, // dest,\r
1818 0, // destCapacity,\r
1819 fileBufC,\r
1820 fileSize,\r
1821 &status);\r
1822 if (status == U_BUFFER_OVERFLOW_ERROR) {\r
1823 // Buffer Overflow is expected from the preflight operation.\r
1824 status = U_ZERO_ERROR;\r
1825\r
1826 retPtr = new UChar[ulen+1];\r
1827 ucnv_toUChars(conv,\r
1828 retPtr, // dest,\r
1829 ulen+1,\r
1830 fileBufC,\r
1831 fileSize,\r
1832 &status);\r
1833 }\r
1834\r
1835cleanUpAndReturn:\r
1836 fclose(f);\r
1837 delete fileBuf;\r
1838 ucnv_close(conv);\r
1839 if (U_FAILURE(status)) {\r
1840 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));\r
1841 delete retPtr;\r
1842 retPtr = 0;\r
1843 ulen = 0;\r
1844 };\r
1845 return retPtr;\r
1846}\r
1847\r
1848\r
1849//--------------------------------------------------------------------------------------------\r
1850//\r
1851// Exhaustive Tests, using Unicode Data Files.\r
1852//\r
1853//--------------------------------------------------------------------------------------------\r
1854\r
1855//\r
1856// Token level scanner for the Unicode Line Break Test Data file.\r
1857// Return the next token, as follows:\r
1858// >= 0: a UChar32 character, scanned from hex in the file.\r
1859// -1: a break position, a division sign in the file.\r
1860// -2: end of rule. A new line in the file.\r
1861// -3: end of file. No more rules.\r
1862// -4: Error\r
1863//\r
1864// The scanner\r
1865// strips comments, ('#' to end of line)\r
1866// Recognizes CR, CR/LF and LF as new lines.\r
1867// Skips over spaces and Xs (don't break here) in the data.\r
1868//\r
1869struct ScanState {\r
1870 int32_t fPeekChar;\r
1871 UBool fPeeked;\r
1872 int32_t fLineNum;\r
1873 FILE *fFile;\r
1874 ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};\r
1875};\r
1876\r
1877// Literal characters that are of interest. In hex to keep EBCDIC based machines happy.\r
1878// The data itself is latin-1 on all platforms.\r
1879static const int32_t chSpace = 0x20;\r
1880static const int32_t chTab = 0x09;\r
1881static const int32_t chCR = 0x0D;\r
1882static const int32_t chLF = 0x0A;\r
1883static const int32_t chHash = 0x23;\r
1884static const int32_t chMult = 0xD7;\r
1885static const int32_t chDivide = 0xF7;\r
1886\r
1887static int32_t nextLBDToken(ScanState *s) {\r
1888 int32_t c;\r
1889\r
1890 // Read characters from the input file until we get something interesting\r
1891 // to return. The file is in latin-1 encoding.\r
1892 for (;;) {\r
1893 // Get the next character to look at,\r
1894 if (s->fPeeked) {\r
1895 c = s->fPeekChar;\r
1896 s->fPeeked = FALSE;\r
1897 } else {\r
1898 c = getc(s->fFile);\r
1899 }\r
1900\r
1901 // EOF. Return immediately.\r
1902 if (c == EOF) {\r
1903 return -3;\r
1904 }\r
1905\r
1906 // Spaces. Treat the multiply sign as a space - it indicates a no-break position\r
1907 // in the data, and the test program doesn't want to see them.\r
1908 // Continue the next char loop, looking for something significant.\r
1909 if (c == chSpace || c == chTab || c == chMult) {\r
1910 continue;\r
1911 }\r
1912\r
1913 // Divide sign. Indicates an expected break position.\r
1914 if (c == chDivide) {\r
1915 return -1;\r
1916 }\r
1917\r
1918 // New Line Handling. Keep track of line number in the file, which in turn\r
1919 // requires keeping track of CR/LF as a single new line.\r
1920 if (c == chCR) {\r
1921 s->fLineNum++;\r
1922 s->fPeekChar = getc(s->fFile);\r
1923 if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};\r
1924 return -2;\r
1925 }\r
1926 if (c == chLF) {\r
1927 s->fLineNum++;\r
1928 return -2;\r
1929 }\r
1930\r
1931 // Comments. Consume everything up to the next new line.\r
1932 if (c == chHash) {\r
1933 do {\r
1934 c = getc(s->fFile);\r
1935 } while (!(c == EOF || c == chCR || c == chLF));\r
1936 s->fPeekChar = c;\r
1937 s->fPeeked = TRUE;\r
1938 return nextLBDToken(s);\r
1939 }\r
1940\r
1941 // Scan a hex character (UChar32) value.\r
1942 if (u_digit(c, 16) >= 0) {\r
1943 int32_t v = u_digit(c, 16);\r
1944 for (;;) {\r
1945 c = getc(s->fFile);\r
1946 if (u_digit(c, 16) < 0) {break;};\r
1947 v <<= 4;\r
1948 v += u_digit(c, 16);\r
1949 }\r
1950 s->fPeekChar = c;\r
1951 s->fPeeked = TRUE;\r
1952 return v;\r
1953 }\r
1954\r
1955 // Error. Character was something unexpected.\r
1956 return -4;\r
1957 }\r
1958}\r
1959\r
1960\r
1961\r
1962void RBBITest::TestLineBreakData() {\r
1963\r
1964 UErrorCode status = U_ZERO_ERROR;\r
1965 UnicodeString testString;\r
1966 UVector expectedBreaks(status);\r
1967 ScanState ss;\r
1968 int32_t tok;\r
1969\r
1970 BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);\r
1971 if (U_FAILURE(status)) {\r
1972 errln("Failure creating break iterator");\r
1973 return;\r
1974 }\r
1975\r
1976 const char * lbdfName = "LBTest.txt";\r
1977\r
1978 // Open the test data file.\r
1979 // TODO: a proper way to handle this data.\r
1980 ss.fFile = fopen(lbdfName, "rb");\r
1981 if (ss.fFile == NULL) {\r
1982 logln("Unable to open Line Break Test Data file. Skipping test.");\r
1983 delete bi;\r
1984 return;\r
1985 }\r
1986\r
1987 // Loop once per line from the test data file.\r
1988 for (;;) {\r
1989 // Zero out test data from previous line.\r
1990 testString.truncate(0);\r
1991 expectedBreaks.removeAllElements();\r
1992\r
1993 // Read one test's (line's) worth of data from the file.\r
1994 // Loop once per token on the input file line.\r
1995 for(;;) {\r
1996 tok = nextLBDToken(&ss);\r
1997\r
1998 // If we scanned a character number in the file.\r
1999 // save it in the test data array.\r
2000 if (tok >= 0) {\r
2001 testString.append((UChar32)tok);\r
2002 continue;\r
2003 }\r
2004\r
2005 // If we scanned a break position in the data, record it.\r
2006 if (tok == -1) {\r
2007 expectedBreaks.addElement(testString.length(), status);\r
2008 continue;\r
2009 }\r
2010\r
2011 // If we scanned a new line, or EOF\r
2012 // drop out of scan loop and run the test case.\r
2013 if (tok == -2 || tok == -3) {break;};\r
2014\r
2015 // None of above. Error.\r
2016 errln("Failure: Unrecognized data format, test file line %d", ss.fLineNum);\r
2017 break;\r
2018 }\r
2019\r
2020 // If this line from the test data file actually contained test data,\r
2021 // run the test.\r
2022 if (testString.length() > 0) {\r
2023 int32_t pos; // Break Position in the test string\r
2024 int32_t expectedI = 0; // Index of expected break position in vector of same.\r
2025 int32_t expectedPos; // Expected break position (index into test string)\r
2026\r
2027 bi->setText(testString);\r
2028 pos = bi->first(); // TODO: break iterators always return a match at pos 0.\r
2029 pos = bi->next(); // Line Break TR says no match at position 0.\r
2030 // Resolve.\r
2031\r
2032 for (; pos != BreakIterator::DONE; ) {\r
2033 expectedPos = expectedBreaks.elementAti(expectedI);\r
2034 if (pos < expectedPos) {\r
2035 errln("Failure: Test file line %d, unexpected break found at position %d",\r
2036 ss.fLineNum, pos);\r
2037 break;\r
2038 }\r
2039 if (pos > expectedPos) {\r
2040 errln("Failure: Test file line %d, failed to find break at position %d",\r
2041 ss.fLineNum, expectedPos);\r
2042 break;\r
2043 }\r
2044 pos = bi->next();\r
2045 expectedI++;\r
2046 }\r
2047 }\r
2048\r
2049 // If we've hit EOF on the input file, we're done.\r
2050 if (tok == -3) {\r
2051 break;\r
2052 }\r
2053\r
2054 }\r
2055\r
2056 fclose(ss.fFile);\r
2057 delete bi;\r
2058\r
2059}\r
2060\r
2061#if !UCONFIG_NO_REGULAR_EXPRESSIONS\r
2062\r
2063//---------------------------------------------------------------------------------------\r
2064//\r
2065// classs RBBIMonkeyKind\r
2066//\r
2067// Monkey Test for Break Iteration\r
2068// Abstract interface class. Concrete derived classes independently\r
2069// implement the break rules for different iterator types.\r
2070//\r
2071// The Monkey Test itself uses doesn't know which type of break iterator it is\r
2072// testing, but works purely in terms of the interface defined here.\r
2073//\r
2074//---------------------------------------------------------------------------------------\r
2075class RBBIMonkeyKind {\r
2076public:\r
2077 // Return a UVector of UnicodeSets, representing the character classes used\r
2078 // for this type of iterator.\r
2079 virtual UVector *charClasses() = 0;\r
2080\r
2081 // Set the test text on which subsequent calls to next() will operate\r
2082 virtual void setText(const UnicodeString &s) = 0;\r
2083\r
2084 // Find the next break postion, starting from the prev break position, or from zero.\r
2085 // Return -1 after reaching end of string.\r
2086 virtual int32_t next(int32_t i) = 0;\r
2087\r
2088 virtual ~RBBIMonkeyKind();\r
2089 UErrorCode deferredStatus;\r
2090\r
2091\r
2092protected:\r
2093 RBBIMonkeyKind();\r
2094\r
2095private:\r
2096};\r
2097\r
2098RBBIMonkeyKind::RBBIMonkeyKind() {\r
2099 deferredStatus = U_ZERO_ERROR;\r
2100}\r
2101\r
2102RBBIMonkeyKind::~RBBIMonkeyKind() {\r
2103}\r
2104\r
2105\r
2106//----------------------------------------------------------------------------------------\r
2107//\r
2108// Random Numbers. Similar to standard lib rand() and srand()\r
2109// Not using library to\r
2110// 1. Get same results on all platforms.\r
2111// 2. Get access to current seed, to more easily reproduce failures.\r
2112//\r
2113//---------------------------------------------------------------------------------------\r
2114static uint32_t m_seed = 1;\r
2115\r
2116static uint32_t m_rand()\r
2117{\r
2118 m_seed = m_seed * 1103515245 + 12345;\r
2119 return (uint32_t)(m_seed/65536) % 32768;\r
2120}\r
2121\r
2122\r
2123//------------------------------------------------------------------------------------------\r
2124//\r
2125// class RBBICharMonkey Character (Grapheme Cluster) specific implementation\r
2126// of RBBIMonkeyKind.\r
2127//\r
2128//------------------------------------------------------------------------------------------\r
2129class RBBICharMonkey: public RBBIMonkeyKind {\r
2130public:\r
2131 RBBICharMonkey();\r
2132 virtual ~RBBICharMonkey();\r
2133 virtual UVector *charClasses();\r
2134 virtual void setText(const UnicodeString &s);\r
2135 virtual int32_t next(int32_t i);\r
2136private:\r
2137 UVector *fSets;\r
2138\r
2139 UnicodeSet *fCRLFSet;\r
2140 UnicodeSet *fControlSet;\r
2141 UnicodeSet *fExtendSet;\r
2142 UnicodeSet *fHangulSet;\r
2143 UnicodeSet *fAnySet;\r
2144\r
2145 RegexMatcher *fMatcher;\r
2146 const UnicodeString *fText;\r
2147};\r
2148\r
2149\r
2150RBBICharMonkey::RBBICharMonkey() {\r
2151 UErrorCode status = U_ZERO_ERROR;\r
2152\r
2153 fText = NULL;\r
2154 fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster\r
2155\r
2156 fCRLFSet = new UnicodeSet("[\\r\\n]", status);\r
2157 fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);\r
2158 fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);\r
2159 fHangulSet = new UnicodeSet(\r
2160 "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"\r
2161 "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);\r
2162 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]", status);\r
2163\r
2164 fSets = new UVector(status);\r
2165 fSets->addElement(fCRLFSet, status);\r
2166 fSets->addElement(fControlSet, status);\r
2167 fSets->addElement(fExtendSet, status);\r
2168 fSets->addElement(fHangulSet, status);\r
2169 fSets->addElement(fAnySet, status);\r
2170 if (U_FAILURE(status)) {\r
2171 deferredStatus = status;\r
2172 }\r
2173};\r
2174\r
2175\r
2176void RBBICharMonkey::setText(const UnicodeString &s) {\r
2177 fText = &s;\r
2178 fMatcher->reset(s);\r
2179}\r
2180\r
2181\r
2182int32_t RBBICharMonkey::next(int32_t i) {\r
2183 UErrorCode status = U_ZERO_ERROR;\r
2184 int32_t retVal = -1;\r
2185\r
2186 if (fMatcher->find(i, status)) {\r
2187 retVal = fMatcher->end(status);\r
2188 }\r
2189 if (U_FAILURE(status)){\r
2190 retVal = -1;\r
2191 }\r
2192 return retVal;\r
2193}\r
2194\r
2195\r
2196UVector *RBBICharMonkey::charClasses() {\r
2197 return fSets;\r
2198}\r
2199\r
2200\r
2201RBBICharMonkey::~RBBICharMonkey() {\r
2202 delete fSets;\r
2203 delete fCRLFSet;\r
2204 delete fControlSet;\r
2205 delete fExtendSet;\r
2206 delete fHangulSet;\r
2207 delete fAnySet;\r
2208\r
2209 delete fMatcher;\r
2210}\r
2211\r
2212//------------------------------------------------------------------------------------------\r
2213//\r
2214// class RBBIWordMonkey Word Break specific implementation\r
2215// of RBBIMonkeyKind.\r
2216//\r
2217//------------------------------------------------------------------------------------------\r
2218class RBBIWordMonkey: public RBBIMonkeyKind {\r
2219public:\r
2220 RBBIWordMonkey();\r
2221 virtual ~RBBIWordMonkey();\r
2222 virtual UVector *charClasses();\r
2223 virtual void setText(const UnicodeString &s);\r
2224 virtual int32_t next(int32_t i);\r
2225private:\r
2226 UVector *fSets;\r
2227\r
2228 UnicodeSet *fKatakanaSet;\r
2229 UnicodeSet *fALetterSet;\r
2230 UnicodeSet *fMidLetterSet;\r
2231 UnicodeSet *fMidNumSet;\r
2232 UnicodeSet *fNumericSet;\r
2233 UnicodeSet *fFormatSet;\r
2234 UnicodeSet *fOtherSet;\r
2235 UnicodeSet *fExtendSet;\r
2236 UnicodeSet *fExtendNumLetSet;\r
2237\r
2238 RegexMatcher *fMatcher;\r
2239\r
2240 const UnicodeString *fText;\r
2241\r
2242 RegexMatcher *fGCFMatcher;\r
2243 RegexMatcher *fGCMatcher;\r
2244\r
2245};\r
2246\r
2247\r
2248RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),\r
2249 fGCMatcher(0)\r
2250{\r
2251 UErrorCode status = U_ZERO_ERROR;\r
2252\r
2253 fSets = new UVector(status);\r
2254\r
2255 fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}"\r
2256 "\\u3031-\\u3035\\u309b\\u309c\\u30a0"\r
2257 "\\u30fc\\uff70\\uff9e\\uff9f]", status);\r
2258\r
2259 const UnicodeString ALetterStr( "[[\\p{Alphabetic}"\r
2260 "\\u00a0" // NBSP\r
2261 "\\u05f3]" // Hebrew punct Geresh\r
2262 "-[\\p{Ideographic}]"\r
2263 "-[\\p{Script=Lao}]"\r
2264 "-[\\p{Script=Hiragana}]"\r
2265 "-[\\p{Grapheme_Extend}]]");\r
2266 fALetterSet = new UnicodeSet(ALetterStr, status);\r
2267 fALetterSet->removeAll(*fKatakanaSet);\r
2268\r
2269 fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);\r
2270 fMidNumSet = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);\r
2271 fNumericSet = new UnicodeSet("[\\p{Nd}\\u066b\\u066c]", status);\r
2272 fFormatSet = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);\r
2273 fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);\r
2274 fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);\r
2275 fOtherSet = new UnicodeSet();\r
2276 if(U_FAILURE(status)) {\r
2277 deferredStatus = status;\r
2278 return;\r
2279 }\r
2280\r
2281 fOtherSet->complement();\r
2282 fOtherSet->removeAll(*fKatakanaSet);\r
2283 fOtherSet->removeAll(*fALetterSet);\r
2284 fOtherSet->removeAll(*fMidLetterSet);\r
2285 fOtherSet->removeAll(*fMidNumSet);\r
2286 fOtherSet->removeAll(*fNumericSet);\r
2287 fOtherSet->removeAll(*fExtendNumLetSet);\r
2288\r
2289 fSets->addElement(fALetterSet, status);\r
2290 fSets->addElement(fKatakanaSet, status);\r
2291 fSets->addElement(fMidLetterSet, status);\r
2292 fSets->addElement(fMidNumSet, status);\r
2293 fSets->addElement(fNumericSet, status);\r
2294 fSets->addElement(fFormatSet, status);\r
2295 fSets->addElement(fOtherSet, status);\r
2296 fSets->addElement(fExtendNumLetSet, status);\r
2297\r
2298\r
2299 fGCFMatcher = new RegexMatcher("\\X(?:[\\p{Format}-\\p{Grapheme_Extend}])*", 0, status);\r
2300 fGCMatcher = new RegexMatcher("\\X", 0, status);\r
2301\r
2302 if (U_FAILURE(status)) {\r
2303 deferredStatus = status;\r
2304 }\r
2305};\r
2306\r
2307void RBBIWordMonkey::setText(const UnicodeString &s) {\r
2308 fText = &s;\r
2309 fGCMatcher->reset(*fText);\r
2310 fGCFMatcher->reset(*fText);\r
2311}\r
2312\r
2313\r
2314int32_t RBBIWordMonkey::next(int32_t prevPos) {\r
2315 UErrorCode status = U_ZERO_ERROR;\r
2316\r
2317 int p0, p1, p2, p3; // Indices of the significant code points around the \r
2318 // break position being tested. The candidate break\r
2319 // location is before p2.\r
2320\r
2321 int breakPos = -1;\r
2322\r
2323 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.\r
2324\r
2325 // Prev break at end of string. return DONE.\r
2326 if (prevPos >= fText->length()) {\r
2327 return -1;\r
2328 }\r
2329 p0 = p1 = p2 = p3 = prevPos;\r
2330 c3 = fText->char32At(prevPos);\r
2331 c0 = c1 = c2 = 0;\r
2332\r
2333\r
2334 // Format char after prev break? Special case, see last Note for Word Boundaries TR.\r
2335 // break immdiately after the format char.\r
2336 if (fFormatSet->contains(c3)) {\r
2337 breakPos = fText->moveIndex32(prevPos, 1);\r
2338 return breakPos;\r
2339 }\r
2340\r
2341\r
2342 // Loop runs once per "significant" character position in the input text.\r
2343 for (;;) {\r
2344 // Move all of the positions forward in the input string.\r
2345 p0 = p1; c0 = c1;\r
2346 p1 = p2; c1 = c2;\r
2347 p2 = p3; c2 = c3;\r
2348 // Advancd p3 by (GC Format*) Rules 3, 4\r
2349 status = U_ZERO_ERROR;\r
2350 if (fGCFMatcher->find(p3, status) == FALSE) {\r
2351 p3 = fText->length();\r
2352 c3 = 0;\r
2353 } else {\r
2354 p3 = fGCFMatcher->end(0, status);\r
2355 U_ASSERT(U_SUCCESS(status));\r
2356 c3 = fText->char32At(p3);\r
2357 }\r
2358 \r
2359 if (p1 == p2) {\r
2360 // Still warming up the loop. (won't work with zero length strings, but we don't care)\r
2361 continue;\r
2362 }\r
2363 if (p2 == fText->length()) {\r
2364 // Reached end of string. Always a break position.\r
2365 break;\r
2366 }\r
2367\r
2368 // Rule (5). ALetter x ALetter\r
2369 if (fALetterSet->contains(c1) &&\r
2370 fALetterSet->contains(c2)) {\r
2371 continue;\r
2372 }\r
2373\r
2374 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter\r
2375 //\r
2376 // Also incorporates rule 7 by skipping pos ahead to position of the\r
2377 // terminating ALetter.\r
2378 if ( fALetterSet->contains(c1) &&\r
2379 fMidLetterSet->contains(c2) &&\r
2380 fALetterSet->contains(c3)) {\r
2381 continue;\r
2382 }\r
2383\r
2384\r
2385 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter\r
2386 if (fALetterSet->contains(c0) &&\r
2387 (fMidLetterSet->contains(c1) ) &&\r
2388 fALetterSet->contains(c2)) {\r
2389 continue;\r
2390 }\r
2391\r
2392 // Rule (8) Numeric x Numeric\r
2393 if (fNumericSet->contains(c1) &&\r
2394 fNumericSet->contains(c2)) {\r
2395 continue;\r
2396 }\r
2397\r
2398 // Rule (9) ALetter x Numeric\r
2399 if (fALetterSet->contains(c1) &&\r
2400 fNumericSet->contains(c2)) {\r
2401 continue;\r
2402 }\r
2403\r
2404 // Rule (10) Numeric x ALetter\r
2405 if (fNumericSet->contains(c1) &&\r
2406 fALetterSet->contains(c2)) {\r
2407 continue;\r
2408 }\r
2409\r
2410 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric\r
2411 if ( fNumericSet->contains(c0) &&\r
2412 fMidNumSet->contains(c1) && \r
2413 fNumericSet->contains(c2)) {\r
2414 continue;\r
2415 }\r
2416\r
2417 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric\r
2418 if (fNumericSet->contains(c1) &&\r
2419 fMidNumSet->contains(c2) &&\r
2420 fNumericSet->contains(c3)) {\r
2421 continue;\r
2422 }\r
2423 \r
2424 // Rule (13) Katakana x Katakana\r
2425 if (fKatakanaSet->contains(c1) &&\r
2426 fKatakanaSet->contains(c2)) {\r
2427 continue;\r
2428 }\r
2429\r
2430 // Rule 13a\r
2431 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||\r
2432 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&\r
2433 fExtendNumLetSet->contains(c2)) {\r
2434 continue;\r
2435 }\r
2436\r
2437 // Rule 13b\r
2438 if (fExtendNumLetSet->contains(c1) && \r
2439 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||\r
2440 fKatakanaSet->contains(c2))) {\r
2441 continue;\r
2442 }\r
2443\r
2444\r
2445 // Rule 14. Break found here.\r
2446 break;\r
2447 }\r
2448\r
2449\r
2450 // Rule 4 fixup, back up before any trailing\r
2451 // format characters at the end of the word.\r
2452 breakPos = p2;\r
2453 status = U_ZERO_ERROR;\r
2454 if (fGCMatcher->find(p1, status)) {\r
2455 breakPos = fGCMatcher->end(0, status);\r
2456 U_ASSERT(U_SUCCESS(status));\r
2457 }\r
2458 return breakPos;\r
2459}\r
2460\r
2461\r
2462UVector *RBBIWordMonkey::charClasses() {\r
2463 return fSets;\r
2464}\r
2465\r
2466\r
2467RBBIWordMonkey::~RBBIWordMonkey() {\r
2468 delete fSets;\r
2469 delete fKatakanaSet;\r
2470 delete fALetterSet;\r
2471 delete fMidLetterSet;\r
2472 delete fMidNumSet;\r
2473 delete fNumericSet;\r
2474 delete fFormatSet;\r
2475 delete fExtendSet;\r
2476 delete fOtherSet;\r
2477\r
2478 delete fGCFMatcher;\r
2479 delete fGCMatcher;\r
2480}\r
2481\r
2482\r
2483\r
2484\r
2485//-------------------------------------------------------------------------------------------\r
2486//\r
2487// RBBILineMonkey\r
2488//\r
2489//-------------------------------------------------------------------------------------------\r
2490\r
2491class RBBILineMonkey: public RBBIMonkeyKind {\r
2492public:\r
2493 RBBILineMonkey();\r
2494 virtual ~RBBILineMonkey();\r
2495 virtual UVector *charClasses();\r
2496 virtual void setText(const UnicodeString &s);\r
2497 virtual int32_t next(int32_t i);\r
2498 virtual void rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);\r
2499private:\r
2500 UVector *fSets;\r
2501\r
2502 UnicodeSet *fBK;\r
2503 UnicodeSet *fCR;\r
2504 UnicodeSet *fLF;\r
2505 UnicodeSet *fCM;\r
2506 UnicodeSet *fNL;\r
2507 UnicodeSet *fSG;\r
2508 UnicodeSet *fWJ;\r
2509 UnicodeSet *fZW;\r
2510 UnicodeSet *fGL;\r
2511 UnicodeSet *fCB;\r
2512 UnicodeSet *fSP;\r
2513 UnicodeSet *fB2;\r
2514 UnicodeSet *fBA;\r
2515 UnicodeSet *fBB;\r
2516 UnicodeSet *fHY;\r
2517 UnicodeSet *fCL;\r
2518 UnicodeSet *fEX;\r
2519 UnicodeSet *fIN;\r
2520 UnicodeSet *fNS;\r
2521 UnicodeSet *fOP;\r
2522 UnicodeSet *fQU;\r
2523 UnicodeSet *fIS;\r
2524 UnicodeSet *fNU;\r
2525 UnicodeSet *fPO;\r
2526 UnicodeSet *fPR;\r
2527 UnicodeSet *fSY;\r
2528 UnicodeSet *fAI;\r
2529 UnicodeSet *fAL;\r
2530 UnicodeSet *fID;\r
2531 UnicodeSet *fSA;\r
2532 UnicodeSet *fXX;\r
2533\r
2534 BreakIterator *fCharBI;\r
2535\r
2536 const UnicodeString *fText;\r
2537 int32_t *fOrigPositions;\r
2538\r
2539 RegexMatcher *fNumberMatcher;\r
2540 RegexMatcher *fLB10Matcher;\r
2541 RegexMatcher *fLB11Matcher;\r
2542};\r
2543\r
2544\r
2545RBBILineMonkey::RBBILineMonkey() \r
2546{\r
2547 UErrorCode status = U_ZERO_ERROR;\r
2548\r
2549 fSets = new UVector(status);\r
2550\r
2551 fBK = new UnicodeSet("[\\p{Line_Break=BK}]", status);\r
2552 fCR = new UnicodeSet("[\\p{Line_break=CR}]", status);\r
2553 fLF = new UnicodeSet("[\\p{Line_break=LF}]", status);\r
2554 fCM = new UnicodeSet("[\\p{Line_break=CM}]", status);\r
2555 fNL = new UnicodeSet("[\\p{Line_break=NL}]", status);\r
2556 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]", status);\r
2557 fZW = new UnicodeSet("[\\p{Line_break=ZW}]", status);\r
2558 fGL = new UnicodeSet("[\\p{Line_break=GL}]", status);\r
2559 fCB = new UnicodeSet("[\\p{Line_break=CB}]", status);\r
2560 fSP = new UnicodeSet("[\\p{Line_break=SP}]", status);\r
2561 fB2 = new UnicodeSet("[\\p{Line_break=B2}]", status);\r
2562 fBA = new UnicodeSet("[\\p{Line_break=BA}]", status);\r
2563 fBB = new UnicodeSet("[\\p{Line_break=BB}]", status);\r
2564 fHY = new UnicodeSet("[\\p{Line_break=HY}]", status);\r
2565 fCL = new UnicodeSet("[\\p{Line_break=CL}]", status);\r
2566 fEX = new UnicodeSet("[\\p{Line_break=EX}]", status);\r
2567 fIN = new UnicodeSet("[\\p{Line_break=IN}]", status);\r
2568 fNS = new UnicodeSet("[\\p{Line_break=NS}]", status);\r
2569 fOP = new UnicodeSet("[\\p{Line_break=OP}]", status);\r
2570 fQU = new UnicodeSet("[\\p{Line_break=QU}]", status);\r
2571 fIS = new UnicodeSet("[\\p{Line_break=IS}]", status);\r
2572 fNU = new UnicodeSet("[\\p{Line_break=NU}]", status);\r
2573 fPO = new UnicodeSet("[\\p{Line_break=PO}]", status);\r
2574 fPR = new UnicodeSet("[\\p{Line_break=PR}]", status);\r
2575 fSY = new UnicodeSet("[\\p{Line_break=SY}]", status);\r
2576 fAI = new UnicodeSet("[\\p{Line_break=AI}]", status);\r
2577 fAL = new UnicodeSet("[\\p{Line_break=AL}]", status);\r
2578 fID = new UnicodeSet("[\\p{Line_break=ID}]", status);\r
2579 fSA = new UnicodeSet("[\\p{Line_break=SA}]", status);\r
2580 fXX = new UnicodeSet("[\\p{Line_break=XX}]", status);\r
2581\r
2582 fAL->addAll(*fXX); // Default behavior for XX is identical to AL\r
2583 fAL->addAll(*fAI); // Default behavior for AI is identical to AL\r
2584 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL\r
2585\r
2586\r
2587\r
2588 fSets->addElement(fBK, status);\r
2589 fSets->addElement(fCR, status);\r
2590 fSets->addElement(fLF, status);\r
2591 fSets->addElement(fCM, status);\r
2592 fSets->addElement(fNL, status);\r
2593 fSets->addElement(fWJ, status);\r
2594 fSets->addElement(fZW, status);\r
2595 fSets->addElement(fGL, status);\r
2596 fSets->addElement(fCB, status);\r
2597 fSets->addElement(fSP, status);\r
2598 fSets->addElement(fB2, status);\r
2599 fSets->addElement(fBA, status);\r
2600 fSets->addElement(fBB, status);\r
2601 fSets->addElement(fHY, status);\r
2602 fSets->addElement(fCL, status);\r
2603 fSets->addElement(fEX, status);\r
2604 fSets->addElement(fIN, status);\r
2605 fSets->addElement(fNS, status);\r
2606 fSets->addElement(fOP, status);\r
2607 fSets->addElement(fQU, status);\r
2608 fSets->addElement(fIS, status);\r
2609 fSets->addElement(fNU, status);\r
2610 fSets->addElement(fPO, status);\r
2611 fSets->addElement(fPR, status);\r
2612 fSets->addElement(fSY, status);\r
2613 fSets->addElement(fAI, status);\r
2614 fSets->addElement(fAL, status);\r
2615 fSets->addElement(fID, status);\r
2616 fSets->addElement(fWJ, status);\r
2617 fSets->addElement(fSA, status);\r
2618 // fSets->addElement(fXX, status);\r
2619\r
2620\r
2621\r
2622 fNumberMatcher = new RegexMatcher(\r
2623 "(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"\r
2624 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"\r
2625 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"\r
2626 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"\r
2627 "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"\r
2628 "(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?", \r
2629 0, status);\r
2630\r
2631 fLB10Matcher = new RegexMatcher(\r
2632 "\\p{Line_Break=QU}\\p{Line_Break=CM}*"\r
2633 "\\p{Line_Break=SP}*"\r
2634 "(\\p{Line_Break=OP})\\p{Line_Break=CM}*", \r
2635 0, status);\r
2636\r
2637 fLB11Matcher = new RegexMatcher(\r
2638 "\\p{Line_Break=CL}\\p{Line_Break=CM}*"\r
2639 "\\p{Line_Break=SP}*"\r
2640 "(\\p{Line_Break=NS})\\p{Line_Break=CM}*", \r
2641 0, status);\r
2642\r
2643 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);\r
2644\r
2645 if (U_FAILURE(status)) {\r
2646 deferredStatus = status;\r
2647 }\r
2648};\r
2649\r
2650\r
2651void RBBILineMonkey::setText(const UnicodeString &s) {\r
2652 fText = &s;\r
2653 fCharBI->setText(s);\r
2654 fNumberMatcher->reset(s);\r
2655}\r
2656\r
2657//\r
2658// rule67Adjust\r
2659// Line Break TR rules 6 and 7 implementation.\r
2660// This deals with combining marks, Hangul Syllables, and other sequences that\r
2661// that must be treated as if they were something other than what they actually are.\r
2662//\r
2663// This is factored out into a separate function because it must be applied twice for\r
2664// each potential break, once to the chars before the position being checked, then\r
2665// again to the text following the possible break.\r
2666//\r
2667void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {\r
2668 if (pos == -1) {\r
2669 // Invalid initial position. Happens during the warmup iteration of the \r
2670 // main loop in next().\r
2671 return;\r
2672 }\r
2673\r
2674 int32_t nPos = *nextPos;\r
2675 \r
2676 // LB 6 Treat Korean Syllables as a single unit\r
2677 int32_t hangultype = u_getIntPropertyValue(*posChar, UCHAR_HANGUL_SYLLABLE_TYPE);\r
2678 if (hangultype != U_HST_NOT_APPLICABLE) {\r
2679 nPos = fCharBI->following(pos); // Advance by grapheme cluster, which\r
2680 // contains the logic to locate Hangul syllables.\r
2681 // Grapheme Cluster Ugliness: some Grapheme_Extend chars, which are absorbed\r
2682 // into a grapheme cluster, are NOT Line Break CM. (Some are GL, for example.)\r
2683 // We don't want consume any of these. The Approach is\r
2684 // 1. Back nPos up, undoing the consumption of any\r
2685 // Grapheme_Extend chars by the char break iterator.\r
2686 // 2. Let the LB 7b logic below reconsume any Line Break CM chars.\r
2687 for (;;) {\r
2688 nPos = fText->moveIndex32(nPos, -1);\r
2689 UChar32 possiblyExtendChar = fText->char32At(nPos);\r
2690 if (fID->contains(possiblyExtendChar)) {\r
2691 // We hit into the Hangul Syllable itself, class is ID.\r
2692 nPos = fText->moveIndex32(nPos, +1);\r
2693 break;\r
2694 }\r
2695 }\r
2696 }\r
2697 \r
2698 // LB 7b Keep combining sequences together.\r
2699 // advance over any CM class chars. (Line Break CM class is different from\r
2700 // grapheme cluster CM, so we need to do this even for HangulSyllables.\r
2701 // Line Break may eat additional stuff as combining, beyond what graphem cluster did.\r
2702 if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a \r
2703 || *posChar==0x0d || *posChar==0x85)) {\r
2704 for (;;) {\r
2705 *nextChar = fText->char32At(nPos);\r
2706 if (!fCM->contains(*nextChar)) {\r
2707 break;\r
2708 }\r
2709 nPos = fText->moveIndex32(nPos, 1);\r
2710 }\r
2711 }\r
2712 \r
2713 \r
2714 // LB 7a In a SP CM* sequence, treat the SP as an ID\r
2715 if (nPos != *nextPos && fSP->contains(*posChar)) {\r
2716 *posChar = 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.\r
2717 }\r
2718 \r
2719 // LB 7b Treat X CM* as if it were x.\r
2720 // No explicit action required. \r
2721 \r
2722 // LB 7c Treat any remaining combining mark as AL\r
2723 if (fCM->contains(*posChar)) {\r
2724 *posChar = 0x41; // thisChar = 'A';\r
2725 }\r
2726\r
2727 // Push the updated nextPos and nextChar back to our caller.\r
2728 // This only makes a difference if posChar got bigger, by slurping up a\r
2729 // combining sequence or Hangul syllable.\r
2730 *nextPos = nPos;\r
2731 *nextChar = fText->char32At(nPos);\r
2732}\r
2733\r
2734\r
2735\r
2736int32_t RBBILineMonkey::next(int32_t startPos) {\r
2737 UErrorCode status = U_ZERO_ERROR;\r
2738 int32_t pos; // Index of the char following a potential break position\r
2739 UChar32 thisChar; // Character at above position "pos"\r
2740\r
2741 int32_t prevPos; // Index of the char preceding a potential break position\r
2742 UChar32 prevChar; // Character at above position. Note that prevChar\r
2743 // and thisChar may not be adjacent because combining\r
2744 // characters between them will be ignored.\r
2745\r
2746 int32_t nextPos; // Index of the next character following pos.\r
2747 // Usually skips over combining marks.\r
2748 int32_t nextCPPos; // Index of the code point following "pos."\r
2749 // May point to a combining mark.\r
2750 int32_t tPos; // temp value.\r
2751 UChar32 c;\r
2752\r
2753 if (startPos >= fText->length()) {\r
2754 return -1;\r
2755 }\r
2756\r
2757\r
2758 // Initial values for loop. Loop will run the first time without finding breaks,\r
2759 // while the invalid values shift out and the "this" and\r
2760 // "prev" positions are filled in with good values.\r
2761 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.\r
2762 thisChar = prevChar = 0;\r
2763 nextPos = nextCPPos = startPos;\r
2764\r
2765\r
2766 // Loop runs once per position in the test text, until a break position\r
2767 // is found.\r
2768 for (;;) {\r
2769 prevPos = pos;\r
2770 prevChar = thisChar;\r
2771\r
2772 pos = nextPos;\r
2773 thisChar = fText->char32At(pos);\r
2774\r
2775 nextCPPos = fText->moveIndex32(pos, 1);\r
2776 nextPos = nextCPPos;\r
2777\r
2778 // Break at end of text.\r
2779 if (pos >= fText->length()) {\r
2780 break;\r
2781 }\r
2782\r
2783 // LB 3a Always break after hard line breaks,\r
2784 if (fBK->contains(prevChar)) {\r
2785 break;\r
2786 }\r
2787\r
2788 // LB 3b Break after CR, LF, NL, but not inside CR LF\r
2789 if (prevChar == 0x0d && thisChar == 0x0a) {\r
2790 continue;\r
2791 }\r
2792 if (prevChar == 0x0d ||\r
2793 prevChar == 0x0a ||\r
2794 prevChar == 0x85) {\r
2795 break;\r
2796 }\r
2797\r
2798 // LB 3c Don't break before hard line breaks\r
2799 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||\r
2800 fBK->contains(thisChar)) {\r
2801 continue;\r
2802 }\r
2803\r
2804 // LB 10 QU SP* x OP\r
2805 if (prevPos >= 0) {\r
2806 UnicodeString subStr10(*fText, prevPos);\r
2807 fLB10Matcher->reset(subStr10);\r
2808 status = U_ZERO_ERROR;\r
2809 if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;\r
2810 // TODO: Check status codes\r
2811 pos = prevPos + fLB10Matcher->start(1, status);\r
2812 nextPos = prevPos + fLB10Matcher->end(0, status);\r
2813 thisChar = fText->char32At(pos);\r
2814 continue;\r
2815 }\r
2816 }\r
2817\r
2818 // LB 11 CL SP* x NS\r
2819 if (prevPos >= 0) {\r
2820 UnicodeString subStr11(*fText, prevPos);\r
2821 fLB11Matcher->reset(subStr11);\r
2822 status = U_ZERO_ERROR;\r
2823 if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;\r
2824 // TODO: Check status codes\r
2825 pos = prevPos + fLB11Matcher->start(1, status);\r
2826 nextPos = prevPos + fLB11Matcher->end(0, status);\r
2827 thisChar = fText->char32At(pos);\r
2828 continue;\r
2829 }\r
2830 }\r
2831\r
2832 // LB 4 Don't break before spaces or zero-width space.\r
2833 if (fSP->contains(thisChar)) {\r
2834 continue;\r
2835 }\r
2836\r
2837 if (fZW->contains(thisChar)) {\r
2838 continue;\r
2839 }\r
2840\r
2841 // LB 5 Break after zero width space\r
2842 if (fZW->contains(prevChar)) {\r
2843 break;\r
2844 }\r
2845\r
2846 // LB 6, LB 7\r
2847 /*int32_t oldpos = pos;*/\r
2848 rule67Adjust(prevPos, &prevChar, &pos, &thisChar);\r
2849 \r
2850 nextCPPos = fText->moveIndex32(pos, 1);\r
2851 nextPos = nextCPPos;\r
2852 c = fText->char32At(nextPos);\r
2853 // another percularity of LB 4 - Dont break before space\r
2854 if (fSP->contains(thisChar)) {\r
2855 continue;\r
2856 }\r
2857 rule67Adjust(pos, &thisChar, &nextPos, &c);\r
2858\r
2859 // If the loop is still warming up - if we haven't shifted the initial\r
2860 // -1 positions out of prevPos yet - loop back to advance the\r
2861 // position in the input without any further looking for breaks.\r
2862 if (prevPos == -1) {\r
2863 continue;\r
2864 }\r
2865\r
2866 // Re-apply rules 3c, 4 because these could be affected by having\r
2867 // a new thisChar from doing rule 6 or 7.\r
2868 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c\r
2869 fBK->contains(thisChar)) {\r
2870 continue;\r
2871 }\r
2872 if (fSP->contains(thisChar)) { // LB 4\r
2873 continue;\r
2874 }\r
2875 if (fZW->contains(thisChar)) { // LB 4\r
2876 continue;\r
2877 }\r
2878\r
2879\r
2880 // LB 8 Don't break before closings.\r
2881 // NU x CL and NU x IS are not matched here so that they will\r
2882 // fall into LB 17 and the more general number regular expression.\r
2883 //\r
2884 if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||\r
2885 fEX->contains(thisChar) ||\r
2886 !fNU->contains(prevChar) && fIS->contains(thisChar) ||\r
2887 !fNU->contains(prevChar) && fSY->contains(thisChar)) {\r
2888 continue;\r
2889 }\r
2890\r
2891 // LB 9 Don't break after OP SP*\r
2892 // Scan backwards, checking for this sequence.\r
2893 // The OP char could include combining marks, so we acually check for\r
2894 // OP CM* SP*\r
2895 // Another Twist: The Rule 67 fixes may have changed a CP CM\r
2896 // sequence into a ID char, so before scanning back through spaces,\r
2897 // verify that prevChar is indeed a space. The prevChar variable\r
2898 // may differ from fText[prevPos]\r
2899 tPos = prevPos;\r
2900 if (fSP->contains(prevChar)) {\r
2901 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {\r
2902 tPos=fText->moveIndex32(tPos, -1);\r
2903 }\r
2904 }\r
2905 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {\r
2906 tPos=fText->moveIndex32(tPos, -1);\r
2907 }\r
2908 if (fOP->contains(fText->char32At(tPos))) {\r
2909 continue;\r
2910 }\r
2911\r
2912\r
2913 // LB 11a B2 x B2\r
2914 if (fB2->contains(thisChar) && fB2->contains(prevChar)) {\r
2915 continue;\r
2916 }\r
2917\r
2918 // LB 11b \r
2919 // x GL\r
2920 // GL x\r
2921 if (fGL->contains(thisChar) || fGL->contains(prevChar)) {\r
2922 continue;\r
2923 }\r
2924 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {\r
2925 continue;\r
2926 }\r
2927\r
2928 // LB 12 break after space\r
2929 if (fSP->contains(prevChar)) {\r
2930 break;\r
2931 }\r
2932\r
2933 // LB 14\r
2934 // x QU\r
2935 // QU x\r
2936 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {\r
2937 continue;\r
2938 }\r
2939\r
2940 // LB 14a Break around a CB\r
2941 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {\r
2942 break;\r
2943 }\r
2944\r
2945 // LB 15 \r
2946 if (fBA->contains(thisChar) ||\r
2947 fHY->contains(thisChar) ||\r
2948 fNS->contains(thisChar) ||\r
2949 fBB->contains(prevChar) ) {\r
2950 continue;\r
2951 }\r
2952\r
2953 // LB 16\r
2954 if (fAL->contains(prevChar) && fIN->contains(thisChar) ||\r
2955 fID->contains(prevChar) && fIN->contains(thisChar) ||\r
2956 fIN->contains(prevChar) && fIN->contains(thisChar) ||\r
2957 fNU->contains(prevChar) && fIN->contains(thisChar) ) {\r
2958 continue; \r
2959 }\r
2960\r
2961\r
2962 // LB 17 ID x PO (Note: Leading CM behaves like ID)\r
2963 // AL x NU\r
2964 // NU x AL\r
2965 if (fID->contains(prevChar) && fPO->contains(thisChar) ||\r
2966 fCM->contains(prevChar) && fPO->contains(thisChar) || \r
2967 fAL->contains(prevChar) && fNU->contains(thisChar) ||\r
2968 fNU->contains(prevChar) && fAL->contains(thisChar) ) {\r
2969 continue; \r
2970 }\r
2971\r
2972 // LB 18 Numbers\r
2973 UnicodeString subStr18(*fText, prevPos);\r
2974 fNumberMatcher->reset(subStr18);\r
2975 if (fNumberMatcher->lookingAt(status)) {\r
2976 // TODO: Check status codes\r
2977 // Matched a number. But could have been just a single digit, which would\r
2978 // not represent a "no break here" between prevChar and thisChar\r
2979 int32_t numEndIdx = prevPos + fNumberMatcher->end(status); // idx of first char following num\r
2980 if (numEndIdx > pos) {\r
2981 // Number match includes at least our two chars being checked\r
2982 if (numEndIdx > nextPos) {\r
2983 // Number match includes additional chars. Update pos and nextPos\r
2984 // so that next loop iteration will continue at the end of the number,\r
2985 // checking for breaks between last char in number & whatever follows.\r
2986 nextPos = numEndIdx;\r
2987 pos = fCharBI->preceding(numEndIdx); \r
2988 thisChar = fText->char32At(pos);\r
2989 while (fCM->contains(thisChar)) {\r
2990 pos = fCharBI->preceding(pos);\r
2991 thisChar = fText->char32At(pos);\r
2992 }\r
2993 }\r
2994 continue;\r
2995 }\r
2996 }\r
2997\r
2998 if (fPR->contains(prevChar) && fAL->contains(thisChar)) {\r
2999 continue;\r
3000 }\r
3001\r
3002 if (fPR->contains(prevChar) && fID->contains(thisChar)) {\r
3003 continue;\r
3004 }\r
3005\r
3006 // LB 18b\r
3007 if (fHY->contains(prevChar) || fBB->contains(thisChar)) {\r
3008 break;\r
3009 }\r
3010\r
3011 // LB 19\r
3012 if (fAL->contains(prevChar) && fAL->contains(thisChar)) {\r
3013 continue;\r
3014 }\r
3015\r
3016 // LB 19b\r
3017 if (fIS->contains(prevChar) && fAL->contains(thisChar)) {\r
3018 continue;\r
3019 }\r
3020\r
3021 // LB 20 Break everywhere else\r
3022 break;\r
3023 \r
3024 }\r
3025 \r
3026 return pos;\r
3027}\r
3028\r
3029\r
3030UVector *RBBILineMonkey::charClasses() {\r
3031 return fSets;\r
3032}\r
3033\r
3034\r
3035RBBILineMonkey::~RBBILineMonkey() {\r
3036 delete fSets;\r
3037\r
3038 delete fBK;\r
3039 delete fCR;\r
3040 delete fLF;\r
3041 delete fCM;\r
3042 delete fNL;\r
3043 delete fWJ;\r
3044 delete fZW;\r
3045 delete fGL;\r
3046 delete fCB;\r
3047 delete fSP;\r
3048 delete fB2;\r
3049 delete fBA;\r
3050 delete fBB;\r
3051 delete fHY;\r
3052 delete fCL;\r
3053 delete fEX;\r
3054 delete fIN;\r
3055 delete fNS;\r
3056 delete fOP;\r
3057 delete fQU;\r
3058 delete fIS;\r
3059 delete fNU;\r
3060 delete fPO;\r
3061 delete fPR;\r
3062 delete fSY;\r
3063 delete fAI;\r
3064 delete fAL;\r
3065 delete fID;\r
3066 delete fSA;\r
3067 delete fXX;\r
3068\r
3069 delete fCharBI;\r
3070 delete fNumberMatcher;\r
3071 delete fLB10Matcher;\r
3072 delete fLB11Matcher;\r
3073}\r
3074\r
3075\r
3076//-------------------------------------------------------------------------------------------\r
3077//\r
3078// TestMonkey\r
3079//\r
3080// params\r
3081// seed=nnnnn Random number starting seed.\r
3082// Setting the seed allows errors to be reproduced.\r
3083// loop=nnn Looping count. Controls running time.\r
3084// -1: run forever.\r
3085// 0 or greater: run length.\r
3086//\r
3087// type = char | word | line | sent | title\r
3088//\r
3089//-------------------------------------------------------------------------------------------\r
3090\r
3091static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {\r
3092 int32_t val = defaultVal;\r
3093 name.append(" *= *(-?\\d+)");\r
3094 UErrorCode status = U_ZERO_ERROR;\r
3095 RegexMatcher m(name, params, 0, status);\r
3096 if (m.find()) {\r
3097 // The param exists. Convert the string to an int.\r
3098 char valString[100];\r
3099 int32_t paramLength = m.end(1, status) - m.start(1, status);\r
3100 if (paramLength >= (int32_t)(sizeof(valString)-1)) {\r
3101 paramLength = (int32_t)(sizeof(valString)-2);\r
3102 }\r
3103 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));\r
3104 val = strtol(valString, NULL, 10);\r
3105\r
3106 // Delete this parameter from the params string.\r
3107 m.reset();\r
3108 params = m.replaceFirst("", status);\r
3109 }\r
3110 U_ASSERT(U_SUCCESS(status));\r
3111 return val;\r
3112}\r
3113#endif\r
3114\r
3115static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, \r
3116 BreakIterator *bi,\r
3117 int expected[], \r
3118 int expectedcount)\r
3119{\r
3120 int count = 0;\r
3121 int i = 0;\r
3122 int forward[50];\r
3123 bi->setText(ustr);\r
3124 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {\r
3125 forward[count] = i;\r
3126 if (count < expectedcount && expected[count] != i) {\r
3127 test->errln("break forward test failed: expected %d but got %d", \r
3128 expected[count], i);\r
3129 break;\r
3130 }\r
3131 count ++;\r
3132 }\r
3133 if (count != expectedcount) {\r
3134 printStringBreaks(ustr, expected, expectedcount);\r
3135 test->errln("break test failed: missed %d match", \r
3136 expectedcount - count);\r
3137 return;\r
3138 }\r
3139 // testing boundaries\r
3140 for (i = 1; i < expectedcount; i ++) {\r
3141 int j = expected[i - 1];\r
3142 if (!bi->isBoundary(j)) {\r
3143 printStringBreaks(ustr, expected, expectedcount);\r
3144 test->errln("Expected boundary at position %d", j);\r
3145 return;\r
3146 }\r
3147 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {\r
3148 if (bi->isBoundary(j)) {\r
3149 printStringBreaks(ustr, expected, expectedcount);\r
3150 test->errln("Not expecting boundary at position %d", j);\r
3151 return;\r
3152 }\r
3153 }\r
3154 }\r
3155\r
3156 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {\r
3157 count --;\r
3158 if (forward[count] != i) {\r
3159 test->errln("happy break test reverse failed: expected %d but got %d", \r
3160 forward[count], i);\r
3161 break;\r
3162 }\r
3163 }\r
3164 if (count != 0) {\r
3165 printStringBreaks(ustr, expected, expectedcount);\r
3166 test->errln("happy break test failed: missed a match");\r
3167 return;\r
3168 }\r
3169\r
3170 // testing preceding\r
3171 for (i = 0; i < expectedcount - 1; i ++) {\r
3172 int j = expected[i] + 1;\r
3173 for (; j <= expected[i + 1]; j ++) {\r
3174 if (bi->preceding(j) != expected[i]) {\r
3175 printStringBreaks(ustr, expected, expectedcount);\r
3176 test->errln("Not expecting backwards boundary at position %d", j);\r
3177 return;\r
3178 }\r
3179 }\r
3180 } \r
3181}\r
3182\r
3183void RBBITest::TestWordBreaks(void)\r
3184{\r
3185#if !UCONFIG_NO_REGULAR_EXPRESSIONS\r
3186\r
3187 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>\r
3188 Locale locale("en");\r
3189 UErrorCode status = U_ZERO_ERROR;\r
3190 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);\r
3191 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);\r
3192 UChar str[300]; \r
3193 static const char *strlist[] = \r
3194 {\r
3195 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",\r
3196 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",\r
3197 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",\r
3198 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",\r
3199 "\\u90ca\\u3588\\u009c\\u0953\\u194b",\r
3200 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",\r
3201 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",\r
3202 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",\r
3203 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",\r
3204 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",\r
3205 "\\u2027\\U000e0067\\u0a47\\u00b7",\r
3206 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",\r
3207 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",\r
3208 "\\u0589\\U000e006e\\u0a42\\U000104a5",\r
3209 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",\r
3210 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",\r
3211 "\\u0027\\u11af\\U000e0057\\u0602",\r
3212 "\\U0001d7f2\\U000e007\\u0004\\u0589",\r
3213 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",\r
3214 "\\U0001d7f2\\U000e007d\\u0004\\u0589",\r
3215 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",\r
3216 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",\r
3217 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",\r
3218 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",\r
3219 "\\u0233\\U000e0020\\u0a69\\u0d6a",\r
3220 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",\r
3221 "\\u58f4\\U000e0049\\u20e7\\u2027",\r
3222 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",\r
3223 "\\ua183\\u102d\\u0bec\\u003a",\r
3224 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",\r
3225 "\\u003a\\u0e57\\u0fad\\u002e",\r
3226 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",\r
3227 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",\r
3228 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",\r
3229 "\\u003a\\u0664\\u00b7\\u1fba",\r
3230 "\\u003b\\u0027\\u00b7\\u47a3",\r
3231 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",\r
3232 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",\r
3233 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",\r
3234 };\r
3235 int loop;\r
3236 if (U_FAILURE(status)) {\r
3237 errln("Creation of break iterator failed %s", u_errorName(status));\r
3238 return;\r
3239 }\r
3240 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r
3241 // printf("looping %d\n", loop);\r
3242 u_unescape(strlist[loop], str, 25);\r
3243 UnicodeString ustr(str);\r
3244 // RBBICharMonkey monkey;\r
3245 RBBIWordMonkey monkey;\r
3246\r
3247 int expected[50];\r
3248 int expectedcount = 0;\r
3249\r
3250 monkey.setText(ustr);\r
3251 int i;\r
3252 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {\r
3253 expected[expectedcount ++] = i;\r
3254 }\r
3255\r
3256 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);\r
3257 }\r
3258 delete bi;\r
3259#endif\r
3260}\r
3261\r
3262void RBBITest::TestWordBoundary(void)\r
3263{\r
3264 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>\r
3265 Locale locale("en");\r
3266 UErrorCode status = U_ZERO_ERROR;\r
3267 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);\r
3268 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);\r
3269 UChar str[50]; \r
3270 static const char *strlist[] = \r
3271 {\r
3272 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",\r
3273 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",\r
3274 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",\r
3275 "\\u2027\\U000e0067\\u0a47\\u00b7",\r
3276 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",\r
3277 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",\r
3278 "\\u0589\\U000e006e\\u0a42\\U000104a5",\r
3279 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",\r
3280 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",\r
3281 "\\u0027\\u11af\\U000e0057\\u0602",\r
3282 "\\U0001d7f2\\U000e007\\u0004\\u0589",\r
3283 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",\r
3284 "\\U0001d7f2\\U000e007d\\u0004\\u0589",\r
3285 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",\r
3286 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",\r
3287 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",\r
3288 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",\r
3289 "\\u0233\\U000e0020\\u0a69\\u0d6a",\r
3290 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",\r
3291 "\\u58f4\\U000e0049\\u20e7\\u2027",\r
3292 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",\r
3293 "\\ua183\\u102d\\u0bec\\u003a",\r
3294 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",\r
3295 "\\u003a\\u0e57\\u0fad\\u002e",\r
3296 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",\r
3297 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",\r
3298 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",\r
3299 "\\u003a\\u0664\\u00b7\\u1fba",\r
3300 "\\u003b\\u0027\\u00b7\\u47a3",\r
3301 };\r
3302 int loop;\r
3303 if (U_FAILURE(status)) {\r
3304 errln("Creation of break iterator failed %s", u_errorName(status));\r
3305 return;\r
3306 }\r
3307 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r
3308 // printf("looping %d\n", loop);\r
3309 u_unescape(strlist[loop], str, 20);\r
3310 UnicodeString ustr(str);\r
3311 int forward[50];\r
3312 int count = 0;\r
3313 \r
3314 bi->setText(ustr);\r
3315 int prev = 0;\r
3316 int i;\r
3317 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {\r
3318 forward[count ++] = i;\r
3319 if (i > prev) {\r
3320 int j;\r
3321 for (j = prev + 1; j < i; j ++) {\r
3322 if (bi->isBoundary(j)) {\r
3323 printStringBreaks(ustr, forward, count);\r
3324 errln("happy boundary test failed: expected %d not a boundary", \r
3325 j);\r
3326 return;\r
3327 }\r
3328 }\r
3329 }\r
3330 if (!bi->isBoundary(i)) {\r
3331 printStringBreaks(ustr, forward, count);\r
3332 errln("happy boundary test failed: expected %d a boundary", \r
3333 i);\r
3334 return;\r
3335 }\r
3336 prev = i;\r
3337 }\r
3338 }\r
3339 delete bi;\r
3340}\r
3341\r
3342void RBBITest::TestLineBreaks(void)\r
3343{\r
3344#if !UCONFIG_NO_REGULAR_EXPRESSIONS\r
3345 Locale locale("en");\r
3346 UErrorCode status = U_ZERO_ERROR;\r
3347 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);\r
3348 UChar str[50]; \r
3349 static const char *strlist[] = \r
3350 {\r
3351 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",\r
3352 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",\r
3353 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",\r
3354 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",\r
3355 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",\r
3356 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",\r
3357 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",\r
3358 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",\r
3359 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",\r
3360 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",\r
3361 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",\r
3362 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",\r
3363 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",\r
3364 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",\r
3365 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",\r
3366 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",\r
3367 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",\r
3368 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",\r
3369 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",\r
3370 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",\r
3371 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",\r
3372 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",\r
3373 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",\r
3374 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",\r
3375 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",\r
3376 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",\r
3377 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",\r
3378 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",\r
3379 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",\r
3380 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",\r
3381 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",\r
3382 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",\r
3383 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",\r
3384 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",\r
3385 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",\r
3386 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",\r
3387 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",\r
3388 };\r
3389 int loop;\r
3390 if (U_FAILURE(status)) {\r
3391 errln("Creation of break iterator failed %s", u_errorName(status));\r
3392 return;\r
3393 }\r
3394 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r
3395 // printf("looping %d\n", loop);\r
3396 u_unescape(strlist[loop], str, 20);\r
3397 UnicodeString ustr(str);\r
3398 RBBILineMonkey monkey;\r
3399\r
3400 int expected[50];\r
3401 int expectedcount = 0;\r
3402\r
3403 monkey.setText(ustr);\r
3404 int i;\r
3405 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {\r
3406 expected[expectedcount ++] = i;\r
3407 }\r
3408\r
3409 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);\r
3410 }\r
3411 delete bi;\r
3412#endif\r
3413}\r
3414\r
3415void RBBITest::TestSentBreaks(void)\r
3416{\r
3417 Locale locale("en");\r
3418 UErrorCode status = U_ZERO_ERROR;\r
3419 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);\r
3420 UChar str[100]; \r
3421 static const char *strlist[] = \r
3422 {\r
3423 "Now\ris\nthe\r\ntime\n\rfor\r\r",\r
3424 "This\n",\r
3425 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",\r
3426 "\"Sentence ending with a quote.\" Bye.",\r
3427 " (This is it). Testing the sentence iterator. \"This isn't it.\"", \r
3428 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",\r
3429 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",\r
3430 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",\r
3431 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",\r
3432 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",\r
3433 };\r
3434 int loop;\r
3435 int forward[100];\r
3436 if (U_FAILURE(status)) {\r
3437 errln("Creation of break iterator failed %s", u_errorName(status));\r
3438 return;\r
3439 }\r
3440 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r
3441 u_unescape(strlist[loop], str, 100);\r
3442 UnicodeString ustr(str);\r
3443\r
3444 int count = 0;\r
3445 bi->setText(ustr);\r
3446 int i;\r
3447 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {\r
3448 forward[count ++] = i;\r
3449 }\r
3450 testBreakBoundPreceding(this, ustr, bi, forward, count);\r
3451 }\r
3452 delete bi;\r
3453}\r
3454\r
3455void RBBITest::TestMonkey(char *params) {\r
3456#if !UCONFIG_NO_REGULAR_EXPRESSIONS\r
3457\r
3458 UErrorCode status = U_ZERO_ERROR;\r
3459 int32_t loopCount = 500;\r
3460 int32_t seed = 1;\r
3461 UnicodeString breakType = "all";\r
3462 Locale locale("en");\r
3463\r
3464 if (quick == FALSE) {\r
3465 loopCount = 10000;\r
3466 }\r
3467\r
3468 if (params) {\r
3469 UnicodeString p(params);\r
3470 loopCount = getIntParam("loop", p, loopCount);\r
3471 seed = getIntParam("seed", p, seed);\r
3472\r
3473 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);\r
3474 if (m.find()) {\r
3475 breakType = m.group(1, status);\r
3476 m.reset();\r
3477 p = m.replaceFirst("", status);\r
3478 }\r
3479\r
3480 m.reset(p);\r
3481 if (RegexMatcher("\\S", p, 0, status).find()) {\r
3482 // Each option is stripped out of the option string as it is processed.\r
3483 // All options have been checked. The option string should have been completely emptied..\r
3484 char buf[100];\r
3485 p.extract(buf, sizeof(buf), NULL, status);\r
3486 buf[sizeof(buf)-1] = 0;\r
3487 errln("Unrecognized or extra parameter: %s\n", buf);\r
3488 return;\r
3489 }\r
3490\r
3491 }\r
3492\r
3493 if (breakType == "char" || breakType == "all") {\r
3494 RBBICharMonkey m;\r
3495 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);\r
3496 if (U_SUCCESS(status)) {\r
3497 RunMonkey(bi, m, "char", seed, loopCount);\r
3498 }\r
3499 else {\r
3500 errln("Creation of character break iterator failed %s", u_errorName(status));\r
3501 }\r
3502 delete bi;\r
3503 }\r
3504\r
3505 if (breakType == "word" || breakType == "all") {\r
3506 logln("Word Break Monkey Test");\r
3507 RBBIWordMonkey m;\r
3508 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);\r
3509 if (U_SUCCESS(status)) {\r
3510 RunMonkey(bi, m, "word", seed, loopCount);\r
3511 }\r
3512 else {\r
3513 errln("Creation of word break iterator failed %s", u_errorName(status));\r
3514 }\r
3515 delete bi;\r
3516 }\r
3517\r
3518 if (breakType == "line" || breakType == "all") {\r
3519 logln("Line Break Monkey Test");\r
3520 RBBILineMonkey m;\r
3521 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);\r
3522 if (params == NULL) {\r
3523 loopCount = 50;\r
3524 }\r
3525 if (U_SUCCESS(status)) {\r
3526 RunMonkey(bi, m, "line", seed, loopCount);\r
3527 }\r
3528 else {\r
3529 errln("Creation of line break iterator failed %s", u_errorName(status));\r
3530 }\r
3531 delete bi;\r
3532 }\r
3533\r
3534\r
3535#endif\r
3536}\r
3537\r
3538//\r
3539// Run a RBBI monkey test. Common routine, for all break iterator types.\r
3540// Parameters:\r
3541// bi - the break iterator to use\r
3542// mk - MonkeyKind, abstraction for obtaining expected results\r
3543// name - Name of test (char, word, etc.) for use in error messages\r
3544// seed - Seed for starting random number generator (parameter from user)\r
3545// numIterations\r
3546//\r
3547void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, int32_t numIterations) {\r
3548\r
3549#if !UCONFIG_NO_REGULAR_EXPRESSIONS\r
3550\r
3551 const int32_t TESTSTRINGLEN = 500;\r
3552 UnicodeString testText;\r
3553 int32_t numCharClasses;\r
3554 UVector *chClasses;\r
3555 int expected[TESTSTRINGLEN*2 + 1];\r
3556 int expectedCount = 0;\r
3557 char expectedBreaks[TESTSTRINGLEN*2 + 1];\r
3558 char forwardBreaks[TESTSTRINGLEN*2 + 1];\r
3559 char reverseBreaks[TESTSTRINGLEN*2+1];\r
3560 char isBoundaryBreaks[TESTSTRINGLEN*2+1];\r
3561 char followingBreaks[TESTSTRINGLEN*2+1];\r
3562 char precedingBreaks[TESTSTRINGLEN*2+1];\r
3563 int i;\r
3564 int loopCount = 0;\r
3565\r
3566 m_seed = seed;\r
3567\r
3568 numCharClasses = mk.charClasses()->size();\r
3569 chClasses = mk.charClasses();\r
3570\r
3571 // Check for errors that occured during the construction of the MonkeyKind object.\r
3572 // Can't report them where they occured because errln() is a method coming from intlTest,\r
3573 // and is not visible outside of RBBITest :-(\r
3574 if (U_FAILURE(mk.deferredStatus)) {\r
3575 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));\r
3576 return;\r
3577 }\r
3578\r
3579 // Verify that the character classes all have at least one member.\r
3580 for (i=0; i<numCharClasses; i++) {\r
3581 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);\r
3582 if (s == NULL || s->size() == 0) {\r
3583 errln("Character Class #%d is null or of zero size.", i);\r
3584 return;\r
3585 }\r
3586 }\r
3587\r
3588 while (loopCount < numIterations || numIterations == -1) {\r
3589 if (numIterations == -1 && loopCount % 10 == 0) {\r
3590 // If test is running in an infinite loop, display a periodic tic so\r
3591 // we can tell that it is making progress.\r
3592 fprintf(stderr, ".");\r
3593 }\r
3594 // Save current random number seed, so that we can recreate the random numbers\r
3595 // for this loop iteration in event of an error.\r
3596 seed = m_seed;\r
3597\r
3598 // Populate a test string with data.\r
3599 testText.truncate(0);\r
3600 for (i=0; i<TESTSTRINGLEN; i++) {\r
3601 int32_t aClassNum = m_rand() % numCharClasses;\r
3602 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);\r
3603 int32_t charIdx = m_rand() % classSet->size();\r
3604 UChar32 c = classSet->charAt(charIdx);\r
3605 if (c < 0) { // TODO: deal with sets containing strings.\r
3606 errln("c < 0");\r
3607 }\r
3608 testText.append(c);\r
3609 }\r
3610\r
3611 // Calculate the expected results for this test string.\r
3612 mk.setText(testText);\r
3613 memset(expectedBreaks, 0, sizeof(expectedBreaks));\r
3614 expectedBreaks[0] = 1;\r
3615 int32_t breakPos = 0;\r
3616 expectedCount = 0;\r
3617 for (;;) {\r
3618 breakPos = mk.next(breakPos);\r
3619 if (breakPos == -1) {\r
3620 break;\r
3621 }\r
3622 if (breakPos > testText.length()) {\r
3623 errln("breakPos > testText.length()");\r
3624 }\r
3625 expectedBreaks[breakPos] = 1;\r
3626 expected[expectedCount ++] = breakPos;\r
3627 }\r
3628\r
3629 // Find the break positions using forward iteration\r
3630 memset(forwardBreaks, 0, sizeof(forwardBreaks));\r
3631 bi->setText(testText);\r
3632 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {\r
3633 if (i < 0 || i > testText.length()) {\r
3634 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);\r
3635 break;\r
3636 }\r
3637 forwardBreaks[i] = 1;\r
3638 }\r
3639\r
3640 // Find the break positions using reverse iteration\r
3641 memset(reverseBreaks, 0, sizeof(reverseBreaks));\r
3642 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {\r
3643 if (i < 0 || i > testText.length()) {\r
3644 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);\r
3645 break;\r
3646 }\r
3647 reverseBreaks[i] = 1;\r
3648 }\r
3649\r
3650 // Find the break positions using isBoundary() tests.\r
3651 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));\r
3652 U_ASSERT(sizeof(isBoundaryBreaks) > testText.length());\r
3653 for (i=0; i<=testText.length(); i++) {\r
3654 isBoundaryBreaks[i] = bi->isBoundary(i);\r
3655 }\r
3656\r
3657\r
3658 // Find the break positions using the following() function.\r
3659 // printf(".");\r
3660 memset(followingBreaks, 0, sizeof(followingBreaks));\r
3661 int32_t lastBreakPos = 0;\r
3662 followingBreaks[0] = 1;\r
3663 for (i=0; i<testText.length(); i++) {\r
3664 breakPos = bi->following(i);\r
3665 if (breakPos <= i ||\r
3666 breakPos < lastBreakPos ||\r
3667 breakPos > testText.length() ||\r
3668 breakPos > lastBreakPos && lastBreakPos > i ) {\r
3669 errln("%s break monkey test: "\r
3670 "Out of range value returned by BreakIterator::following().\n"\r
3671 "Random seed=%d", name, seed);\r
3672 break;\r
3673 }\r
3674 followingBreaks[breakPos] = 1;\r
3675 lastBreakPos = breakPos;\r
3676 }\r
3677\r
3678 // Find the break positions using the preceding() function.\r
3679 memset(precedingBreaks, 0, sizeof(followingBreaks));\r
3680 lastBreakPos = testText.length();\r
3681 precedingBreaks[testText.length()] = 1;\r
3682 for (i=testText.length(); i>0; i--) {\r
3683 breakPos = bi->preceding(i);\r
3684 if (breakPos >= i ||\r
3685 breakPos > lastBreakPos ||\r
3686 breakPos < 0 ||\r
3687 breakPos < lastBreakPos && lastBreakPos < i ) {\r
3688 errln("%s break monkey test: "\r
3689 "Out of range value returned by BreakIterator::preceding().\n"\r
3690 "index=%d; prev returned %d; lastBreak=%d" ,\r
3691 name, i, breakPos, lastBreakPos);\r
3692 precedingBreaks[i] = 2; // Forces an error.\r
3693 } else {\r
3694 precedingBreaks[breakPos] = 1;\r
3695 lastBreakPos = breakPos;\r
3696 }\r
3697 }\r
3698\r
3699 // Compare the expected and actual results.\r
3700 for (i=0; i<=testText.length(); i++) {\r
3701 const char *errorType = NULL;\r
3702 if (forwardBreaks[i] != expectedBreaks[i]) {\r
3703 errorType = "next()";\r
3704 } else if (reverseBreaks[i] != forwardBreaks[i]) {\r
3705 errorType = "previous()";\r
3706 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {\r
3707 errorType = "isBoundary()";\r
3708 } else if (followingBreaks[i] != expectedBreaks[i]) {\r
3709 errorType = "following()";\r
3710 } else if (precedingBreaks[i] != expectedBreaks[i]) {\r
3711 errorType = "preceding()";\r
3712 }\r
3713\r
3714\r
3715 if (errorType != NULL) {\r
3716 // Format a range of the test text that includes the failure as\r
3717 // a data item that can be included in the rbbi test data file.\r
3718\r
3719 // Start of the range is the last point where expected and actual results\r
3720 // both agreed that there was a break position.\r
3721 int startContext = i;\r
3722 int32_t count = 0;\r
3723 for (;;) {\r
3724 if (startContext==0) { break; }\r
3725 startContext --;\r
3726 if (expectedBreaks[startContext] != 0) {\r
3727 if (count == 2) break;\r
3728 count ++;\r
3729 }\r
3730 }\r
3731\r
3732 // End of range is two expected breaks past the start position.\r
3733 int endContext = i + 1;\r
3734 int ci;\r
3735 for (ci=0; ci<2; ci++) { // Number of items to include in error text.\r
3736 for (;;) {\r
3737 if (endContext >= testText.length()) {break;}\r
3738 if (expectedBreaks[endContext-1] != 0) { \r
3739 if (count == 0) break;\r
3740 count --;\r
3741 }\r
3742 endContext ++;\r
3743 }\r
3744 }\r
3745\r
3746 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"\r
3747 UnicodeString errorText = "<data>";\r
3748 /***if (strcmp(errorType, "next()") == 0) {\r
3749 startContext = 0;\r
3750 endContext = testText.length();\r
3751 \r
3752 printStringBreaks(testText, expected, expectedCount);\r
3753 }***/\r
3754\r
3755 for (ci=startContext; ci<endContext;) {\r
3756 UnicodeString hexChars("0123456789abcdef");\r
3757 UChar32 c;\r
3758 int bn;\r
3759 c = testText.char32At(ci);\r
3760 if (ci == i) {\r
3761 // This is the location of the error.\r
3762 errorText.append("<?>");\r
3763 } else if (expectedBreaks[ci] != 0) {\r
3764 // This a non-error expected break position.\r
3765 errorText.append("<>");\r
3766 }\r
3767 if (c < 0x10000) {\r
3768 errorText.append("\\u");\r
3769 for (bn=12; bn>=0; bn-=4) {\r
3770 errorText.append(hexChars.charAt((c>>bn)&0xf));\r
3771 }\r
3772 } else {\r
3773 errorText.append("\\U");\r
3774 for (bn=28; bn>=0; bn-=4) {\r
3775 errorText.append(hexChars.charAt((c>>bn)&0xf));\r
3776 }\r
3777 }\r
3778 ci = testText.moveIndex32(ci, 1);\r
3779 }\r
3780 errorText.append("<>");\r
3781 errorText.append("</data>\n");\r
3782\r
3783 // Output the error\r
3784 char charErrorTxt[500];\r
3785 UErrorCode status = U_ZERO_ERROR;\r
3786 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);\r
3787 charErrorTxt[sizeof(charErrorTxt)-1] = 0;\r
3788 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",\r
3789 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),\r
3790 errorType, seed, i, charErrorTxt);\r
3791 break;\r
3792 }\r
3793 }\r
3794\r
3795 loopCount++;\r
3796 }\r
3797#endif\r
3798}\r
3799\r
3800\r
3801#endif /* #if !UCONFIG_NO_BREAK_ITERATION */\r