]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /********************************************************************\r |
2 | * COPYRIGHT:\r | |
3 | * Copyright (c) 1999-2004, International Business Machines Corporation and\r | |
4 | * others. All Rights Reserved.\r | |
5 | ********************************************************************/\r | |
6 | /************************************************************************\r | |
7 | * Date Name Description\r | |
8 | * 12/15/99 Madhu Creation.\r | |
9 | * 01/12/2000 Madhu Updated for changed API and added new tests\r | |
10 | ************************************************************************/\r | |
11 | \r | |
12 | #include "unicode/utypes.h"\r | |
13 | \r | |
14 | #if !UCONFIG_NO_BREAK_ITERATION\r | |
15 | \r | |
16 | #include "unicode/utypes.h"\r | |
17 | #include "unicode/brkiter.h"\r | |
18 | #include "unicode/rbbi.h"\r | |
19 | #include "unicode/uchar.h"\r | |
20 | #include "unicode/utf16.h"\r | |
21 | #include "unicode/ucnv.h"\r | |
22 | #include "unicode/schriter.h"\r | |
23 | #include "unicode/uniset.h"\r | |
24 | #include "unicode/regex.h" // TODO: make conditional on regexp being built.\r | |
25 | #include "unicode/ustring.h"\r | |
26 | \r | |
27 | #include "intltest.h"\r | |
28 | #include "rbbitst.h"\r | |
29 | #include <string.h>\r | |
30 | #include "uvector.h"\r | |
31 | #include "uvectr32.h"\r | |
32 | #include <string.h>\r | |
33 | #include <stdio.h>\r | |
34 | #include <stdlib.h>\r | |
35 | \r | |
36 | \r | |
37 | \r | |
38 | //---------------------------------------------------------------------------\r | |
39 | //\r | |
40 | // class BITestData Holds a set of Break iterator test data and results\r | |
41 | // Includes\r | |
42 | // - the string data to be broken\r | |
43 | // - a vector of the expected break positions.\r | |
44 | // - a vector of source line numbers for the data,\r | |
45 | // (to help see where errors occured.)\r | |
46 | // - The expected break tag values.\r | |
47 | // - Vectors of actual break positions and tag values.\r | |
48 | // - Functions for comparing actual with expected and\r | |
49 | // reporting errors.\r | |
50 | //\r | |
51 | //----------------------------------------------------------------------------\r | |
52 | class BITestData {\r | |
53 | public:\r | |
54 | UnicodeString fDataToBreak;\r | |
55 | UVector fExpectedBreakPositions;\r | |
56 | UVector fExpectedTags;\r | |
57 | UVector fLineNum;\r | |
58 | UVector fActualBreakPositions; // Test Results.\r | |
59 | UVector fActualTags;\r | |
60 | \r | |
61 | BITestData(UErrorCode &status);\r | |
62 | void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);\r | |
63 | void checkResults(const char *heading, RBBITest *test);\r | |
64 | void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);\r | |
65 | void clearResults();\r | |
66 | };\r | |
67 | \r | |
68 | //\r | |
69 | // Constructor.\r | |
70 | //\r | |
71 | BITestData::BITestData(UErrorCode &status)\r | |
72 | : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),\r | |
73 | fActualTags(status)\r | |
74 | {\r | |
75 | };\r | |
76 | \r | |
77 | //\r | |
78 | // addDataChunk. Add a section (non-breaking) piece if data to the test data.\r | |
79 | // The macro form collects the line number, which is helpful\r | |
80 | // when tracking down failures.\r | |
81 | //\r | |
82 | // A null data item is inserted at the start of each test's data\r | |
83 | // to put the starting zero into the data list. The position saved for\r | |
84 | // each non-null item is its ending position.\r | |
85 | //\r | |
86 | #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);\r | |
87 | void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {\r | |
88 | if (U_FAILURE(status)) {return;}\r | |
89 | if (data != NULL) {\r | |
90 | fDataToBreak.append(CharsToUnicodeString(data));\r | |
91 | }\r | |
92 | fExpectedBreakPositions.addElement(fDataToBreak.length(), status);\r | |
93 | fExpectedTags.addElement(tag, status);\r | |
94 | fLineNum.addElement(lineNum, status);\r | |
95 | };\r | |
96 | \r | |
97 | \r | |
98 | //\r | |
99 | // checkResults. Compare the actual and expected break positions, report any differences.\r | |
100 | //\r | |
101 | void BITestData::checkResults(const char *heading, RBBITest *test) {\r | |
102 | int32_t expectedIndex = 0;\r | |
103 | int32_t actualIndex = 0;\r | |
104 | \r | |
105 | for (;;) {\r | |
106 | // If we've run through both the expected and actual results vectors, we're done.\r | |
107 | // break out of the loop.\r | |
108 | if (expectedIndex >= fExpectedBreakPositions.size() &&\r | |
109 | actualIndex >= fActualBreakPositions.size()) {\r | |
110 | break;\r | |
111 | }\r | |
112 | \r | |
113 | \r | |
114 | if (expectedIndex >= fExpectedBreakPositions.size()) {\r | |
115 | err(heading, test, expectedIndex-1, actualIndex);\r | |
116 | actualIndex++;\r | |
117 | continue;\r | |
118 | }\r | |
119 | \r | |
120 | if (actualIndex >= fActualBreakPositions.size()) {\r | |
121 | err(heading, test, expectedIndex, actualIndex-1);\r | |
122 | expectedIndex++;\r | |
123 | continue;\r | |
124 | }\r | |
125 | \r | |
126 | if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {\r | |
127 | err(heading, test, expectedIndex, actualIndex);\r | |
128 | // Try to resync the positions of the indices, to avoid a rash of spurious erros.\r | |
129 | if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {\r | |
130 | actualIndex++;\r | |
131 | } else {\r | |
132 | expectedIndex++;\r | |
133 | }\r | |
134 | continue;\r | |
135 | }\r | |
136 | \r | |
137 | if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {\r | |
138 | test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",\r | |
139 | heading, fLineNum.elementAt(expectedIndex),\r | |
140 | fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));\r | |
141 | }\r | |
142 | \r | |
143 | actualIndex++;\r | |
144 | expectedIndex++;\r | |
145 | }\r | |
146 | }\r | |
147 | \r | |
148 | //\r | |
149 | // err - An error was found. Report it, along with information about where the\r | |
150 | // incorrectly broken test data appeared in the source file.\r | |
151 | //\r | |
152 | void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)\r | |
153 | {\r | |
154 | int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);\r | |
155 | int32_t actual = fActualBreakPositions.elementAti(actualIdx);\r | |
156 | int32_t o = 0;\r | |
157 | int32_t line = fLineNum.elementAti(expectedIdx);\r | |
158 | if (expectedIdx > 0) {\r | |
159 | // The line numbers are off by one because a premature break occurs somewhere\r | |
160 | // within the previous item, rather than at the start of the current (expected) item.\r | |
161 | // We want to report the offset of the unexpected break from the start of\r | |
162 | // this previous item.\r | |
163 | o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);\r | |
164 | }\r | |
165 | if (actual < expected) {\r | |
166 | test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);\r | |
167 | } else {\r | |
168 | test->errln("%s Failed to find break at end of item from line %d", heading, line);\r | |
169 | }\r | |
170 | }\r | |
171 | \r | |
172 | \r | |
173 | void BITestData::clearResults() {\r | |
174 | fActualBreakPositions.removeAllElements();\r | |
175 | fActualTags.removeAllElements();\r | |
176 | }\r | |
177 | \r | |
178 | \r | |
179 | //-----------------------------------------------------------------------------------\r | |
180 | //\r | |
181 | // Cannned Test Characters\r | |
182 | //\r | |
183 | //-----------------------------------------------------------------------------------\r | |
184 | \r | |
185 | static const UChar cannedTestArray[] = {\r | |
186 | 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,\r | |
187 | 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,\r | |
188 | 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,\r | |
189 | 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,\r | |
190 | 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,\r | |
191 | 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,\r | |
192 | 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,\r | |
193 | 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000\r | |
194 | };\r | |
195 | \r | |
196 | static UnicodeString* cannedTestChars = 0;\r | |
197 | \r | |
198 | #define halfNA "\\u0928\\u094d\\u200d"\r | |
199 | #define halfSA "\\u0938\\u094d\\u200d"\r | |
200 | #define halfCHA "\\u091a\\u094d\\u200d"\r | |
201 | #define halfKA "\\u0915\\u094d\\u200d"\r | |
202 | #define deadTA "\\u0924\\u094d"\r | |
203 | \r | |
204 | //--------------------------------------------------------------------------------------\r | |
205 | //\r | |
206 | // RBBITest constructor and destructor\r | |
207 | //\r | |
208 | //--------------------------------------------------------------------------------------\r | |
209 | \r | |
210 | RBBITest::RBBITest() {\r | |
211 | UnicodeString temp(cannedTestArray);\r | |
212 | cannedTestChars = new UnicodeString();\r | |
213 | *cannedTestChars += (UChar)0x0000;\r | |
214 | *cannedTestChars += temp;\r | |
215 | }\r | |
216 | \r | |
217 | \r | |
218 | RBBITest::~RBBITest() {\r | |
219 | delete cannedTestChars;\r | |
220 | }\r | |
221 | \r | |
222 | \r | |
223 | static const int T_NUMBER = 100;\r | |
224 | static const int T_LETTER = 200;\r | |
225 | static const int T_H_OR_K = 300;\r | |
226 | static const int T_IDEO = 400;\r | |
227 | \r | |
228 | \r | |
229 | \r | |
230 | \r | |
231 | \r | |
232 | \r | |
233 | //--------------------------------------------------------------------\r | |
234 | //Testing the BreakIterator for devanagari script\r | |
235 | //--------------------------------------------------------------------\r | |
236 | \r | |
237 | #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/\r | |
238 | #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/\r | |
239 | #define deadTTHA "\\u0920\\u094d"\r | |
240 | #define deadPA "\\u092a\\u094d"\r | |
241 | #define deadSA "\\u0938\\u094d"\r | |
242 | #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/\r | |
243 | \r | |
244 | \r | |
245 | \r | |
246 | \r | |
247 | \r | |
248 | \r | |
249 | //-----------------------------------------------------------------------------------\r | |
250 | //\r | |
251 | // Test for status {tag} return value from break rules.\r | |
252 | // TODO: a more thorough test.\r | |
253 | //\r | |
254 | //-----------------------------------------------------------------------------------\r | |
255 | void RBBITest::TestStatusReturn() {\r | |
256 | UnicodeString rulesString1 = "$Letters = [:L:];\n"\r | |
257 | "$Numbers = [:N:];\n"\r | |
258 | "$Letters+{1};\n"\r | |
259 | "$Numbers+{2};\n"\r | |
260 | "Help\\ {4}/me\\!;\n"\r | |
261 | "[^$Letters $Numbers];\n"\r | |
262 | "!.*;\n";\r | |
263 | UnicodeString testString1 = "abc123..abc Help me Help me!";\r | |
264 | // 01234567890123456789012345678\r | |
265 | int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};\r | |
266 | int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};\r | |
267 | \r | |
268 | UErrorCode status=U_ZERO_ERROR;\r | |
269 | UParseError parseError;\r | |
270 | \r | |
271 | RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);\r | |
272 | if(U_FAILURE(status)) {\r | |
273 | errln("FAIL : in construction");\r | |
274 | } else {\r | |
275 | int32_t pos;\r | |
276 | int32_t i = 0;\r | |
277 | bi->setText(testString1);\r | |
278 | for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {\r | |
279 | if (pos != bounds1[i]) {\r | |
280 | errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);\r | |
281 | break;\r | |
282 | }\r | |
283 | \r | |
284 | int tag = bi->getRuleStatus();\r | |
285 | if (tag != brkStatus[i]) {\r | |
286 | errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);\r | |
287 | break;\r | |
288 | }\r | |
289 | i++;\r | |
290 | }\r | |
291 | }\r | |
292 | delete bi;\r | |
293 | }\r | |
294 | \r | |
295 | \r | |
296 | static void printStringBreaks(UnicodeString ustr, int expected[],\r | |
297 | int expectedcount)\r | |
298 | {\r | |
299 | UErrorCode status = U_ZERO_ERROR;\r | |
300 | char name[100];\r | |
301 | printf("code alpha extend alphanum type line name\n");\r | |
302 | int j;\r | |
303 | for (j = 0; j < ustr.length(); j ++) {\r | |
304 | if (expectedcount > 0) {\r | |
305 | int k;\r | |
306 | for (k = 0; k < expectedcount; k ++) {\r | |
307 | if (j == expected[k]) {\r | |
308 | printf("------------------------------------------------ %d\n",\r | |
309 | j);\r | |
310 | }\r | |
311 | }\r | |
312 | }\r | |
313 | UChar32 c = ustr.char32At(j);\r | |
314 | if (c > 0xffff) {\r | |
315 | j ++;\r | |
316 | }\r | |
317 | u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);\r | |
318 | printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c, \r | |
319 | u_isUAlphabetic(c), \r | |
320 | u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),\r | |
321 | u_isalnum(c), \r | |
322 | u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, \r | |
323 | u_charType(c), \r | |
324 | U_SHORT_PROPERTY_NAME), \r | |
325 | u_getPropertyValueName(UCHAR_LINE_BREAK, \r | |
326 | u_getIntPropertyValue(c, \r | |
327 | UCHAR_LINE_BREAK), \r | |
328 | U_SHORT_PROPERTY_NAME),\r | |
329 | name);\r | |
330 | }\r | |
331 | }\r | |
332 | \r | |
333 | void RBBITest::TestThaiLineBreak() {\r | |
334 | UErrorCode status = U_ZERO_ERROR;\r | |
335 | BITestData thaiLineSelection(status);\r | |
336 | \r | |
337 | // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that\r | |
338 | // represents elided letters at the end of a long word. It should be bound to\r | |
339 | // the end of the word and not treated as an independent punctuation mark.\r | |
340 | \r | |
341 | \r | |
342 | ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data\r | |
343 | ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);\r | |
344 | ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);\r | |
345 | ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);\r | |
346 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);\r | |
347 | // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);\r | |
348 | // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);\r | |
349 | ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);\r | |
350 | // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us\r | |
351 | ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);\r | |
352 | ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);\r | |
353 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);\r | |
354 | ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);\r | |
355 | ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);\r | |
356 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);\r | |
357 | \r | |
358 | // the one time where the paiyannoi occurs somewhere other than at the end\r | |
359 | // of a word is in the Thai abbrevation for "etc.", which both begins and\r | |
360 | // ends with a paiyannoi\r | |
361 | ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);\r | |
362 | ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);\r | |
363 | ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);\r | |
364 | \r | |
365 | RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(\r | |
366 | Locale("th"), status);\r | |
367 | if (U_FAILURE(status))\r | |
368 | {\r | |
369 | errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");\r | |
370 | return;\r | |
371 | }\r | |
372 | \r | |
373 | generalIteratorTest(*e, thaiLineSelection);\r | |
374 | delete e;\r | |
375 | }\r | |
376 | \r | |
377 | \r | |
378 | \r | |
379 | void RBBITest::TestMixedThaiLineBreak()\r | |
380 | {\r | |
381 | UErrorCode status = U_ZERO_ERROR;\r | |
382 | BITestData thaiLineSelection(status);\r | |
383 | \r | |
384 | ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data\r | |
385 | \r | |
386 | // Arabic numerals should always be separated from surrounding Thai text\r | |
387 | /*\r | |
388 | ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);\r | |
389 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);\r | |
390 | ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);\r | |
391 | ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);\r | |
392 | ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);\r | |
393 | thaiLineSelection->addElement("39");\r | |
394 | ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);\r | |
395 | \r | |
396 | // words in non-Thai scripts should always be separated from surrounding Thai text\r | |
397 | ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);\r | |
398 | ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);\r | |
399 | thaiLineSelection->addElement("Java");\r | |
400 | ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);\r | |
401 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);\r | |
402 | ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);\r | |
403 | \r | |
404 | // Thai numerals should always be separated from the text surrounding them\r | |
405 | ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);\r | |
406 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);\r | |
407 | ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);\r | |
408 | ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);\r | |
409 | ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);\r | |
410 | ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);\r | |
411 | ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);\r | |
412 | \r | |
413 | // Thai text should interact correctly with punctuation and symbols\r | |
414 | ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);\r | |
415 | // ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);\r | |
416 | // ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);\r | |
417 | ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);\r | |
418 | // I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary\r | |
419 | ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);\r | |
420 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);\r | |
421 | ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);\r | |
422 | */\r | |
423 | \r | |
424 | // The Unicode Linebreak TR says do not break before or after quotes.\r | |
425 | // So this test is changed ot not break around the quote.\r | |
426 | // TODO: should Thai break around the around the quotes, like the original behavior here?\r | |
427 | // ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);\r | |
428 | // ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);\r | |
429 | ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""\r | |
430 | "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);\r | |
431 | \r | |
432 | ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);\r | |
433 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);\r | |
434 | ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);\r | |
435 | ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);\r | |
436 | ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);\r | |
437 | ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);\r | |
438 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);\r | |
439 | ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);\r | |
440 | ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);\r | |
441 | \r | |
442 | RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);\r | |
443 | if (U_FAILURE(status))\r | |
444 | {\r | |
445 | errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");\r | |
446 | return;\r | |
447 | }\r | |
448 | \r | |
449 | \r | |
450 | generalIteratorTest(*e, thaiLineSelection);\r | |
451 | delete e;\r | |
452 | }\r | |
453 | \r | |
454 | \r | |
455 | void RBBITest::TestMaiyamok()\r | |
456 | {\r | |
457 | UErrorCode status = U_ZERO_ERROR;\r | |
458 | BITestData thaiLineSelection(status);\r | |
459 | ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data\r | |
460 | // the Thai maiyamok character is a shorthand symbol that means "repeat the previous\r | |
461 | // word". Instead of appearing as a word unto itself, however, it's kept together\r | |
462 | // with the word before it\r | |
463 | ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);\r | |
464 | ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);\r | |
465 | ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);\r | |
466 | ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);\r | |
467 | ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);\r | |
468 | ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);\r | |
469 | ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);\r | |
470 | \r | |
471 | RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(\r | |
472 | Locale("th"), status);\r | |
473 | \r | |
474 | if (U_FAILURE(status))\r | |
475 | {\r | |
476 | errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");\r | |
477 | return;\r | |
478 | }\r | |
479 | generalIteratorTest(*e, thaiLineSelection);\r | |
480 | delete e;\r | |
481 | }\r | |
482 | \r | |
483 | void RBBITest::TestThaiWordBreak() {\r | |
484 | UErrorCode status = U_ZERO_ERROR;\r | |
485 | BITestData thaiWordSelection(status);\r | |
486 | \r | |
487 | ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data\r | |
488 | ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2\r | |
489 | ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5\r | |
490 | ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6\r | |
491 | ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10\r | |
492 | ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16\r | |
493 | ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18\r | |
494 | \r | |
495 | // This is the correct result\r | |
496 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24\r | |
497 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29\r | |
498 | \r | |
499 | // and this is what the dictionary does...\r | |
500 | ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20\r | |
501 | ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29\r | |
502 | \r | |
503 | ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33\r | |
504 | \r | |
505 | // This is the correct result\r | |
506 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37\r | |
507 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41\r | |
508 | \r | |
509 | // and this is what the dictionary does\r | |
510 | ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41\r | |
511 | \r | |
512 | ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45\r | |
513 | ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49\r | |
514 | ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51\r | |
515 | \r | |
516 | // This is the correct result\r | |
517 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57\r | |
518 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60\r | |
519 | \r | |
520 | // and this is what the dictionary does\r | |
521 | ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54\r | |
522 | ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60\r | |
523 | \r | |
524 | ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63\r | |
525 | \r | |
526 | // This is the correct result\r | |
527 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68\r | |
528 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71\r | |
529 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74\r | |
530 | //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77\r | |
531 | \r | |
532 | // and this is what the dictionary does\r | |
533 | ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65\r | |
534 | ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77\r | |
535 | \r | |
536 | RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(\r | |
537 | Locale("th"), status);\r | |
538 | if (U_FAILURE(status))\r | |
539 | {\r | |
540 | errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");\r | |
541 | return;\r | |
542 | }\r | |
543 | \r | |
544 | generalIteratorTest(*e, thaiWordSelection);\r | |
545 | delete e;\r | |
546 | }\r | |
547 | \r | |
548 | \r | |
549 | void RBBITest::TestBug3818() {\r | |
550 | UErrorCode status = U_ZERO_ERROR;\r | |
551 | \r | |
552 | // Four Thai words...\r | |
553 | static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, \r | |
554 | 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; \r | |
555 | UnicodeString thaiStr(thaiWordData);\r | |
556 | \r | |
557 | RuleBasedBreakIterator* bi = \r | |
558 | (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);\r | |
559 | if (U_FAILURE(status) || bi == NULL) {\r | |
560 | errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));\r | |
561 | return;\r | |
562 | }\r | |
563 | bi->setText(thaiStr);\r | |
564 | \r | |
565 | int32_t startOfSecondWord = bi->following(1);\r | |
566 | if (startOfSecondWord != 4) {\r | |
567 | errln("Fail at file %s, line %d expected start of word at 4, got %d",\r | |
568 | __FILE__, __LINE__, startOfSecondWord);\r | |
569 | }\r | |
570 | startOfSecondWord = bi->following(0);\r | |
571 | if (startOfSecondWord != 4) {\r | |
572 | errln("Fail at file %s, line %d expected start of word at 4, got %d",\r | |
573 | __FILE__, __LINE__, startOfSecondWord);\r | |
574 | }\r | |
575 | delete bi;\r | |
576 | }\r | |
577 | \r | |
578 | \r | |
579 | void RBBITest::TestJapaneseWordBreak() {\r | |
580 | UErrorCode status = U_ZERO_ERROR;\r | |
581 | BITestData japaneseWordSelection(status);\r | |
582 | \r | |
583 | ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data\r | |
584 | ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2\r | |
585 | ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5\r | |
586 | ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7\r | |
587 | ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10\r | |
588 | ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11\r | |
589 | ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12\r | |
590 | \r | |
591 | RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(\r | |
592 | Locale("ja"), status);\r | |
593 | if (U_FAILURE(status))\r | |
594 | {\r | |
595 | errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");\r | |
596 | return;\r | |
597 | }\r | |
598 | \r | |
599 | generalIteratorTest(*e, japaneseWordSelection);\r | |
600 | delete e;\r | |
601 | }\r | |
602 | \r | |
603 | //---------------------------------------------\r | |
604 | // runIndexedTest\r | |
605 | //---------------------------------------------\r | |
606 | \r | |
607 | void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )\r | |
608 | {\r | |
609 | if (exec) logln("TestSuite RuleBasedBreakIterator: ");\r | |
610 | \r | |
611 | switch (index) {\r | |
612 | case 0: name = "TestBug4153072";\r | |
613 | if(exec) TestBug4153072(); break;\r | |
614 | case 1: name = "TestJapaneseLineBreak";\r | |
615 | if(exec) TestJapaneseLineBreak(); break;\r | |
616 | case 2: name = "TestStatusReturn";\r | |
617 | if(exec) TestStatusReturn(); break;\r | |
618 | \r | |
619 | case 3: name = "TestLineBreakData";\r | |
620 | if(exec) TestLineBreakData(); break;\r | |
621 | case 4: name = "TestEmptyString";\r | |
622 | if(exec) TestEmptyString(); break;\r | |
623 | \r | |
624 | case 5: name = "TestGetAvailableLocales";\r | |
625 | if(exec) TestGetAvailableLocales(); break;\r | |
626 | \r | |
627 | case 6: name = "TestGetDisplayName";\r | |
628 | if(exec) TestGetDisplayName(); break;\r | |
629 | \r | |
630 | case 7: name = "TestEndBehaviour";\r | |
631 | if(exec) TestEndBehaviour(); break;\r | |
632 | case 8: name = "TestMixedThaiLineBreak";\r | |
633 | if(exec) TestMixedThaiLineBreak(); break;\r | |
634 | case 9: name = "TestThaiWordBreak";\r | |
635 | if(exec) TestThaiWordBreak(); break;\r | |
636 | case 10: name = "TestThaiLineBreak";\r | |
637 | if(exec) TestThaiLineBreak(); break;\r | |
638 | case 11: name = "TestMaiyamok";\r | |
639 | if(exec) TestMaiyamok(); break;\r | |
640 | case 12: name = "TestWordBreaks";\r | |
641 | if(exec) TestWordBreaks(); break;\r | |
642 | case 13: name = "TestWordBoundary";\r | |
643 | if(exec) TestWordBoundary(); break;\r | |
644 | case 14: name = "TestLineBreaks";\r | |
645 | if(exec) TestLineBreaks(); break;\r | |
646 | case 15: name = "TestSentBreaks";\r | |
647 | if(exec) TestSentBreaks(); break;\r | |
648 | case 16: name = "TestExtended";\r | |
649 | if(exec) TestExtended(); break;\r | |
650 | case 17: name = "TestMonkey";\r | |
651 | if(exec) {\r | |
652 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS\r | |
653 | TestMonkey(params);\r | |
654 | #else\r | |
655 | logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");\r | |
656 | #endif\r | |
657 | }\r | |
658 | break;\r | |
659 | case 18: name = "TestBug3818";\r | |
660 | if(exec) TestBug3818(); break;\r | |
661 | case 19: name = "TestJapaneseWordBreak";\r | |
662 | if(exec) TestJapaneseWordBreak(); break;\r | |
663 | \r | |
664 | default: name = ""; break; //needed to end loop\r | |
665 | }\r | |
666 | }\r | |
667 | \r | |
668 | \r | |
669 | //----------------------------------------------------------------------------\r | |
670 | //\r | |
671 | // generalIteratorTest Given a break iterator and a set of test data,\r | |
672 | // Run the tests and report the results.\r | |
673 | //\r | |
674 | //----------------------------------------------------------------------------\r | |
675 | void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)\r | |
676 | {\r | |
677 | \r | |
678 | bi.setText(td.fDataToBreak);\r | |
679 | \r | |
680 | testFirstAndNext(bi, td);\r | |
681 | \r | |
682 | testLastAndPrevious(bi, td);\r | |
683 | \r | |
684 | testFollowing(bi, td);\r | |
685 | testPreceding(bi, td);\r | |
686 | testIsBoundary(bi, td);\r | |
687 | doMultipleSelectionTest(bi, td);\r | |
688 | }\r | |
689 | \r | |
690 | \r | |
691 | //\r | |
692 | // testFirstAndNext. Run the iterator forwards in the obvious first(), next()\r | |
693 | // kind of loop.\r | |
694 | //\r | |
695 | void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)\r | |
696 | {\r | |
697 | UErrorCode status = U_ZERO_ERROR;\r | |
698 | int32_t p;\r | |
699 | int32_t lastP = -1;\r | |
700 | int32_t tag;\r | |
701 | \r | |
702 | logln("Test first and next");\r | |
703 | bi.setText(td.fDataToBreak);\r | |
704 | td.clearResults();\r | |
705 | \r | |
706 | for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {\r | |
707 | td.fActualBreakPositions.addElement(p, status); // Save result.\r | |
708 | tag = bi.getRuleStatus();\r | |
709 | td.fActualTags.addElement(tag, status);\r | |
710 | if (p <= lastP) {\r | |
711 | // If the iterator is not making forward progress, stop.\r | |
712 | // No need to raise an error here, it'll be detected in the normal check of results.\r | |
713 | break;\r | |
714 | }\r | |
715 | lastP = p;\r | |
716 | }\r | |
717 | td.checkResults("testFirstAndNext", this);\r | |
718 | }\r | |
719 | \r | |
720 | \r | |
721 | //\r | |
722 | // TestLastAndPrevious. Run the iterator backwards, starting with last().\r | |
723 | //\r | |
724 | void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)\r | |
725 | {\r | |
726 | UErrorCode status = U_ZERO_ERROR;\r | |
727 | int32_t p;\r | |
728 | int32_t lastP = 0x7ffffffe;\r | |
729 | int32_t tag;\r | |
730 | \r | |
731 | logln("Test first and next");\r | |
732 | bi.setText(td.fDataToBreak);\r | |
733 | td.clearResults();\r | |
734 | \r | |
735 | for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {\r | |
736 | // Save break position. Insert it at start of vector of results, shoving\r | |
737 | // already-saved results further towards the end.\r | |
738 | td.fActualBreakPositions.insertElementAt(p, 0, status);\r | |
739 | // bi.previous(); // TODO: Why does this fix things up????\r | |
740 | // bi.next();\r | |
741 | tag = bi.getRuleStatus();\r | |
742 | td.fActualTags.insertElementAt(tag, 0, status);\r | |
743 | if (p >= lastP) {\r | |
744 | // If the iterator is not making progress, stop.\r | |
745 | // No need to raise an error here, it'll be detected in the normal check of results.\r | |
746 | break;\r | |
747 | }\r | |
748 | lastP = p;\r | |
749 | }\r | |
750 | td.checkResults("testLastAndPrevious", this);\r | |
751 | }\r | |
752 | \r | |
753 | \r | |
754 | void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)\r | |
755 | {\r | |
756 | UErrorCode status = U_ZERO_ERROR;\r | |
757 | int32_t p;\r | |
758 | int32_t tag;\r | |
759 | int32_t lastP = -2; // A value that will never be returned as a break position.\r | |
760 | // cannot be -1; that is returned for DONE.\r | |
761 | int i;\r | |
762 | \r | |
763 | logln("testFollowing():");\r | |
764 | bi.setText(td.fDataToBreak);\r | |
765 | td.clearResults();\r | |
766 | \r | |
767 | // Save the starting point, since we won't get that out of following.\r | |
768 | p = bi.first();\r | |
769 | td.fActualBreakPositions.addElement(p, status); // Save result.\r | |
770 | tag = bi.getRuleStatus();\r | |
771 | td.fActualTags.addElement(tag, status);\r | |
772 | \r | |
773 | for (i = 0; i <= td.fDataToBreak.length()+1; i++) {\r | |
774 | p = bi.following(i);\r | |
775 | if (p != lastP) {\r | |
776 | if (p == RuleBasedBreakIterator::DONE) {\r | |
777 | break;\r | |
778 | }\r | |
779 | // We've reached a new break position. Save it.\r | |
780 | td.fActualBreakPositions.addElement(p, status); // Save result.\r | |
781 | tag = bi.getRuleStatus();\r | |
782 | td.fActualTags.addElement(tag, status);\r | |
783 | lastP = p;\r | |
784 | }\r | |
785 | }\r | |
786 | // The loop normally exits by means of the break in the middle.\r | |
787 | // Make sure that the index was at the correct position for the break iterator to have\r | |
788 | // returned DONE.\r | |
789 | if (i != td.fDataToBreak.length()) {\r | |
790 | errln("testFollowing(): iterator returned DONE prematurely.");\r | |
791 | }\r | |
792 | \r | |
793 | // Full check of all results.\r | |
794 | td.checkResults("testFollowing", this);\r | |
795 | }\r | |
796 | \r | |
797 | \r | |
798 | \r | |
799 | void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {\r | |
800 | UErrorCode status = U_ZERO_ERROR;\r | |
801 | int32_t p;\r | |
802 | int32_t tag;\r | |
803 | int32_t lastP = 0x7ffffffe;\r | |
804 | int i;\r | |
805 | \r | |
806 | logln("testPreceding():");\r | |
807 | bi.setText(td.fDataToBreak);\r | |
808 | td.clearResults();\r | |
809 | \r | |
810 | p = bi.last();\r | |
811 | td.fActualBreakPositions.addElement(p, status);\r | |
812 | tag = bi.getRuleStatus();\r | |
813 | td.fActualTags.addElement(tag, status);\r | |
814 | \r | |
815 | for (i = td.fDataToBreak.length(); i>=-1; i--) {\r | |
816 | p = bi.preceding(i);\r | |
817 | if (p != lastP) {\r | |
818 | if (p == RuleBasedBreakIterator::DONE) {\r | |
819 | break;\r | |
820 | }\r | |
821 | // We've reached a new break position. Save it.\r | |
822 | td.fActualBreakPositions.insertElementAt(p, 0, status);\r | |
823 | lastP = p;\r | |
824 | tag = bi.getRuleStatus();\r | |
825 | td.fActualTags.insertElementAt(tag, 0, status);\r | |
826 | }\r | |
827 | }\r | |
828 | // The loop normally exits by means of the break in the middle.\r | |
829 | // Make sure that the index was at the correct position for the break iterator to have\r | |
830 | // returned DONE.\r | |
831 | if (i != 0) {\r | |
832 | errln("testPreceding(): iterator returned DONE prematurely.");\r | |
833 | }\r | |
834 | \r | |
835 | // Full check of all results.\r | |
836 | td.checkResults("testPreceding", this);\r | |
837 | }\r | |
838 | \r | |
839 | \r | |
840 | \r | |
841 | void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {\r | |
842 | UErrorCode status = U_ZERO_ERROR;\r | |
843 | int i;\r | |
844 | int32_t tag;\r | |
845 | \r | |
846 | logln("testIsBoundary():");\r | |
847 | bi.setText(td.fDataToBreak);\r | |
848 | td.clearResults();\r | |
849 | \r | |
850 | for (i = 0; i <= td.fDataToBreak.length(); i++) {\r | |
851 | if (bi.isBoundary(i)) {\r | |
852 | td.fActualBreakPositions.addElement(i, status); // Save result.\r | |
853 | tag = bi.getRuleStatus();\r | |
854 | td.fActualTags.addElement(tag, status);\r | |
855 | }\r | |
856 | }\r | |
857 | td.checkResults("testIsBoundary: ", this);\r | |
858 | }\r | |
859 | \r | |
860 | \r | |
861 | \r | |
862 | void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)\r | |
863 | {\r | |
864 | iterator.setText(td.fDataToBreak);\r | |
865 | \r | |
866 | RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();\r | |
867 | int32_t offset = iterator.first();\r | |
868 | int32_t testOffset;\r | |
869 | int32_t count = 0;\r | |
870 | \r | |
871 | logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());\r | |
872 | \r | |
873 | if (*testIterator != iterator)\r | |
874 | errln("clone() or operator!= failed: two clones compared unequal");\r | |
875 | \r | |
876 | do {\r | |
877 | testOffset = testIterator->first();\r | |
878 | testOffset = testIterator->next(count);\r | |
879 | if (offset != testOffset)\r | |
880 | errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);\r | |
881 | \r | |
882 | if (offset != RuleBasedBreakIterator::DONE) {\r | |
883 | count++;\r | |
884 | offset = iterator.next();\r | |
885 | \r | |
886 | if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {\r | |
887 | errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);\r | |
888 | if (count > 10000 || offset == -1) {\r | |
889 | errln("operator== failed too many times. Stopping test.");\r | |
890 | if (offset == -1) {\r | |
891 | errln("Does (RuleBasedBreakIterator::DONE == -1)?");\r | |
892 | }\r | |
893 | return;\r | |
894 | }\r | |
895 | }\r | |
896 | }\r | |
897 | } while (offset != RuleBasedBreakIterator::DONE);\r | |
898 | \r | |
899 | // now do it backwards...\r | |
900 | offset = iterator.last();\r | |
901 | count = 0;\r | |
902 | \r | |
903 | do {\r | |
904 | testOffset = testIterator->last();\r | |
905 | testOffset = testIterator->next(count); // next() with a negative arg is same as previous\r | |
906 | if (offset != testOffset)\r | |
907 | errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);\r | |
908 | \r | |
909 | if (offset != RuleBasedBreakIterator::DONE) {\r | |
910 | count--;\r | |
911 | offset = iterator.previous();\r | |
912 | }\r | |
913 | } while (offset != RuleBasedBreakIterator::DONE);\r | |
914 | \r | |
915 | delete testIterator;\r | |
916 | }\r | |
917 | \r | |
918 | \r | |
919 | \r | |
920 | //--------------------------------------------------------------------------------------------\r | |
921 | //\r | |
922 | // Break Iterator Invariants Tests\r | |
923 | //\r | |
924 | //--------------------------------------------------------------------------------------------\r | |
925 | \r | |
926 | void RBBITest::TestCharacterInvariants()\r | |
927 | {\r | |
928 | UErrorCode status = U_ZERO_ERROR;\r | |
929 | BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);\r | |
930 | if (U_FAILURE(status))\r | |
931 | {\r | |
932 | errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");\r | |
933 | return;\r | |
934 | }\r | |
935 | UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");\r | |
936 | doBreakInvariantTest(*e, s);\r | |
937 | s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");\r | |
938 | doOtherInvariantTest(*e, s);\r | |
939 | delete e;\r | |
940 | }\r | |
941 | \r | |
942 | \r | |
943 | void RBBITest::TestWordInvariants()\r | |
944 | {\r | |
945 | UErrorCode status = U_ZERO_ERROR;\r | |
946 | BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);\r | |
947 | if (U_FAILURE(status))\r | |
948 | {\r | |
949 | errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");\r | |
950 | return;\r | |
951 | }\r | |
952 | UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");\r | |
953 | doBreakInvariantTest(*e, s);\r | |
954 | s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");\r | |
955 | doOtherInvariantTest(*e, s);\r | |
956 | delete e;\r | |
957 | }\r | |
958 | \r | |
959 | \r | |
960 | void RBBITest::TestSentenceInvariants()\r | |
961 | {\r | |
962 | UErrorCode status = U_ZERO_ERROR;\r | |
963 | BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);\r | |
964 | if (U_FAILURE(status))\r | |
965 | {\r | |
966 | errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");\r | |
967 | return;\r | |
968 | }\r | |
969 | UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");\r | |
970 | doOtherInvariantTest(*e, s);\r | |
971 | delete e;\r | |
972 | }\r | |
973 | \r | |
974 | \r | |
975 | \r | |
976 | \r | |
977 | void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)\r | |
978 | {\r | |
979 | UnicodeString work("aaa");\r | |
980 | int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;\r | |
981 | \r | |
982 | // a break should always occur after CR (unless followed by LF), LF, PS, and LS\r | |
983 | UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");\r | |
984 | int32_t i, j;\r | |
985 | \r | |
986 | breaksLen = breaks.length();\r | |
987 | for (i = 0; i < breaksLen; i++) {\r | |
988 | UChar c1 = breaks[i];\r | |
989 | work.setCharAt(1, c1);\r | |
990 | for (j = 0; j < testCharsLen; j++) {\r | |
991 | UChar c0 = testChars[j];\r | |
992 | work.setCharAt(0, c0);\r | |
993 | int k;\r | |
994 | for (k = 0; k < testCharsLen; k++) {\r | |
995 | UChar c2 = testChars[k];\r | |
996 | work.setCharAt(2, c2);\r | |
997 | \r | |
998 | // if a cr is followed by lf, ps, ls or etx, don't do the check (that's\r | |
999 | // not supposed to work)\r | |
1000 | if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029\r | |
1001 | || c2 == 0x2028 || c2 == 0x0003))\r | |
1002 | continue;\r | |
1003 | \r | |
1004 | if (u_charType(c1) == U_CONTROL_CHAR &&\r | |
1005 | (u_charType(c2) == U_NON_SPACING_MARK ||\r | |
1006 | u_charType(c2) == U_ENCLOSING_MARK ||\r | |
1007 | u_charType(c2) == U_COMBINING_SPACING_MARK)\r | |
1008 | ) {\r | |
1009 | // Combining marks don't combine with controls.\r | |
1010 | // TODO: enhance test to verify that the break actually occurs,\r | |
1011 | // not just ignore the case.\r | |
1012 | continue;\r | |
1013 | }\r | |
1014 | \r | |
1015 | \r | |
1016 | tb.setText(work);\r | |
1017 | UBool seen2 = FALSE;\r | |
1018 | int l;\r | |
1019 | for (l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {\r | |
1020 | if (l == 2) {\r | |
1021 | seen2 = TRUE;\r | |
1022 | break;\r | |
1023 | }\r | |
1024 | }\r | |
1025 | if (!seen2) {\r | |
1026 | printStringBreaks(work, NULL, 0); \r | |
1027 | errln("No Break between \\U%04x and \\U%04x", c1, c2);\r | |
1028 | errCount++;\r | |
1029 | if (errCount >= 75)\r | |
1030 | return;\r | |
1031 | }\r | |
1032 | }\r | |
1033 | }\r | |
1034 | }\r | |
1035 | }\r | |
1036 | \r | |
1037 | \r | |
1038 | \r | |
1039 | void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)\r | |
1040 | {\r | |
1041 | UnicodeString work("a\r\na");\r | |
1042 | int32_t errCount = 0, testCharsLen = testChars.length();\r | |
1043 | int32_t i, j;\r | |
1044 | int8_t type;\r | |
1045 | \r | |
1046 | // a break should never occur between CR and LF\r | |
1047 | for (i = 0; i < testCharsLen; i++) {\r | |
1048 | work.setCharAt(0, testChars[i]);\r | |
1049 | for (j = 0; j < testCharsLen; j++) {\r | |
1050 | work.setCharAt(3, testChars[j]);\r | |
1051 | tb.setText(work);\r | |
1052 | int32_t k;\r | |
1053 | for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())\r | |
1054 | if (k == 2) {\r | |
1055 | errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",\r | |
1056 | work[0], work[1], work[2], work[3]);\r | |
1057 | errCount++;\r | |
1058 | if (errCount >= 75)\r | |
1059 | return;\r | |
1060 | }\r | |
1061 | }\r | |
1062 | }\r | |
1063 | \r | |
1064 | // a break should never occur before a non-spacing mark, unless the preceding\r | |
1065 | // character is CR, LF, PS, or LS\r | |
1066 | // Or the general category == Control.\r | |
1067 | work.remove();\r | |
1068 | work += "aaaa";\r | |
1069 | for (i = 0; i < testCharsLen; i++) {\r | |
1070 | UChar c1 = testChars[i];\r | |
1071 | if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||\r | |
1072 | u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) {\r | |
1073 | continue;\r | |
1074 | }\r | |
1075 | work.setCharAt(1, c1);\r | |
1076 | for (j = 0; j < testCharsLen; j++) {\r | |
1077 | UChar c2 = testChars[j];\r | |
1078 | type = u_charType(c2);\r | |
1079 | if ((type != U_NON_SPACING_MARK) &&\r | |
1080 | (type != U_ENCLOSING_MARK)) {\r | |
1081 | continue;\r | |
1082 | }\r | |
1083 | work.setCharAt(2, c2);\r | |
1084 | tb.setText(work);\r | |
1085 | int k;\r | |
1086 | for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())\r | |
1087 | if (k == 2) {\r | |
1088 | //errln("Break between U+" + UCharToUnicodeString(work[1])\r | |
1089 | // + " and U+" + UCharToUnicodeString(work[2]));\r | |
1090 | errln("Unexpected Break between %6x and %6x", c1, c2);\r | |
1091 | errCount++;\r | |
1092 | if (errCount >= 75)\r | |
1093 | return;\r | |
1094 | }\r | |
1095 | }\r | |
1096 | }\r | |
1097 | }\r | |
1098 | \r | |
1099 | \r | |
1100 | \r | |
1101 | \r | |
1102 | //---------------------------------------------\r | |
1103 | //\r | |
1104 | // other tests\r | |
1105 | //\r | |
1106 | //---------------------------------------------\r | |
1107 | void RBBITest::TestEmptyString()\r | |
1108 | {\r | |
1109 | UnicodeString text = "";\r | |
1110 | UErrorCode status = U_ZERO_ERROR;\r | |
1111 | \r | |
1112 | BITestData x(status);\r | |
1113 | ADD_DATACHUNK(x, "", 0, status); // Break at start of data\r | |
1114 | RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);\r | |
1115 | if (U_FAILURE(status))\r | |
1116 | {\r | |
1117 | errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");\r | |
1118 | return;\r | |
1119 | }\r | |
1120 | generalIteratorTest(*bi, x);\r | |
1121 | delete bi;\r | |
1122 | }\r | |
1123 | \r | |
1124 | void RBBITest::TestGetAvailableLocales()\r | |
1125 | {\r | |
1126 | int32_t locCount = 0;\r | |
1127 | const Locale* locList = BreakIterator::getAvailableLocales(locCount);\r | |
1128 | \r | |
1129 | if (locCount == 0)\r | |
1130 | errln("getAvailableLocales() returned an empty list!");\r | |
1131 | // Just make sure that it's returning good memory.\r | |
1132 | int32_t i;\r | |
1133 | for (i = 0; i < locCount; ++i) {\r | |
1134 | logln(locList[i].getName());\r | |
1135 | }\r | |
1136 | }\r | |
1137 | \r | |
1138 | //Testing the BreakIterator::getDisplayName() function\r | |
1139 | void RBBITest::TestGetDisplayName()\r | |
1140 | {\r | |
1141 | UnicodeString result;\r | |
1142 | \r | |
1143 | BreakIterator::getDisplayName(Locale::getUS(), result);\r | |
1144 | if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")\r | |
1145 | errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""\r | |
1146 | + result);\r | |
1147 | \r | |
1148 | BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);\r | |
1149 | if (result != "French (France)")\r | |
1150 | errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""\r | |
1151 | + result);\r | |
1152 | }\r | |
1153 | /**\r | |
1154 | * Test End Behaviour\r | |
1155 | * @bug 4068137\r | |
1156 | */\r | |
1157 | void RBBITest::TestEndBehaviour()\r | |
1158 | {\r | |
1159 | UErrorCode status = U_ZERO_ERROR;\r | |
1160 | UnicodeString testString("boo.");\r | |
1161 | BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);\r | |
1162 | if (U_FAILURE(status))\r | |
1163 | {\r | |
1164 | errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");\r | |
1165 | return;\r | |
1166 | }\r | |
1167 | wb->setText(testString);\r | |
1168 | \r | |
1169 | if (wb->first() != 0)\r | |
1170 | errln("Didn't get break at beginning of string.");\r | |
1171 | if (wb->next() != 3)\r | |
1172 | errln("Didn't get break before period in \"boo.\"");\r | |
1173 | if (wb->current() != 4 && wb->next() != 4)\r | |
1174 | errln("Didn't get break at end of string.");\r | |
1175 | delete wb;\r | |
1176 | }\r | |
1177 | /*\r | |
1178 | * @bug 4153072\r | |
1179 | */\r | |
1180 | void RBBITest::TestBug4153072() {\r | |
1181 | UErrorCode status = U_ZERO_ERROR;\r | |
1182 | BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);\r | |
1183 | if (U_FAILURE(status))\r | |
1184 | {\r | |
1185 | errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");\r | |
1186 | return;\r | |
1187 | }\r | |
1188 | UnicodeString str("...Hello, World!...");\r | |
1189 | int32_t begin = 3;\r | |
1190 | int32_t end = str.length() - 3;\r | |
1191 | UBool dummy;\r | |
1192 | \r | |
1193 | StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);\r | |
1194 | iter->adoptText(textIterator);\r | |
1195 | int index;\r | |
1196 | for (index = -1; index < begin + 1; ++index) {\r | |
1197 | dummy = iter->isBoundary(index);\r | |
1198 | if (index < begin && dummy == TRUE) {\r | |
1199 | errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +\r | |
1200 | " and begin index = " + begin);\r | |
1201 | }\r | |
1202 | }\r | |
1203 | delete iter;\r | |
1204 | }\r | |
1205 | \r | |
1206 | \r | |
1207 | /**\r | |
1208 | * Test Japanese Line Break\r | |
1209 | * @bug 4095322\r | |
1210 | */\r | |
1211 | void RBBITest::TestJapaneseLineBreak()\r | |
1212 | {\r | |
1213 | #if 0\r | |
1214 | // Test needs updating some more... Dump it for now.\r | |
1215 | \r | |
1216 | \r | |
1217 | // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count\r | |
1218 | // as opening and closing punctuation for line breaking.\r | |
1219 | // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars\r | |
1220 | // from these tests. 6-13-2002\r | |
1221 | //\r | |
1222 | UErrorCode status = U_ZERO_ERROR;\r | |
1223 | UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");\r | |
1224 | UnicodeString precedingChars = CharsToUnicodeString(\r | |
1225 | //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");\r | |
1226 | "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");\r | |
1227 | UnicodeString followingChars = CharsToUnicodeString(\r | |
1228 | // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"\r | |
1229 | ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"\r | |
1230 | // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"\r | |
1231 | ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"\r | |
1232 | "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");\r | |
1233 | BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);\r | |
1234 | \r | |
1235 | int32_t i;\r | |
1236 | if (U_FAILURE(status))\r | |
1237 | {\r | |
1238 | errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");\r | |
1239 | return;\r | |
1240 | }\r | |
1241 | \r | |
1242 | for (i = 0; i < precedingChars.length(); i++) {\r | |
1243 | testString.setCharAt(1, precedingChars[i]);\r | |
1244 | iter->setText(testString);\r | |
1245 | int32_t j = iter->first();\r | |
1246 | if (j != 0)\r | |
1247 | errln("ja line break failure: failed to start at 0");\r | |
1248 | j = iter->next();\r | |
1249 | if (j != 1)\r | |
1250 | errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])\r | |
1251 | + "' (" + ((int)(precedingChars[i])) + ")");\r | |
1252 | j = iter->next();\r | |
1253 | if (j != 3)\r | |
1254 | errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])\r | |
1255 | + "' (" + ((int)(precedingChars[i])) + ")");\r | |
1256 | }\r | |
1257 | \r | |
1258 | for (i = 0; i < followingChars.length(); i++) {\r | |
1259 | testString.setCharAt(1, followingChars[i]);\r | |
1260 | iter->setText(testString);\r | |
1261 | int j = iter->first();\r | |
1262 | if (j != 0)\r | |
1263 | errln("ja line break failure: failed to start at 0");\r | |
1264 | j = iter->next();\r | |
1265 | if (j != 2)\r | |
1266 | errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])\r | |
1267 | + "' (" + ((int)(followingChars[i])) + ")");\r | |
1268 | j = iter->next();\r | |
1269 | if (j != 3)\r | |
1270 | errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])\r | |
1271 | + "' (" + ((int)(followingChars[i])) + ")");\r | |
1272 | }\r | |
1273 | delete iter;\r | |
1274 | #endif\r | |
1275 | }\r | |
1276 | \r | |
1277 | \r | |
1278 | //------------------------------------------------------------------------------\r | |
1279 | //\r | |
1280 | // RBBITest::Extended Run RBBI Tests from an external test data file\r | |
1281 | //\r | |
1282 | //------------------------------------------------------------------------------\r | |
1283 | \r | |
1284 | struct TestParams {\r | |
1285 | BreakIterator *bi;\r | |
1286 | UnicodeString dataToBreak;\r | |
1287 | UVector32 *expectedBreaks;\r | |
1288 | UVector32 *srcLine;\r | |
1289 | UVector32 *srcCol;\r | |
1290 | };\r | |
1291 | \r | |
1292 | void RBBITest::executeTest(TestParams *t) {\r | |
1293 | int32_t bp;\r | |
1294 | int32_t prevBP;\r | |
1295 | int32_t i;\r | |
1296 | \r | |
1297 | t->bi->setText(t->dataToBreak);\r | |
1298 | //\r | |
1299 | // Run the iterator forward\r | |
1300 | //\r | |
1301 | prevBP = -1;\r | |
1302 | for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {\r | |
1303 | if (prevBP == bp) {\r | |
1304 | // Fail for lack of forward progress.\r | |
1305 | errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",\r | |
1306 | bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r | |
1307 | break;\r | |
1308 | }\r | |
1309 | \r | |
1310 | // Check that there were we didn't miss an expected break between the last one\r | |
1311 | // and this one.\r | |
1312 | for (i=prevBP+1; i<bp; i++) {\r | |
1313 | if (t->expectedBreaks->elementAti(i) != 0) {\r | |
1314 | int expected[] = {0, i};\r | |
1315 | printStringBreaks(t->dataToBreak, expected, 2);\r | |
1316 | errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r | |
1317 | i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r | |
1318 | }\r | |
1319 | }\r | |
1320 | \r | |
1321 | // Check that the break we did find was expected\r | |
1322 | if (t->expectedBreaks->elementAti(bp) == 0) {\r | |
1323 | int expected[] = {0, bp};\r | |
1324 | printStringBreaks(t->dataToBreak, expected, 2);\r | |
1325 | errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",\r | |
1326 | bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r | |
1327 | } else {\r | |
1328 | // The break was expected.\r | |
1329 | // Check that the {nnn} tag value is correct.\r | |
1330 | int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);\r | |
1331 | if (expectedTagVal == -1) {\r | |
1332 | expectedTagVal = 0;\r | |
1333 | }\r | |
1334 | int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();\r | |
1335 | if (rs != expectedTagVal) {\r | |
1336 | errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"\r | |
1337 | " Actual, Expected status = %4d, %4d",\r | |
1338 | bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);\r | |
1339 | }\r | |
1340 | }\r | |
1341 | \r | |
1342 | \r | |
1343 | prevBP = bp;\r | |
1344 | }\r | |
1345 | \r | |
1346 | // Verify that there were no missed expected breaks after the last one found\r | |
1347 | for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {\r | |
1348 | if (t->expectedBreaks->elementAti(i) != 0) {\r | |
1349 | errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r | |
1350 | i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r | |
1351 | }\r | |
1352 | }\r | |
1353 | \r | |
1354 | //\r | |
1355 | // Run the iterator backwards, verify that the same breaks are found.\r | |
1356 | //\r | |
1357 | prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.\r | |
1358 | for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {\r | |
1359 | if (prevBP == bp) {\r | |
1360 | // Fail for lack of progress.\r | |
1361 | errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",\r | |
1362 | bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r | |
1363 | break;\r | |
1364 | }\r | |
1365 | \r | |
1366 | // Check that there were we didn't miss an expected break between the last one\r | |
1367 | // and this one. (UVector returns zeros for index out of bounds.)\r | |
1368 | for (i=prevBP-1; i>bp; i--) {\r | |
1369 | if (t->expectedBreaks->elementAti(i) != 0) {\r | |
1370 | errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r | |
1371 | i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r | |
1372 | }\r | |
1373 | }\r | |
1374 | \r | |
1375 | // Check that the break we did find was expected\r | |
1376 | if (t->expectedBreaks->elementAti(bp) == 0) {\r | |
1377 | errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",\r | |
1378 | bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));\r | |
1379 | } else {\r | |
1380 | // The break was expected.\r | |
1381 | // Check that the {nnn} tag value is correct.\r | |
1382 | int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);\r | |
1383 | if (expectedTagVal == -1) {\r | |
1384 | expectedTagVal = 0;\r | |
1385 | }\r | |
1386 | int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();\r | |
1387 | if (rs != expectedTagVal) {\r | |
1388 | errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"\r | |
1389 | " Actual, Expected status = %4d, %4d",\r | |
1390 | bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);\r | |
1391 | }\r | |
1392 | }\r | |
1393 | \r | |
1394 | prevBP = bp;\r | |
1395 | }\r | |
1396 | \r | |
1397 | // Verify that there were no missed breaks prior to the last one found\r | |
1398 | for (i=prevBP-1; i>=0; i--) {\r | |
1399 | if (t->expectedBreaks->elementAti(i) != 0) {\r | |
1400 | errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",\r | |
1401 | i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));\r | |
1402 | }\r | |
1403 | }\r | |
1404 | }\r | |
1405 | \r | |
1406 | \r | |
1407 | void RBBITest::TestExtended() {\r | |
1408 | UErrorCode status = U_ZERO_ERROR;\r | |
1409 | Locale locale = Locale::getDefault();\r | |
1410 | \r | |
1411 | UnicodeString rules;\r | |
1412 | TestParams tp;\r | |
1413 | tp.bi = NULL;\r | |
1414 | tp.expectedBreaks = new UVector32(status);\r | |
1415 | tp.srcLine = new UVector32(status);\r | |
1416 | tp.srcCol = new UVector32(status);\r | |
1417 | \r | |
1418 | \r | |
1419 | //\r | |
1420 | // Open and read the test data file.\r | |
1421 | //\r | |
1422 | const char *testDataDirectory = IntlTest::getSourceTestData(status);\r | |
1423 | char testFileName[1000];\r | |
1424 | if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {\r | |
1425 | errln("Can't open test data. Path too long.");\r | |
1426 | return;\r | |
1427 | }\r | |
1428 | strcpy(testFileName, testDataDirectory);\r | |
1429 | strcat(testFileName, "rbbitst.txt");\r | |
1430 | \r | |
1431 | int len;\r | |
1432 | UChar *testFile = ReadAndConvertFile(testFileName, len, status);\r | |
1433 | if (U_FAILURE(status)) {\r | |
1434 | return; /* something went wrong, error already output */\r | |
1435 | }\r | |
1436 | \r | |
1437 | \r | |
1438 | \r | |
1439 | //\r | |
1440 | // Put the test data into a UnicodeString\r | |
1441 | //\r | |
1442 | UnicodeString testString(FALSE, testFile, len);\r | |
1443 | \r | |
1444 | enum EParseState{\r | |
1445 | PARSE_COMMENT,\r | |
1446 | PARSE_TAG,\r | |
1447 | PARSE_DATA,\r | |
1448 | PARSE_NUM\r | |
1449 | }\r | |
1450 | parseState = PARSE_TAG;\r | |
1451 | \r | |
1452 | EParseState savedState = PARSE_TAG;\r | |
1453 | \r | |
1454 | static const UChar CH_LF = 0x0a;\r | |
1455 | static const UChar CH_CR = 0x0d;\r | |
1456 | static const UChar CH_HASH = 0x23;\r | |
1457 | /*static const UChar CH_PERIOD = 0x2e;*/\r | |
1458 | static const UChar CH_LT = 0x3c;\r | |
1459 | static const UChar CH_GT = 0x3e;\r | |
1460 | static const UChar CH_BACKSLASH = 0x5c;\r | |
1461 | static const UChar CH_BULLET = 0x2022;\r | |
1462 | \r | |
1463 | int32_t lineNum = 1;\r | |
1464 | int32_t colStart = 0;\r | |
1465 | int32_t column = 0;\r | |
1466 | int32_t charIdx = 0;\r | |
1467 | \r | |
1468 | int32_t tagValue = 0; // The numeric value of a <nnn> tag.\r | |
1469 | \r | |
1470 | for (charIdx = 0; charIdx < len; ) {\r | |
1471 | UChar c = testString.charAt(charIdx);\r | |
1472 | charIdx++;\r | |
1473 | if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {\r | |
1474 | // treat CRLF as a unit\r | |
1475 | c = CH_LF;\r | |
1476 | charIdx++;\r | |
1477 | }\r | |
1478 | if (c == CH_LF || c == CH_CR) {\r | |
1479 | lineNum++;\r | |
1480 | colStart = charIdx;\r | |
1481 | }\r | |
1482 | column = charIdx - colStart + 1;\r | |
1483 | \r | |
1484 | switch (parseState) {\r | |
1485 | case PARSE_COMMENT:\r | |
1486 | if (c == 0x0a || c == 0x0d) {\r | |
1487 | parseState = savedState;\r | |
1488 | }\r | |
1489 | break;\r | |
1490 | \r | |
1491 | case PARSE_TAG:\r | |
1492 | {\r | |
1493 | if (c == CH_HASH) {\r | |
1494 | parseState = PARSE_COMMENT;\r | |
1495 | savedState = PARSE_TAG;\r | |
1496 | break;\r | |
1497 | }\r | |
1498 | if (u_isUWhiteSpace(c)) {\r | |
1499 | break;\r | |
1500 | }\r | |
1501 | if (testString.compare(charIdx-1, 6, "<word>") == 0) {\r | |
1502 | delete tp.bi;\r | |
1503 | tp.bi = BreakIterator::createWordInstance(locale, status);\r | |
1504 | charIdx += 5;\r | |
1505 | break;\r | |
1506 | }\r | |
1507 | if (testString.compare(charIdx-1, 6, "<char>") == 0) {\r | |
1508 | delete tp.bi;\r | |
1509 | tp.bi = BreakIterator::createCharacterInstance(locale, status);\r | |
1510 | charIdx += 5;\r | |
1511 | break;\r | |
1512 | }\r | |
1513 | if (testString.compare(charIdx-1, 6, "<line>") == 0) {\r | |
1514 | delete tp.bi;\r | |
1515 | tp.bi = BreakIterator::createLineInstance(locale, status);\r | |
1516 | charIdx += 5;\r | |
1517 | break;\r | |
1518 | }\r | |
1519 | if (testString.compare(charIdx-1, 6, "<sent>") == 0) {\r | |
1520 | delete tp.bi;\r | |
1521 | tp.bi = BreakIterator::createSentenceInstance(locale, status);\r | |
1522 | charIdx += 5;\r | |
1523 | break;\r | |
1524 | }\r | |
1525 | if (testString.compare(charIdx-1, 7, "<title>") == 0) {\r | |
1526 | delete tp.bi;\r | |
1527 | tp.bi = BreakIterator::createTitleInstance(locale, status);\r | |
1528 | charIdx += 6;\r | |
1529 | break;\r | |
1530 | }\r | |
1531 | if (testString.compare(charIdx-1, 6, "<data>") == 0) {\r | |
1532 | parseState = PARSE_DATA;\r | |
1533 | charIdx += 5;\r | |
1534 | tp.dataToBreak = "";\r | |
1535 | tp.expectedBreaks->removeAllElements();\r | |
1536 | tp.srcCol ->removeAllElements();\r | |
1537 | tp.srcLine->removeAllElements();\r | |
1538 | break;\r | |
1539 | }\r | |
1540 | \r | |
1541 | errln("line %d: Tag expected in test file.", lineNum);\r | |
1542 | goto end_test;\r | |
1543 | parseState = PARSE_COMMENT;\r | |
1544 | savedState = PARSE_DATA;\r | |
1545 | }\r | |
1546 | break;\r | |
1547 | \r | |
1548 | case PARSE_DATA:\r | |
1549 | if (c == CH_BULLET) {\r | |
1550 | int32_t breakIdx = tp.dataToBreak.length();\r | |
1551 | tp.expectedBreaks->setSize(breakIdx+1);\r | |
1552 | tp.expectedBreaks->setElementAt(-1, breakIdx);\r | |
1553 | tp.srcLine->setSize(breakIdx+1);\r | |
1554 | tp.srcLine->setElementAt(lineNum, breakIdx);\r | |
1555 | tp.srcCol ->setSize(breakIdx+1);\r | |
1556 | tp.srcCol ->setElementAt(column, breakIdx);\r | |
1557 | break;\r | |
1558 | }\r | |
1559 | \r | |
1560 | if (testString.compare(charIdx-1, 7, "</data>") == 0) {\r | |
1561 | // Add final entry to mappings from break location to source file position.\r | |
1562 | // Need one extra because last break position returned is after the\r | |
1563 | // last char in the data, not at the last char.\r | |
1564 | tp.srcLine->addElement(lineNum, status);\r | |
1565 | tp.srcCol ->addElement(column, status);\r | |
1566 | \r | |
1567 | parseState = PARSE_TAG;\r | |
1568 | charIdx += 7;\r | |
1569 | \r | |
1570 | // RUN THE TEST!\r | |
1571 | executeTest(&tp);\r | |
1572 | break;\r | |
1573 | }\r | |
1574 | \r | |
1575 | if (testString.compare(charIdx-1, 3, "\\N{") == 0) {\r | |
1576 | // Named character, e.g. \N{COMBINING GRAVE ACCENT}\r | |
1577 | // Get the code point from the name and insert it into the test data.\r | |
1578 | // (Damn, no API takes names in Unicode !!!\r | |
1579 | // we've got to take it back to char *)\r | |
1580 | int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);\r | |
1581 | int32_t nameLength = nameEndIdx - (charIdx+2);\r | |
1582 | char charNameBuf[200];\r | |
1583 | UChar32 theChar = -1;\r | |
1584 | if (nameEndIdx != -1) {\r | |
1585 | UErrorCode status = U_ZERO_ERROR;\r | |
1586 | testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));\r | |
1587 | charNameBuf[sizeof(charNameBuf)-1] = 0;\r | |
1588 | theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);\r | |
1589 | if (U_FAILURE(status)) {\r | |
1590 | theChar = -1;\r | |
1591 | }\r | |
1592 | }\r | |
1593 | if (theChar == -1) {\r | |
1594 | errln("Error in named character in test file at line %d, col %d",\r | |
1595 | lineNum, column);\r | |
1596 | } else {\r | |
1597 | // Named code point was recognized. Insert it\r | |
1598 | // into the test data.\r | |
1599 | tp.dataToBreak.append(theChar);\r | |
1600 | while (tp.dataToBreak.length() > tp.srcLine->size()) {\r | |
1601 | tp.srcLine->addElement(lineNum, status);\r | |
1602 | tp.srcCol ->addElement(column, status);\r | |
1603 | }\r | |
1604 | }\r | |
1605 | if (nameEndIdx > charIdx) {\r | |
1606 | charIdx = nameEndIdx+1;\r | |
1607 | }\r | |
1608 | break;\r | |
1609 | }\r | |
1610 | \r | |
1611 | \r | |
1612 | \r | |
1613 | \r | |
1614 | if (testString.compare(charIdx-1, 2, "<>") == 0) {\r | |
1615 | charIdx++;\r | |
1616 | int32_t breakIdx = tp.dataToBreak.length();\r | |
1617 | tp.expectedBreaks->setSize(breakIdx+1);\r | |
1618 | tp.expectedBreaks->setElementAt(-1, breakIdx);\r | |
1619 | tp.srcLine->setSize(breakIdx+1);\r | |
1620 | tp.srcLine->setElementAt(lineNum, breakIdx);\r | |
1621 | tp.srcCol ->setSize(breakIdx+1);\r | |
1622 | tp.srcCol ->setElementAt(column, breakIdx);\r | |
1623 | break;\r | |
1624 | }\r | |
1625 | \r | |
1626 | if (c == CH_LT) {\r | |
1627 | tagValue = 0;\r | |
1628 | parseState = PARSE_NUM;\r | |
1629 | break;\r | |
1630 | }\r | |
1631 | \r | |
1632 | if (c == CH_HASH && column==3) { // TODO: why is column off so far?\r | |
1633 | parseState = PARSE_COMMENT;\r | |
1634 | savedState = PARSE_DATA;\r | |
1635 | break;\r | |
1636 | }\r | |
1637 | \r | |
1638 | if (c == CH_BACKSLASH) {\r | |
1639 | // Check for \ at end of line, a line continuation.\r | |
1640 | // Advance over (discard) the newline\r | |
1641 | UChar32 cp = testString.char32At(charIdx);\r | |
1642 | if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {\r | |
1643 | // We have a CR LF\r | |
1644 | // Need an extra increment of the input ptr to move over both of them\r | |
1645 | charIdx++;\r | |
1646 | }\r | |
1647 | if (cp == CH_LF || cp == CH_CR) {\r | |
1648 | lineNum++;\r | |
1649 | colStart = charIdx;\r | |
1650 | charIdx++;\r | |
1651 | break;\r | |
1652 | }\r | |
1653 | \r | |
1654 | // Let unescape handle the back slash.\r | |
1655 | cp = testString.unescapeAt(charIdx);\r | |
1656 | if (cp != -1) {\r | |
1657 | // Escape sequence was recognized. Insert the char\r | |
1658 | // into the test data.\r | |
1659 | tp.dataToBreak.append(cp);\r | |
1660 | while (tp.dataToBreak.length() > tp.srcLine->size()) {\r | |
1661 | tp.srcLine->addElement(lineNum, status);\r | |
1662 | tp.srcCol ->addElement(column, status);\r | |
1663 | }\r | |
1664 | break;\r | |
1665 | }\r | |
1666 | \r | |
1667 | \r | |
1668 | // Not a recognized backslash escape sequence.\r | |
1669 | // Take the next char as a literal.\r | |
1670 | // TODO: Should this be an error?\r | |
1671 | c = testString.charAt(charIdx);\r | |
1672 | charIdx = testString.moveIndex32(charIdx, 1);\r | |
1673 | }\r | |
1674 | \r | |
1675 | // Normal, non-escaped data char.\r | |
1676 | tp.dataToBreak.append(c);\r | |
1677 | \r | |
1678 | // Save the mapping from offset in the data to line/column numbers in\r | |
1679 | // the original input file. Will be used for better error messages only.\r | |
1680 | // If there's an expected break before this char, the slot in the mapping\r | |
1681 | // vector will already be set for this char; don't overwrite it.\r | |
1682 | if (tp.dataToBreak.length() > tp.srcLine->size()) {\r | |
1683 | tp.srcLine->addElement(lineNum, status);\r | |
1684 | tp.srcCol ->addElement(column, status);\r | |
1685 | }\r | |
1686 | break;\r | |
1687 | \r | |
1688 | \r | |
1689 | case PARSE_NUM:\r | |
1690 | // We are parsing an expected numeric tag value, like <1234>,\r | |
1691 | // within a chunk of data.\r | |
1692 | if (u_isUWhiteSpace(c)) {\r | |
1693 | break;\r | |
1694 | }\r | |
1695 | \r | |
1696 | if (c == CH_GT) {\r | |
1697 | // Finished the number. Add the info to the expected break data,\r | |
1698 | // and switch parse state back to doing plain data.\r | |
1699 | parseState = PARSE_DATA;\r | |
1700 | if (tagValue == 0) {\r | |
1701 | tagValue = -1;\r | |
1702 | }\r | |
1703 | int32_t breakIdx = tp.dataToBreak.length();\r | |
1704 | tp.expectedBreaks->setSize(breakIdx+1);\r | |
1705 | tp.expectedBreaks->setElementAt(tagValue, breakIdx);\r | |
1706 | tp.srcLine->setSize(breakIdx+1);\r | |
1707 | tp.srcLine->setElementAt(lineNum, breakIdx);\r | |
1708 | tp.srcCol ->setSize(breakIdx+1);\r | |
1709 | tp.srcCol ->setElementAt(column, breakIdx);\r | |
1710 | break;\r | |
1711 | }\r | |
1712 | \r | |
1713 | if (u_isdigit(c)) {\r | |
1714 | tagValue = tagValue*10 + u_charDigitValue(c);\r | |
1715 | break;\r | |
1716 | }\r | |
1717 | \r | |
1718 | errln("Syntax Error in test file at line %d, col %d",\r | |
1719 | lineNum, column);\r | |
1720 | goto end_test;\r | |
1721 | parseState = PARSE_COMMENT;\r | |
1722 | break;\r | |
1723 | }\r | |
1724 | \r | |
1725 | \r | |
1726 | if (U_FAILURE(status)) {\r | |
1727 | errln("ICU Error %s while parsing test file at line %d.",\r | |
1728 | u_errorName(status), lineNum);\r | |
1729 | goto end_test;\r | |
1730 | status = U_ZERO_ERROR;\r | |
1731 | }\r | |
1732 | \r | |
1733 | }\r | |
1734 | \r | |
1735 | end_test:\r | |
1736 | delete tp.bi;\r | |
1737 | delete tp.expectedBreaks;\r | |
1738 | delete tp.srcLine;\r | |
1739 | delete tp.srcCol;\r | |
1740 | delete [] testFile;\r | |
1741 | }\r | |
1742 | \r | |
1743 | \r | |
1744 | //-------------------------------------------------------------------------------\r | |
1745 | //\r | |
1746 | // ReadAndConvertFile Read a text data file, convert it to UChars, and\r | |
1747 | // return the datain one big UChar * buffer, which the caller must delete.\r | |
1748 | //\r | |
1749 | // TODO: This is a clone of RegexTest::ReadAndConvertFile.\r | |
1750 | // Move this function to some common place.\r | |
1751 | //\r | |
1752 | //--------------------------------------------------------------------------------\r | |
1753 | UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {\r | |
1754 | UChar *retPtr = NULL;\r | |
1755 | char *fileBuf = NULL;\r | |
1756 | UConverter* conv = NULL;\r | |
1757 | FILE *f = NULL;\r | |
1758 | \r | |
1759 | ulen = 0;\r | |
1760 | if (U_FAILURE(status)) {\r | |
1761 | return retPtr;\r | |
1762 | }\r | |
1763 | \r | |
1764 | //\r | |
1765 | // Open the file.\r | |
1766 | //\r | |
1767 | f = fopen(fileName, "rb");\r | |
1768 | if (f == 0) {\r | |
1769 | errln("Error opening test data file %s\n", fileName);\r | |
1770 | status = U_FILE_ACCESS_ERROR;\r | |
1771 | return NULL;\r | |
1772 | }\r | |
1773 | //\r | |
1774 | // Read it in\r | |
1775 | //\r | |
1776 | int fileSize;\r | |
1777 | int amt_read;\r | |
1778 | \r | |
1779 | fseek( f, 0, SEEK_END);\r | |
1780 | fileSize = ftell(f);\r | |
1781 | fileBuf = new char[fileSize];\r | |
1782 | fseek(f, 0, SEEK_SET);\r | |
1783 | amt_read = fread(fileBuf, 1, fileSize, f);\r | |
1784 | if (amt_read != fileSize || fileSize <= 0) {\r | |
1785 | errln("Error reading test data file.");\r | |
1786 | goto cleanUpAndReturn;\r | |
1787 | }\r | |
1788 | \r | |
1789 | //\r | |
1790 | // Look for a Unicode Signature (BOM) on the data just read\r | |
1791 | //\r | |
1792 | int32_t signatureLength;\r | |
1793 | const char * fileBufC;\r | |
1794 | const char* encoding;\r | |
1795 | \r | |
1796 | fileBufC = fileBuf;\r | |
1797 | encoding = ucnv_detectUnicodeSignature(\r | |
1798 | fileBuf, fileSize, &signatureLength, &status);\r | |
1799 | if(encoding!=NULL ){\r | |
1800 | fileBufC += signatureLength;\r | |
1801 | fileSize -= signatureLength;\r | |
1802 | }\r | |
1803 | \r | |
1804 | //\r | |
1805 | // Open a converter to take the rule file to UTF-16\r | |
1806 | //\r | |
1807 | conv = ucnv_open(encoding, &status);\r | |
1808 | if (U_FAILURE(status)) {\r | |
1809 | goto cleanUpAndReturn;\r | |
1810 | }\r | |
1811 | \r | |
1812 | //\r | |
1813 | // Convert the rules to UChar.\r | |
1814 | // Preflight first to determine required buffer size.\r | |
1815 | //\r | |
1816 | ulen = ucnv_toUChars(conv,\r | |
1817 | NULL, // dest,\r | |
1818 | 0, // destCapacity,\r | |
1819 | fileBufC,\r | |
1820 | fileSize,\r | |
1821 | &status);\r | |
1822 | if (status == U_BUFFER_OVERFLOW_ERROR) {\r | |
1823 | // Buffer Overflow is expected from the preflight operation.\r | |
1824 | status = U_ZERO_ERROR;\r | |
1825 | \r | |
1826 | retPtr = new UChar[ulen+1];\r | |
1827 | ucnv_toUChars(conv,\r | |
1828 | retPtr, // dest,\r | |
1829 | ulen+1,\r | |
1830 | fileBufC,\r | |
1831 | fileSize,\r | |
1832 | &status);\r | |
1833 | }\r | |
1834 | \r | |
1835 | cleanUpAndReturn:\r | |
1836 | fclose(f);\r | |
1837 | delete fileBuf;\r | |
1838 | ucnv_close(conv);\r | |
1839 | if (U_FAILURE(status)) {\r | |
1840 | errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));\r | |
1841 | delete retPtr;\r | |
1842 | retPtr = 0;\r | |
1843 | ulen = 0;\r | |
1844 | };\r | |
1845 | return retPtr;\r | |
1846 | }\r | |
1847 | \r | |
1848 | \r | |
1849 | //--------------------------------------------------------------------------------------------\r | |
1850 | //\r | |
1851 | // Exhaustive Tests, using Unicode Data Files.\r | |
1852 | //\r | |
1853 | //--------------------------------------------------------------------------------------------\r | |
1854 | \r | |
1855 | //\r | |
1856 | // Token level scanner for the Unicode Line Break Test Data file.\r | |
1857 | // Return the next token, as follows:\r | |
1858 | // >= 0: a UChar32 character, scanned from hex in the file.\r | |
1859 | // -1: a break position, a division sign in the file.\r | |
1860 | // -2: end of rule. A new line in the file.\r | |
1861 | // -3: end of file. No more rules.\r | |
1862 | // -4: Error\r | |
1863 | //\r | |
1864 | // The scanner\r | |
1865 | // strips comments, ('#' to end of line)\r | |
1866 | // Recognizes CR, CR/LF and LF as new lines.\r | |
1867 | // Skips over spaces and Xs (don't break here) in the data.\r | |
1868 | //\r | |
1869 | struct ScanState {\r | |
1870 | int32_t fPeekChar;\r | |
1871 | UBool fPeeked;\r | |
1872 | int32_t fLineNum;\r | |
1873 | FILE *fFile;\r | |
1874 | ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};\r | |
1875 | };\r | |
1876 | \r | |
1877 | // Literal characters that are of interest. In hex to keep EBCDIC based machines happy.\r | |
1878 | // The data itself is latin-1 on all platforms.\r | |
1879 | static const int32_t chSpace = 0x20;\r | |
1880 | static const int32_t chTab = 0x09;\r | |
1881 | static const int32_t chCR = 0x0D;\r | |
1882 | static const int32_t chLF = 0x0A;\r | |
1883 | static const int32_t chHash = 0x23;\r | |
1884 | static const int32_t chMult = 0xD7;\r | |
1885 | static const int32_t chDivide = 0xF7;\r | |
1886 | \r | |
1887 | static int32_t nextLBDToken(ScanState *s) {\r | |
1888 | int32_t c;\r | |
1889 | \r | |
1890 | // Read characters from the input file until we get something interesting\r | |
1891 | // to return. The file is in latin-1 encoding.\r | |
1892 | for (;;) {\r | |
1893 | // Get the next character to look at,\r | |
1894 | if (s->fPeeked) {\r | |
1895 | c = s->fPeekChar;\r | |
1896 | s->fPeeked = FALSE;\r | |
1897 | } else {\r | |
1898 | c = getc(s->fFile);\r | |
1899 | }\r | |
1900 | \r | |
1901 | // EOF. Return immediately.\r | |
1902 | if (c == EOF) {\r | |
1903 | return -3;\r | |
1904 | }\r | |
1905 | \r | |
1906 | // Spaces. Treat the multiply sign as a space - it indicates a no-break position\r | |
1907 | // in the data, and the test program doesn't want to see them.\r | |
1908 | // Continue the next char loop, looking for something significant.\r | |
1909 | if (c == chSpace || c == chTab || c == chMult) {\r | |
1910 | continue;\r | |
1911 | }\r | |
1912 | \r | |
1913 | // Divide sign. Indicates an expected break position.\r | |
1914 | if (c == chDivide) {\r | |
1915 | return -1;\r | |
1916 | }\r | |
1917 | \r | |
1918 | // New Line Handling. Keep track of line number in the file, which in turn\r | |
1919 | // requires keeping track of CR/LF as a single new line.\r | |
1920 | if (c == chCR) {\r | |
1921 | s->fLineNum++;\r | |
1922 | s->fPeekChar = getc(s->fFile);\r | |
1923 | if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};\r | |
1924 | return -2;\r | |
1925 | }\r | |
1926 | if (c == chLF) {\r | |
1927 | s->fLineNum++;\r | |
1928 | return -2;\r | |
1929 | }\r | |
1930 | \r | |
1931 | // Comments. Consume everything up to the next new line.\r | |
1932 | if (c == chHash) {\r | |
1933 | do {\r | |
1934 | c = getc(s->fFile);\r | |
1935 | } while (!(c == EOF || c == chCR || c == chLF));\r | |
1936 | s->fPeekChar = c;\r | |
1937 | s->fPeeked = TRUE;\r | |
1938 | return nextLBDToken(s);\r | |
1939 | }\r | |
1940 | \r | |
1941 | // Scan a hex character (UChar32) value.\r | |
1942 | if (u_digit(c, 16) >= 0) {\r | |
1943 | int32_t v = u_digit(c, 16);\r | |
1944 | for (;;) {\r | |
1945 | c = getc(s->fFile);\r | |
1946 | if (u_digit(c, 16) < 0) {break;};\r | |
1947 | v <<= 4;\r | |
1948 | v += u_digit(c, 16);\r | |
1949 | }\r | |
1950 | s->fPeekChar = c;\r | |
1951 | s->fPeeked = TRUE;\r | |
1952 | return v;\r | |
1953 | }\r | |
1954 | \r | |
1955 | // Error. Character was something unexpected.\r | |
1956 | return -4;\r | |
1957 | }\r | |
1958 | }\r | |
1959 | \r | |
1960 | \r | |
1961 | \r | |
1962 | void RBBITest::TestLineBreakData() {\r | |
1963 | \r | |
1964 | UErrorCode status = U_ZERO_ERROR;\r | |
1965 | UnicodeString testString;\r | |
1966 | UVector expectedBreaks(status);\r | |
1967 | ScanState ss;\r | |
1968 | int32_t tok;\r | |
1969 | \r | |
1970 | BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);\r | |
1971 | if (U_FAILURE(status)) {\r | |
1972 | errln("Failure creating break iterator");\r | |
1973 | return;\r | |
1974 | }\r | |
1975 | \r | |
1976 | const char * lbdfName = "LBTest.txt";\r | |
1977 | \r | |
1978 | // Open the test data file.\r | |
1979 | // TODO: a proper way to handle this data.\r | |
1980 | ss.fFile = fopen(lbdfName, "rb");\r | |
1981 | if (ss.fFile == NULL) {\r | |
1982 | logln("Unable to open Line Break Test Data file. Skipping test.");\r | |
1983 | delete bi;\r | |
1984 | return;\r | |
1985 | }\r | |
1986 | \r | |
1987 | // Loop once per line from the test data file.\r | |
1988 | for (;;) {\r | |
1989 | // Zero out test data from previous line.\r | |
1990 | testString.truncate(0);\r | |
1991 | expectedBreaks.removeAllElements();\r | |
1992 | \r | |
1993 | // Read one test's (line's) worth of data from the file.\r | |
1994 | // Loop once per token on the input file line.\r | |
1995 | for(;;) {\r | |
1996 | tok = nextLBDToken(&ss);\r | |
1997 | \r | |
1998 | // If we scanned a character number in the file.\r | |
1999 | // save it in the test data array.\r | |
2000 | if (tok >= 0) {\r | |
2001 | testString.append((UChar32)tok);\r | |
2002 | continue;\r | |
2003 | }\r | |
2004 | \r | |
2005 | // If we scanned a break position in the data, record it.\r | |
2006 | if (tok == -1) {\r | |
2007 | expectedBreaks.addElement(testString.length(), status);\r | |
2008 | continue;\r | |
2009 | }\r | |
2010 | \r | |
2011 | // If we scanned a new line, or EOF\r | |
2012 | // drop out of scan loop and run the test case.\r | |
2013 | if (tok == -2 || tok == -3) {break;};\r | |
2014 | \r | |
2015 | // None of above. Error.\r | |
2016 | errln("Failure: Unrecognized data format, test file line %d", ss.fLineNum);\r | |
2017 | break;\r | |
2018 | }\r | |
2019 | \r | |
2020 | // If this line from the test data file actually contained test data,\r | |
2021 | // run the test.\r | |
2022 | if (testString.length() > 0) {\r | |
2023 | int32_t pos; // Break Position in the test string\r | |
2024 | int32_t expectedI = 0; // Index of expected break position in vector of same.\r | |
2025 | int32_t expectedPos; // Expected break position (index into test string)\r | |
2026 | \r | |
2027 | bi->setText(testString);\r | |
2028 | pos = bi->first(); // TODO: break iterators always return a match at pos 0.\r | |
2029 | pos = bi->next(); // Line Break TR says no match at position 0.\r | |
2030 | // Resolve.\r | |
2031 | \r | |
2032 | for (; pos != BreakIterator::DONE; ) {\r | |
2033 | expectedPos = expectedBreaks.elementAti(expectedI);\r | |
2034 | if (pos < expectedPos) {\r | |
2035 | errln("Failure: Test file line %d, unexpected break found at position %d",\r | |
2036 | ss.fLineNum, pos);\r | |
2037 | break;\r | |
2038 | }\r | |
2039 | if (pos > expectedPos) {\r | |
2040 | errln("Failure: Test file line %d, failed to find break at position %d",\r | |
2041 | ss.fLineNum, expectedPos);\r | |
2042 | break;\r | |
2043 | }\r | |
2044 | pos = bi->next();\r | |
2045 | expectedI++;\r | |
2046 | }\r | |
2047 | }\r | |
2048 | \r | |
2049 | // If we've hit EOF on the input file, we're done.\r | |
2050 | if (tok == -3) {\r | |
2051 | break;\r | |
2052 | }\r | |
2053 | \r | |
2054 | }\r | |
2055 | \r | |
2056 | fclose(ss.fFile);\r | |
2057 | delete bi;\r | |
2058 | \r | |
2059 | }\r | |
2060 | \r | |
2061 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS\r | |
2062 | \r | |
2063 | //---------------------------------------------------------------------------------------\r | |
2064 | //\r | |
2065 | // classs RBBIMonkeyKind\r | |
2066 | //\r | |
2067 | // Monkey Test for Break Iteration\r | |
2068 | // Abstract interface class. Concrete derived classes independently\r | |
2069 | // implement the break rules for different iterator types.\r | |
2070 | //\r | |
2071 | // The Monkey Test itself uses doesn't know which type of break iterator it is\r | |
2072 | // testing, but works purely in terms of the interface defined here.\r | |
2073 | //\r | |
2074 | //---------------------------------------------------------------------------------------\r | |
2075 | class RBBIMonkeyKind {\r | |
2076 | public:\r | |
2077 | // Return a UVector of UnicodeSets, representing the character classes used\r | |
2078 | // for this type of iterator.\r | |
2079 | virtual UVector *charClasses() = 0;\r | |
2080 | \r | |
2081 | // Set the test text on which subsequent calls to next() will operate\r | |
2082 | virtual void setText(const UnicodeString &s) = 0;\r | |
2083 | \r | |
2084 | // Find the next break postion, starting from the prev break position, or from zero.\r | |
2085 | // Return -1 after reaching end of string.\r | |
2086 | virtual int32_t next(int32_t i) = 0;\r | |
2087 | \r | |
2088 | virtual ~RBBIMonkeyKind();\r | |
2089 | UErrorCode deferredStatus;\r | |
2090 | \r | |
2091 | \r | |
2092 | protected:\r | |
2093 | RBBIMonkeyKind();\r | |
2094 | \r | |
2095 | private:\r | |
2096 | };\r | |
2097 | \r | |
2098 | RBBIMonkeyKind::RBBIMonkeyKind() {\r | |
2099 | deferredStatus = U_ZERO_ERROR;\r | |
2100 | }\r | |
2101 | \r | |
2102 | RBBIMonkeyKind::~RBBIMonkeyKind() {\r | |
2103 | }\r | |
2104 | \r | |
2105 | \r | |
2106 | //----------------------------------------------------------------------------------------\r | |
2107 | //\r | |
2108 | // Random Numbers. Similar to standard lib rand() and srand()\r | |
2109 | // Not using library to\r | |
2110 | // 1. Get same results on all platforms.\r | |
2111 | // 2. Get access to current seed, to more easily reproduce failures.\r | |
2112 | //\r | |
2113 | //---------------------------------------------------------------------------------------\r | |
2114 | static uint32_t m_seed = 1;\r | |
2115 | \r | |
2116 | static uint32_t m_rand()\r | |
2117 | {\r | |
2118 | m_seed = m_seed * 1103515245 + 12345;\r | |
2119 | return (uint32_t)(m_seed/65536) % 32768;\r | |
2120 | }\r | |
2121 | \r | |
2122 | \r | |
2123 | //------------------------------------------------------------------------------------------\r | |
2124 | //\r | |
2125 | // class RBBICharMonkey Character (Grapheme Cluster) specific implementation\r | |
2126 | // of RBBIMonkeyKind.\r | |
2127 | //\r | |
2128 | //------------------------------------------------------------------------------------------\r | |
2129 | class RBBICharMonkey: public RBBIMonkeyKind {\r | |
2130 | public:\r | |
2131 | RBBICharMonkey();\r | |
2132 | virtual ~RBBICharMonkey();\r | |
2133 | virtual UVector *charClasses();\r | |
2134 | virtual void setText(const UnicodeString &s);\r | |
2135 | virtual int32_t next(int32_t i);\r | |
2136 | private:\r | |
2137 | UVector *fSets;\r | |
2138 | \r | |
2139 | UnicodeSet *fCRLFSet;\r | |
2140 | UnicodeSet *fControlSet;\r | |
2141 | UnicodeSet *fExtendSet;\r | |
2142 | UnicodeSet *fHangulSet;\r | |
2143 | UnicodeSet *fAnySet;\r | |
2144 | \r | |
2145 | RegexMatcher *fMatcher;\r | |
2146 | const UnicodeString *fText;\r | |
2147 | };\r | |
2148 | \r | |
2149 | \r | |
2150 | RBBICharMonkey::RBBICharMonkey() {\r | |
2151 | UErrorCode status = U_ZERO_ERROR;\r | |
2152 | \r | |
2153 | fText = NULL;\r | |
2154 | fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster\r | |
2155 | \r | |
2156 | fCRLFSet = new UnicodeSet("[\\r\\n]", status);\r | |
2157 | fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);\r | |
2158 | fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);\r | |
2159 | fHangulSet = new UnicodeSet(\r | |
2160 | "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"\r | |
2161 | "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);\r | |
2162 | fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]", status);\r | |
2163 | \r | |
2164 | fSets = new UVector(status);\r | |
2165 | fSets->addElement(fCRLFSet, status);\r | |
2166 | fSets->addElement(fControlSet, status);\r | |
2167 | fSets->addElement(fExtendSet, status);\r | |
2168 | fSets->addElement(fHangulSet, status);\r | |
2169 | fSets->addElement(fAnySet, status);\r | |
2170 | if (U_FAILURE(status)) {\r | |
2171 | deferredStatus = status;\r | |
2172 | }\r | |
2173 | };\r | |
2174 | \r | |
2175 | \r | |
2176 | void RBBICharMonkey::setText(const UnicodeString &s) {\r | |
2177 | fText = &s;\r | |
2178 | fMatcher->reset(s);\r | |
2179 | }\r | |
2180 | \r | |
2181 | \r | |
2182 | int32_t RBBICharMonkey::next(int32_t i) {\r | |
2183 | UErrorCode status = U_ZERO_ERROR;\r | |
2184 | int32_t retVal = -1;\r | |
2185 | \r | |
2186 | if (fMatcher->find(i, status)) {\r | |
2187 | retVal = fMatcher->end(status);\r | |
2188 | }\r | |
2189 | if (U_FAILURE(status)){\r | |
2190 | retVal = -1;\r | |
2191 | }\r | |
2192 | return retVal;\r | |
2193 | }\r | |
2194 | \r | |
2195 | \r | |
2196 | UVector *RBBICharMonkey::charClasses() {\r | |
2197 | return fSets;\r | |
2198 | }\r | |
2199 | \r | |
2200 | \r | |
2201 | RBBICharMonkey::~RBBICharMonkey() {\r | |
2202 | delete fSets;\r | |
2203 | delete fCRLFSet;\r | |
2204 | delete fControlSet;\r | |
2205 | delete fExtendSet;\r | |
2206 | delete fHangulSet;\r | |
2207 | delete fAnySet;\r | |
2208 | \r | |
2209 | delete fMatcher;\r | |
2210 | }\r | |
2211 | \r | |
2212 | //------------------------------------------------------------------------------------------\r | |
2213 | //\r | |
2214 | // class RBBIWordMonkey Word Break specific implementation\r | |
2215 | // of RBBIMonkeyKind.\r | |
2216 | //\r | |
2217 | //------------------------------------------------------------------------------------------\r | |
2218 | class RBBIWordMonkey: public RBBIMonkeyKind {\r | |
2219 | public:\r | |
2220 | RBBIWordMonkey();\r | |
2221 | virtual ~RBBIWordMonkey();\r | |
2222 | virtual UVector *charClasses();\r | |
2223 | virtual void setText(const UnicodeString &s);\r | |
2224 | virtual int32_t next(int32_t i);\r | |
2225 | private:\r | |
2226 | UVector *fSets;\r | |
2227 | \r | |
2228 | UnicodeSet *fKatakanaSet;\r | |
2229 | UnicodeSet *fALetterSet;\r | |
2230 | UnicodeSet *fMidLetterSet;\r | |
2231 | UnicodeSet *fMidNumSet;\r | |
2232 | UnicodeSet *fNumericSet;\r | |
2233 | UnicodeSet *fFormatSet;\r | |
2234 | UnicodeSet *fOtherSet;\r | |
2235 | UnicodeSet *fExtendSet;\r | |
2236 | UnicodeSet *fExtendNumLetSet;\r | |
2237 | \r | |
2238 | RegexMatcher *fMatcher;\r | |
2239 | \r | |
2240 | const UnicodeString *fText;\r | |
2241 | \r | |
2242 | RegexMatcher *fGCFMatcher;\r | |
2243 | RegexMatcher *fGCMatcher;\r | |
2244 | \r | |
2245 | };\r | |
2246 | \r | |
2247 | \r | |
2248 | RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),\r | |
2249 | fGCMatcher(0)\r | |
2250 | {\r | |
2251 | UErrorCode status = U_ZERO_ERROR;\r | |
2252 | \r | |
2253 | fSets = new UVector(status);\r | |
2254 | \r | |
2255 | fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}"\r | |
2256 | "\\u3031-\\u3035\\u309b\\u309c\\u30a0"\r | |
2257 | "\\u30fc\\uff70\\uff9e\\uff9f]", status);\r | |
2258 | \r | |
2259 | const UnicodeString ALetterStr( "[[\\p{Alphabetic}"\r | |
2260 | "\\u00a0" // NBSP\r | |
2261 | "\\u05f3]" // Hebrew punct Geresh\r | |
2262 | "-[\\p{Ideographic}]"\r | |
2263 | "-[\\p{Script=Lao}]"\r | |
2264 | "-[\\p{Script=Hiragana}]"\r | |
2265 | "-[\\p{Grapheme_Extend}]]");\r | |
2266 | fALetterSet = new UnicodeSet(ALetterStr, status);\r | |
2267 | fALetterSet->removeAll(*fKatakanaSet);\r | |
2268 | \r | |
2269 | fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);\r | |
2270 | fMidNumSet = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);\r | |
2271 | fNumericSet = new UnicodeSet("[\\p{Nd}\\u066b\\u066c]", status);\r | |
2272 | fFormatSet = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);\r | |
2273 | fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);\r | |
2274 | fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);\r | |
2275 | fOtherSet = new UnicodeSet();\r | |
2276 | if(U_FAILURE(status)) {\r | |
2277 | deferredStatus = status;\r | |
2278 | return;\r | |
2279 | }\r | |
2280 | \r | |
2281 | fOtherSet->complement();\r | |
2282 | fOtherSet->removeAll(*fKatakanaSet);\r | |
2283 | fOtherSet->removeAll(*fALetterSet);\r | |
2284 | fOtherSet->removeAll(*fMidLetterSet);\r | |
2285 | fOtherSet->removeAll(*fMidNumSet);\r | |
2286 | fOtherSet->removeAll(*fNumericSet);\r | |
2287 | fOtherSet->removeAll(*fExtendNumLetSet);\r | |
2288 | \r | |
2289 | fSets->addElement(fALetterSet, status);\r | |
2290 | fSets->addElement(fKatakanaSet, status);\r | |
2291 | fSets->addElement(fMidLetterSet, status);\r | |
2292 | fSets->addElement(fMidNumSet, status);\r | |
2293 | fSets->addElement(fNumericSet, status);\r | |
2294 | fSets->addElement(fFormatSet, status);\r | |
2295 | fSets->addElement(fOtherSet, status);\r | |
2296 | fSets->addElement(fExtendNumLetSet, status);\r | |
2297 | \r | |
2298 | \r | |
2299 | fGCFMatcher = new RegexMatcher("\\X(?:[\\p{Format}-\\p{Grapheme_Extend}])*", 0, status);\r | |
2300 | fGCMatcher = new RegexMatcher("\\X", 0, status);\r | |
2301 | \r | |
2302 | if (U_FAILURE(status)) {\r | |
2303 | deferredStatus = status;\r | |
2304 | }\r | |
2305 | };\r | |
2306 | \r | |
2307 | void RBBIWordMonkey::setText(const UnicodeString &s) {\r | |
2308 | fText = &s;\r | |
2309 | fGCMatcher->reset(*fText);\r | |
2310 | fGCFMatcher->reset(*fText);\r | |
2311 | }\r | |
2312 | \r | |
2313 | \r | |
2314 | int32_t RBBIWordMonkey::next(int32_t prevPos) {\r | |
2315 | UErrorCode status = U_ZERO_ERROR;\r | |
2316 | \r | |
2317 | int p0, p1, p2, p3; // Indices of the significant code points around the \r | |
2318 | // break position being tested. The candidate break\r | |
2319 | // location is before p2.\r | |
2320 | \r | |
2321 | int breakPos = -1;\r | |
2322 | \r | |
2323 | UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.\r | |
2324 | \r | |
2325 | // Prev break at end of string. return DONE.\r | |
2326 | if (prevPos >= fText->length()) {\r | |
2327 | return -1;\r | |
2328 | }\r | |
2329 | p0 = p1 = p2 = p3 = prevPos;\r | |
2330 | c3 = fText->char32At(prevPos);\r | |
2331 | c0 = c1 = c2 = 0;\r | |
2332 | \r | |
2333 | \r | |
2334 | // Format char after prev break? Special case, see last Note for Word Boundaries TR.\r | |
2335 | // break immdiately after the format char.\r | |
2336 | if (fFormatSet->contains(c3)) {\r | |
2337 | breakPos = fText->moveIndex32(prevPos, 1);\r | |
2338 | return breakPos;\r | |
2339 | }\r | |
2340 | \r | |
2341 | \r | |
2342 | // Loop runs once per "significant" character position in the input text.\r | |
2343 | for (;;) {\r | |
2344 | // Move all of the positions forward in the input string.\r | |
2345 | p0 = p1; c0 = c1;\r | |
2346 | p1 = p2; c1 = c2;\r | |
2347 | p2 = p3; c2 = c3;\r | |
2348 | // Advancd p3 by (GC Format*) Rules 3, 4\r | |
2349 | status = U_ZERO_ERROR;\r | |
2350 | if (fGCFMatcher->find(p3, status) == FALSE) {\r | |
2351 | p3 = fText->length();\r | |
2352 | c3 = 0;\r | |
2353 | } else {\r | |
2354 | p3 = fGCFMatcher->end(0, status);\r | |
2355 | U_ASSERT(U_SUCCESS(status));\r | |
2356 | c3 = fText->char32At(p3);\r | |
2357 | }\r | |
2358 | \r | |
2359 | if (p1 == p2) {\r | |
2360 | // Still warming up the loop. (won't work with zero length strings, but we don't care)\r | |
2361 | continue;\r | |
2362 | }\r | |
2363 | if (p2 == fText->length()) {\r | |
2364 | // Reached end of string. Always a break position.\r | |
2365 | break;\r | |
2366 | }\r | |
2367 | \r | |
2368 | // Rule (5). ALetter x ALetter\r | |
2369 | if (fALetterSet->contains(c1) &&\r | |
2370 | fALetterSet->contains(c2)) {\r | |
2371 | continue;\r | |
2372 | }\r | |
2373 | \r | |
2374 | // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter\r | |
2375 | //\r | |
2376 | // Also incorporates rule 7 by skipping pos ahead to position of the\r | |
2377 | // terminating ALetter.\r | |
2378 | if ( fALetterSet->contains(c1) &&\r | |
2379 | fMidLetterSet->contains(c2) &&\r | |
2380 | fALetterSet->contains(c3)) {\r | |
2381 | continue;\r | |
2382 | }\r | |
2383 | \r | |
2384 | \r | |
2385 | // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter\r | |
2386 | if (fALetterSet->contains(c0) &&\r | |
2387 | (fMidLetterSet->contains(c1) ) &&\r | |
2388 | fALetterSet->contains(c2)) {\r | |
2389 | continue;\r | |
2390 | }\r | |
2391 | \r | |
2392 | // Rule (8) Numeric x Numeric\r | |
2393 | if (fNumericSet->contains(c1) &&\r | |
2394 | fNumericSet->contains(c2)) {\r | |
2395 | continue;\r | |
2396 | }\r | |
2397 | \r | |
2398 | // Rule (9) ALetter x Numeric\r | |
2399 | if (fALetterSet->contains(c1) &&\r | |
2400 | fNumericSet->contains(c2)) {\r | |
2401 | continue;\r | |
2402 | }\r | |
2403 | \r | |
2404 | // Rule (10) Numeric x ALetter\r | |
2405 | if (fNumericSet->contains(c1) &&\r | |
2406 | fALetterSet->contains(c2)) {\r | |
2407 | continue;\r | |
2408 | }\r | |
2409 | \r | |
2410 | // Rule (11) Numeric (MidNum | MidNumLet) x Numeric\r | |
2411 | if ( fNumericSet->contains(c0) &&\r | |
2412 | fMidNumSet->contains(c1) && \r | |
2413 | fNumericSet->contains(c2)) {\r | |
2414 | continue;\r | |
2415 | }\r | |
2416 | \r | |
2417 | // Rule (12) Numeric x (MidNum | MidNumLet) Numeric\r | |
2418 | if (fNumericSet->contains(c1) &&\r | |
2419 | fMidNumSet->contains(c2) &&\r | |
2420 | fNumericSet->contains(c3)) {\r | |
2421 | continue;\r | |
2422 | }\r | |
2423 | \r | |
2424 | // Rule (13) Katakana x Katakana\r | |
2425 | if (fKatakanaSet->contains(c1) &&\r | |
2426 | fKatakanaSet->contains(c2)) {\r | |
2427 | continue;\r | |
2428 | }\r | |
2429 | \r | |
2430 | // Rule 13a\r | |
2431 | if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||\r | |
2432 | fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&\r | |
2433 | fExtendNumLetSet->contains(c2)) {\r | |
2434 | continue;\r | |
2435 | }\r | |
2436 | \r | |
2437 | // Rule 13b\r | |
2438 | if (fExtendNumLetSet->contains(c1) && \r | |
2439 | (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||\r | |
2440 | fKatakanaSet->contains(c2))) {\r | |
2441 | continue;\r | |
2442 | }\r | |
2443 | \r | |
2444 | \r | |
2445 | // Rule 14. Break found here.\r | |
2446 | break;\r | |
2447 | }\r | |
2448 | \r | |
2449 | \r | |
2450 | // Rule 4 fixup, back up before any trailing\r | |
2451 | // format characters at the end of the word.\r | |
2452 | breakPos = p2;\r | |
2453 | status = U_ZERO_ERROR;\r | |
2454 | if (fGCMatcher->find(p1, status)) {\r | |
2455 | breakPos = fGCMatcher->end(0, status);\r | |
2456 | U_ASSERT(U_SUCCESS(status));\r | |
2457 | }\r | |
2458 | return breakPos;\r | |
2459 | }\r | |
2460 | \r | |
2461 | \r | |
2462 | UVector *RBBIWordMonkey::charClasses() {\r | |
2463 | return fSets;\r | |
2464 | }\r | |
2465 | \r | |
2466 | \r | |
2467 | RBBIWordMonkey::~RBBIWordMonkey() {\r | |
2468 | delete fSets;\r | |
2469 | delete fKatakanaSet;\r | |
2470 | delete fALetterSet;\r | |
2471 | delete fMidLetterSet;\r | |
2472 | delete fMidNumSet;\r | |
2473 | delete fNumericSet;\r | |
2474 | delete fFormatSet;\r | |
2475 | delete fExtendSet;\r | |
2476 | delete fOtherSet;\r | |
2477 | \r | |
2478 | delete fGCFMatcher;\r | |
2479 | delete fGCMatcher;\r | |
2480 | }\r | |
2481 | \r | |
2482 | \r | |
2483 | \r | |
2484 | \r | |
2485 | //-------------------------------------------------------------------------------------------\r | |
2486 | //\r | |
2487 | // RBBILineMonkey\r | |
2488 | //\r | |
2489 | //-------------------------------------------------------------------------------------------\r | |
2490 | \r | |
2491 | class RBBILineMonkey: public RBBIMonkeyKind {\r | |
2492 | public:\r | |
2493 | RBBILineMonkey();\r | |
2494 | virtual ~RBBILineMonkey();\r | |
2495 | virtual UVector *charClasses();\r | |
2496 | virtual void setText(const UnicodeString &s);\r | |
2497 | virtual int32_t next(int32_t i);\r | |
2498 | virtual void rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);\r | |
2499 | private:\r | |
2500 | UVector *fSets;\r | |
2501 | \r | |
2502 | UnicodeSet *fBK;\r | |
2503 | UnicodeSet *fCR;\r | |
2504 | UnicodeSet *fLF;\r | |
2505 | UnicodeSet *fCM;\r | |
2506 | UnicodeSet *fNL;\r | |
2507 | UnicodeSet *fSG;\r | |
2508 | UnicodeSet *fWJ;\r | |
2509 | UnicodeSet *fZW;\r | |
2510 | UnicodeSet *fGL;\r | |
2511 | UnicodeSet *fCB;\r | |
2512 | UnicodeSet *fSP;\r | |
2513 | UnicodeSet *fB2;\r | |
2514 | UnicodeSet *fBA;\r | |
2515 | UnicodeSet *fBB;\r | |
2516 | UnicodeSet *fHY;\r | |
2517 | UnicodeSet *fCL;\r | |
2518 | UnicodeSet *fEX;\r | |
2519 | UnicodeSet *fIN;\r | |
2520 | UnicodeSet *fNS;\r | |
2521 | UnicodeSet *fOP;\r | |
2522 | UnicodeSet *fQU;\r | |
2523 | UnicodeSet *fIS;\r | |
2524 | UnicodeSet *fNU;\r | |
2525 | UnicodeSet *fPO;\r | |
2526 | UnicodeSet *fPR;\r | |
2527 | UnicodeSet *fSY;\r | |
2528 | UnicodeSet *fAI;\r | |
2529 | UnicodeSet *fAL;\r | |
2530 | UnicodeSet *fID;\r | |
2531 | UnicodeSet *fSA;\r | |
2532 | UnicodeSet *fXX;\r | |
2533 | \r | |
2534 | BreakIterator *fCharBI;\r | |
2535 | \r | |
2536 | const UnicodeString *fText;\r | |
2537 | int32_t *fOrigPositions;\r | |
2538 | \r | |
2539 | RegexMatcher *fNumberMatcher;\r | |
2540 | RegexMatcher *fLB10Matcher;\r | |
2541 | RegexMatcher *fLB11Matcher;\r | |
2542 | };\r | |
2543 | \r | |
2544 | \r | |
2545 | RBBILineMonkey::RBBILineMonkey() \r | |
2546 | {\r | |
2547 | UErrorCode status = U_ZERO_ERROR;\r | |
2548 | \r | |
2549 | fSets = new UVector(status);\r | |
2550 | \r | |
2551 | fBK = new UnicodeSet("[\\p{Line_Break=BK}]", status);\r | |
2552 | fCR = new UnicodeSet("[\\p{Line_break=CR}]", status);\r | |
2553 | fLF = new UnicodeSet("[\\p{Line_break=LF}]", status);\r | |
2554 | fCM = new UnicodeSet("[\\p{Line_break=CM}]", status);\r | |
2555 | fNL = new UnicodeSet("[\\p{Line_break=NL}]", status);\r | |
2556 | fWJ = new UnicodeSet("[\\p{Line_break=WJ}]", status);\r | |
2557 | fZW = new UnicodeSet("[\\p{Line_break=ZW}]", status);\r | |
2558 | fGL = new UnicodeSet("[\\p{Line_break=GL}]", status);\r | |
2559 | fCB = new UnicodeSet("[\\p{Line_break=CB}]", status);\r | |
2560 | fSP = new UnicodeSet("[\\p{Line_break=SP}]", status);\r | |
2561 | fB2 = new UnicodeSet("[\\p{Line_break=B2}]", status);\r | |
2562 | fBA = new UnicodeSet("[\\p{Line_break=BA}]", status);\r | |
2563 | fBB = new UnicodeSet("[\\p{Line_break=BB}]", status);\r | |
2564 | fHY = new UnicodeSet("[\\p{Line_break=HY}]", status);\r | |
2565 | fCL = new UnicodeSet("[\\p{Line_break=CL}]", status);\r | |
2566 | fEX = new UnicodeSet("[\\p{Line_break=EX}]", status);\r | |
2567 | fIN = new UnicodeSet("[\\p{Line_break=IN}]", status);\r | |
2568 | fNS = new UnicodeSet("[\\p{Line_break=NS}]", status);\r | |
2569 | fOP = new UnicodeSet("[\\p{Line_break=OP}]", status);\r | |
2570 | fQU = new UnicodeSet("[\\p{Line_break=QU}]", status);\r | |
2571 | fIS = new UnicodeSet("[\\p{Line_break=IS}]", status);\r | |
2572 | fNU = new UnicodeSet("[\\p{Line_break=NU}]", status);\r | |
2573 | fPO = new UnicodeSet("[\\p{Line_break=PO}]", status);\r | |
2574 | fPR = new UnicodeSet("[\\p{Line_break=PR}]", status);\r | |
2575 | fSY = new UnicodeSet("[\\p{Line_break=SY}]", status);\r | |
2576 | fAI = new UnicodeSet("[\\p{Line_break=AI}]", status);\r | |
2577 | fAL = new UnicodeSet("[\\p{Line_break=AL}]", status);\r | |
2578 | fID = new UnicodeSet("[\\p{Line_break=ID}]", status);\r | |
2579 | fSA = new UnicodeSet("[\\p{Line_break=SA}]", status);\r | |
2580 | fXX = new UnicodeSet("[\\p{Line_break=XX}]", status);\r | |
2581 | \r | |
2582 | fAL->addAll(*fXX); // Default behavior for XX is identical to AL\r | |
2583 | fAL->addAll(*fAI); // Default behavior for AI is identical to AL\r | |
2584 | fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL\r | |
2585 | \r | |
2586 | \r | |
2587 | \r | |
2588 | fSets->addElement(fBK, status);\r | |
2589 | fSets->addElement(fCR, status);\r | |
2590 | fSets->addElement(fLF, status);\r | |
2591 | fSets->addElement(fCM, status);\r | |
2592 | fSets->addElement(fNL, status);\r | |
2593 | fSets->addElement(fWJ, status);\r | |
2594 | fSets->addElement(fZW, status);\r | |
2595 | fSets->addElement(fGL, status);\r | |
2596 | fSets->addElement(fCB, status);\r | |
2597 | fSets->addElement(fSP, status);\r | |
2598 | fSets->addElement(fB2, status);\r | |
2599 | fSets->addElement(fBA, status);\r | |
2600 | fSets->addElement(fBB, status);\r | |
2601 | fSets->addElement(fHY, status);\r | |
2602 | fSets->addElement(fCL, status);\r | |
2603 | fSets->addElement(fEX, status);\r | |
2604 | fSets->addElement(fIN, status);\r | |
2605 | fSets->addElement(fNS, status);\r | |
2606 | fSets->addElement(fOP, status);\r | |
2607 | fSets->addElement(fQU, status);\r | |
2608 | fSets->addElement(fIS, status);\r | |
2609 | fSets->addElement(fNU, status);\r | |
2610 | fSets->addElement(fPO, status);\r | |
2611 | fSets->addElement(fPR, status);\r | |
2612 | fSets->addElement(fSY, status);\r | |
2613 | fSets->addElement(fAI, status);\r | |
2614 | fSets->addElement(fAL, status);\r | |
2615 | fSets->addElement(fID, status);\r | |
2616 | fSets->addElement(fWJ, status);\r | |
2617 | fSets->addElement(fSA, status);\r | |
2618 | // fSets->addElement(fXX, status);\r | |
2619 | \r | |
2620 | \r | |
2621 | \r | |
2622 | fNumberMatcher = new RegexMatcher(\r | |
2623 | "(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"\r | |
2624 | "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"\r | |
2625 | "\\p{Line_Break=NU}\\p{Line_Break=CM}*"\r | |
2626 | "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"\r | |
2627 | "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"\r | |
2628 | "(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?", \r | |
2629 | 0, status);\r | |
2630 | \r | |
2631 | fLB10Matcher = new RegexMatcher(\r | |
2632 | "\\p{Line_Break=QU}\\p{Line_Break=CM}*"\r | |
2633 | "\\p{Line_Break=SP}*"\r | |
2634 | "(\\p{Line_Break=OP})\\p{Line_Break=CM}*", \r | |
2635 | 0, status);\r | |
2636 | \r | |
2637 | fLB11Matcher = new RegexMatcher(\r | |
2638 | "\\p{Line_Break=CL}\\p{Line_Break=CM}*"\r | |
2639 | "\\p{Line_Break=SP}*"\r | |
2640 | "(\\p{Line_Break=NS})\\p{Line_Break=CM}*", \r | |
2641 | 0, status);\r | |
2642 | \r | |
2643 | fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);\r | |
2644 | \r | |
2645 | if (U_FAILURE(status)) {\r | |
2646 | deferredStatus = status;\r | |
2647 | }\r | |
2648 | };\r | |
2649 | \r | |
2650 | \r | |
2651 | void RBBILineMonkey::setText(const UnicodeString &s) {\r | |
2652 | fText = &s;\r | |
2653 | fCharBI->setText(s);\r | |
2654 | fNumberMatcher->reset(s);\r | |
2655 | }\r | |
2656 | \r | |
2657 | //\r | |
2658 | // rule67Adjust\r | |
2659 | // Line Break TR rules 6 and 7 implementation.\r | |
2660 | // This deals with combining marks, Hangul Syllables, and other sequences that\r | |
2661 | // that must be treated as if they were something other than what they actually are.\r | |
2662 | //\r | |
2663 | // This is factored out into a separate function because it must be applied twice for\r | |
2664 | // each potential break, once to the chars before the position being checked, then\r | |
2665 | // again to the text following the possible break.\r | |
2666 | //\r | |
2667 | void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {\r | |
2668 | if (pos == -1) {\r | |
2669 | // Invalid initial position. Happens during the warmup iteration of the \r | |
2670 | // main loop in next().\r | |
2671 | return;\r | |
2672 | }\r | |
2673 | \r | |
2674 | int32_t nPos = *nextPos;\r | |
2675 | \r | |
2676 | // LB 6 Treat Korean Syllables as a single unit\r | |
2677 | int32_t hangultype = u_getIntPropertyValue(*posChar, UCHAR_HANGUL_SYLLABLE_TYPE);\r | |
2678 | if (hangultype != U_HST_NOT_APPLICABLE) {\r | |
2679 | nPos = fCharBI->following(pos); // Advance by grapheme cluster, which\r | |
2680 | // contains the logic to locate Hangul syllables.\r | |
2681 | // Grapheme Cluster Ugliness: some Grapheme_Extend chars, which are absorbed\r | |
2682 | // into a grapheme cluster, are NOT Line Break CM. (Some are GL, for example.)\r | |
2683 | // We don't want consume any of these. The Approach is\r | |
2684 | // 1. Back nPos up, undoing the consumption of any\r | |
2685 | // Grapheme_Extend chars by the char break iterator.\r | |
2686 | // 2. Let the LB 7b logic below reconsume any Line Break CM chars.\r | |
2687 | for (;;) {\r | |
2688 | nPos = fText->moveIndex32(nPos, -1);\r | |
2689 | UChar32 possiblyExtendChar = fText->char32At(nPos);\r | |
2690 | if (fID->contains(possiblyExtendChar)) {\r | |
2691 | // We hit into the Hangul Syllable itself, class is ID.\r | |
2692 | nPos = fText->moveIndex32(nPos, +1);\r | |
2693 | break;\r | |
2694 | }\r | |
2695 | }\r | |
2696 | }\r | |
2697 | \r | |
2698 | // LB 7b Keep combining sequences together.\r | |
2699 | // advance over any CM class chars. (Line Break CM class is different from\r | |
2700 | // grapheme cluster CM, so we need to do this even for HangulSyllables.\r | |
2701 | // Line Break may eat additional stuff as combining, beyond what graphem cluster did.\r | |
2702 | if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a \r | |
2703 | || *posChar==0x0d || *posChar==0x85)) {\r | |
2704 | for (;;) {\r | |
2705 | *nextChar = fText->char32At(nPos);\r | |
2706 | if (!fCM->contains(*nextChar)) {\r | |
2707 | break;\r | |
2708 | }\r | |
2709 | nPos = fText->moveIndex32(nPos, 1);\r | |
2710 | }\r | |
2711 | }\r | |
2712 | \r | |
2713 | \r | |
2714 | // LB 7a In a SP CM* sequence, treat the SP as an ID\r | |
2715 | if (nPos != *nextPos && fSP->contains(*posChar)) {\r | |
2716 | *posChar = 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.\r | |
2717 | }\r | |
2718 | \r | |
2719 | // LB 7b Treat X CM* as if it were x.\r | |
2720 | // No explicit action required. \r | |
2721 | \r | |
2722 | // LB 7c Treat any remaining combining mark as AL\r | |
2723 | if (fCM->contains(*posChar)) {\r | |
2724 | *posChar = 0x41; // thisChar = 'A';\r | |
2725 | }\r | |
2726 | \r | |
2727 | // Push the updated nextPos and nextChar back to our caller.\r | |
2728 | // This only makes a difference if posChar got bigger, by slurping up a\r | |
2729 | // combining sequence or Hangul syllable.\r | |
2730 | *nextPos = nPos;\r | |
2731 | *nextChar = fText->char32At(nPos);\r | |
2732 | }\r | |
2733 | \r | |
2734 | \r | |
2735 | \r | |
2736 | int32_t RBBILineMonkey::next(int32_t startPos) {\r | |
2737 | UErrorCode status = U_ZERO_ERROR;\r | |
2738 | int32_t pos; // Index of the char following a potential break position\r | |
2739 | UChar32 thisChar; // Character at above position "pos"\r | |
2740 | \r | |
2741 | int32_t prevPos; // Index of the char preceding a potential break position\r | |
2742 | UChar32 prevChar; // Character at above position. Note that prevChar\r | |
2743 | // and thisChar may not be adjacent because combining\r | |
2744 | // characters between them will be ignored.\r | |
2745 | \r | |
2746 | int32_t nextPos; // Index of the next character following pos.\r | |
2747 | // Usually skips over combining marks.\r | |
2748 | int32_t nextCPPos; // Index of the code point following "pos."\r | |
2749 | // May point to a combining mark.\r | |
2750 | int32_t tPos; // temp value.\r | |
2751 | UChar32 c;\r | |
2752 | \r | |
2753 | if (startPos >= fText->length()) {\r | |
2754 | return -1;\r | |
2755 | }\r | |
2756 | \r | |
2757 | \r | |
2758 | // Initial values for loop. Loop will run the first time without finding breaks,\r | |
2759 | // while the invalid values shift out and the "this" and\r | |
2760 | // "prev" positions are filled in with good values.\r | |
2761 | pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.\r | |
2762 | thisChar = prevChar = 0;\r | |
2763 | nextPos = nextCPPos = startPos;\r | |
2764 | \r | |
2765 | \r | |
2766 | // Loop runs once per position in the test text, until a break position\r | |
2767 | // is found.\r | |
2768 | for (;;) {\r | |
2769 | prevPos = pos;\r | |
2770 | prevChar = thisChar;\r | |
2771 | \r | |
2772 | pos = nextPos;\r | |
2773 | thisChar = fText->char32At(pos);\r | |
2774 | \r | |
2775 | nextCPPos = fText->moveIndex32(pos, 1);\r | |
2776 | nextPos = nextCPPos;\r | |
2777 | \r | |
2778 | // Break at end of text.\r | |
2779 | if (pos >= fText->length()) {\r | |
2780 | break;\r | |
2781 | }\r | |
2782 | \r | |
2783 | // LB 3a Always break after hard line breaks,\r | |
2784 | if (fBK->contains(prevChar)) {\r | |
2785 | break;\r | |
2786 | }\r | |
2787 | \r | |
2788 | // LB 3b Break after CR, LF, NL, but not inside CR LF\r | |
2789 | if (prevChar == 0x0d && thisChar == 0x0a) {\r | |
2790 | continue;\r | |
2791 | }\r | |
2792 | if (prevChar == 0x0d ||\r | |
2793 | prevChar == 0x0a ||\r | |
2794 | prevChar == 0x85) {\r | |
2795 | break;\r | |
2796 | }\r | |
2797 | \r | |
2798 | // LB 3c Don't break before hard line breaks\r | |
2799 | if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||\r | |
2800 | fBK->contains(thisChar)) {\r | |
2801 | continue;\r | |
2802 | }\r | |
2803 | \r | |
2804 | // LB 10 QU SP* x OP\r | |
2805 | if (prevPos >= 0) {\r | |
2806 | UnicodeString subStr10(*fText, prevPos);\r | |
2807 | fLB10Matcher->reset(subStr10);\r | |
2808 | status = U_ZERO_ERROR;\r | |
2809 | if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;\r | |
2810 | // TODO: Check status codes\r | |
2811 | pos = prevPos + fLB10Matcher->start(1, status);\r | |
2812 | nextPos = prevPos + fLB10Matcher->end(0, status);\r | |
2813 | thisChar = fText->char32At(pos);\r | |
2814 | continue;\r | |
2815 | }\r | |
2816 | }\r | |
2817 | \r | |
2818 | // LB 11 CL SP* x NS\r | |
2819 | if (prevPos >= 0) {\r | |
2820 | UnicodeString subStr11(*fText, prevPos);\r | |
2821 | fLB11Matcher->reset(subStr11);\r | |
2822 | status = U_ZERO_ERROR;\r | |
2823 | if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;\r | |
2824 | // TODO: Check status codes\r | |
2825 | pos = prevPos + fLB11Matcher->start(1, status);\r | |
2826 | nextPos = prevPos + fLB11Matcher->end(0, status);\r | |
2827 | thisChar = fText->char32At(pos);\r | |
2828 | continue;\r | |
2829 | }\r | |
2830 | }\r | |
2831 | \r | |
2832 | // LB 4 Don't break before spaces or zero-width space.\r | |
2833 | if (fSP->contains(thisChar)) {\r | |
2834 | continue;\r | |
2835 | }\r | |
2836 | \r | |
2837 | if (fZW->contains(thisChar)) {\r | |
2838 | continue;\r | |
2839 | }\r | |
2840 | \r | |
2841 | // LB 5 Break after zero width space\r | |
2842 | if (fZW->contains(prevChar)) {\r | |
2843 | break;\r | |
2844 | }\r | |
2845 | \r | |
2846 | // LB 6, LB 7\r | |
2847 | /*int32_t oldpos = pos;*/\r | |
2848 | rule67Adjust(prevPos, &prevChar, &pos, &thisChar);\r | |
2849 | \r | |
2850 | nextCPPos = fText->moveIndex32(pos, 1);\r | |
2851 | nextPos = nextCPPos;\r | |
2852 | c = fText->char32At(nextPos);\r | |
2853 | // another percularity of LB 4 - Dont break before space\r | |
2854 | if (fSP->contains(thisChar)) {\r | |
2855 | continue;\r | |
2856 | }\r | |
2857 | rule67Adjust(pos, &thisChar, &nextPos, &c);\r | |
2858 | \r | |
2859 | // If the loop is still warming up - if we haven't shifted the initial\r | |
2860 | // -1 positions out of prevPos yet - loop back to advance the\r | |
2861 | // position in the input without any further looking for breaks.\r | |
2862 | if (prevPos == -1) {\r | |
2863 | continue;\r | |
2864 | }\r | |
2865 | \r | |
2866 | // Re-apply rules 3c, 4 because these could be affected by having\r | |
2867 | // a new thisChar from doing rule 6 or 7.\r | |
2868 | if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c\r | |
2869 | fBK->contains(thisChar)) {\r | |
2870 | continue;\r | |
2871 | }\r | |
2872 | if (fSP->contains(thisChar)) { // LB 4\r | |
2873 | continue;\r | |
2874 | }\r | |
2875 | if (fZW->contains(thisChar)) { // LB 4\r | |
2876 | continue;\r | |
2877 | }\r | |
2878 | \r | |
2879 | \r | |
2880 | // LB 8 Don't break before closings.\r | |
2881 | // NU x CL and NU x IS are not matched here so that they will\r | |
2882 | // fall into LB 17 and the more general number regular expression.\r | |
2883 | //\r | |
2884 | if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||\r | |
2885 | fEX->contains(thisChar) ||\r | |
2886 | !fNU->contains(prevChar) && fIS->contains(thisChar) ||\r | |
2887 | !fNU->contains(prevChar) && fSY->contains(thisChar)) {\r | |
2888 | continue;\r | |
2889 | }\r | |
2890 | \r | |
2891 | // LB 9 Don't break after OP SP*\r | |
2892 | // Scan backwards, checking for this sequence.\r | |
2893 | // The OP char could include combining marks, so we acually check for\r | |
2894 | // OP CM* SP*\r | |
2895 | // Another Twist: The Rule 67 fixes may have changed a CP CM\r | |
2896 | // sequence into a ID char, so before scanning back through spaces,\r | |
2897 | // verify that prevChar is indeed a space. The prevChar variable\r | |
2898 | // may differ from fText[prevPos]\r | |
2899 | tPos = prevPos;\r | |
2900 | if (fSP->contains(prevChar)) {\r | |
2901 | while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {\r | |
2902 | tPos=fText->moveIndex32(tPos, -1);\r | |
2903 | }\r | |
2904 | }\r | |
2905 | while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {\r | |
2906 | tPos=fText->moveIndex32(tPos, -1);\r | |
2907 | }\r | |
2908 | if (fOP->contains(fText->char32At(tPos))) {\r | |
2909 | continue;\r | |
2910 | }\r | |
2911 | \r | |
2912 | \r | |
2913 | // LB 11a B2 x B2\r | |
2914 | if (fB2->contains(thisChar) && fB2->contains(prevChar)) {\r | |
2915 | continue;\r | |
2916 | }\r | |
2917 | \r | |
2918 | // LB 11b \r | |
2919 | // x GL\r | |
2920 | // GL x\r | |
2921 | if (fGL->contains(thisChar) || fGL->contains(prevChar)) {\r | |
2922 | continue;\r | |
2923 | }\r | |
2924 | if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {\r | |
2925 | continue;\r | |
2926 | }\r | |
2927 | \r | |
2928 | // LB 12 break after space\r | |
2929 | if (fSP->contains(prevChar)) {\r | |
2930 | break;\r | |
2931 | }\r | |
2932 | \r | |
2933 | // LB 14\r | |
2934 | // x QU\r | |
2935 | // QU x\r | |
2936 | if (fQU->contains(thisChar) || fQU->contains(prevChar)) {\r | |
2937 | continue;\r | |
2938 | }\r | |
2939 | \r | |
2940 | // LB 14a Break around a CB\r | |
2941 | if (fCB->contains(thisChar) || fCB->contains(prevChar)) {\r | |
2942 | break;\r | |
2943 | }\r | |
2944 | \r | |
2945 | // LB 15 \r | |
2946 | if (fBA->contains(thisChar) ||\r | |
2947 | fHY->contains(thisChar) ||\r | |
2948 | fNS->contains(thisChar) ||\r | |
2949 | fBB->contains(prevChar) ) {\r | |
2950 | continue;\r | |
2951 | }\r | |
2952 | \r | |
2953 | // LB 16\r | |
2954 | if (fAL->contains(prevChar) && fIN->contains(thisChar) ||\r | |
2955 | fID->contains(prevChar) && fIN->contains(thisChar) ||\r | |
2956 | fIN->contains(prevChar) && fIN->contains(thisChar) ||\r | |
2957 | fNU->contains(prevChar) && fIN->contains(thisChar) ) {\r | |
2958 | continue; \r | |
2959 | }\r | |
2960 | \r | |
2961 | \r | |
2962 | // LB 17 ID x PO (Note: Leading CM behaves like ID)\r | |
2963 | // AL x NU\r | |
2964 | // NU x AL\r | |
2965 | if (fID->contains(prevChar) && fPO->contains(thisChar) ||\r | |
2966 | fCM->contains(prevChar) && fPO->contains(thisChar) || \r | |
2967 | fAL->contains(prevChar) && fNU->contains(thisChar) ||\r | |
2968 | fNU->contains(prevChar) && fAL->contains(thisChar) ) {\r | |
2969 | continue; \r | |
2970 | }\r | |
2971 | \r | |
2972 | // LB 18 Numbers\r | |
2973 | UnicodeString subStr18(*fText, prevPos);\r | |
2974 | fNumberMatcher->reset(subStr18);\r | |
2975 | if (fNumberMatcher->lookingAt(status)) {\r | |
2976 | // TODO: Check status codes\r | |
2977 | // Matched a number. But could have been just a single digit, which would\r | |
2978 | // not represent a "no break here" between prevChar and thisChar\r | |
2979 | int32_t numEndIdx = prevPos + fNumberMatcher->end(status); // idx of first char following num\r | |
2980 | if (numEndIdx > pos) {\r | |
2981 | // Number match includes at least our two chars being checked\r | |
2982 | if (numEndIdx > nextPos) {\r | |
2983 | // Number match includes additional chars. Update pos and nextPos\r | |
2984 | // so that next loop iteration will continue at the end of the number,\r | |
2985 | // checking for breaks between last char in number & whatever follows.\r | |
2986 | nextPos = numEndIdx;\r | |
2987 | pos = fCharBI->preceding(numEndIdx); \r | |
2988 | thisChar = fText->char32At(pos);\r | |
2989 | while (fCM->contains(thisChar)) {\r | |
2990 | pos = fCharBI->preceding(pos);\r | |
2991 | thisChar = fText->char32At(pos);\r | |
2992 | }\r | |
2993 | }\r | |
2994 | continue;\r | |
2995 | }\r | |
2996 | }\r | |
2997 | \r | |
2998 | if (fPR->contains(prevChar) && fAL->contains(thisChar)) {\r | |
2999 | continue;\r | |
3000 | }\r | |
3001 | \r | |
3002 | if (fPR->contains(prevChar) && fID->contains(thisChar)) {\r | |
3003 | continue;\r | |
3004 | }\r | |
3005 | \r | |
3006 | // LB 18b\r | |
3007 | if (fHY->contains(prevChar) || fBB->contains(thisChar)) {\r | |
3008 | break;\r | |
3009 | }\r | |
3010 | \r | |
3011 | // LB 19\r | |
3012 | if (fAL->contains(prevChar) && fAL->contains(thisChar)) {\r | |
3013 | continue;\r | |
3014 | }\r | |
3015 | \r | |
3016 | // LB 19b\r | |
3017 | if (fIS->contains(prevChar) && fAL->contains(thisChar)) {\r | |
3018 | continue;\r | |
3019 | }\r | |
3020 | \r | |
3021 | // LB 20 Break everywhere else\r | |
3022 | break;\r | |
3023 | \r | |
3024 | }\r | |
3025 | \r | |
3026 | return pos;\r | |
3027 | }\r | |
3028 | \r | |
3029 | \r | |
3030 | UVector *RBBILineMonkey::charClasses() {\r | |
3031 | return fSets;\r | |
3032 | }\r | |
3033 | \r | |
3034 | \r | |
3035 | RBBILineMonkey::~RBBILineMonkey() {\r | |
3036 | delete fSets;\r | |
3037 | \r | |
3038 | delete fBK;\r | |
3039 | delete fCR;\r | |
3040 | delete fLF;\r | |
3041 | delete fCM;\r | |
3042 | delete fNL;\r | |
3043 | delete fWJ;\r | |
3044 | delete fZW;\r | |
3045 | delete fGL;\r | |
3046 | delete fCB;\r | |
3047 | delete fSP;\r | |
3048 | delete fB2;\r | |
3049 | delete fBA;\r | |
3050 | delete fBB;\r | |
3051 | delete fHY;\r | |
3052 | delete fCL;\r | |
3053 | delete fEX;\r | |
3054 | delete fIN;\r | |
3055 | delete fNS;\r | |
3056 | delete fOP;\r | |
3057 | delete fQU;\r | |
3058 | delete fIS;\r | |
3059 | delete fNU;\r | |
3060 | delete fPO;\r | |
3061 | delete fPR;\r | |
3062 | delete fSY;\r | |
3063 | delete fAI;\r | |
3064 | delete fAL;\r | |
3065 | delete fID;\r | |
3066 | delete fSA;\r | |
3067 | delete fXX;\r | |
3068 | \r | |
3069 | delete fCharBI;\r | |
3070 | delete fNumberMatcher;\r | |
3071 | delete fLB10Matcher;\r | |
3072 | delete fLB11Matcher;\r | |
3073 | }\r | |
3074 | \r | |
3075 | \r | |
3076 | //-------------------------------------------------------------------------------------------\r | |
3077 | //\r | |
3078 | // TestMonkey\r | |
3079 | //\r | |
3080 | // params\r | |
3081 | // seed=nnnnn Random number starting seed.\r | |
3082 | // Setting the seed allows errors to be reproduced.\r | |
3083 | // loop=nnn Looping count. Controls running time.\r | |
3084 | // -1: run forever.\r | |
3085 | // 0 or greater: run length.\r | |
3086 | //\r | |
3087 | // type = char | word | line | sent | title\r | |
3088 | //\r | |
3089 | //-------------------------------------------------------------------------------------------\r | |
3090 | \r | |
3091 | static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {\r | |
3092 | int32_t val = defaultVal;\r | |
3093 | name.append(" *= *(-?\\d+)");\r | |
3094 | UErrorCode status = U_ZERO_ERROR;\r | |
3095 | RegexMatcher m(name, params, 0, status);\r | |
3096 | if (m.find()) {\r | |
3097 | // The param exists. Convert the string to an int.\r | |
3098 | char valString[100];\r | |
3099 | int32_t paramLength = m.end(1, status) - m.start(1, status);\r | |
3100 | if (paramLength >= (int32_t)(sizeof(valString)-1)) {\r | |
3101 | paramLength = (int32_t)(sizeof(valString)-2);\r | |
3102 | }\r | |
3103 | params.extract(m.start(1, status), paramLength, valString, sizeof(valString));\r | |
3104 | val = strtol(valString, NULL, 10);\r | |
3105 | \r | |
3106 | // Delete this parameter from the params string.\r | |
3107 | m.reset();\r | |
3108 | params = m.replaceFirst("", status);\r | |
3109 | }\r | |
3110 | U_ASSERT(U_SUCCESS(status));\r | |
3111 | return val;\r | |
3112 | }\r | |
3113 | #endif\r | |
3114 | \r | |
3115 | static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, \r | |
3116 | BreakIterator *bi,\r | |
3117 | int expected[], \r | |
3118 | int expectedcount)\r | |
3119 | {\r | |
3120 | int count = 0;\r | |
3121 | int i = 0;\r | |
3122 | int forward[50];\r | |
3123 | bi->setText(ustr);\r | |
3124 | for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {\r | |
3125 | forward[count] = i;\r | |
3126 | if (count < expectedcount && expected[count] != i) {\r | |
3127 | test->errln("break forward test failed: expected %d but got %d", \r | |
3128 | expected[count], i);\r | |
3129 | break;\r | |
3130 | }\r | |
3131 | count ++;\r | |
3132 | }\r | |
3133 | if (count != expectedcount) {\r | |
3134 | printStringBreaks(ustr, expected, expectedcount);\r | |
3135 | test->errln("break test failed: missed %d match", \r | |
3136 | expectedcount - count);\r | |
3137 | return;\r | |
3138 | }\r | |
3139 | // testing boundaries\r | |
3140 | for (i = 1; i < expectedcount; i ++) {\r | |
3141 | int j = expected[i - 1];\r | |
3142 | if (!bi->isBoundary(j)) {\r | |
3143 | printStringBreaks(ustr, expected, expectedcount);\r | |
3144 | test->errln("Expected boundary at position %d", j);\r | |
3145 | return;\r | |
3146 | }\r | |
3147 | for (j = expected[i - 1] + 1; j < expected[i]; j ++) {\r | |
3148 | if (bi->isBoundary(j)) {\r | |
3149 | printStringBreaks(ustr, expected, expectedcount);\r | |
3150 | test->errln("Not expecting boundary at position %d", j);\r | |
3151 | return;\r | |
3152 | }\r | |
3153 | }\r | |
3154 | }\r | |
3155 | \r | |
3156 | for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {\r | |
3157 | count --;\r | |
3158 | if (forward[count] != i) {\r | |
3159 | test->errln("happy break test reverse failed: expected %d but got %d", \r | |
3160 | forward[count], i);\r | |
3161 | break;\r | |
3162 | }\r | |
3163 | }\r | |
3164 | if (count != 0) {\r | |
3165 | printStringBreaks(ustr, expected, expectedcount);\r | |
3166 | test->errln("happy break test failed: missed a match");\r | |
3167 | return;\r | |
3168 | }\r | |
3169 | \r | |
3170 | // testing preceding\r | |
3171 | for (i = 0; i < expectedcount - 1; i ++) {\r | |
3172 | int j = expected[i] + 1;\r | |
3173 | for (; j <= expected[i + 1]; j ++) {\r | |
3174 | if (bi->preceding(j) != expected[i]) {\r | |
3175 | printStringBreaks(ustr, expected, expectedcount);\r | |
3176 | test->errln("Not expecting backwards boundary at position %d", j);\r | |
3177 | return;\r | |
3178 | }\r | |
3179 | }\r | |
3180 | } \r | |
3181 | }\r | |
3182 | \r | |
3183 | void RBBITest::TestWordBreaks(void)\r | |
3184 | {\r | |
3185 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS\r | |
3186 | \r | |
3187 | // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>\r | |
3188 | Locale locale("en");\r | |
3189 | UErrorCode status = U_ZERO_ERROR;\r | |
3190 | // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);\r | |
3191 | BreakIterator *bi = BreakIterator::createWordInstance(locale, status);\r | |
3192 | UChar str[300]; \r | |
3193 | static const char *strlist[] = \r | |
3194 | {\r | |
3195 | "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",\r | |
3196 | "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",\r | |
3197 | "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",\r | |
3198 | "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",\r | |
3199 | "\\u90ca\\u3588\\u009c\\u0953\\u194b",\r | |
3200 | "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",\r | |
3201 | "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",\r | |
3202 | "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",\r | |
3203 | "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",\r | |
3204 | "\\u003b\\u024a\\u102e\\U000e0071\\u0600",\r | |
3205 | "\\u2027\\U000e0067\\u0a47\\u00b7",\r | |
3206 | "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",\r | |
3207 | "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",\r | |
3208 | "\\u0589\\U000e006e\\u0a42\\U000104a5",\r | |
3209 | "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",\r | |
3210 | "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",\r | |
3211 | "\\u0027\\u11af\\U000e0057\\u0602",\r | |
3212 | "\\U0001d7f2\\U000e007\\u0004\\u0589",\r | |
3213 | "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",\r | |
3214 | "\\U0001d7f2\\U000e007d\\u0004\\u0589",\r | |
3215 | "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",\r | |
3216 | "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",\r | |
3217 | "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",\r | |
3218 | "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",\r | |
3219 | "\\u0233\\U000e0020\\u0a69\\u0d6a",\r | |
3220 | "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",\r | |
3221 | "\\u58f4\\U000e0049\\u20e7\\u2027",\r | |
3222 | "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",\r | |
3223 | "\\ua183\\u102d\\u0bec\\u003a",\r | |
3224 | "\\u17e8\\u06e7\\u002e\\u096d\\u003b",\r | |
3225 | "\\u003a\\u0e57\\u0fad\\u002e",\r | |
3226 | "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",\r | |
3227 | "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",\r | |
3228 | "\\U000e005d\\u2044\\u0731\\u0650\\u0061",\r | |
3229 | "\\u003a\\u0664\\u00b7\\u1fba",\r | |
3230 | "\\u003b\\u0027\\u00b7\\u47a3",\r | |
3231 | "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",\r | |
3232 | "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",\r | |
3233 | "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",\r | |
3234 | };\r | |
3235 | int loop;\r | |
3236 | if (U_FAILURE(status)) {\r | |
3237 | errln("Creation of break iterator failed %s", u_errorName(status));\r | |
3238 | return;\r | |
3239 | }\r | |
3240 | for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r | |
3241 | // printf("looping %d\n", loop);\r | |
3242 | u_unescape(strlist[loop], str, 25);\r | |
3243 | UnicodeString ustr(str);\r | |
3244 | // RBBICharMonkey monkey;\r | |
3245 | RBBIWordMonkey monkey;\r | |
3246 | \r | |
3247 | int expected[50];\r | |
3248 | int expectedcount = 0;\r | |
3249 | \r | |
3250 | monkey.setText(ustr);\r | |
3251 | int i;\r | |
3252 | for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {\r | |
3253 | expected[expectedcount ++] = i;\r | |
3254 | }\r | |
3255 | \r | |
3256 | testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);\r | |
3257 | }\r | |
3258 | delete bi;\r | |
3259 | #endif\r | |
3260 | }\r | |
3261 | \r | |
3262 | void RBBITest::TestWordBoundary(void)\r | |
3263 | {\r | |
3264 | // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>\r | |
3265 | Locale locale("en");\r | |
3266 | UErrorCode status = U_ZERO_ERROR;\r | |
3267 | // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);\r | |
3268 | BreakIterator *bi = BreakIterator::createWordInstance(locale, status);\r | |
3269 | UChar str[50]; \r | |
3270 | static const char *strlist[] = \r | |
3271 | {\r | |
3272 | "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",\r | |
3273 | "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",\r | |
3274 | "\\u003b\\u024a\\u102e\\U000e0071\\u0600",\r | |
3275 | "\\u2027\\U000e0067\\u0a47\\u00b7",\r | |
3276 | "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",\r | |
3277 | "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",\r | |
3278 | "\\u0589\\U000e006e\\u0a42\\U000104a5",\r | |
3279 | "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",\r | |
3280 | "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",\r | |
3281 | "\\u0027\\u11af\\U000e0057\\u0602",\r | |
3282 | "\\U0001d7f2\\U000e007\\u0004\\u0589",\r | |
3283 | "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",\r | |
3284 | "\\U0001d7f2\\U000e007d\\u0004\\u0589",\r | |
3285 | "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",\r | |
3286 | "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",\r | |
3287 | "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",\r | |
3288 | "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",\r | |
3289 | "\\u0233\\U000e0020\\u0a69\\u0d6a",\r | |
3290 | "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",\r | |
3291 | "\\u58f4\\U000e0049\\u20e7\\u2027",\r | |
3292 | "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",\r | |
3293 | "\\ua183\\u102d\\u0bec\\u003a",\r | |
3294 | "\\u17e8\\u06e7\\u002e\\u096d\\u003b",\r | |
3295 | "\\u003a\\u0e57\\u0fad\\u002e",\r | |
3296 | "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",\r | |
3297 | "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",\r | |
3298 | "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",\r | |
3299 | "\\u003a\\u0664\\u00b7\\u1fba",\r | |
3300 | "\\u003b\\u0027\\u00b7\\u47a3",\r | |
3301 | };\r | |
3302 | int loop;\r | |
3303 | if (U_FAILURE(status)) {\r | |
3304 | errln("Creation of break iterator failed %s", u_errorName(status));\r | |
3305 | return;\r | |
3306 | }\r | |
3307 | for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r | |
3308 | // printf("looping %d\n", loop);\r | |
3309 | u_unescape(strlist[loop], str, 20);\r | |
3310 | UnicodeString ustr(str);\r | |
3311 | int forward[50];\r | |
3312 | int count = 0;\r | |
3313 | \r | |
3314 | bi->setText(ustr);\r | |
3315 | int prev = 0;\r | |
3316 | int i;\r | |
3317 | for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {\r | |
3318 | forward[count ++] = i;\r | |
3319 | if (i > prev) {\r | |
3320 | int j;\r | |
3321 | for (j = prev + 1; j < i; j ++) {\r | |
3322 | if (bi->isBoundary(j)) {\r | |
3323 | printStringBreaks(ustr, forward, count);\r | |
3324 | errln("happy boundary test failed: expected %d not a boundary", \r | |
3325 | j);\r | |
3326 | return;\r | |
3327 | }\r | |
3328 | }\r | |
3329 | }\r | |
3330 | if (!bi->isBoundary(i)) {\r | |
3331 | printStringBreaks(ustr, forward, count);\r | |
3332 | errln("happy boundary test failed: expected %d a boundary", \r | |
3333 | i);\r | |
3334 | return;\r | |
3335 | }\r | |
3336 | prev = i;\r | |
3337 | }\r | |
3338 | }\r | |
3339 | delete bi;\r | |
3340 | }\r | |
3341 | \r | |
3342 | void RBBITest::TestLineBreaks(void)\r | |
3343 | {\r | |
3344 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS\r | |
3345 | Locale locale("en");\r | |
3346 | UErrorCode status = U_ZERO_ERROR;\r | |
3347 | BreakIterator *bi = BreakIterator::createLineInstance(locale, status);\r | |
3348 | UChar str[50]; \r | |
3349 | static const char *strlist[] = \r | |
3350 | {\r | |
3351 | "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",\r | |
3352 | "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",\r | |
3353 | "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",\r | |
3354 | "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",\r | |
3355 | "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",\r | |
3356 | "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",\r | |
3357 | "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",\r | |
3358 | "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",\r | |
3359 | "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",\r | |
3360 | "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",\r | |
3361 | "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",\r | |
3362 | "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",\r | |
3363 | "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",\r | |
3364 | "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",\r | |
3365 | "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",\r | |
3366 | "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",\r | |
3367 | "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",\r | |
3368 | "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",\r | |
3369 | "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",\r | |
3370 | "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",\r | |
3371 | "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",\r | |
3372 | "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",\r | |
3373 | "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",\r | |
3374 | "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",\r | |
3375 | "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",\r | |
3376 | "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",\r | |
3377 | "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",\r | |
3378 | "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",\r | |
3379 | "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",\r | |
3380 | "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",\r | |
3381 | "\\u2014\\u0020\\u000a\\u17c5\\u24fc",\r | |
3382 | "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",\r | |
3383 | "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",\r | |
3384 | "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",\r | |
3385 | "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",\r | |
3386 | "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",\r | |
3387 | "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",\r | |
3388 | };\r | |
3389 | int loop;\r | |
3390 | if (U_FAILURE(status)) {\r | |
3391 | errln("Creation of break iterator failed %s", u_errorName(status));\r | |
3392 | return;\r | |
3393 | }\r | |
3394 | for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r | |
3395 | // printf("looping %d\n", loop);\r | |
3396 | u_unescape(strlist[loop], str, 20);\r | |
3397 | UnicodeString ustr(str);\r | |
3398 | RBBILineMonkey monkey;\r | |
3399 | \r | |
3400 | int expected[50];\r | |
3401 | int expectedcount = 0;\r | |
3402 | \r | |
3403 | monkey.setText(ustr);\r | |
3404 | int i;\r | |
3405 | for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {\r | |
3406 | expected[expectedcount ++] = i;\r | |
3407 | }\r | |
3408 | \r | |
3409 | testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);\r | |
3410 | }\r | |
3411 | delete bi;\r | |
3412 | #endif\r | |
3413 | }\r | |
3414 | \r | |
3415 | void RBBITest::TestSentBreaks(void)\r | |
3416 | {\r | |
3417 | Locale locale("en");\r | |
3418 | UErrorCode status = U_ZERO_ERROR;\r | |
3419 | BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);\r | |
3420 | UChar str[100]; \r | |
3421 | static const char *strlist[] = \r | |
3422 | {\r | |
3423 | "Now\ris\nthe\r\ntime\n\rfor\r\r",\r | |
3424 | "This\n",\r | |
3425 | "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",\r | |
3426 | "\"Sentence ending with a quote.\" Bye.",\r | |
3427 | " (This is it). Testing the sentence iterator. \"This isn't it.\"", \r | |
3428 | "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",\r | |
3429 | "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",\r | |
3430 | "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",\r | |
3431 | "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",\r | |
3432 | "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",\r | |
3433 | };\r | |
3434 | int loop;\r | |
3435 | int forward[100];\r | |
3436 | if (U_FAILURE(status)) {\r | |
3437 | errln("Creation of break iterator failed %s", u_errorName(status));\r | |
3438 | return;\r | |
3439 | }\r | |
3440 | for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {\r | |
3441 | u_unescape(strlist[loop], str, 100);\r | |
3442 | UnicodeString ustr(str);\r | |
3443 | \r | |
3444 | int count = 0;\r | |
3445 | bi->setText(ustr);\r | |
3446 | int i;\r | |
3447 | for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {\r | |
3448 | forward[count ++] = i;\r | |
3449 | }\r | |
3450 | testBreakBoundPreceding(this, ustr, bi, forward, count);\r | |
3451 | }\r | |
3452 | delete bi;\r | |
3453 | }\r | |
3454 | \r | |
3455 | void RBBITest::TestMonkey(char *params) {\r | |
3456 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS\r | |
3457 | \r | |
3458 | UErrorCode status = U_ZERO_ERROR;\r | |
3459 | int32_t loopCount = 500;\r | |
3460 | int32_t seed = 1;\r | |
3461 | UnicodeString breakType = "all";\r | |
3462 | Locale locale("en");\r | |
3463 | \r | |
3464 | if (quick == FALSE) {\r | |
3465 | loopCount = 10000;\r | |
3466 | }\r | |
3467 | \r | |
3468 | if (params) {\r | |
3469 | UnicodeString p(params);\r | |
3470 | loopCount = getIntParam("loop", p, loopCount);\r | |
3471 | seed = getIntParam("seed", p, seed);\r | |
3472 | \r | |
3473 | RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);\r | |
3474 | if (m.find()) {\r | |
3475 | breakType = m.group(1, status);\r | |
3476 | m.reset();\r | |
3477 | p = m.replaceFirst("", status);\r | |
3478 | }\r | |
3479 | \r | |
3480 | m.reset(p);\r | |
3481 | if (RegexMatcher("\\S", p, 0, status).find()) {\r | |
3482 | // Each option is stripped out of the option string as it is processed.\r | |
3483 | // All options have been checked. The option string should have been completely emptied..\r | |
3484 | char buf[100];\r | |
3485 | p.extract(buf, sizeof(buf), NULL, status);\r | |
3486 | buf[sizeof(buf)-1] = 0;\r | |
3487 | errln("Unrecognized or extra parameter: %s\n", buf);\r | |
3488 | return;\r | |
3489 | }\r | |
3490 | \r | |
3491 | }\r | |
3492 | \r | |
3493 | if (breakType == "char" || breakType == "all") {\r | |
3494 | RBBICharMonkey m;\r | |
3495 | BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);\r | |
3496 | if (U_SUCCESS(status)) {\r | |
3497 | RunMonkey(bi, m, "char", seed, loopCount);\r | |
3498 | }\r | |
3499 | else {\r | |
3500 | errln("Creation of character break iterator failed %s", u_errorName(status));\r | |
3501 | }\r | |
3502 | delete bi;\r | |
3503 | }\r | |
3504 | \r | |
3505 | if (breakType == "word" || breakType == "all") {\r | |
3506 | logln("Word Break Monkey Test");\r | |
3507 | RBBIWordMonkey m;\r | |
3508 | BreakIterator *bi = BreakIterator::createWordInstance(locale, status);\r | |
3509 | if (U_SUCCESS(status)) {\r | |
3510 | RunMonkey(bi, m, "word", seed, loopCount);\r | |
3511 | }\r | |
3512 | else {\r | |
3513 | errln("Creation of word break iterator failed %s", u_errorName(status));\r | |
3514 | }\r | |
3515 | delete bi;\r | |
3516 | }\r | |
3517 | \r | |
3518 | if (breakType == "line" || breakType == "all") {\r | |
3519 | logln("Line Break Monkey Test");\r | |
3520 | RBBILineMonkey m;\r | |
3521 | BreakIterator *bi = BreakIterator::createLineInstance(locale, status);\r | |
3522 | if (params == NULL) {\r | |
3523 | loopCount = 50;\r | |
3524 | }\r | |
3525 | if (U_SUCCESS(status)) {\r | |
3526 | RunMonkey(bi, m, "line", seed, loopCount);\r | |
3527 | }\r | |
3528 | else {\r | |
3529 | errln("Creation of line break iterator failed %s", u_errorName(status));\r | |
3530 | }\r | |
3531 | delete bi;\r | |
3532 | }\r | |
3533 | \r | |
3534 | \r | |
3535 | #endif\r | |
3536 | }\r | |
3537 | \r | |
3538 | //\r | |
3539 | // Run a RBBI monkey test. Common routine, for all break iterator types.\r | |
3540 | // Parameters:\r | |
3541 | // bi - the break iterator to use\r | |
3542 | // mk - MonkeyKind, abstraction for obtaining expected results\r | |
3543 | // name - Name of test (char, word, etc.) for use in error messages\r | |
3544 | // seed - Seed for starting random number generator (parameter from user)\r | |
3545 | // numIterations\r | |
3546 | //\r | |
3547 | void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, int32_t numIterations) {\r | |
3548 | \r | |
3549 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS\r | |
3550 | \r | |
3551 | const int32_t TESTSTRINGLEN = 500;\r | |
3552 | UnicodeString testText;\r | |
3553 | int32_t numCharClasses;\r | |
3554 | UVector *chClasses;\r | |
3555 | int expected[TESTSTRINGLEN*2 + 1];\r | |
3556 | int expectedCount = 0;\r | |
3557 | char expectedBreaks[TESTSTRINGLEN*2 + 1];\r | |
3558 | char forwardBreaks[TESTSTRINGLEN*2 + 1];\r | |
3559 | char reverseBreaks[TESTSTRINGLEN*2+1];\r | |
3560 | char isBoundaryBreaks[TESTSTRINGLEN*2+1];\r | |
3561 | char followingBreaks[TESTSTRINGLEN*2+1];\r | |
3562 | char precedingBreaks[TESTSTRINGLEN*2+1];\r | |
3563 | int i;\r | |
3564 | int loopCount = 0;\r | |
3565 | \r | |
3566 | m_seed = seed;\r | |
3567 | \r | |
3568 | numCharClasses = mk.charClasses()->size();\r | |
3569 | chClasses = mk.charClasses();\r | |
3570 | \r | |
3571 | // Check for errors that occured during the construction of the MonkeyKind object.\r | |
3572 | // Can't report them where they occured because errln() is a method coming from intlTest,\r | |
3573 | // and is not visible outside of RBBITest :-(\r | |
3574 | if (U_FAILURE(mk.deferredStatus)) {\r | |
3575 | errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));\r | |
3576 | return;\r | |
3577 | }\r | |
3578 | \r | |
3579 | // Verify that the character classes all have at least one member.\r | |
3580 | for (i=0; i<numCharClasses; i++) {\r | |
3581 | UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);\r | |
3582 | if (s == NULL || s->size() == 0) {\r | |
3583 | errln("Character Class #%d is null or of zero size.", i);\r | |
3584 | return;\r | |
3585 | }\r | |
3586 | }\r | |
3587 | \r | |
3588 | while (loopCount < numIterations || numIterations == -1) {\r | |
3589 | if (numIterations == -1 && loopCount % 10 == 0) {\r | |
3590 | // If test is running in an infinite loop, display a periodic tic so\r | |
3591 | // we can tell that it is making progress.\r | |
3592 | fprintf(stderr, ".");\r | |
3593 | }\r | |
3594 | // Save current random number seed, so that we can recreate the random numbers\r | |
3595 | // for this loop iteration in event of an error.\r | |
3596 | seed = m_seed;\r | |
3597 | \r | |
3598 | // Populate a test string with data.\r | |
3599 | testText.truncate(0);\r | |
3600 | for (i=0; i<TESTSTRINGLEN; i++) {\r | |
3601 | int32_t aClassNum = m_rand() % numCharClasses;\r | |
3602 | UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);\r | |
3603 | int32_t charIdx = m_rand() % classSet->size();\r | |
3604 | UChar32 c = classSet->charAt(charIdx);\r | |
3605 | if (c < 0) { // TODO: deal with sets containing strings.\r | |
3606 | errln("c < 0");\r | |
3607 | }\r | |
3608 | testText.append(c);\r | |
3609 | }\r | |
3610 | \r | |
3611 | // Calculate the expected results for this test string.\r | |
3612 | mk.setText(testText);\r | |
3613 | memset(expectedBreaks, 0, sizeof(expectedBreaks));\r | |
3614 | expectedBreaks[0] = 1;\r | |
3615 | int32_t breakPos = 0;\r | |
3616 | expectedCount = 0;\r | |
3617 | for (;;) {\r | |
3618 | breakPos = mk.next(breakPos);\r | |
3619 | if (breakPos == -1) {\r | |
3620 | break;\r | |
3621 | }\r | |
3622 | if (breakPos > testText.length()) {\r | |
3623 | errln("breakPos > testText.length()");\r | |
3624 | }\r | |
3625 | expectedBreaks[breakPos] = 1;\r | |
3626 | expected[expectedCount ++] = breakPos;\r | |
3627 | }\r | |
3628 | \r | |
3629 | // Find the break positions using forward iteration\r | |
3630 | memset(forwardBreaks, 0, sizeof(forwardBreaks));\r | |
3631 | bi->setText(testText);\r | |
3632 | for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {\r | |
3633 | if (i < 0 || i > testText.length()) {\r | |
3634 | errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);\r | |
3635 | break;\r | |
3636 | }\r | |
3637 | forwardBreaks[i] = 1;\r | |
3638 | }\r | |
3639 | \r | |
3640 | // Find the break positions using reverse iteration\r | |
3641 | memset(reverseBreaks, 0, sizeof(reverseBreaks));\r | |
3642 | for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {\r | |
3643 | if (i < 0 || i > testText.length()) {\r | |
3644 | errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);\r | |
3645 | break;\r | |
3646 | }\r | |
3647 | reverseBreaks[i] = 1;\r | |
3648 | }\r | |
3649 | \r | |
3650 | // Find the break positions using isBoundary() tests.\r | |
3651 | memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));\r | |
3652 | U_ASSERT(sizeof(isBoundaryBreaks) > testText.length());\r | |
3653 | for (i=0; i<=testText.length(); i++) {\r | |
3654 | isBoundaryBreaks[i] = bi->isBoundary(i);\r | |
3655 | }\r | |
3656 | \r | |
3657 | \r | |
3658 | // Find the break positions using the following() function.\r | |
3659 | // printf(".");\r | |
3660 | memset(followingBreaks, 0, sizeof(followingBreaks));\r | |
3661 | int32_t lastBreakPos = 0;\r | |
3662 | followingBreaks[0] = 1;\r | |
3663 | for (i=0; i<testText.length(); i++) {\r | |
3664 | breakPos = bi->following(i);\r | |
3665 | if (breakPos <= i ||\r | |
3666 | breakPos < lastBreakPos ||\r | |
3667 | breakPos > testText.length() ||\r | |
3668 | breakPos > lastBreakPos && lastBreakPos > i ) {\r | |
3669 | errln("%s break monkey test: "\r | |
3670 | "Out of range value returned by BreakIterator::following().\n"\r | |
3671 | "Random seed=%d", name, seed);\r | |
3672 | break;\r | |
3673 | }\r | |
3674 | followingBreaks[breakPos] = 1;\r | |
3675 | lastBreakPos = breakPos;\r | |
3676 | }\r | |
3677 | \r | |
3678 | // Find the break positions using the preceding() function.\r | |
3679 | memset(precedingBreaks, 0, sizeof(followingBreaks));\r | |
3680 | lastBreakPos = testText.length();\r | |
3681 | precedingBreaks[testText.length()] = 1;\r | |
3682 | for (i=testText.length(); i>0; i--) {\r | |
3683 | breakPos = bi->preceding(i);\r | |
3684 | if (breakPos >= i ||\r | |
3685 | breakPos > lastBreakPos ||\r | |
3686 | breakPos < 0 ||\r | |
3687 | breakPos < lastBreakPos && lastBreakPos < i ) {\r | |
3688 | errln("%s break monkey test: "\r | |
3689 | "Out of range value returned by BreakIterator::preceding().\n"\r | |
3690 | "index=%d; prev returned %d; lastBreak=%d" ,\r | |
3691 | name, i, breakPos, lastBreakPos);\r | |
3692 | precedingBreaks[i] = 2; // Forces an error.\r | |
3693 | } else {\r | |
3694 | precedingBreaks[breakPos] = 1;\r | |
3695 | lastBreakPos = breakPos;\r | |
3696 | }\r | |
3697 | }\r | |
3698 | \r | |
3699 | // Compare the expected and actual results.\r | |
3700 | for (i=0; i<=testText.length(); i++) {\r | |
3701 | const char *errorType = NULL;\r | |
3702 | if (forwardBreaks[i] != expectedBreaks[i]) {\r | |
3703 | errorType = "next()";\r | |
3704 | } else if (reverseBreaks[i] != forwardBreaks[i]) {\r | |
3705 | errorType = "previous()";\r | |
3706 | } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {\r | |
3707 | errorType = "isBoundary()";\r | |
3708 | } else if (followingBreaks[i] != expectedBreaks[i]) {\r | |
3709 | errorType = "following()";\r | |
3710 | } else if (precedingBreaks[i] != expectedBreaks[i]) {\r | |
3711 | errorType = "preceding()";\r | |
3712 | }\r | |
3713 | \r | |
3714 | \r | |
3715 | if (errorType != NULL) {\r | |
3716 | // Format a range of the test text that includes the failure as\r | |
3717 | // a data item that can be included in the rbbi test data file.\r | |
3718 | \r | |
3719 | // Start of the range is the last point where expected and actual results\r | |
3720 | // both agreed that there was a break position.\r | |
3721 | int startContext = i;\r | |
3722 | int32_t count = 0;\r | |
3723 | for (;;) {\r | |
3724 | if (startContext==0) { break; }\r | |
3725 | startContext --;\r | |
3726 | if (expectedBreaks[startContext] != 0) {\r | |
3727 | if (count == 2) break;\r | |
3728 | count ++;\r | |
3729 | }\r | |
3730 | }\r | |
3731 | \r | |
3732 | // End of range is two expected breaks past the start position.\r | |
3733 | int endContext = i + 1;\r | |
3734 | int ci;\r | |
3735 | for (ci=0; ci<2; ci++) { // Number of items to include in error text.\r | |
3736 | for (;;) {\r | |
3737 | if (endContext >= testText.length()) {break;}\r | |
3738 | if (expectedBreaks[endContext-1] != 0) { \r | |
3739 | if (count == 0) break;\r | |
3740 | count --;\r | |
3741 | }\r | |
3742 | endContext ++;\r | |
3743 | }\r | |
3744 | }\r | |
3745 | \r | |
3746 | // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"\r | |
3747 | UnicodeString errorText = "<data>";\r | |
3748 | /***if (strcmp(errorType, "next()") == 0) {\r | |
3749 | startContext = 0;\r | |
3750 | endContext = testText.length();\r | |
3751 | \r | |
3752 | printStringBreaks(testText, expected, expectedCount);\r | |
3753 | }***/\r | |
3754 | \r | |
3755 | for (ci=startContext; ci<endContext;) {\r | |
3756 | UnicodeString hexChars("0123456789abcdef");\r | |
3757 | UChar32 c;\r | |
3758 | int bn;\r | |
3759 | c = testText.char32At(ci);\r | |
3760 | if (ci == i) {\r | |
3761 | // This is the location of the error.\r | |
3762 | errorText.append("<?>");\r | |
3763 | } else if (expectedBreaks[ci] != 0) {\r | |
3764 | // This a non-error expected break position.\r | |
3765 | errorText.append("<>");\r | |
3766 | }\r | |
3767 | if (c < 0x10000) {\r | |
3768 | errorText.append("\\u");\r | |
3769 | for (bn=12; bn>=0; bn-=4) {\r | |
3770 | errorText.append(hexChars.charAt((c>>bn)&0xf));\r | |
3771 | }\r | |
3772 | } else {\r | |
3773 | errorText.append("\\U");\r | |
3774 | for (bn=28; bn>=0; bn-=4) {\r | |
3775 | errorText.append(hexChars.charAt((c>>bn)&0xf));\r | |
3776 | }\r | |
3777 | }\r | |
3778 | ci = testText.moveIndex32(ci, 1);\r | |
3779 | }\r | |
3780 | errorText.append("<>");\r | |
3781 | errorText.append("</data>\n");\r | |
3782 | \r | |
3783 | // Output the error\r | |
3784 | char charErrorTxt[500];\r | |
3785 | UErrorCode status = U_ZERO_ERROR;\r | |
3786 | errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);\r | |
3787 | charErrorTxt[sizeof(charErrorTxt)-1] = 0;\r | |
3788 | errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",\r | |
3789 | name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),\r | |
3790 | errorType, seed, i, charErrorTxt);\r | |
3791 | break;\r | |
3792 | }\r | |
3793 | }\r | |
3794 | \r | |
3795 | loopCount++;\r | |
3796 | }\r | |
3797 | #endif\r | |
3798 | }\r | |
3799 | \r | |
3800 | \r | |
3801 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */\r |