]>
Commit | Line | Data |
---|---|---|
46f4442e A |
1 | /* |
2 | ********************************************************************** | |
4388f060 | 3 | * Copyright (C) 2005-2012, International Business Machines |
46f4442e A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | ||
9 | #include "unicode/utypes.h" | |
10 | ||
11 | #if !UCONFIG_NO_COLLATION | |
12 | ||
13 | #include "unicode/unistr.h" | |
14 | #include "unicode/putil.h" | |
15 | #include "unicode/usearch.h" | |
16 | ||
17 | #include "cmemory.h" | |
18 | #include "unicode/coll.h" | |
19 | #include "unicode/tblcoll.h" | |
20 | #include "unicode/coleitr.h" | |
21 | #include "unicode/ucoleitr.h" | |
22 | ||
23 | #include "unicode/regex.h" // TODO: make conditional on regexp being built. | |
24 | ||
25 | #include "unicode/uniset.h" | |
26 | #include "unicode/uset.h" | |
27 | #include "unicode/ustring.h" | |
28 | #include "hash.h" | |
29 | #include "uhash.h" | |
30 | #include "ucol_imp.h" | |
31 | ||
32 | #include "intltest.h" | |
33 | #include "ssearch.h" | |
34 | ||
729e4ab9 A |
35 | #include "unicode/colldata.h" |
36 | #include "unicode/bmsearch.h" | |
37 | #include "unicode/bms.h" | |
38 | ||
46f4442e | 39 | #include "xmlparser.h" |
729e4ab9 | 40 | #include "ucbuf.h" |
46f4442e A |
41 | |
42 | #include <stdlib.h> | |
43 | #include <string.h> | |
44 | #include <stdio.h> | |
45 | ||
46 | char testId[100]; | |
47 | ||
48 | #define TEST_ASSERT(x) {if (!(x)) { \ | |
49 | errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}} | |
50 | ||
51 | #define TEST_ASSERT_M(x, m) {if (!(x)) { \ | |
52 | errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}} | |
53 | ||
54 | #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ | |
729e4ab9 | 55 | dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ |
46f4442e A |
56 | __FILE__, __LINE__, testId, u_errorName(errcode));}} |
57 | ||
58 | #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | |
729e4ab9 A |
59 | #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) |
60 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
46f4442e A |
61 | |
62 | //--------------------------------------------------------------------------- | |
63 | // | |
64 | // Test class boilerplate | |
65 | // | |
66 | //--------------------------------------------------------------------------- | |
67 | SSearchTest::SSearchTest() | |
68 | { | |
69 | } | |
70 | ||
71 | SSearchTest::~SSearchTest() | |
72 | { | |
73 | } | |
74 | ||
75 | void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params ) | |
76 | { | |
77 | if (exec) logln("TestSuite SSearchTest: "); | |
78 | switch (index) { | |
79 | #if !UCONFIG_NO_BREAK_ITERATION | |
80 | case 0: name = "searchTest"; | |
81 | if (exec) searchTest(); | |
82 | break; | |
83 | ||
84 | case 1: name = "offsetTest"; | |
85 | if (exec) offsetTest(); | |
86 | break; | |
87 | ||
88 | case 2: name = "monkeyTest"; | |
89 | if (exec) monkeyTest(params); | |
90 | break; | |
729e4ab9 A |
91 | |
92 | case 3: name = "bmMonkeyTest"; | |
93 | if (exec) bmMonkeyTest(params); | |
94 | break; | |
95 | ||
96 | case 4: name = "boyerMooreTest"; | |
97 | if (exec) boyerMooreTest(); | |
98 | break; | |
99 | ||
100 | case 5: name = "goodSuffixTest"; | |
101 | if (exec) goodSuffixTest(); | |
102 | break; | |
103 | ||
104 | case 6: name = "searchTime"; | |
105 | if (exec) searchTime(); | |
106 | break; | |
107 | ||
108 | case 7: name = "bmsTest"; | |
109 | if (exec) bmsTest(); | |
110 | break; | |
111 | ||
112 | case 8: name = "bmSearchTest"; | |
113 | if (exec) bmSearchTest(); | |
114 | break; | |
115 | ||
116 | case 9: name = "udhrTest"; | |
117 | if (exec) udhrTest(); | |
118 | break; | |
119 | case 10: name = "stringListTest"; | |
120 | if (exec) stringListTest(); | |
121 | break; | |
46f4442e A |
122 | #endif |
123 | default: name = ""; | |
124 | break; //needed to end loop | |
125 | } | |
126 | } | |
127 | ||
128 | ||
129 | #if !UCONFIG_NO_BREAK_ITERATION | |
130 | ||
131 | #define PATH_BUFFER_SIZE 2048 | |
132 | const char *SSearchTest::getPath(char buffer[2048], const char *filename) { | |
133 | UErrorCode status = U_ZERO_ERROR; | |
134 | const char *testDataDirectory = IntlTest::getSourceTestData(status); | |
135 | ||
136 | if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) { | |
137 | errln("ERROR: getPath() failed - %s", u_errorName(status)); | |
138 | return NULL; | |
139 | } | |
140 | ||
141 | strcpy(buffer, testDataDirectory); | |
142 | strcat(buffer, filename); | |
143 | return buffer; | |
144 | } | |
145 | ||
146 | ||
147 | void SSearchTest::searchTest() | |
148 | { | |
729e4ab9 | 149 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO |
46f4442e A |
150 | UErrorCode status = U_ZERO_ERROR; |
151 | char path[PATH_BUFFER_SIZE]; | |
152 | const char *testFilePath = getPath(path, "ssearch.xml"); | |
153 | ||
154 | if (testFilePath == NULL) { | |
155 | return; /* Couldn't get path: error message already output. */ | |
156 | } | |
157 | ||
729e4ab9 | 158 | LocalPointer<UXMLParser> parser(UXMLParser::createParser(status)); |
46f4442e | 159 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 | 160 | LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status)); |
46f4442e A |
161 | TEST_ASSERT_SUCCESS(status); |
162 | if (U_FAILURE(status)) { | |
163 | return; | |
164 | } | |
165 | ||
166 | const UnicodeString *debugTestCase = root->getAttribute("debug"); | |
167 | if (debugTestCase != NULL) { | |
168 | // setenv("USEARCH_DEBUG", "1", 1); | |
169 | } | |
170 | ||
171 | ||
172 | const UXMLElement *testCase; | |
173 | int32_t tc = 0; | |
174 | ||
175 | while((testCase = root->nextChildElement(tc)) != NULL) { | |
176 | ||
177 | if (testCase->getTagName().compare("test-case") != 0) { | |
178 | errln("ssearch, unrecognized XML Element in test file"); | |
179 | continue; | |
180 | } | |
181 | const UnicodeString *id = testCase->getAttribute("id"); | |
182 | *testId = 0; | |
183 | if (id != NULL) { | |
184 | id->extract(0, id->length(), testId, sizeof(testId), US_INV); | |
185 | } | |
186 | ||
187 | // If debugging test case has been specified and this is not it, skip to next. | |
188 | if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { | |
189 | continue; | |
190 | } | |
191 | // | |
192 | // Get the requested collation strength. | |
193 | // Default is tertiary if the XML attribute is missing from the test case. | |
194 | // | |
195 | const UnicodeString *strength = testCase->getAttribute("strength"); | |
729e4ab9 | 196 | UColAttributeValue collatorStrength = UCOL_PRIMARY; |
46f4442e A |
197 | if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} |
198 | else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} | |
199 | else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} | |
200 | else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} | |
201 | else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} | |
202 | else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} | |
203 | else { | |
204 | // Bogus value supplied for strength. Shouldn't happen, even from | |
205 | // typos, if the XML source has been validated. | |
206 | // This assert is a little deceiving in that strength can be | |
207 | // any of the allowed values, not just TERTIARY, but it will | |
208 | // do the job of getting the error output. | |
209 | TEST_ASSERT(*strength=="TERTIARY") | |
210 | } | |
211 | ||
212 | // | |
213 | // Get the collator normalization flag. Default is UCOL_OFF. | |
214 | // | |
215 | UColAttributeValue normalize = UCOL_OFF; | |
216 | const UnicodeString *norm = testCase->getAttribute("norm"); | |
217 | TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); | |
218 | if (norm!=NULL && *norm=="ON") { | |
219 | normalize = UCOL_ON; | |
220 | } | |
221 | ||
729e4ab9 A |
222 | // |
223 | // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. | |
224 | // | |
225 | UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; | |
226 | const UnicodeString *alt = testCase->getAttribute("alternate_handling"); | |
227 | TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); | |
228 | if (alt != NULL && *alt == "SHIFTED") { | |
229 | alternateHandling = UCOL_SHIFTED; | |
230 | } | |
231 | ||
46f4442e A |
232 | const UnicodeString defLocale("en"); |
233 | char clocale[100]; | |
234 | const UnicodeString *locale = testCase->getAttribute("locale"); | |
235 | if (locale == NULL || locale->length()==0) { | |
236 | locale = &defLocale; | |
237 | }; | |
238 | locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); | |
239 | ||
240 | ||
241 | UnicodeString text; | |
242 | UnicodeString target; | |
243 | UnicodeString pattern; | |
244 | int32_t expectedMatchStart = -1; | |
245 | int32_t expectedMatchLimit = -1; | |
246 | const UXMLElement *n; | |
729e4ab9 | 247 | int32_t nodeCount = 0; |
46f4442e A |
248 | |
249 | n = testCase->getChildElement("pattern"); | |
250 | TEST_ASSERT(n != NULL); | |
251 | if (n==NULL) { | |
252 | continue; | |
253 | } | |
254 | text = n->getText(FALSE); | |
255 | text = text.unescape(); | |
256 | pattern.append(text); | |
257 | nodeCount++; | |
258 | ||
259 | n = testCase->getChildElement("pre"); | |
260 | if (n!=NULL) { | |
261 | text = n->getText(FALSE); | |
262 | text = text.unescape(); | |
263 | target.append(text); | |
264 | nodeCount++; | |
265 | } | |
729e4ab9 | 266 | |
46f4442e A |
267 | n = testCase->getChildElement("m"); |
268 | if (n!=NULL) { | |
269 | expectedMatchStart = target.length(); | |
270 | text = n->getText(FALSE); | |
271 | text = text.unescape(); | |
272 | target.append(text); | |
273 | expectedMatchLimit = target.length(); | |
274 | nodeCount++; | |
275 | } | |
276 | ||
277 | n = testCase->getChildElement("post"); | |
278 | if (n!=NULL) { | |
279 | text = n->getText(FALSE); | |
280 | text = text.unescape(); | |
281 | target.append(text); | |
282 | nodeCount++; | |
283 | } | |
284 | ||
285 | // Check that there weren't extra things in the XML | |
286 | TEST_ASSERT(nodeCount == testCase->countChildren()); | |
287 | ||
729e4ab9 | 288 | // Open a collator and StringSearch based on the parameters |
46f4442e A |
289 | // obtained from the XML. |
290 | // | |
291 | status = U_ZERO_ERROR; | |
729e4ab9 A |
292 | LocalUCollatorPointer collator(ucol_open(clocale, &status)); |
293 | ucol_setStrength(collator.getAlias(), collatorStrength); | |
294 | ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); | |
295 | ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status); | |
296 | LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), | |
297 | target.getBuffer(), target.length(), | |
298 | collator.getAlias(), | |
299 | NULL, // the break iterator | |
300 | &status)); | |
301 | ||
46f4442e A |
302 | TEST_ASSERT_SUCCESS(status); |
303 | if (U_FAILURE(status)) { | |
46f4442e A |
304 | continue; |
305 | } | |
306 | ||
307 | int32_t foundStart = 0; | |
308 | int32_t foundLimit = 0; | |
309 | UBool foundMatch; | |
310 | ||
311 | // | |
312 | // Do the search, check the match result against the expected results. | |
313 | // | |
729e4ab9 | 314 | foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status); |
46f4442e | 315 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 A |
316 | if ((foundMatch && expectedMatchStart<0) || |
317 | (foundStart != expectedMatchStart) || | |
318 | (foundLimit != expectedMatchLimit)) { | |
46f4442e A |
319 | TEST_ASSERT(FALSE); // ouput generic error position |
320 | infoln("Found, expected match start = %d, %d \n" | |
321 | "Found, expected match limit = %d, %d", | |
322 | foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
323 | } | |
324 | ||
325 | // In case there are other matches... | |
326 | // (should we only do this if the test case passed?) | |
327 | while (foundMatch) { | |
328 | expectedMatchStart = foundStart; | |
329 | expectedMatchLimit = foundLimit; | |
330 | ||
729e4ab9 | 331 | foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status); |
46f4442e A |
332 | } |
333 | ||
729e4ab9 | 334 | uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), |
46f4442e | 335 | target.getBuffer(), target.length(), |
729e4ab9 | 336 | collator.getAlias(), |
46f4442e | 337 | NULL, |
729e4ab9 | 338 | &status)); |
46f4442e A |
339 | |
340 | // | |
341 | // Do the backwards search, check the match result against the expected results. | |
342 | // | |
729e4ab9 | 343 | foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status); |
46f4442e | 344 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 A |
345 | if ((foundMatch && expectedMatchStart<0) || |
346 | (foundStart != expectedMatchStart) || | |
347 | (foundLimit != expectedMatchLimit)) { | |
46f4442e A |
348 | TEST_ASSERT(FALSE); // ouput generic error position |
349 | infoln("Found, expected backwards match start = %d, %d \n" | |
350 | "Found, expected backwards match limit = %d, %d", | |
351 | foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
352 | } | |
729e4ab9 A |
353 | } |
354 | #endif | |
355 | } | |
356 | ||
357 | struct UdhrTestCase | |
358 | { | |
359 | const char *locale; | |
360 | const char *file; | |
361 | }; | |
362 | ||
363 | void SSearchTest::udhrTest() | |
364 | { | |
365 | UErrorCode status = U_ZERO_ERROR; | |
366 | char path[PATH_BUFFER_SIZE]; | |
367 | const char *udhrPath = getPath(path, "udhr"); | |
368 | ||
369 | if (udhrPath == NULL) { | |
370 | // couldn't get path: error message already output... | |
371 | return; | |
372 | } | |
373 | ||
374 | UdhrTestCase testCases[] = { | |
375 | {"en", "udhr_eng.txt"}, | |
376 | {"de", "udhr_deu_1996.txt"}, | |
377 | {"fr", "udhr_fra.txt"}, | |
378 | {"ru", "udhr_rus.txt"}, | |
379 | {"th", "udhr_tha.txt"}, | |
380 | {"ja", "udhr_jpn.txt"}, | |
381 | {"ko", "udhr_kor.txt"}, | |
382 | {"zh", "udhr_cmn_hans.txt"}, | |
383 | {"zh_Hant", "udhr_cmn_hant.txt"} | |
384 | }; | |
385 | ||
386 | int32_t testCount = ARRAY_SIZE(testCases); | |
387 | ||
388 | for (int32_t t = 0; t < testCount; t += 1) { | |
389 | int32_t len = 0; | |
390 | char *resolvedFileName = NULL; | |
391 | const char *encoding = NULL; | |
392 | UCHARBUF *ucharBuf = NULL; | |
393 | ||
394 | ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status); | |
395 | resolvedFileName = NEW_ARRAY(char, len); | |
396 | ||
397 | if(resolvedFileName == NULL){ | |
398 | continue; | |
399 | } | |
400 | ||
401 | if(status == U_BUFFER_OVERFLOW_ERROR){ | |
402 | status = U_ZERO_ERROR; | |
403 | } | |
404 | ||
405 | ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status); | |
406 | ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status); | |
407 | ||
408 | DELETE_ARRAY(resolvedFileName); | |
409 | ||
410 | if(U_FAILURE(status)){ | |
411 | infoln("Could not open the input file %s. Test skipped\n", testCases[t].file); | |
412 | continue; | |
413 | } | |
414 | ||
415 | int32_t targetLen = 0; | |
416 | const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status); | |
417 | ||
418 | /* The first line of the file contains the pattern */ | |
419 | int32_t start = 0, end = 0, plen = 0; | |
420 | ||
421 | for(end = start; ; end += 1) { | |
422 | UChar ch = target[end]; | |
423 | ||
424 | if (ch == 0x000A || ch == 0x000D || ch == 0x2028) { | |
425 | break; | |
426 | } | |
427 | } | |
428 | ||
429 | plen = end - start; | |
430 | ||
431 | UChar *pattern = NEW_ARRAY(UChar, plen); | |
432 | for (int32_t i = 0; i < plen; i += 1) { | |
433 | pattern[i] = target[start++]; | |
434 | } | |
435 | ||
436 | int32_t offset = 0; | |
437 | UCollator *coll = ucol_open(testCases[t].locale, &status); | |
438 | UCD *ucd = NULL; | |
439 | BMS *bms = NULL; | |
440 | ||
441 | if (U_FAILURE(status)) { | |
442 | errln("Could not open collator for %s", testCases[t].locale); | |
443 | goto delete_collator; | |
444 | } | |
445 | ||
446 | ucd = ucd_open(coll, &status); | |
447 | ||
448 | if (U_FAILURE(status)) { | |
449 | errln("Could not open CollData object for %s", testCases[t].locale); | |
450 | goto delete_ucd; | |
451 | } | |
452 | ||
453 | bms = bms_open(ucd, pattern, plen, target, targetLen, &status); | |
454 | ||
455 | if (U_FAILURE(status)) { | |
456 | errln("Could not open search object for %s", testCases[t].locale); | |
457 | goto delete_bms; | |
458 | } | |
459 | ||
460 | start = end = -1; | |
461 | while (bms_search(bms, offset, &start, &end)) { | |
462 | offset = end; | |
463 | } | |
464 | ||
465 | if (offset == 0) { | |
466 | errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file); | |
467 | } | |
468 | ||
469 | delete_bms: | |
470 | bms_close(bms); | |
471 | ||
472 | delete_ucd: | |
473 | ucd_close(ucd); | |
474 | ||
475 | delete_collator: | |
476 | ucol_close(coll); | |
477 | ||
478 | DELETE_ARRAY(pattern); | |
479 | ucbuf_close(ucharBuf); | |
480 | } | |
481 | ||
482 | ucd_flushCache(); | |
483 | } | |
484 | ||
485 | void SSearchTest::bmSearchTest() | |
486 | { | |
487 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
488 | UErrorCode status = U_ZERO_ERROR; | |
489 | char path[PATH_BUFFER_SIZE]; | |
490 | const char *testFilePath = getPath(path, "ssearch.xml"); | |
491 | ||
492 | if (testFilePath == NULL) { | |
493 | return; /* Couldn't get path: error message already output. */ | |
494 | } | |
495 | ||
496 | UXMLParser *parser = UXMLParser::createParser(status); | |
497 | TEST_ASSERT_SUCCESS(status); | |
498 | UXMLElement *root = parser->parseFile(testFilePath, status); | |
499 | TEST_ASSERT_SUCCESS(status); | |
500 | if (U_FAILURE(status)) { | |
501 | return; | |
502 | } | |
503 | ||
504 | const UnicodeString *debugTestCase = root->getAttribute("debug"); | |
505 | if (debugTestCase != NULL) { | |
506 | // setenv("USEARCH_DEBUG", "1", 1); | |
507 | } | |
508 | ||
509 | ||
510 | const UXMLElement *testCase; | |
511 | int32_t tc = 0; | |
512 | ||
513 | while((testCase = root->nextChildElement(tc)) != NULL) { | |
514 | ||
515 | if (testCase->getTagName().compare("test-case") != 0) { | |
516 | errln("ssearch, unrecognized XML Element in test file"); | |
517 | continue; | |
518 | } | |
519 | const UnicodeString *id = testCase->getAttribute("id"); | |
520 | *testId = 0; | |
521 | if (id != NULL) { | |
522 | id->extract(0, id->length(), testId, sizeof(testId), US_INV); | |
523 | } | |
524 | ||
525 | // If debugging test case has been specified and this is not it, skip to next. | |
526 | if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { | |
527 | continue; | |
528 | } | |
529 | // | |
530 | // Get the requested collation strength. | |
531 | // Default is tertiary if the XML attribute is missing from the test case. | |
532 | // | |
533 | const UnicodeString *strength = testCase->getAttribute("strength"); | |
534 | UColAttributeValue collatorStrength = UCOL_PRIMARY; | |
535 | if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} | |
536 | else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} | |
537 | else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} | |
538 | else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} | |
539 | else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} | |
540 | else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} | |
541 | else { | |
542 | // Bogus value supplied for strength. Shouldn't happen, even from | |
543 | // typos, if the XML source has been validated. | |
544 | // This assert is a little deceiving in that strength can be | |
545 | // any of the allowed values, not just TERTIARY, but it will | |
546 | // do the job of getting the error output. | |
547 | TEST_ASSERT(*strength=="TERTIARY") | |
548 | } | |
549 | ||
550 | // | |
551 | // Get the collator normalization flag. Default is UCOL_OFF. | |
552 | // | |
553 | UColAttributeValue normalize = UCOL_OFF; | |
554 | const UnicodeString *norm = testCase->getAttribute("norm"); | |
555 | TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); | |
556 | if (norm!=NULL && *norm=="ON") { | |
557 | normalize = UCOL_ON; | |
558 | } | |
559 | ||
560 | // | |
561 | // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. | |
562 | // | |
563 | UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; | |
564 | const UnicodeString *alt = testCase->getAttribute("alternate_handling"); | |
565 | TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); | |
566 | if (alt != NULL && *alt == "SHIFTED") { | |
567 | alternateHandling = UCOL_SHIFTED; | |
568 | } | |
569 | ||
570 | const UnicodeString defLocale("en"); | |
571 | char clocale[100]; | |
572 | const UnicodeString *locale = testCase->getAttribute("locale"); | |
573 | if (locale == NULL || locale->length()==0) { | |
574 | locale = &defLocale; | |
575 | }; | |
576 | locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); | |
577 | ||
578 | ||
579 | UnicodeString text; | |
580 | UnicodeString target; | |
581 | UnicodeString pattern; | |
582 | int32_t expectedMatchStart = -1; | |
583 | int32_t expectedMatchLimit = -1; | |
584 | const UXMLElement *n; | |
585 | int32_t nodeCount = 0; | |
586 | ||
587 | n = testCase->getChildElement("pattern"); | |
588 | TEST_ASSERT(n != NULL); | |
589 | if (n==NULL) { | |
590 | continue; | |
591 | } | |
592 | text = n->getText(FALSE); | |
593 | text = text.unescape(); | |
594 | pattern.append(text); | |
595 | nodeCount++; | |
596 | ||
597 | n = testCase->getChildElement("pre"); | |
598 | if (n!=NULL) { | |
599 | text = n->getText(FALSE); | |
600 | text = text.unescape(); | |
601 | target.append(text); | |
602 | nodeCount++; | |
603 | } | |
604 | ||
605 | n = testCase->getChildElement("m"); | |
606 | if (n!=NULL) { | |
607 | expectedMatchStart = target.length(); | |
608 | text = n->getText(FALSE); | |
609 | text = text.unescape(); | |
610 | target.append(text); | |
611 | expectedMatchLimit = target.length(); | |
612 | nodeCount++; | |
613 | } | |
614 | ||
615 | n = testCase->getChildElement("post"); | |
616 | if (n!=NULL) { | |
617 | text = n->getText(FALSE); | |
618 | text = text.unescape(); | |
619 | target.append(text); | |
620 | nodeCount++; | |
621 | } | |
622 | ||
623 | // Check that there weren't extra things in the XML | |
624 | TEST_ASSERT(nodeCount == testCase->countChildren()); | |
625 | ||
626 | // Open a collator and StringSearch based on the parameters | |
627 | // obtained from the XML. | |
628 | // | |
629 | status = U_ZERO_ERROR; | |
630 | UCollator *collator = ucol_open(clocale, &status); | |
631 | ucol_setStrength(collator, collatorStrength); | |
632 | ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); | |
633 | ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status); | |
634 | UCD *ucd = ucd_open(collator, &status); | |
635 | BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status); | |
636 | ||
637 | TEST_ASSERT_SUCCESS(status); | |
638 | if (U_FAILURE(status)) { | |
639 | bms_close(bms); | |
640 | ucd_close(ucd); | |
641 | ucol_close(collator); | |
642 | continue; | |
643 | } | |
644 | ||
645 | int32_t foundStart = 0; | |
646 | int32_t foundLimit = 0; | |
647 | UBool foundMatch; | |
648 | ||
649 | // | |
650 | // Do the search, check the match result against the expected results. | |
651 | // | |
652 | foundMatch = bms_search(bms, 0, &foundStart, &foundLimit); | |
653 | //TEST_ASSERT_SUCCESS(status); | |
654 | if ((foundMatch && expectedMatchStart < 0) || | |
655 | (foundStart != expectedMatchStart) || | |
656 | (foundLimit != expectedMatchLimit)) { | |
657 | TEST_ASSERT(FALSE); // ouput generic error position | |
658 | infoln("Found, expected match start = %d, %d \n" | |
659 | "Found, expected match limit = %d, %d", | |
660 | foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
661 | } | |
46f4442e | 662 | |
729e4ab9 A |
663 | bms_close(bms); |
664 | ucd_close(ucd); | |
46f4442e A |
665 | ucol_close(collator); |
666 | } | |
667 | ||
729e4ab9 | 668 | ucd_flushCache(); |
46f4442e A |
669 | delete root; |
670 | delete parser; | |
671 | #endif | |
672 | } | |
673 | ||
674 | struct Order | |
675 | { | |
676 | int32_t order; | |
677 | int32_t lowOffset; | |
678 | int32_t highOffset; | |
679 | }; | |
680 | ||
681 | class OrderList | |
682 | { | |
683 | public: | |
684 | OrderList(); | |
685 | OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0); | |
686 | ~OrderList(); | |
687 | ||
688 | int32_t size(void) const; | |
689 | void add(int32_t order, int32_t low, int32_t high); | |
690 | const Order *get(int32_t index) const; | |
691 | int32_t getLowOffset(int32_t index) const; | |
692 | int32_t getHighOffset(int32_t index) const; | |
693 | int32_t getOrder(int32_t index) const; | |
694 | void reverse(void); | |
695 | UBool compare(const OrderList &other) const; | |
696 | UBool matchesAt(int32_t offset, const OrderList &other) const; | |
697 | ||
698 | private: | |
699 | Order *list; | |
700 | int32_t listMax; | |
701 | int32_t listSize; | |
702 | }; | |
703 | ||
704 | OrderList::OrderList() | |
729e4ab9 | 705 | : list(NULL), listMax(16), listSize(0) |
46f4442e A |
706 | { |
707 | list = new Order[listMax]; | |
708 | } | |
709 | ||
710 | OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset) | |
711 | : list(NULL), listMax(16), listSize(0) | |
712 | { | |
713 | UErrorCode status = U_ZERO_ERROR; | |
714 | UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); | |
715 | uint32_t strengthMask = 0; | |
716 | int32_t order, low, high; | |
717 | ||
729e4ab9 | 718 | switch (ucol_getStrength(coll)) |
46f4442e A |
719 | { |
720 | default: | |
721 | strengthMask |= UCOL_TERTIARYORDERMASK; | |
722 | /* fall through */ | |
723 | ||
724 | case UCOL_SECONDARY: | |
725 | strengthMask |= UCOL_SECONDARYORDERMASK; | |
726 | /* fall through */ | |
727 | ||
728 | case UCOL_PRIMARY: | |
729 | strengthMask |= UCOL_PRIMARYORDERMASK; | |
730 | } | |
731 | ||
732 | list = new Order[listMax]; | |
733 | ||
734 | ucol_setOffset(elems, stringOffset, &status); | |
735 | ||
736 | do { | |
737 | low = ucol_getOffset(elems); | |
738 | order = ucol_next(elems, &status); | |
739 | high = ucol_getOffset(elems); | |
740 | ||
741 | if (order != UCOL_NULLORDER) { | |
742 | order &= strengthMask; | |
743 | } | |
744 | ||
745 | if (order != UCOL_IGNORABLE) { | |
746 | add(order, low, high); | |
747 | } | |
748 | } while (order != UCOL_NULLORDER); | |
749 | ||
750 | ucol_closeElements(elems); | |
751 | } | |
752 | ||
753 | OrderList::~OrderList() | |
754 | { | |
755 | delete[] list; | |
756 | } | |
757 | ||
758 | void OrderList::add(int32_t order, int32_t low, int32_t high) | |
759 | { | |
760 | if (listSize >= listMax) { | |
761 | listMax *= 2; | |
762 | ||
763 | Order *newList = new Order[listMax]; | |
764 | ||
765 | uprv_memcpy(newList, list, listSize * sizeof(Order)); | |
766 | delete[] list; | |
767 | list = newList; | |
768 | } | |
769 | ||
770 | list[listSize].order = order; | |
771 | list[listSize].lowOffset = low; | |
772 | list[listSize].highOffset = high; | |
773 | ||
774 | listSize += 1; | |
775 | } | |
776 | ||
777 | const Order *OrderList::get(int32_t index) const | |
778 | { | |
779 | if (index >= listSize) { | |
780 | return NULL; | |
781 | } | |
782 | ||
783 | return &list[index]; | |
784 | } | |
785 | ||
786 | int32_t OrderList::getLowOffset(int32_t index) const | |
787 | { | |
788 | const Order *order = get(index); | |
789 | ||
790 | if (order != NULL) { | |
791 | return order->lowOffset; | |
792 | } | |
793 | ||
794 | return -1; | |
795 | } | |
796 | ||
797 | int32_t OrderList::getHighOffset(int32_t index) const | |
798 | { | |
799 | const Order *order = get(index); | |
800 | ||
801 | if (order != NULL) { | |
802 | return order->highOffset; | |
803 | } | |
804 | ||
805 | return -1; | |
806 | } | |
807 | ||
808 | int32_t OrderList::getOrder(int32_t index) const | |
809 | { | |
810 | const Order *order = get(index); | |
811 | ||
812 | if (order != NULL) { | |
813 | return order->order; | |
814 | } | |
815 | ||
816 | return UCOL_NULLORDER; | |
817 | } | |
818 | ||
819 | int32_t OrderList::size() const | |
820 | { | |
821 | return listSize; | |
822 | } | |
823 | ||
824 | void OrderList::reverse() | |
825 | { | |
826 | for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { | |
827 | Order swap = list[b]; | |
828 | ||
829 | list[b] = list[f]; | |
830 | list[f] = swap; | |
831 | } | |
832 | } | |
833 | ||
834 | UBool OrderList::compare(const OrderList &other) const | |
835 | { | |
836 | if (listSize != other.listSize) { | |
837 | return FALSE; | |
838 | } | |
839 | ||
840 | for(int32_t i = 0; i < listSize; i += 1) { | |
841 | if (list[i].order != other.list[i].order || | |
842 | list[i].lowOffset != other.list[i].lowOffset || | |
843 | list[i].highOffset != other.list[i].highOffset) { | |
844 | return FALSE; | |
845 | } | |
846 | } | |
847 | ||
848 | return TRUE; | |
849 | } | |
850 | ||
851 | UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const | |
852 | { | |
853 | // NOTE: sizes include the NULLORDER, which we don't want to compare. | |
854 | int32_t otherSize = other.size() - 1; | |
855 | ||
856 | if (listSize - 1 - offset < otherSize) { | |
857 | return FALSE; | |
858 | } | |
859 | ||
860 | for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { | |
861 | if (getOrder(i) != other.getOrder(j)) { | |
862 | return FALSE; | |
863 | } | |
864 | } | |
865 | ||
866 | return TRUE; | |
867 | } | |
868 | ||
869 | static char *printOffsets(char *buffer, OrderList &list) | |
870 | { | |
871 | int32_t size = list.size(); | |
872 | char *s = buffer; | |
873 | ||
874 | for(int32_t i = 0; i < size; i += 1) { | |
875 | const Order *order = list.get(i); | |
876 | ||
877 | if (i != 0) { | |
878 | s += sprintf(s, ", "); | |
879 | } | |
880 | ||
881 | s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); | |
882 | } | |
883 | ||
884 | return buffer; | |
885 | } | |
886 | ||
887 | static char *printOrders(char *buffer, OrderList &list) | |
888 | { | |
889 | int32_t size = list.size(); | |
890 | char *s = buffer; | |
891 | ||
892 | for(int32_t i = 0; i < size; i += 1) { | |
893 | const Order *order = list.get(i); | |
894 | ||
895 | if (i != 0) { | |
896 | s += sprintf(s, ", "); | |
897 | } | |
898 | ||
899 | s += sprintf(s, "%8.8X", order->order); | |
900 | } | |
901 | ||
902 | return buffer; | |
903 | } | |
904 | ||
905 | void SSearchTest::offsetTest() | |
906 | { | |
907 | const char *test[] = { | |
729e4ab9 A |
908 | // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous |
909 | // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. | |
910 | "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", | |
911 | ||
46f4442e A |
912 | "\\ua191\\u16ef\\u2036\\u017a", |
913 | ||
914 | #if 0 | |
915 | // This results in a complex interaction between contraction, | |
916 | // expansion and normalization that confuses the backwards offset fixups. | |
917 | "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
918 | #endif | |
919 | ||
920 | "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
921 | "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", | |
922 | ||
923 | "\\u02FE\\u02FF" | |
924 | "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" | |
925 | "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" | |
926 | "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" | |
927 | "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" | |
729e4ab9 | 928 | "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081 |
46f4442e | 929 | |
729e4ab9 A |
930 | "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081 |
931 | "a\\u02FF\\u0301\\u0316", // currently not working, see #8081 | |
932 | "a\\u02FF\\u0316\\u0301", | |
933 | "a\\u0430\\u0301\\u0316", | |
934 | "a\\u0430\\u0316\\u0301", | |
46f4442e | 935 | "abc\\u0E41\\u0301\\u0316", |
729e4ab9 A |
936 | "abc\\u0E41\\u0316\\u0301", |
937 | "\\u0E41\\u0301\\u0316", | |
938 | "\\u0E41\\u0316\\u0301", | |
939 | "a\\u0301\\u0316", | |
940 | "a\\u0316\\u0301", | |
941 | "\\uAC52\\uAC53", | |
942 | "\\u34CA\\u34CB", | |
943 | "\\u11ED\\u11EE", | |
944 | "\\u30C3\\u30D0", | |
945 | "p\\u00E9ch\\u00E9", | |
46f4442e A |
946 | "a\\u0301\\u0325", |
947 | "a\\u0300\\u0325", | |
948 | "a\\u0325\\u0300", | |
949 | "A\\u0323\\u0300B", | |
950 | "A\\u0300\\u0323B", | |
951 | "A\\u0301\\u0323B", | |
952 | "A\\u0302\\u0301\\u0323B", | |
953 | "abc", | |
954 | "ab\\u0300c", | |
955 | "ab\\u0300\\u0323c", | |
956 | " \\uD800\\uDC00\\uDC00", | |
957 | "a\\uD800\\uDC00\\uDC00", | |
958 | "A\\u0301\\u0301", | |
959 | "A\\u0301\\u0323", | |
960 | "A\\u0301\\u0323B", | |
961 | "B\\u0301\\u0323C", | |
962 | "A\\u0300\\u0323B", | |
963 | "\\u0301A\\u0301\\u0301", | |
964 | "abcd\\r\\u0301", | |
965 | "p\\u00EAche", | |
966 | "pe\\u0302che", | |
967 | }; | |
968 | ||
969 | int32_t testCount = ARRAY_SIZE(test); | |
970 | UErrorCode status = U_ZERO_ERROR; | |
971 | RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status); | |
972 | if (U_FAILURE(status)) { | |
729e4ab9 | 973 | errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status)); |
46f4442e A |
974 | return; |
975 | } | |
976 | char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases... | |
977 | // We could allocate one that's the right size by (CE_count * 10) + 2 | |
978 | // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]" | |
979 | ||
980 | col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); | |
981 | ||
982 | for(int32_t i = 0; i < testCount; i += 1) { | |
4388f060 | 983 | if (!isICUVersionAtLeast(50, 0) && i>=4 && i<=6) { |
729e4ab9 A |
984 | continue; // timebomb until ticket #8080 is resolved |
985 | } | |
46f4442e A |
986 | UnicodeString ts = CharsToUnicodeString(test[i]); |
987 | CollationElementIterator *iter = col->createCollationElementIterator(ts); | |
988 | OrderList forwardList; | |
989 | OrderList backwardList; | |
990 | int32_t order, low, high; | |
991 | ||
992 | do { | |
993 | low = iter->getOffset(); | |
994 | order = iter->next(status); | |
995 | high = iter->getOffset(); | |
996 | ||
997 | forwardList.add(order, low, high); | |
998 | } while (order != CollationElementIterator::NULLORDER); | |
999 | ||
1000 | iter->reset(); | |
1001 | iter->setOffset(ts.length(), status); | |
1002 | ||
1003 | backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset()); | |
1004 | ||
1005 | do { | |
1006 | high = iter->getOffset(); | |
1007 | order = iter->previous(status); | |
1008 | low = iter->getOffset(); | |
1009 | ||
1010 | if (order == CollationElementIterator::NULLORDER) { | |
1011 | break; | |
1012 | } | |
1013 | ||
1014 | backwardList.add(order, low, high); | |
1015 | } while (TRUE); | |
1016 | ||
1017 | backwardList.reverse(); | |
1018 | ||
1019 | if (forwardList.compare(backwardList)) { | |
1020 | logln("Works with \"%s\"", test[i]); | |
1021 | logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
1022 | // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); | |
1023 | ||
1024 | logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
1025 | // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
1026 | ||
1027 | logln(); | |
1028 | } else { | |
1029 | errln("Fails with \"%s\"", test[i]); | |
1030 | infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
1031 | infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); | |
1032 | ||
1033 | infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
1034 | infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
1035 | ||
1036 | infoln(); | |
1037 | } | |
1038 | delete iter; | |
1039 | } | |
1040 | delete col; | |
1041 | } | |
1042 | ||
729e4ab9 A |
1043 | #if 0 |
1044 | static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) | |
46f4442e | 1045 | { |
729e4ab9 A |
1046 | for(int32_t i = 0; i < string.length(); i += 1) { |
1047 | UChar32 ch = string.char32At(i); | |
46f4442e | 1048 | |
729e4ab9 A |
1049 | if (ch >= 0x0020 && ch <= 0x007F) { |
1050 | if (ch == 0x005C) { | |
1051 | buffer.append("\\\\"); | |
1052 | } else { | |
1053 | buffer.append(ch); | |
1054 | } | |
1055 | } else { | |
1056 | char cbuffer[12]; | |
46f4442e | 1057 | |
729e4ab9 A |
1058 | if (ch <= 0xFFFFL) { |
1059 | sprintf(cbuffer, "\\u%4.4X", ch); | |
1060 | } else { | |
1061 | sprintf(cbuffer, "\\U%8.8X", ch); | |
1062 | } | |
46f4442e | 1063 | |
729e4ab9 | 1064 | buffer.append(cbuffer); |
46f4442e A |
1065 | } |
1066 | ||
729e4ab9 A |
1067 | if (ch >= 0x10000L) { |
1068 | i += 1; | |
1069 | } | |
46f4442e A |
1070 | } |
1071 | ||
729e4ab9 | 1072 | return buffer; |
46f4442e | 1073 | } |
729e4ab9 | 1074 | #endif |
46f4442e | 1075 | |
729e4ab9 | 1076 | #if 1 |
46f4442e | 1077 | |
729e4ab9 | 1078 | struct PCE |
46f4442e | 1079 | { |
729e4ab9 A |
1080 | uint64_t ce; |
1081 | int32_t lowOffset; | |
1082 | int32_t highOffset; | |
1083 | }; | |
46f4442e | 1084 | |
729e4ab9 | 1085 | class PCEList |
46f4442e | 1086 | { |
729e4ab9 A |
1087 | public: |
1088 | PCEList(UCollator *coll, const UnicodeString &string); | |
1089 | ~PCEList(); | |
46f4442e | 1090 | |
729e4ab9 | 1091 | int32_t size() const; |
46f4442e | 1092 | |
729e4ab9 | 1093 | const PCE *get(int32_t index) const; |
46f4442e | 1094 | |
729e4ab9 A |
1095 | int32_t getLowOffset(int32_t index) const; |
1096 | int32_t getHighOffset(int32_t index) const; | |
1097 | uint64_t getOrder(int32_t index) const; | |
46f4442e | 1098 | |
729e4ab9 | 1099 | UBool matchesAt(int32_t offset, const PCEList &other) const; |
46f4442e | 1100 | |
729e4ab9 | 1101 | uint64_t operator[](int32_t index) const; |
46f4442e A |
1102 | |
1103 | private: | |
729e4ab9 A |
1104 | void add(uint64_t ce, int32_t low, int32_t high); |
1105 | ||
1106 | PCE *list; | |
46f4442e A |
1107 | int32_t listMax; |
1108 | int32_t listSize; | |
1109 | }; | |
1110 | ||
729e4ab9 | 1111 | PCEList::PCEList(UCollator *coll, const UnicodeString &string) |
46f4442e | 1112 | { |
729e4ab9 A |
1113 | UErrorCode status = U_ZERO_ERROR; |
1114 | UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); | |
1115 | uint64_t order; | |
1116 | int32_t low, high; | |
1117 | ||
1118 | list = new PCE[listMax]; | |
1119 | ||
1120 | ucol_setOffset(elems, 0, &status); | |
1121 | ||
1122 | do { | |
1123 | order = ucol_nextProcessed(elems, &low, &high, &status); | |
1124 | add(order, low, high); | |
1125 | } while (order != UCOL_PROCESSED_NULLORDER); | |
1126 | ||
1127 | ucol_closeElements(elems); | |
46f4442e A |
1128 | } |
1129 | ||
729e4ab9 | 1130 | PCEList::~PCEList() |
46f4442e | 1131 | { |
729e4ab9 | 1132 | delete[] list; |
46f4442e A |
1133 | } |
1134 | ||
729e4ab9 | 1135 | void PCEList::add(uint64_t order, int32_t low, int32_t high) |
46f4442e A |
1136 | { |
1137 | if (listSize >= listMax) { | |
1138 | listMax *= 2; | |
1139 | ||
729e4ab9 | 1140 | PCE *newList = new PCE[listMax]; |
46f4442e | 1141 | |
729e4ab9 A |
1142 | uprv_memcpy(newList, list, listSize * sizeof(Order)); |
1143 | delete[] list; | |
1144 | list = newList; | |
46f4442e A |
1145 | } |
1146 | ||
729e4ab9 A |
1147 | list[listSize].ce = order; |
1148 | list[listSize].lowOffset = low; | |
1149 | list[listSize].highOffset = high; | |
46f4442e | 1150 | |
729e4ab9 | 1151 | listSize += 1; |
46f4442e A |
1152 | } |
1153 | ||
729e4ab9 | 1154 | const PCE *PCEList::get(int32_t index) const |
46f4442e | 1155 | { |
729e4ab9 A |
1156 | if (index >= listSize) { |
1157 | return NULL; | |
46f4442e A |
1158 | } |
1159 | ||
729e4ab9 | 1160 | return &list[index]; |
46f4442e A |
1161 | } |
1162 | ||
729e4ab9 | 1163 | int32_t PCEList::getLowOffset(int32_t index) const |
46f4442e | 1164 | { |
729e4ab9 A |
1165 | const PCE *pce = get(index); |
1166 | ||
1167 | if (pce != NULL) { | |
1168 | return pce->lowOffset; | |
1169 | } | |
1170 | ||
1171 | return -1; | |
46f4442e A |
1172 | } |
1173 | ||
729e4ab9 | 1174 | int32_t PCEList::getHighOffset(int32_t index) const |
46f4442e | 1175 | { |
729e4ab9 | 1176 | const PCE *pce = get(index); |
46f4442e | 1177 | |
729e4ab9 A |
1178 | if (pce != NULL) { |
1179 | return pce->highOffset; | |
1180 | } | |
46f4442e | 1181 | |
729e4ab9 A |
1182 | return -1; |
1183 | } | |
46f4442e | 1184 | |
729e4ab9 | 1185 | uint64_t PCEList::getOrder(int32_t index) const |
46f4442e | 1186 | { |
729e4ab9 | 1187 | const PCE *pce = get(index); |
46f4442e | 1188 | |
729e4ab9 A |
1189 | if (pce != NULL) { |
1190 | return pce->ce; | |
1191 | } | |
46f4442e | 1192 | |
729e4ab9 | 1193 | return UCOL_PROCESSED_NULLORDER; |
46f4442e A |
1194 | } |
1195 | ||
729e4ab9 | 1196 | int32_t PCEList::size() const |
46f4442e | 1197 | { |
729e4ab9 | 1198 | return listSize; |
46f4442e A |
1199 | } |
1200 | ||
729e4ab9 | 1201 | UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const |
46f4442e | 1202 | { |
729e4ab9 A |
1203 | // NOTE: sizes include the NULLORDER, which we don't want to compare. |
1204 | int32_t otherSize = other.size() - 1; | |
46f4442e | 1205 | |
729e4ab9 A |
1206 | if (listSize - 1 - offset < otherSize) { |
1207 | return FALSE; | |
1208 | } | |
1209 | ||
1210 | for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { | |
1211 | if (getOrder(i) != other.getOrder(j)) { | |
1212 | return FALSE; | |
1213 | } | |
46f4442e A |
1214 | } |
1215 | ||
729e4ab9 | 1216 | return TRUE; |
46f4442e A |
1217 | } |
1218 | ||
729e4ab9 | 1219 | uint64_t PCEList::operator[](int32_t index) const |
46f4442e | 1220 | { |
729e4ab9 | 1221 | return getOrder(index); |
46f4442e A |
1222 | } |
1223 | ||
729e4ab9 | 1224 | void SSearchTest::boyerMooreTest() |
46f4442e A |
1225 | { |
1226 | UErrorCode status = U_ZERO_ERROR; | |
729e4ab9 A |
1227 | UCollator *coll = NULL; |
1228 | CollData *data = NULL; | |
1229 | const CEList* ce = NULL; | |
1230 | const CEList* ce1 = NULL; | |
1231 | UnicodeString lp = "fuss"; | |
1232 | UnicodeString sp = "fu\\u00DF"; | |
1233 | BoyerMooreSearch *longPattern = NULL; | |
1234 | BoyerMooreSearch *shortPattern = NULL; | |
1235 | UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", | |
1236 | "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", | |
1237 | "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; | |
1238 | int32_t start = -1, end = -1; | |
1239 | ||
1240 | coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); | |
1241 | if (U_FAILURE(status)) { | |
1242 | errcheckln(status, "Could not open collator. - %s", u_errorName(status)); | |
1243 | return; | |
1244 | } | |
46f4442e | 1245 | |
729e4ab9 A |
1246 | data = CollData::open(coll, status); |
1247 | if (U_FAILURE(status)) { | |
1248 | errln("Could not open CollData object."); | |
1249 | goto close_data; | |
1250 | } | |
46f4442e | 1251 | |
729e4ab9 A |
1252 | data->getDynamicClassID(); |
1253 | if (U_FAILURE(status)) { | |
1254 | errln("Could not get dynamic class ID of CollData."); | |
1255 | goto close_patterns; | |
1256 | } | |
46f4442e | 1257 | |
729e4ab9 A |
1258 | data->getStaticClassID(); |
1259 | if (U_FAILURE(status)) { | |
1260 | errln("Could not get static class ID of CollData."); | |
1261 | goto close_patterns; | |
1262 | } | |
46f4442e | 1263 | |
729e4ab9 A |
1264 | longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status); |
1265 | shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status); | |
1266 | if (U_FAILURE(status)) { | |
1267 | errln("Could not create pattern objects."); | |
1268 | goto close_patterns; | |
1269 | } | |
46f4442e | 1270 | |
729e4ab9 A |
1271 | longPattern->getBadCharacterTable(); |
1272 | shortPattern->getBadCharacterTable(); | |
1273 | if (U_FAILURE(status)) { | |
1274 | errln("Could not get bad character table."); | |
1275 | goto close_patterns; | |
1276 | } | |
46f4442e | 1277 | |
729e4ab9 A |
1278 | longPattern->getGoodSuffixTable(); |
1279 | shortPattern->getGoodSuffixTable(); | |
1280 | if (U_FAILURE(status)) { | |
1281 | errln("Could not get good suffix table."); | |
1282 | goto close_patterns; | |
1283 | } | |
46f4442e | 1284 | |
729e4ab9 A |
1285 | longPattern->getDynamicClassID(); |
1286 | shortPattern->getDynamicClassID(); | |
1287 | if (U_FAILURE(status)) { | |
1288 | errln("Could not get dynamic class ID of BoyerMooreSearch."); | |
1289 | goto close_patterns; | |
1290 | } | |
46f4442e | 1291 | |
729e4ab9 A |
1292 | longPattern->getStaticClassID(); |
1293 | shortPattern->getStaticClassID(); | |
1294 | if (U_FAILURE(status)) { | |
1295 | errln("Could not get static class ID of BoyerMooreSearch."); | |
1296 | goto close_patterns; | |
1297 | } | |
46f4442e | 1298 | |
729e4ab9 A |
1299 | longPattern->getData(); |
1300 | shortPattern->getData(); | |
1301 | if (U_FAILURE(status)) { | |
1302 | errln("Could not get collate data."); | |
1303 | goto close_patterns; | |
1304 | } | |
46f4442e | 1305 | |
729e4ab9 A |
1306 | ce = longPattern->getPatternCEs(); |
1307 | ce1 = shortPattern->getPatternCEs(); | |
1308 | if (U_FAILURE(status)) { | |
1309 | errln("Could not get pattern CEs."); | |
1310 | goto close_patterns; | |
1311 | } | |
46f4442e | 1312 | |
729e4ab9 A |
1313 | ce->getDynamicClassID(); |
1314 | ce1->getDynamicClassID(); | |
1315 | if (U_FAILURE(status)) { | |
1316 | errln("Could not get dynamic class ID of CEList."); | |
1317 | goto close_patterns; | |
1318 | } | |
46f4442e | 1319 | |
729e4ab9 A |
1320 | ce->getStaticClassID(); |
1321 | ce1->getStaticClassID(); | |
1322 | if (U_FAILURE(status)) { | |
1323 | errln("Could not get static class ID of CEList."); | |
1324 | goto close_patterns; | |
1325 | } | |
46f4442e | 1326 | |
729e4ab9 A |
1327 | if(data->minLengthInChars(ce,0) != 3){ |
1328 | errln("Minimal Length in Characters for 'data' with 'ce' was suppose to give 3."); | |
1329 | goto close_patterns; | |
1330 | } | |
46f4442e | 1331 | |
729e4ab9 A |
1332 | if(data->minLengthInChars(ce1,0) != 3){ |
1333 | errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to give 3."); | |
1334 | goto close_patterns; | |
1335 | } | |
46f4442e | 1336 | |
729e4ab9 A |
1337 | for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { |
1338 | UnicodeString target = targets[t].unescape(); | |
46f4442e | 1339 | |
729e4ab9 A |
1340 | longPattern->setTargetString(&target, status); |
1341 | if (longPattern->search(0, start, end)) { | |
1342 | logln("Test %d: found long pattern at [%d, %d].", t, start, end); | |
1343 | } else { | |
1344 | errln("Test %d: did not find long pattern.", t); | |
1345 | } | |
46f4442e | 1346 | |
729e4ab9 A |
1347 | shortPattern->setTargetString(&target, status); |
1348 | if (shortPattern->search(0, start, end)) { | |
1349 | logln("Test %d: found short pattern at [%d, %d].", t, start, end); | |
1350 | } else { | |
1351 | errln("Test %d: did not find short pattern.", t); | |
1352 | } | |
46f4442e | 1353 | |
729e4ab9 A |
1354 | if(longPattern->empty()){ |
1355 | errln("Test %d: Long pattern should not have been empty."); | |
1356 | } | |
1357 | ||
1358 | if(shortPattern->empty()){ | |
1359 | errln("Test %d: Short pattern should not have been empty."); | |
1360 | } | |
1361 | } | |
1362 | ||
1363 | close_patterns: | |
1364 | delete shortPattern; | |
1365 | delete longPattern; | |
46f4442e | 1366 | |
729e4ab9 A |
1367 | close_data: |
1368 | CollData::close(data); | |
1369 | ucol_close(coll); | |
46f4442e A |
1370 | } |
1371 | ||
729e4ab9 | 1372 | void SSearchTest::bmsTest() |
46f4442e | 1373 | { |
46f4442e | 1374 | UErrorCode status = U_ZERO_ERROR; |
729e4ab9 A |
1375 | UCollator *coll = NULL; |
1376 | UCD *data = NULL; | |
1377 | UnicodeString lp = "fuss"; | |
1378 | UnicodeString lpu = lp.unescape(); | |
1379 | UnicodeString sp = "fu\\u00DF"; | |
1380 | UnicodeString spu = sp.unescape(); | |
1381 | BMS *longPattern = NULL; | |
1382 | BMS *shortPattern = NULL; | |
1383 | UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", | |
1384 | "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", | |
1385 | "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; | |
1386 | int32_t start = -1, end = -1; | |
1387 | ||
1388 | coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); | |
1389 | if (U_FAILURE(status)) { | |
1390 | errcheckln(status, "Could not open collator. - %s", u_errorName(status)); | |
1391 | return; | |
1392 | } | |
46f4442e | 1393 | |
729e4ab9 A |
1394 | data = ucd_open(coll, &status); |
1395 | if (U_FAILURE(status)) { | |
1396 | errln("Could not open CollData object."); | |
1397 | goto close_data; | |
1398 | } | |
46f4442e | 1399 | |
729e4ab9 A |
1400 | longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status); |
1401 | shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status); | |
1402 | if (U_FAILURE(status)) { | |
1403 | errln("Couldn't open pattern objects."); | |
1404 | goto close_patterns; | |
1405 | } | |
46f4442e | 1406 | |
729e4ab9 A |
1407 | for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { |
1408 | UnicodeString target = targets[t].unescape(); | |
46f4442e | 1409 | |
729e4ab9 A |
1410 | bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status); |
1411 | if (bms_search(longPattern, 0, &start, &end)) { | |
1412 | logln("Test %d: found long pattern at [%d, %d].", t, start, end); | |
46f4442e | 1413 | } else { |
729e4ab9 | 1414 | errln("Test %d: did not find long pattern.", t); |
46f4442e | 1415 | } |
46f4442e | 1416 | |
729e4ab9 A |
1417 | bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status); |
1418 | if (bms_search(shortPattern, 0, &start, &end)) { | |
1419 | logln("Test %d: found short pattern at [%d, %d].", t, start, end); | |
46f4442e | 1420 | } else { |
729e4ab9 A |
1421 | errln("Test %d: did not find short pattern.", t); |
1422 | } | |
1423 | } | |
46f4442e | 1424 | |
729e4ab9 A |
1425 | /* Add better coverage for bms code. */ |
1426 | if(bms_empty(longPattern)) { | |
1427 | errln("FAIL: longgPattern is empty."); | |
1428 | } | |
46f4442e | 1429 | |
729e4ab9 A |
1430 | if (!bms_getData(longPattern)) { |
1431 | errln("FAIL: bms_getData returned NULL."); | |
1432 | } | |
46f4442e | 1433 | |
729e4ab9 A |
1434 | if (!ucd_getCollator(data)) { |
1435 | errln("FAIL: ucd_getCollator returned NULL."); | |
46f4442e A |
1436 | } |
1437 | ||
729e4ab9 A |
1438 | close_patterns: |
1439 | bms_close(shortPattern); | |
1440 | bms_close(longPattern); | |
1441 | ||
1442 | close_data: | |
1443 | ucd_close(data); | |
1444 | ucd_freeCache(); | |
1445 | ucol_close(coll); | |
46f4442e A |
1446 | } |
1447 | ||
729e4ab9 | 1448 | void SSearchTest::goodSuffixTest() |
46f4442e | 1449 | { |
729e4ab9 A |
1450 | UErrorCode status = U_ZERO_ERROR; |
1451 | UCollator *coll = NULL; | |
1452 | CollData *data = NULL; | |
1453 | UnicodeString pat = /*"gcagagag"*/ "fxeld"; | |
1454 | UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; | |
1455 | BoyerMooreSearch *pattern = NULL; | |
1456 | int32_t start = -1, end = -1; | |
1457 | ||
1458 | coll = ucol_open(NULL, &status); | |
1459 | if (U_FAILURE(status)) { | |
1460 | errcheckln(status, "Couldn't open collator. - %s", u_errorName(status)); | |
1461 | return; | |
1462 | } | |
1463 | ||
1464 | data = CollData::open(coll, status); | |
1465 | if (U_FAILURE(status)) { | |
1466 | errln("Couldn't open CollData object."); | |
1467 | goto close_data; | |
1468 | } | |
1469 | ||
1470 | pattern = new BoyerMooreSearch(data, pat, &target, status); | |
1471 | if (U_FAILURE(status)) { | |
1472 | errln("Couldn't open pattern object."); | |
1473 | goto close_pattern; | |
46f4442e A |
1474 | } |
1475 | ||
729e4ab9 A |
1476 | if (pattern->search(0, start, end)) { |
1477 | logln("Found pattern at [%d, %d].", start, end); | |
1478 | } else { | |
1479 | errln("Did not find pattern."); | |
1480 | } | |
1481 | ||
1482 | close_pattern: | |
1483 | delete pattern; | |
1484 | ||
1485 | close_data: | |
1486 | CollData::close(data); | |
1487 | ucol_close(coll); | |
46f4442e A |
1488 | } |
1489 | ||
729e4ab9 A |
1490 | // |
1491 | // searchTime() A quick and dirty performance test for string search. | |
1492 | // Probably doesn't really belong as part of intltest, but it | |
1493 | // does check that the search succeeds, and gets the right result, | |
1494 | // so it serves as a functionality test also. | |
1495 | // | |
1496 | // To run as a perf test, up the loop count, select by commenting | |
1497 | // and uncommenting in the code the operation to be measured, | |
1498 | // rebuild, and measure the running time of this test alone. | |
1499 | // | |
1500 | // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime | |
1501 | // | |
1502 | void SSearchTest::searchTime() { | |
1503 | static const char *longishText = | |
1504 | "Whylom, as olde stories tellen us,\n" | |
1505 | "Ther was a duk that highte Theseus:\n" | |
1506 | "Of Athenes he was lord and governour,\n" | |
1507 | "And in his tyme swich a conquerour,\n" | |
1508 | "That gretter was ther noon under the sonne.\n" | |
1509 | "Ful many a riche contree hadde he wonne;\n" | |
1510 | "What with his wisdom and his chivalrye,\n" | |
1511 | "He conquered al the regne of Femenye,\n" | |
1512 | "That whylom was y-cleped Scithia;\n" | |
1513 | "And weddede the quene Ipolita,\n" | |
1514 | "And broghte hir hoom with him in his contree\n" | |
1515 | "With muchel glorie and greet solempnitee,\n" | |
1516 | "And eek hir yonge suster Emelye.\n" | |
1517 | "And thus with victorie and with melodye\n" | |
1518 | "Lete I this noble duk to Athenes ryde,\n" | |
1519 | "And al his hoost, in armes, him bisyde.\n" | |
1520 | "And certes, if it nere to long to here,\n" | |
1521 | "I wolde han told yow fully the manere,\n" | |
1522 | "How wonnen was the regne of Femenye\n" | |
1523 | "By Theseus, and by his chivalrye;\n" | |
1524 | "And of the grete bataille for the nones\n" | |
1525 | "Bitwixen Athen's and Amazones;\n" | |
1526 | "And how asseged was Ipolita,\n" | |
1527 | "The faire hardy quene of Scithia;\n" | |
1528 | "And of the feste that was at hir weddinge,\n" | |
1529 | "And of the tempest at hir hoom-cominge;\n" | |
1530 | "But al that thing I moot as now forbere.\n" | |
1531 | "I have, God woot, a large feeld to ere,\n" | |
1532 | "And wayke been the oxen in my plough.\n" | |
1533 | "The remenant of the tale is long y-nough.\n" | |
1534 | "I wol nat letten eek noon of this route;\n" | |
1535 | "Lat every felawe telle his tale aboute,\n" | |
1536 | "And lat see now who shal the soper winne;\n" | |
1537 | "And ther I lefte, I wol ageyn biginne.\n" | |
1538 | "This duk, of whom I make mencioun,\n" | |
1539 | "When he was come almost unto the toun,\n" | |
1540 | "In al his wele and in his moste pryde,\n" | |
1541 | "He was war, as he caste his eye asyde,\n" | |
1542 | "Wher that ther kneled in the hye weye\n" | |
1543 | "A companye of ladies, tweye and tweye,\n" | |
1544 | "Ech after other, clad in clothes blake; \n" | |
1545 | "But swich a cry and swich a wo they make,\n" | |
1546 | "That in this world nis creature livinge,\n" | |
1547 | "That herde swich another weymentinge;\n" | |
1548 | "And of this cry they nolde never stenten,\n" | |
1549 | "Til they the reynes of his brydel henten.\n" | |
1550 | "'What folk ben ye, that at myn hoomcominge\n" | |
1551 | "Perturben so my feste with cryinge'?\n" | |
1552 | "Quod Theseus, 'have ye so greet envye\n" | |
1553 | "Of myn honour, that thus compleyne and crye? \n" | |
1554 | "Or who hath yow misboden, or offended?\n" | |
1555 | "And telleth me if it may been amended;\n" | |
1556 | "And why that ye ben clothed thus in blak'?\n" | |
1557 | "The eldest lady of hem alle spak,\n" | |
1558 | "When she hadde swowned with a deedly chere,\n" | |
1559 | "That it was routhe for to seen and here,\n" | |
1560 | "And seyde: 'Lord, to whom Fortune hath yiven\n" | |
1561 | "Victorie, and as a conquerour to liven,\n" | |
1562 | "Noght greveth us your glorie and your honour;\n" | |
1563 | "But we biseken mercy and socour.\n" | |
1564 | "Have mercy on our wo and our distresse.\n" | |
1565 | "Som drope of pitee, thurgh thy gentilesse,\n" | |
1566 | "Up-on us wrecched wommen lat thou falle.\n" | |
1567 | "For certes, lord, ther nis noon of us alle,\n" | |
1568 | "That she nath been a duchesse or a quene;\n" | |
1569 | "Now be we caitifs, as it is wel sene:\n" | |
1570 | "Thanked be Fortune, and hir false wheel,\n" | |
1571 | "That noon estat assureth to be weel.\n" | |
1572 | "And certes, lord, t'abyden your presence,\n" | |
1573 | "Here in the temple of the goddesse Clemence\n" | |
1574 | "We han ben waytinge al this fourtenight;\n" | |
1575 | "Now help us, lord, sith it is in thy might.\n" | |
1576 | "I wrecche, which that wepe and waille thus,\n" | |
1577 | "Was whylom wyf to king Capaneus,\n" | |
1578 | "That starf at Thebes, cursed be that day!\n" | |
1579 | "And alle we, that been in this array,\n" | |
1580 | "And maken al this lamentacioun,\n" | |
1581 | "We losten alle our housbondes at that toun,\n" | |
1582 | "Whyl that the sege ther-aboute lay.\n" | |
1583 | "And yet now th'olde Creon, weylaway!\n" | |
1584 | "The lord is now of Thebes the citee, \n" | |
1585 | "Fulfild of ire and of iniquitee,\n" | |
1586 | "He, for despyt, and for his tirannye,\n" | |
1587 | "To do the dede bodyes vileinye,\n" | |
1588 | "Of alle our lordes, whiche that ben slawe,\n" | |
1589 | "Hath alle the bodyes on an heep y-drawe,\n" | |
1590 | "And wol nat suffren hem, by noon assent,\n" | |
1591 | "Neither to been y-buried nor y-brent,\n" | |
1592 | "But maketh houndes ete hem in despyt. zet'\n"; | |
1593 | ||
1594 | #define TEST_BOYER_MOORE 1 | |
1595 | const char *cPattern = "maketh houndes ete hem"; | |
1596 | //const char *cPattern = "Whylom"; | |
1597 | //const char *cPattern = "zet"; | |
1598 | const char *testId = "searchTime()"; // for error macros. | |
1599 | UnicodeString target = longishText; | |
1600 | UErrorCode status = U_ZERO_ERROR; | |
46f4442e | 1601 | |
46f4442e | 1602 | |
729e4ab9 A |
1603 | LocalUCollatorPointer collator(ucol_open("en", &status)); |
1604 | CollData *data = CollData::open(collator.getAlias(), status); | |
1605 | if (U_FAILURE(status) || collator.isNull() || data == NULL) { | |
1606 | errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status)); | |
1607 | return; | |
1608 | } | |
1609 | //ucol_setStrength(collator.getAlias(), collatorStrength); | |
1610 | //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); | |
1611 | UnicodeString uPattern = cPattern; | |
1612 | #ifndef TEST_BOYER_MOORE | |
1613 | LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(), | |
1614 | target.getBuffer(), target.length(), | |
1615 | collator.getAlias(), | |
1616 | NULL, // the break iterator | |
1617 | &status)); | |
1618 | TEST_ASSERT_SUCCESS(status); | |
1619 | #else | |
1620 | BoyerMooreSearch bms(data, uPattern, &target, status); | |
1621 | TEST_ASSERT_SUCCESS(status); | |
1622 | #endif | |
46f4442e | 1623 | |
729e4ab9 A |
1624 | // int32_t foundStart; |
1625 | // int32_t foundEnd; | |
1626 | UBool found; | |
1627 | ||
1628 | // Find the match position usgin strstr | |
1629 | const char *pm = strstr(longishText, cPattern); | |
1630 | TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); | |
1631 | int32_t refMatchPos = (int32_t)(pm - longishText); | |
1632 | int32_t icuMatchPos; | |
1633 | int32_t icuMatchEnd; | |
1634 | #ifndef TEST_BOYER_MOORE | |
1635 | usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); | |
1636 | TEST_ASSERT_SUCCESS(status); | |
1637 | #else | |
1638 | found = bms.search(0, icuMatchPos, icuMatchEnd); | |
1639 | #endif | |
1640 | TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions."); | |
46f4442e | 1641 | |
729e4ab9 | 1642 | int32_t i; |
4388f060 | 1643 | // int32_t j=0; |
729e4ab9 A |
1644 | |
1645 | // Try loopcounts around 100000 to some millions, depending on the operation, | |
1646 | // to get runtimes of at least several seconds. | |
1647 | for (i=0; i<10000; i++) { | |
1648 | #ifndef TEST_BOYER_MOORE | |
1649 | found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); | |
1650 | #else | |
1651 | found = bms.search(0, icuMatchPos, icuMatchEnd); | |
1652 | #endif | |
1653 | //TEST_ASSERT_SUCCESS(status); | |
1654 | //TEST_ASSERT(found); | |
1655 | ||
1656 | // usearch_setOffset(uss.getAlias(), 0, &status); | |
1657 | // icuMatchPos = usearch_next(uss.getAlias(), &status); | |
46f4442e | 1658 | |
729e4ab9 A |
1659 | // The i+j stuff is to confuse the optimizer and get it to actually leave the |
1660 | // call to strstr in place. | |
1661 | //pm = strstr(longishText+j, cPattern); | |
1662 | //j = (j + i)%5; | |
46f4442e | 1663 | } |
729e4ab9 | 1664 | |
4388f060 | 1665 | //printf("%ld, %d\n", pm-longishText, j); |
729e4ab9 A |
1666 | #ifdef TEST_BOYER_MOORE |
1667 | CollData::close(data); | |
1668 | #endif | |
46f4442e | 1669 | } |
729e4ab9 | 1670 | #endif |
46f4442e A |
1671 | |
1672 | //---------------------------------------------------------------------------------------- | |
1673 | // | |
1674 | // Random Numbers. Similar to standard lib rand() and srand() | |
1675 | // Not using library to | |
1676 | // 1. Get same results on all platforms. | |
1677 | // 2. Get access to current seed, to more easily reproduce failures. | |
1678 | // | |
1679 | //--------------------------------------------------------------------------------------- | |
1680 | static uint32_t m_seed = 1; | |
1681 | ||
1682 | static uint32_t m_rand() | |
1683 | { | |
1684 | m_seed = m_seed * 1103515245 + 12345; | |
1685 | return (uint32_t)(m_seed/65536) % 32768; | |
1686 | } | |
1687 | ||
1688 | class Monkey | |
1689 | { | |
1690 | public: | |
1691 | virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; | |
1692 | ||
1693 | protected: | |
1694 | Monkey(); | |
1695 | virtual ~Monkey(); | |
1696 | }; | |
1697 | ||
1698 | Monkey::Monkey() | |
1699 | { | |
1700 | // ook? | |
1701 | } | |
1702 | ||
1703 | Monkey::~Monkey() | |
1704 | { | |
1705 | // ook? | |
1706 | } | |
1707 | ||
1708 | class SetMonkey : public Monkey | |
1709 | { | |
1710 | public: | |
1711 | SetMonkey(const USet *theSet); | |
1712 | ~SetMonkey(); | |
1713 | ||
1714 | virtual void append(UnicodeString &test, UnicodeString &alternate); | |
1715 | ||
1716 | private: | |
1717 | const USet *set; | |
1718 | }; | |
1719 | ||
1720 | SetMonkey::SetMonkey(const USet *theSet) | |
1721 | : Monkey(), set(theSet) | |
1722 | { | |
1723 | // ook? | |
1724 | } | |
1725 | ||
1726 | SetMonkey::~SetMonkey() | |
1727 | { | |
1728 | //ook... | |
1729 | } | |
1730 | ||
1731 | void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) | |
1732 | { | |
1733 | int32_t size = uset_size(set); | |
1734 | int32_t index = m_rand() % size; | |
1735 | UChar32 ch = uset_charAt(set, index); | |
1736 | UnicodeString str(ch); | |
1737 | ||
1738 | test.append(str); | |
1739 | alternate.append(str); // flip case, or some junk? | |
1740 | } | |
1741 | ||
1742 | class StringSetMonkey : public Monkey | |
1743 | { | |
1744 | public: | |
729e4ab9 | 1745 | StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData); |
46f4442e A |
1746 | ~StringSetMonkey(); |
1747 | ||
1748 | void append(UnicodeString &testCase, UnicodeString &alternate); | |
1749 | ||
1750 | private: | |
1751 | UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate); | |
1752 | ||
1753 | const USet *set; | |
729e4ab9 A |
1754 | UCollator *coll; |
1755 | CollData *collData; | |
46f4442e A |
1756 | }; |
1757 | ||
729e4ab9 A |
1758 | StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData) |
1759 | : Monkey(), set(theSet), coll(theCollator), collData(theCollData) | |
46f4442e A |
1760 | { |
1761 | // ook. | |
1762 | } | |
1763 | ||
1764 | StringSetMonkey::~StringSetMonkey() | |
1765 | { | |
1766 | // ook? | |
1767 | } | |
1768 | ||
1769 | void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) | |
1770 | { | |
1771 | int32_t itemCount = uset_getItemCount(set), len = 0; | |
1772 | int32_t index = m_rand() % itemCount; | |
1773 | UChar32 rangeStart = 0, rangeEnd = 0; | |
1774 | UChar buffer[16]; | |
1775 | UErrorCode err = U_ZERO_ERROR; | |
1776 | ||
1777 | len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); | |
1778 | ||
1779 | if (len == 0) { | |
1780 | int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); | |
1781 | UChar32 ch = rangeStart + offset; | |
1782 | UnicodeString str(ch); | |
1783 | ||
1784 | testCase.append(str); | |
1785 | generateAlternative(str, alternate); | |
1786 | } else if (len > 0) { | |
1787 | // should check that len < 16... | |
1788 | UnicodeString str(buffer, len); | |
1789 | ||
1790 | testCase.append(str); | |
1791 | generateAlternative(str, alternate); | |
1792 | } else { | |
1793 | // shouldn't happen... | |
1794 | } | |
1795 | } | |
1796 | ||
1797 | UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate) | |
1798 | { | |
1799 | // find out shortest string for the longest sequence of ces. | |
1800 | // needs to be refined to use dynamic programming, but will be roughly right | |
729e4ab9 A |
1801 | UErrorCode status = U_ZERO_ERROR; |
1802 | CEList ceList(coll, testCase, status); | |
46f4442e A |
1803 | UnicodeString alt; |
1804 | int32_t offset = 0; | |
1805 | ||
1806 | if (ceList.size() == 0) { | |
1807 | return alternate.append(testCase); | |
1808 | } | |
1809 | ||
1810 | while (offset < ceList.size()) { | |
1811 | int32_t ce = ceList.get(offset); | |
729e4ab9 | 1812 | const StringList *strings = collData->getStringList(ce); |
46f4442e A |
1813 | |
1814 | if (strings == NULL) { | |
1815 | return alternate.append(testCase); | |
1816 | } | |
1817 | ||
1818 | int32_t stringCount = strings->size(); | |
1819 | int32_t tries = 0; | |
729e4ab9 | 1820 | |
46f4442e | 1821 | // find random string that generates the same CEList |
729e4ab9 A |
1822 | const CEList *ceList2 = NULL; |
1823 | const UnicodeString *string = NULL; | |
1824 | UBool matches = FALSE; | |
46f4442e A |
1825 | |
1826 | do { | |
1827 | int32_t s = m_rand() % stringCount; | |
1828 | ||
1829 | if (tries++ > stringCount) { | |
1830 | alternate.append(testCase); | |
1831 | return alternate; | |
1832 | } | |
1833 | ||
1834 | string = strings->get(s); | |
729e4ab9 A |
1835 | ceList2 = collData->getCEList(string); |
1836 | matches = ceList.matchesAt(offset, ceList2); | |
1837 | ||
1838 | if (! matches) { | |
1839 | collData->freeCEList((CEList *) ceList2); | |
1840 | } | |
1841 | } while (! matches); | |
46f4442e A |
1842 | |
1843 | alt.append(*string); | |
1844 | offset += ceList2->size(); | |
729e4ab9 | 1845 | collData->freeCEList(ceList2); |
46f4442e A |
1846 | } |
1847 | ||
729e4ab9 | 1848 | const CEList altCEs(coll, alt, status); |
46f4442e A |
1849 | |
1850 | if (ceList.matchesAt(0, &altCEs)) { | |
1851 | return alternate.append(alt); | |
1852 | } | |
1853 | ||
1854 | return alternate.append(testCase); | |
1855 | } | |
1856 | ||
1857 | static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate) | |
1858 | { | |
1859 | int32_t pieces = (m_rand() % 4) + 1; | |
729e4ab9 | 1860 | UErrorCode status = U_ZERO_ERROR; |
46f4442e A |
1861 | UBool matches; |
1862 | ||
1863 | do { | |
1864 | testCase.remove(); | |
1865 | alternate.remove(); | |
1866 | monkeys[0]->append(testCase, alternate); | |
1867 | ||
1868 | for(int32_t piece = 0; piece < pieces; piece += 1) { | |
1869 | int32_t monkey = m_rand() % monkeyCount; | |
1870 | ||
1871 | monkeys[monkey]->append(testCase, alternate); | |
1872 | } | |
1873 | ||
729e4ab9 A |
1874 | const CEList ceTest(coll, testCase, status); |
1875 | const CEList ceAlt(coll, alternate, status); | |
46f4442e A |
1876 | |
1877 | matches = ceTest.matchesAt(0, &ceAlt); | |
1878 | } while (! matches); | |
1879 | } | |
1880 | ||
46f4442e A |
1881 | // |
1882 | // Find the next acceptable boundary following the specified starting index | |
1883 | // in the target text being searched. | |
1884 | // TODO: refine what is an acceptable boundary. For the moment, | |
1885 | // choose the next position not within a combining sequence. | |
1886 | // | |
729e4ab9 | 1887 | #if 0 |
46f4442e A |
1888 | static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) { |
1889 | const UChar *text = string.getBuffer(); | |
1890 | int32_t textLen = string.length(); | |
729e4ab9 | 1891 | |
46f4442e A |
1892 | if (startIndex >= textLen) { |
1893 | return startIndex; | |
1894 | } | |
1895 | ||
1896 | UChar32 c; | |
1897 | int32_t i = startIndex; | |
1898 | ||
1899 | U16_NEXT(text, i, textLen, c); | |
729e4ab9 | 1900 | |
46f4442e A |
1901 | // If we are on a control character, stop without looking for combining marks. |
1902 | // Control characters do not combine. | |
1903 | int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); | |
1904 | if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { | |
1905 | return i; | |
1906 | } | |
729e4ab9 | 1907 | |
46f4442e A |
1908 | // The initial character was not a control, and can thus accept trailing |
1909 | // combining characters. Advance over however many of them there are. | |
1910 | int32_t indexOfLastCharChecked; | |
1911 | ||
1912 | for (;;) { | |
1913 | indexOfLastCharChecked = i; | |
1914 | ||
1915 | if (i>=textLen) { | |
1916 | break; | |
1917 | } | |
1918 | ||
1919 | U16_NEXT(text, i, textLen, c); | |
1920 | gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); | |
1921 | ||
1922 | if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { | |
1923 | break; | |
1924 | } | |
1925 | } | |
1926 | ||
1927 | return indexOfLastCharChecked; | |
1928 | } | |
729e4ab9 A |
1929 | #endif |
1930 | ||
1931 | #if 0 | |
46f4442e A |
1932 | static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) { |
1933 | const UChar *text = string.getBuffer(); | |
1934 | int32_t textLen = string.length(); | |
729e4ab9 | 1935 | |
46f4442e A |
1936 | if (index>=textLen || index<=0) { |
1937 | return FALSE; | |
1938 | } | |
729e4ab9 | 1939 | |
46f4442e A |
1940 | // If the character at the current index is not a GRAPHEME_EXTEND |
1941 | // then we can not be within a combining sequence. | |
1942 | UChar32 c; | |
1943 | U16_GET(text, 0, index, textLen, c); | |
1944 | int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); | |
1945 | if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { | |
1946 | return FALSE; | |
1947 | } | |
729e4ab9 | 1948 | |
46f4442e A |
1949 | // We are at a combining mark. If the preceding character is anything |
1950 | // except a CONTROL, CR or LF, we are in a combining sequence. | |
729e4ab9 | 1951 | U16_PREV(text, 0, index, c); |
46f4442e A |
1952 | gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
1953 | ||
1954 | return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); | |
729e4ab9 A |
1955 | } |
1956 | #endif | |
1957 | ||
46f4442e A |
1958 | static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) |
1959 | { | |
1960 | UErrorCode status = U_ZERO_ERROR; | |
1961 | OrderList targetOrders(coll, target, offset); | |
1962 | OrderList patternOrders(coll, pattern); | |
1963 | int32_t targetSize = targetOrders.size() - 1; | |
1964 | int32_t patternSize = patternOrders.size() - 1; | |
729e4ab9 A |
1965 | UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), |
1966 | target.getBuffer(), target.length(), &status); | |
46f4442e A |
1967 | |
1968 | if (patternSize == 0) { | |
729e4ab9 A |
1969 | // Searching for an empty pattern always fails |
1970 | matchStart = matchEnd = -1; | |
1971 | ubrk_close(charBreakIterator); | |
46f4442e A |
1972 | return FALSE; |
1973 | } | |
1974 | ||
1975 | matchStart = matchEnd = -1; | |
1976 | ||
1977 | for(int32_t i = 0; i < targetSize; i += 1) { | |
1978 | if (targetOrders.matchesAt(i, patternOrders)) { | |
1979 | int32_t start = targetOrders.getLowOffset(i); | |
1980 | int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); | |
1981 | int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); | |
1982 | ||
1983 | // if the low and high offsets of the first CE in | |
1984 | // the match are the same, it means that the match | |
1985 | // starts in the middle of an expansion - all but | |
1986 | // the first CE of the expansion will have the offset | |
1987 | // of the following character. | |
1988 | if (start == targetOrders.getHighOffset(i)) { | |
1989 | continue; | |
1990 | } | |
1991 | ||
1992 | // Make sure match starts on a grapheme boundary | |
1993 | if (! ubrk_isBoundary(charBreakIterator, start)) { | |
1994 | continue; | |
1995 | } | |
1996 | ||
1997 | // If the low and high offsets of the CE after the match | |
1998 | // are the same, it means that the match ends in the middle | |
1999 | // of an expansion sequence. | |
2000 | if (maxLimit == targetOrders.getHighOffset(i + patternSize) && | |
2001 | targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { | |
2002 | continue; | |
2003 | } | |
2004 | ||
2005 | int32_t mend = maxLimit; | |
2006 | ||
2007 | // Find the first grapheme break after the character index | |
2008 | // of the last CE in the match. If it's after character index | |
2009 | // that's after the last CE in the match, use that index | |
2010 | // as the end of the match. | |
2011 | if (minLimit < maxLimit) { | |
4388f060 A |
2012 | // When the last CE's low index is same with its high index, the CE is likely |
2013 | // a part of expansion. In this case, the index is located just after the | |
2014 | // character corresponding to the CEs compared above. If the index is right | |
2015 | // at the break boundary, move the position to the next boundary will result | |
2016 | // incorrect match length when there are ignorable characters exist between | |
2017 | // the position and the next character produces CE(s). See ticket#8482. | |
2018 | if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { | |
2019 | mend = minLimit; | |
2020 | } else { | |
2021 | int32_t nba = ubrk_following(charBreakIterator, minLimit); | |
2022 | ||
2023 | if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { | |
2024 | mend = nba; | |
2025 | } | |
46f4442e A |
2026 | } |
2027 | } | |
2028 | ||
2029 | if (mend > maxLimit) { | |
2030 | continue; | |
2031 | } | |
2032 | ||
2033 | if (! ubrk_isBoundary(charBreakIterator, mend)) { | |
2034 | continue; | |
2035 | } | |
2036 | ||
2037 | matchStart = start; | |
2038 | matchEnd = mend; | |
2039 | ||
2040 | ubrk_close(charBreakIterator); | |
2041 | return TRUE; | |
2042 | } | |
2043 | } | |
2044 | ||
2045 | ubrk_close(charBreakIterator); | |
2046 | return FALSE; | |
2047 | } | |
2048 | ||
2049 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
2050 | static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { | |
2051 | int32_t val = defaultVal; | |
2052 | ||
2053 | name.append(" *= *(-?\\d+)"); | |
2054 | ||
2055 | UErrorCode status = U_ZERO_ERROR; | |
2056 | RegexMatcher m(name, params, 0, status); | |
2057 | ||
2058 | if (m.find()) { | |
2059 | // The param exists. Convert the string to an int. | |
2060 | char valString[100]; | |
2061 | int32_t paramLength = m.end(1, status) - m.start(1, status); | |
2062 | ||
2063 | if (paramLength >= (int32_t)(sizeof(valString)-1)) { | |
2064 | paramLength = (int32_t)(sizeof(valString)-2); | |
2065 | } | |
2066 | ||
2067 | params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); | |
2068 | val = strtol(valString, NULL, 10); | |
2069 | ||
2070 | // Delete this parameter from the params string. | |
2071 | m.reset(); | |
2072 | params = m.replaceFirst("", status); | |
2073 | } | |
2074 | ||
2075 | //U_ASSERT(U_SUCCESS(status)); | |
2076 | if (! U_SUCCESS(status)) { | |
2077 | val = defaultVal; | |
2078 | } | |
2079 | ||
2080 | return val; | |
2081 | } | |
2082 | #endif | |
2083 | ||
2084 | #if !UCONFIG_NO_COLLATION | |
2085 | int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, | |
2086 | const char *name, const char *strength, uint32_t seed) | |
2087 | { | |
2088 | UErrorCode status = U_ZERO_ERROR; | |
2089 | int32_t actualStart = -1, actualEnd = -1; | |
2090 | //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); | |
2091 | int32_t expectedStart = -1, expectedEnd = -1; | |
2092 | int32_t notFoundCount = 0; | |
729e4ab9 A |
2093 | LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), |
2094 | testCase.getBuffer(), testCase.length(), | |
2095 | coll, | |
2096 | NULL, // the break iterator | |
2097 | &status)); | |
46f4442e A |
2098 | |
2099 | // **** TODO: find *all* matches, not just first one **** | |
2100 | simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); | |
2101 | ||
729e4ab9 | 2102 | usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); |
46f4442e | 2103 | |
729e4ab9 | 2104 | if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { |
46f4442e A |
2105 | errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" |
2106 | " strength=%s seed=%d", | |
2107 | name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); | |
2108 | } | |
2109 | ||
2110 | if (expectedStart == -1 && actualStart == -1) { | |
2111 | notFoundCount += 1; | |
2112 | } | |
2113 | ||
2114 | // **** TODO: find *all* matches, not just first one **** | |
2115 | simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); | |
2116 | ||
729e4ab9 | 2117 | usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status); |
46f4442e | 2118 | |
729e4ab9 | 2119 | usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); |
46f4442e | 2120 | |
729e4ab9 | 2121 | if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { |
46f4442e A |
2122 | errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" |
2123 | " strength=%s seed=%d", | |
2124 | name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); | |
2125 | } | |
2126 | ||
2127 | if (expectedStart == -1 && actualStart == -1) { | |
2128 | notFoundCount += 1; | |
2129 | } | |
2130 | ||
729e4ab9 A |
2131 | return notFoundCount; |
2132 | } | |
2133 | ||
2134 | static void hexForUnicodeString(const UnicodeString &ustr, char * cbuf, int32_t cbuflen) | |
2135 | { | |
2136 | int32_t ustri, ustrlen = ustr.length(); | |
2137 | ||
2138 | for (ustri = 0; ustri < ustrlen; ++ustri) { | |
2139 | if (cbuflen >= 9 /* format width for single code unit(5) + terminating ellipsis(3) + null(1) */) { | |
2140 | int len = sprintf(cbuf, " %04X", ustr.charAt(ustri)); | |
2141 | cbuflen -= len; | |
2142 | cbuf += len; | |
2143 | } else { | |
2144 | if (cbuflen >= 4 /* terminating ellipsis(3) + null(1) */) { | |
2145 | sprintf(cbuf, "..."); | |
2146 | } else if (cbuflen >= 1) { | |
2147 | cbuf = 0; | |
2148 | } | |
2149 | break; | |
2150 | } | |
2151 | } | |
2152 | } | |
2153 | ||
2154 | int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, | |
2155 | BoyerMooreSearch *bms, BoyerMooreSearch *abms, | |
2156 | const char *name, const char *strength, uint32_t seed) | |
2157 | { | |
2158 | UErrorCode status = U_ZERO_ERROR; | |
2159 | int32_t actualStart = -1, actualEnd = -1; | |
2160 | //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); | |
2161 | int32_t expectedStart = -1, expectedEnd = -1; | |
2162 | int32_t notFoundCount = 0; | |
2163 | char hexbuf[128]; | |
2164 | ||
2165 | // **** TODO: find *all* matches, not just first one **** | |
2166 | simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); | |
2167 | ||
2168 | bms->setTargetString(&testCase, status); | |
2169 | bms->search(0, actualStart, actualEnd); | |
2170 | ||
2171 | if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { | |
2172 | hexForUnicodeString(pattern, hexbuf, sizeof(hexbuf)); | |
2173 | errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" | |
2174 | " strength=%s seed=%d <pattern>: %s", | |
2175 | name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf); | |
2176 | } | |
2177 | ||
2178 | if (expectedStart == -1 && actualStart == -1) { | |
2179 | notFoundCount += 1; | |
2180 | } | |
2181 | ||
2182 | // **** TODO: find *all* matches, not just first one **** | |
2183 | simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); | |
2184 | ||
2185 | abms->setTargetString(&testCase, status); | |
2186 | abms->search(0, actualStart, actualEnd); | |
2187 | ||
2188 | if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { | |
2189 | hexForUnicodeString(altPattern, hexbuf, sizeof(hexbuf)); | |
2190 | errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" | |
2191 | " strength=%s seed=%d <pattern>: %s", | |
2192 | name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf); | |
2193 | } | |
2194 | ||
2195 | if (expectedStart == -1 && actualStart == -1) { | |
2196 | notFoundCount += 1; | |
2197 | } | |
2198 | ||
46f4442e A |
2199 | |
2200 | return notFoundCount; | |
2201 | } | |
2202 | #endif | |
2203 | ||
2204 | void SSearchTest::monkeyTest(char *params) | |
2205 | { | |
2206 | // ook! | |
2207 | UErrorCode status = U_ZERO_ERROR; | |
729e4ab9 A |
2208 | //UCollator *coll = ucol_open(NULL, &status); |
2209 | UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); | |
2210 | ||
46f4442e | 2211 | if (U_FAILURE(status)) { |
729e4ab9 | 2212 | errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); |
46f4442e A |
2213 | return; |
2214 | } | |
729e4ab9 A |
2215 | |
2216 | CollData *monkeyData = CollData::open(coll, status); | |
2217 | ||
46f4442e A |
2218 | USet *expansions = uset_openEmpty(); |
2219 | USet *contractions = uset_openEmpty(); | |
46f4442e A |
2220 | |
2221 | ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); | |
2222 | ||
46f4442e A |
2223 | U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); |
2224 | U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); | |
2225 | USet *letters = uset_openPattern(letter_pattern, 39, &status); | |
2226 | SetMonkey letterMonkey(letters); | |
729e4ab9 A |
2227 | StringSetMonkey contractionMonkey(contractions, coll, monkeyData); |
2228 | StringSetMonkey expansionMonkey(expansions, coll, monkeyData); | |
46f4442e A |
2229 | UnicodeString testCase; |
2230 | UnicodeString alternate; | |
2231 | UnicodeString pattern, altPattern; | |
2232 | UnicodeString prefix, altPrefix; | |
2233 | UnicodeString suffix, altSuffix; | |
2234 | ||
2235 | Monkey *monkeys[] = { | |
2236 | &letterMonkey, | |
2237 | &contractionMonkey, | |
2238 | &expansionMonkey, | |
2239 | &contractionMonkey, | |
2240 | &expansionMonkey, | |
2241 | &contractionMonkey, | |
2242 | &expansionMonkey, | |
2243 | &contractionMonkey, | |
2244 | &expansionMonkey}; | |
2245 | int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); | |
729e4ab9 | 2246 | // int32_t nonMatchCount = 0; |
46f4442e A |
2247 | |
2248 | UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; | |
2249 | const char *strengthNames[] = {"primary", "secondary", "tertiary"}; | |
2250 | int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); | |
2251 | int32_t loopCount = quick? 1000 : 10000; | |
2252 | int32_t firstStrength = 0; | |
729e4ab9 | 2253 | int32_t lastStrength = strengthCount - 1; //*/ 0; |
46f4442e A |
2254 | |
2255 | if (params != NULL) { | |
2256 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
2257 | UnicodeString p(params); | |
2258 | ||
2259 | loopCount = getIntParam("loop", p, loopCount); | |
2260 | m_seed = getIntParam("seed", p, m_seed); | |
2261 | ||
2262 | RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); | |
2263 | if (m.find()) { | |
2264 | UnicodeString breakType = m.group(1, status); | |
2265 | ||
2266 | for (int32_t s = 0; s < strengthCount; s += 1) { | |
2267 | if (breakType == strengthNames[s]) { | |
2268 | firstStrength = lastStrength = s; | |
2269 | break; | |
2270 | } | |
2271 | } | |
2272 | ||
2273 | m.reset(); | |
2274 | p = m.replaceFirst("", status); | |
2275 | } | |
2276 | ||
2277 | if (RegexMatcher("\\S", p, 0, status).find()) { | |
2278 | // Each option is stripped out of the option string as it is processed. | |
2279 | // All options have been checked. The option string should have been completely emptied.. | |
2280 | char buf[100]; | |
2281 | p.extract(buf, sizeof(buf), NULL, status); | |
2282 | buf[sizeof(buf)-1] = 0; | |
2283 | errln("Unrecognized or extra parameter: %s\n", buf); | |
2284 | return; | |
2285 | } | |
2286 | #else | |
2287 | infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); | |
2288 | #endif | |
2289 | } | |
2290 | ||
2291 | for(int32_t s = firstStrength; s <= lastStrength; s += 1) { | |
2292 | int32_t notFoundCount = 0; | |
2293 | ||
729e4ab9 | 2294 | logln("Setting strength to %s.", strengthNames[s]); |
46f4442e A |
2295 | ucol_setStrength(coll, strengths[s]); |
2296 | ||
2297 | // TODO: try alternate prefix and suffix too? | |
2298 | // TODO: alterntaes are only equal at primary strength. Is this OK? | |
729e4ab9 | 2299 | for(int32_t t = 0; t < loopCount; t += 1) { |
46f4442e | 2300 | uint32_t seed = m_seed; |
729e4ab9 | 2301 | // int32_t nmc = 0; |
46f4442e A |
2302 | |
2303 | generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); | |
2304 | generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); | |
2305 | generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); | |
2306 | ||
2307 | // pattern | |
2308 | notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); | |
2309 | ||
2310 | testCase.remove(); | |
2311 | testCase.append(prefix); | |
2312 | testCase.append(/*alt*/pattern); | |
2313 | ||
2314 | // prefix + pattern | |
2315 | notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); | |
2316 | ||
2317 | testCase.append(suffix); | |
2318 | ||
2319 | // prefix + pattern + suffix | |
2320 | notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); | |
2321 | ||
2322 | testCase.remove(); | |
2323 | testCase.append(pattern); | |
2324 | testCase.append(suffix); | |
729e4ab9 | 2325 | |
46f4442e A |
2326 | // pattern + suffix |
2327 | notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); | |
2328 | } | |
2329 | ||
729e4ab9 A |
2330 | logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); |
2331 | } | |
2332 | ||
2333 | uset_close(contractions); | |
2334 | uset_close(expansions); | |
2335 | uset_close(letters); | |
2336 | ||
2337 | CollData::close(monkeyData); | |
2338 | ||
2339 | ucol_close(coll); | |
2340 | } | |
2341 | ||
2342 | void SSearchTest::bmMonkeyTest(char *params) | |
2343 | { | |
729e4ab9 A |
2344 | static const UChar skipChars[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb |
2345 | // ook! | |
2346 | UErrorCode status = U_ZERO_ERROR; | |
2347 | UCollator *coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); | |
2348 | ||
2349 | if (U_FAILURE(status)) { | |
2350 | errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); | |
2351 | return; | |
2352 | } | |
2353 | ||
2354 | CollData *monkeyData = CollData::open(coll, status); | |
2355 | ||
2356 | USet *expansions = uset_openEmpty(); | |
2357 | USet *contractions = uset_openEmpty(); | |
2358 | ||
2359 | ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); | |
2360 | ||
2361 | U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); | |
2362 | U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); | |
2363 | USet *letters = uset_openPattern(letter_pattern, 39, &status); | |
2364 | SetMonkey letterMonkey(letters); | |
2365 | StringSetMonkey contractionMonkey(contractions, coll, monkeyData); | |
2366 | StringSetMonkey expansionMonkey(expansions, coll, monkeyData); | |
2367 | UnicodeString testCase; | |
2368 | UnicodeString alternate; | |
2369 | UnicodeString pattern, altPattern; | |
2370 | UnicodeString prefix, altPrefix; | |
2371 | UnicodeString suffix, altSuffix; | |
2372 | ||
2373 | Monkey *monkeys[] = { | |
2374 | &letterMonkey, | |
2375 | &contractionMonkey, | |
2376 | &expansionMonkey, | |
2377 | &contractionMonkey, | |
2378 | &expansionMonkey, | |
2379 | &contractionMonkey, | |
2380 | &expansionMonkey, | |
2381 | &contractionMonkey, | |
2382 | &expansionMonkey}; | |
2383 | int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); | |
2384 | // int32_t nonMatchCount = 0; | |
2385 | ||
2386 | UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; | |
2387 | const char *strengthNames[] = {"primary", "secondary", "tertiary"}; | |
2388 | int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); | |
2389 | int32_t loopCount = quick? 1000 : 10000; | |
2390 | int32_t firstStrength = 0; | |
2391 | int32_t lastStrength = strengthCount - 1; //*/ 0; | |
2392 | ||
2393 | if (params != NULL) { | |
2394 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
2395 | UnicodeString p(params); | |
2396 | ||
2397 | loopCount = getIntParam("loop", p, loopCount); | |
2398 | m_seed = getIntParam("seed", p, m_seed); | |
2399 | ||
2400 | RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); | |
2401 | if (m.find()) { | |
2402 | UnicodeString breakType = m.group(1, status); | |
2403 | ||
2404 | for (int32_t s = 0; s < strengthCount; s += 1) { | |
2405 | if (breakType == strengthNames[s]) { | |
2406 | firstStrength = lastStrength = s; | |
2407 | break; | |
2408 | } | |
2409 | } | |
2410 | ||
2411 | m.reset(); | |
2412 | p = m.replaceFirst("", status); | |
2413 | } | |
2414 | ||
2415 | if (RegexMatcher("\\S", p, 0, status).find()) { | |
2416 | // Each option is stripped out of the option string as it is processed. | |
2417 | // All options have been checked. The option string should have been completely emptied.. | |
2418 | char buf[100]; | |
2419 | p.extract(buf, sizeof(buf), NULL, status); | |
2420 | buf[sizeof(buf)-1] = 0; | |
2421 | errln("Unrecognized or extra parameter: %s\n", buf); | |
2422 | return; | |
2423 | } | |
2424 | #else | |
2425 | infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); | |
2426 | #endif | |
46f4442e A |
2427 | } |
2428 | ||
729e4ab9 A |
2429 | for(int32_t s = firstStrength; s <= lastStrength; s += 1) { |
2430 | int32_t notFoundCount = 0; | |
2431 | ||
2432 | logln("Setting strength to %s.", strengthNames[s]); | |
2433 | ucol_setStrength(coll, strengths[s]); | |
2434 | ||
2435 | CollData *data = CollData::open(coll, status); | |
2436 | ||
2437 | UnicodeString skipString(skipChars); // for timebomb | |
2438 | UnicodeSet* skipSet = UnicodeSet::createFromAll(skipString); // for timebomb | |
2439 | // TODO: try alternate prefix and suffix too? | |
2440 | // TODO: alterntaes are only equal at primary strength. Is this OK? | |
2441 | for(int32_t t = 0; t < loopCount; t += 1) { | |
2442 | uint32_t seed = m_seed; | |
2443 | // int32_t nmc = 0; | |
2444 | ||
2445 | generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); | |
2446 | generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); | |
2447 | generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); | |
2448 | ||
4388f060 | 2449 | if (!isICUVersionAtLeast(50, 0) && skipSet->containsSome(pattern)) { |
729e4ab9 A |
2450 | continue; // timebomb until ticket #8080 is resolved |
2451 | } | |
2452 | ||
2453 | BoyerMooreSearch pat(data, pattern, NULL, status); | |
2454 | BoyerMooreSearch alt(data, altPattern, NULL, status); | |
2455 | ||
2456 | // **** need a better way to deal with this **** | |
2457 | #if 0 | |
2458 | if (pat.empty() || | |
2459 | alt.empty()) { | |
2460 | continue; | |
2461 | } | |
2462 | #endif | |
2463 | ||
2464 | // pattern | |
2465 | notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed); | |
2466 | ||
2467 | testCase.remove(); | |
2468 | testCase.append(prefix); | |
2469 | testCase.append(/*alt*/pattern); | |
2470 | ||
2471 | // prefix + pattern | |
2472 | notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed); | |
2473 | ||
2474 | testCase.append(suffix); | |
2475 | ||
2476 | // prefix + pattern + suffix | |
2477 | notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed); | |
2478 | ||
2479 | testCase.remove(); | |
2480 | testCase.append(pattern); | |
2481 | testCase.append(suffix); | |
2482 | ||
2483 | // pattern + suffix | |
2484 | notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed); | |
2485 | } | |
2486 | delete skipSet; // for timebomb | |
2487 | ||
2488 | CollData::close(data); | |
2489 | ||
2490 | logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); | |
2491 | } | |
46f4442e A |
2492 | |
2493 | uset_close(contractions); | |
2494 | uset_close(expansions); | |
46f4442e | 2495 | uset_close(letters); |
729e4ab9 A |
2496 | |
2497 | CollData::close(monkeyData); | |
2498 | ||
46f4442e A |
2499 | ucol_close(coll); |
2500 | } | |
2501 | ||
729e4ab9 A |
2502 | void SSearchTest::stringListTest(){ |
2503 | UErrorCode status = U_ZERO_ERROR; | |
2504 | StringList *sl = new StringList(status); | |
2505 | if(U_FAILURE(status)){ | |
2506 | errln("ERROR: stringListTest: Could not start StringList"); | |
2507 | } | |
2508 | ||
2509 | const UChar chars[] = { | |
2510 | 0x0000 | |
2511 | }; | |
2512 | sl->add(chars, (int32_t) 0, status); | |
2513 | if(U_FAILURE(status)){ | |
2514 | errln("ERROR: stringListTest: StringList::add"); | |
2515 | } | |
2516 | ||
2517 | if(sl->getDynamicClassID() != StringList::getStaticClassID()){ | |
2518 | errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID does not match"); | |
2519 | } | |
2520 | delete sl; | |
2521 | } | |
2522 | ||
2523 | #endif | |
2524 | ||
46f4442e | 2525 | #endif |