]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
46f4442e A |
3 | /* |
4 | ********************************************************************** | |
2ca993e8 | 5 | * Copyright (C) 2005-2016, International Business Machines |
46f4442e A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | */ | |
9 | ||
46f4442e A |
10 | #include "unicode/utypes.h" |
11 | ||
12 | #if !UCONFIG_NO_COLLATION | |
13 | ||
46f4442e | 14 | #include "cmemory.h" |
51004dcb | 15 | #include "cstring.h" |
57a6839d | 16 | #include "usrchimp.h" |
51004dcb | 17 | |
46f4442e A |
18 | #include "unicode/coll.h" |
19 | #include "unicode/tblcoll.h" | |
51004dcb | 20 | #include "unicode/usearch.h" |
46f4442e A |
21 | #include "unicode/uset.h" |
22 | #include "unicode/ustring.h" | |
46f4442e | 23 | |
51004dcb A |
24 | #include "unicode/coleitr.h" |
25 | #include "unicode/regex.h" // TODO: make conditional on regexp being built. | |
729e4ab9 | 26 | |
51004dcb A |
27 | #include "colldata.h" |
28 | #include "ssearch.h" | |
46f4442e A |
29 | #include "xmlparser.h" |
30 | ||
51004dcb | 31 | #include <stdio.h> // for sprintf |
46f4442e A |
32 | |
33 | char testId[100]; | |
34 | ||
35 | #define TEST_ASSERT(x) {if (!(x)) { \ | |
36 | errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}} | |
37 | ||
38 | #define TEST_ASSERT_M(x, m) {if (!(x)) { \ | |
51004dcb | 39 | dataerrln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}} |
46f4442e A |
40 | |
41 | #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ | |
729e4ab9 | 42 | dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ |
46f4442e A |
43 | __FILE__, __LINE__, testId, u_errorName(errcode));}} |
44 | ||
729e4ab9 A |
45 | #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) |
46 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
46f4442e A |
47 | |
48 | //--------------------------------------------------------------------------- | |
49 | // | |
50 | // Test class boilerplate | |
51 | // | |
52 | //--------------------------------------------------------------------------- | |
53 | SSearchTest::SSearchTest() | |
54 | { | |
55 | } | |
56 | ||
57 | SSearchTest::~SSearchTest() | |
58 | { | |
59 | } | |
60 | ||
61 | void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params ) | |
62 | { | |
63 | if (exec) logln("TestSuite SSearchTest: "); | |
64 | switch (index) { | |
65 | #if !UCONFIG_NO_BREAK_ITERATION | |
66 | case 0: name = "searchTest"; | |
67 | if (exec) searchTest(); | |
68 | break; | |
69 | ||
70 | case 1: name = "offsetTest"; | |
71 | if (exec) offsetTest(); | |
72 | break; | |
73 | ||
74 | case 2: name = "monkeyTest"; | |
75 | if (exec) monkeyTest(params); | |
76 | break; | |
729e4ab9 | 77 | |
51004dcb A |
78 | case 3: name = "sharpSTest"; |
79 | if (exec) sharpSTest(); | |
729e4ab9 A |
80 | break; |
81 | ||
51004dcb | 82 | case 4: name = "goodSuffixTest"; |
729e4ab9 A |
83 | if (exec) goodSuffixTest(); |
84 | break; | |
85 | ||
51004dcb | 86 | case 5: name = "searchTime"; |
729e4ab9 A |
87 | if (exec) searchTime(); |
88 | break; | |
46f4442e A |
89 | #endif |
90 | default: name = ""; | |
91 | break; //needed to end loop | |
92 | } | |
93 | } | |
94 | ||
95 | ||
96 | #if !UCONFIG_NO_BREAK_ITERATION | |
97 | ||
98 | #define PATH_BUFFER_SIZE 2048 | |
99 | const char *SSearchTest::getPath(char buffer[2048], const char *filename) { | |
100 | UErrorCode status = U_ZERO_ERROR; | |
101 | const char *testDataDirectory = IntlTest::getSourceTestData(status); | |
102 | ||
103 | if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) { | |
104 | errln("ERROR: getPath() failed - %s", u_errorName(status)); | |
105 | return NULL; | |
106 | } | |
107 | ||
108 | strcpy(buffer, testDataDirectory); | |
109 | strcat(buffer, filename); | |
110 | return buffer; | |
111 | } | |
112 | ||
113 | ||
114 | void SSearchTest::searchTest() | |
115 | { | |
729e4ab9 | 116 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO |
46f4442e A |
117 | UErrorCode status = U_ZERO_ERROR; |
118 | char path[PATH_BUFFER_SIZE]; | |
119 | const char *testFilePath = getPath(path, "ssearch.xml"); | |
120 | ||
121 | if (testFilePath == NULL) { | |
122 | return; /* Couldn't get path: error message already output. */ | |
123 | } | |
124 | ||
729e4ab9 | 125 | LocalPointer<UXMLParser> parser(UXMLParser::createParser(status)); |
46f4442e | 126 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 | 127 | LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status)); |
46f4442e A |
128 | TEST_ASSERT_SUCCESS(status); |
129 | if (U_FAILURE(status)) { | |
130 | return; | |
131 | } | |
132 | ||
133 | const UnicodeString *debugTestCase = root->getAttribute("debug"); | |
134 | if (debugTestCase != NULL) { | |
135 | // setenv("USEARCH_DEBUG", "1", 1); | |
136 | } | |
137 | ||
138 | ||
139 | const UXMLElement *testCase; | |
140 | int32_t tc = 0; | |
141 | ||
142 | while((testCase = root->nextChildElement(tc)) != NULL) { | |
143 | ||
144 | if (testCase->getTagName().compare("test-case") != 0) { | |
145 | errln("ssearch, unrecognized XML Element in test file"); | |
146 | continue; | |
147 | } | |
148 | const UnicodeString *id = testCase->getAttribute("id"); | |
149 | *testId = 0; | |
150 | if (id != NULL) { | |
151 | id->extract(0, id->length(), testId, sizeof(testId), US_INV); | |
152 | } | |
153 | ||
154 | // If debugging test case has been specified and this is not it, skip to next. | |
155 | if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { | |
156 | continue; | |
157 | } | |
158 | // | |
159 | // Get the requested collation strength. | |
160 | // Default is tertiary if the XML attribute is missing from the test case. | |
161 | // | |
162 | const UnicodeString *strength = testCase->getAttribute("strength"); | |
729e4ab9 | 163 | UColAttributeValue collatorStrength = UCOL_PRIMARY; |
46f4442e A |
164 | if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} |
165 | else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} | |
166 | else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} | |
167 | else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} | |
168 | else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} | |
169 | else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} | |
170 | else { | |
171 | // Bogus value supplied for strength. Shouldn't happen, even from | |
172 | // typos, if the XML source has been validated. | |
173 | // This assert is a little deceiving in that strength can be | |
174 | // any of the allowed values, not just TERTIARY, but it will | |
175 | // do the job of getting the error output. | |
176 | TEST_ASSERT(*strength=="TERTIARY") | |
177 | } | |
178 | ||
179 | // | |
180 | // Get the collator normalization flag. Default is UCOL_OFF. | |
181 | // | |
182 | UColAttributeValue normalize = UCOL_OFF; | |
183 | const UnicodeString *norm = testCase->getAttribute("norm"); | |
184 | TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); | |
185 | if (norm!=NULL && *norm=="ON") { | |
186 | normalize = UCOL_ON; | |
187 | } | |
188 | ||
729e4ab9 A |
189 | // |
190 | // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. | |
191 | // | |
192 | UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; | |
193 | const UnicodeString *alt = testCase->getAttribute("alternate_handling"); | |
194 | TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"); | |
195 | if (alt != NULL && *alt == "SHIFTED") { | |
196 | alternateHandling = UCOL_SHIFTED; | |
197 | } | |
198 | ||
46f4442e A |
199 | const UnicodeString defLocale("en"); |
200 | char clocale[100]; | |
201 | const UnicodeString *locale = testCase->getAttribute("locale"); | |
202 | if (locale == NULL || locale->length()==0) { | |
203 | locale = &defLocale; | |
204 | }; | |
205 | locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); | |
206 | ||
207 | ||
208 | UnicodeString text; | |
209 | UnicodeString target; | |
210 | UnicodeString pattern; | |
211 | int32_t expectedMatchStart = -1; | |
212 | int32_t expectedMatchLimit = -1; | |
213 | const UXMLElement *n; | |
729e4ab9 | 214 | int32_t nodeCount = 0; |
46f4442e A |
215 | |
216 | n = testCase->getChildElement("pattern"); | |
217 | TEST_ASSERT(n != NULL); | |
218 | if (n==NULL) { | |
219 | continue; | |
220 | } | |
221 | text = n->getText(FALSE); | |
222 | text = text.unescape(); | |
223 | pattern.append(text); | |
224 | nodeCount++; | |
225 | ||
226 | n = testCase->getChildElement("pre"); | |
227 | if (n!=NULL) { | |
228 | text = n->getText(FALSE); | |
229 | text = text.unescape(); | |
230 | target.append(text); | |
231 | nodeCount++; | |
232 | } | |
729e4ab9 | 233 | |
46f4442e A |
234 | n = testCase->getChildElement("m"); |
235 | if (n!=NULL) { | |
236 | expectedMatchStart = target.length(); | |
237 | text = n->getText(FALSE); | |
238 | text = text.unescape(); | |
239 | target.append(text); | |
240 | expectedMatchLimit = target.length(); | |
241 | nodeCount++; | |
242 | } | |
243 | ||
244 | n = testCase->getChildElement("post"); | |
245 | if (n!=NULL) { | |
246 | text = n->getText(FALSE); | |
247 | text = text.unescape(); | |
248 | target.append(text); | |
249 | nodeCount++; | |
250 | } | |
251 | ||
252 | // Check that there weren't extra things in the XML | |
253 | TEST_ASSERT(nodeCount == testCase->countChildren()); | |
254 | ||
729e4ab9 | 255 | // Open a collator and StringSearch based on the parameters |
46f4442e A |
256 | // obtained from the XML. |
257 | // | |
258 | status = U_ZERO_ERROR; | |
729e4ab9 A |
259 | LocalUCollatorPointer collator(ucol_open(clocale, &status)); |
260 | ucol_setStrength(collator.getAlias(), collatorStrength); | |
261 | ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); | |
262 | ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status); | |
263 | LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), | |
264 | target.getBuffer(), target.length(), | |
265 | collator.getAlias(), | |
266 | NULL, // the break iterator | |
267 | &status)); | |
268 | ||
46f4442e A |
269 | TEST_ASSERT_SUCCESS(status); |
270 | if (U_FAILURE(status)) { | |
46f4442e A |
271 | continue; |
272 | } | |
273 | ||
274 | int32_t foundStart = 0; | |
275 | int32_t foundLimit = 0; | |
276 | UBool foundMatch; | |
277 | ||
278 | // | |
279 | // Do the search, check the match result against the expected results. | |
280 | // | |
729e4ab9 | 281 | foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status); |
46f4442e | 282 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 A |
283 | if ((foundMatch && expectedMatchStart<0) || |
284 | (foundStart != expectedMatchStart) || | |
285 | (foundLimit != expectedMatchLimit)) { | |
46f4442e A |
286 | TEST_ASSERT(FALSE); // ouput generic error position |
287 | infoln("Found, expected match start = %d, %d \n" | |
288 | "Found, expected match limit = %d, %d", | |
289 | foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
290 | } | |
291 | ||
292 | // In case there are other matches... | |
293 | // (should we only do this if the test case passed?) | |
294 | while (foundMatch) { | |
295 | expectedMatchStart = foundStart; | |
296 | expectedMatchLimit = foundLimit; | |
297 | ||
729e4ab9 | 298 | foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status); |
46f4442e A |
299 | } |
300 | ||
729e4ab9 | 301 | uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), |
46f4442e | 302 | target.getBuffer(), target.length(), |
729e4ab9 | 303 | collator.getAlias(), |
46f4442e | 304 | NULL, |
729e4ab9 | 305 | &status)); |
46f4442e A |
306 | |
307 | // | |
308 | // Do the backwards search, check the match result against the expected results. | |
309 | // | |
729e4ab9 | 310 | foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status); |
46f4442e | 311 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 A |
312 | if ((foundMatch && expectedMatchStart<0) || |
313 | (foundStart != expectedMatchStart) || | |
314 | (foundLimit != expectedMatchLimit)) { | |
46f4442e A |
315 | TEST_ASSERT(FALSE); // ouput generic error position |
316 | infoln("Found, expected backwards match start = %d, %d \n" | |
317 | "Found, expected backwards match limit = %d, %d", | |
318 | foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
319 | } | |
729e4ab9 A |
320 | } |
321 | #endif | |
322 | } | |
323 | ||
46f4442e A |
324 | struct Order |
325 | { | |
326 | int32_t order; | |
327 | int32_t lowOffset; | |
328 | int32_t highOffset; | |
329 | }; | |
330 | ||
331 | class OrderList | |
332 | { | |
333 | public: | |
334 | OrderList(); | |
335 | OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0); | |
336 | ~OrderList(); | |
337 | ||
338 | int32_t size(void) const; | |
339 | void add(int32_t order, int32_t low, int32_t high); | |
340 | const Order *get(int32_t index) const; | |
341 | int32_t getLowOffset(int32_t index) const; | |
342 | int32_t getHighOffset(int32_t index) const; | |
343 | int32_t getOrder(int32_t index) const; | |
344 | void reverse(void); | |
345 | UBool compare(const OrderList &other) const; | |
346 | UBool matchesAt(int32_t offset, const OrderList &other) const; | |
347 | ||
348 | private: | |
349 | Order *list; | |
350 | int32_t listMax; | |
351 | int32_t listSize; | |
352 | }; | |
353 | ||
354 | OrderList::OrderList() | |
729e4ab9 | 355 | : list(NULL), listMax(16), listSize(0) |
46f4442e A |
356 | { |
357 | list = new Order[listMax]; | |
358 | } | |
359 | ||
360 | OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset) | |
361 | : list(NULL), listMax(16), listSize(0) | |
362 | { | |
363 | UErrorCode status = U_ZERO_ERROR; | |
364 | UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); | |
365 | uint32_t strengthMask = 0; | |
366 | int32_t order, low, high; | |
367 | ||
729e4ab9 | 368 | switch (ucol_getStrength(coll)) |
46f4442e A |
369 | { |
370 | default: | |
371 | strengthMask |= UCOL_TERTIARYORDERMASK; | |
2ca993e8 | 372 | U_FALLTHROUGH; |
46f4442e A |
373 | case UCOL_SECONDARY: |
374 | strengthMask |= UCOL_SECONDARYORDERMASK; | |
2ca993e8 | 375 | U_FALLTHROUGH; |
46f4442e A |
376 | case UCOL_PRIMARY: |
377 | strengthMask |= UCOL_PRIMARYORDERMASK; | |
378 | } | |
379 | ||
380 | list = new Order[listMax]; | |
381 | ||
382 | ucol_setOffset(elems, stringOffset, &status); | |
383 | ||
384 | do { | |
385 | low = ucol_getOffset(elems); | |
386 | order = ucol_next(elems, &status); | |
387 | high = ucol_getOffset(elems); | |
388 | ||
389 | if (order != UCOL_NULLORDER) { | |
390 | order &= strengthMask; | |
391 | } | |
392 | ||
393 | if (order != UCOL_IGNORABLE) { | |
394 | add(order, low, high); | |
395 | } | |
396 | } while (order != UCOL_NULLORDER); | |
397 | ||
398 | ucol_closeElements(elems); | |
399 | } | |
400 | ||
401 | OrderList::~OrderList() | |
402 | { | |
403 | delete[] list; | |
404 | } | |
405 | ||
406 | void OrderList::add(int32_t order, int32_t low, int32_t high) | |
407 | { | |
408 | if (listSize >= listMax) { | |
409 | listMax *= 2; | |
410 | ||
411 | Order *newList = new Order[listMax]; | |
412 | ||
413 | uprv_memcpy(newList, list, listSize * sizeof(Order)); | |
414 | delete[] list; | |
415 | list = newList; | |
416 | } | |
417 | ||
418 | list[listSize].order = order; | |
419 | list[listSize].lowOffset = low; | |
420 | list[listSize].highOffset = high; | |
421 | ||
422 | listSize += 1; | |
423 | } | |
424 | ||
425 | const Order *OrderList::get(int32_t index) const | |
426 | { | |
427 | if (index >= listSize) { | |
428 | return NULL; | |
429 | } | |
430 | ||
431 | return &list[index]; | |
432 | } | |
433 | ||
434 | int32_t OrderList::getLowOffset(int32_t index) const | |
435 | { | |
436 | const Order *order = get(index); | |
437 | ||
438 | if (order != NULL) { | |
439 | return order->lowOffset; | |
440 | } | |
441 | ||
442 | return -1; | |
443 | } | |
444 | ||
445 | int32_t OrderList::getHighOffset(int32_t index) const | |
446 | { | |
447 | const Order *order = get(index); | |
448 | ||
449 | if (order != NULL) { | |
450 | return order->highOffset; | |
451 | } | |
452 | ||
453 | return -1; | |
454 | } | |
455 | ||
456 | int32_t OrderList::getOrder(int32_t index) const | |
457 | { | |
458 | const Order *order = get(index); | |
459 | ||
460 | if (order != NULL) { | |
461 | return order->order; | |
462 | } | |
463 | ||
464 | return UCOL_NULLORDER; | |
465 | } | |
466 | ||
467 | int32_t OrderList::size() const | |
468 | { | |
469 | return listSize; | |
470 | } | |
471 | ||
472 | void OrderList::reverse() | |
473 | { | |
474 | for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { | |
475 | Order swap = list[b]; | |
476 | ||
477 | list[b] = list[f]; | |
478 | list[f] = swap; | |
479 | } | |
480 | } | |
481 | ||
482 | UBool OrderList::compare(const OrderList &other) const | |
483 | { | |
484 | if (listSize != other.listSize) { | |
485 | return FALSE; | |
486 | } | |
487 | ||
488 | for(int32_t i = 0; i < listSize; i += 1) { | |
489 | if (list[i].order != other.list[i].order || | |
490 | list[i].lowOffset != other.list[i].lowOffset || | |
491 | list[i].highOffset != other.list[i].highOffset) { | |
492 | return FALSE; | |
493 | } | |
494 | } | |
495 | ||
496 | return TRUE; | |
497 | } | |
498 | ||
499 | UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const | |
500 | { | |
501 | // NOTE: sizes include the NULLORDER, which we don't want to compare. | |
502 | int32_t otherSize = other.size() - 1; | |
503 | ||
504 | if (listSize - 1 - offset < otherSize) { | |
505 | return FALSE; | |
506 | } | |
507 | ||
508 | for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { | |
509 | if (getOrder(i) != other.getOrder(j)) { | |
510 | return FALSE; | |
511 | } | |
512 | } | |
513 | ||
514 | return TRUE; | |
515 | } | |
516 | ||
517 | static char *printOffsets(char *buffer, OrderList &list) | |
518 | { | |
519 | int32_t size = list.size(); | |
520 | char *s = buffer; | |
521 | ||
522 | for(int32_t i = 0; i < size; i += 1) { | |
523 | const Order *order = list.get(i); | |
524 | ||
525 | if (i != 0) { | |
526 | s += sprintf(s, ", "); | |
527 | } | |
528 | ||
529 | s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); | |
530 | } | |
531 | ||
532 | return buffer; | |
533 | } | |
534 | ||
535 | static char *printOrders(char *buffer, OrderList &list) | |
536 | { | |
537 | int32_t size = list.size(); | |
538 | char *s = buffer; | |
539 | ||
540 | for(int32_t i = 0; i < size; i += 1) { | |
541 | const Order *order = list.get(i); | |
542 | ||
543 | if (i != 0) { | |
544 | s += sprintf(s, ", "); | |
545 | } | |
546 | ||
547 | s += sprintf(s, "%8.8X", order->order); | |
548 | } | |
549 | ||
550 | return buffer; | |
551 | } | |
552 | ||
553 | void SSearchTest::offsetTest() | |
554 | { | |
555 | const char *test[] = { | |
729e4ab9 A |
556 | // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous |
557 | // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. | |
558 | "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", | |
559 | ||
46f4442e A |
560 | "\\ua191\\u16ef\\u2036\\u017a", |
561 | ||
562 | #if 0 | |
563 | // This results in a complex interaction between contraction, | |
564 | // expansion and normalization that confuses the backwards offset fixups. | |
565 | "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
566 | #endif | |
567 | ||
568 | "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
569 | "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", | |
570 | ||
571 | "\\u02FE\\u02FF" | |
572 | "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" | |
573 | "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" | |
574 | "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" | |
575 | "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" | |
729e4ab9 | 576 | "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081 |
46f4442e | 577 | |
729e4ab9 A |
578 | "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081 |
579 | "a\\u02FF\\u0301\\u0316", // currently not working, see #8081 | |
580 | "a\\u02FF\\u0316\\u0301", | |
581 | "a\\u0430\\u0301\\u0316", | |
582 | "a\\u0430\\u0316\\u0301", | |
46f4442e | 583 | "abc\\u0E41\\u0301\\u0316", |
729e4ab9 A |
584 | "abc\\u0E41\\u0316\\u0301", |
585 | "\\u0E41\\u0301\\u0316", | |
586 | "\\u0E41\\u0316\\u0301", | |
587 | "a\\u0301\\u0316", | |
588 | "a\\u0316\\u0301", | |
589 | "\\uAC52\\uAC53", | |
590 | "\\u34CA\\u34CB", | |
591 | "\\u11ED\\u11EE", | |
592 | "\\u30C3\\u30D0", | |
593 | "p\\u00E9ch\\u00E9", | |
46f4442e A |
594 | "a\\u0301\\u0325", |
595 | "a\\u0300\\u0325", | |
596 | "a\\u0325\\u0300", | |
597 | "A\\u0323\\u0300B", | |
598 | "A\\u0300\\u0323B", | |
599 | "A\\u0301\\u0323B", | |
600 | "A\\u0302\\u0301\\u0323B", | |
601 | "abc", | |
602 | "ab\\u0300c", | |
603 | "ab\\u0300\\u0323c", | |
604 | " \\uD800\\uDC00\\uDC00", | |
605 | "a\\uD800\\uDC00\\uDC00", | |
606 | "A\\u0301\\u0301", | |
607 | "A\\u0301\\u0323", | |
608 | "A\\u0301\\u0323B", | |
609 | "B\\u0301\\u0323C", | |
610 | "A\\u0300\\u0323B", | |
611 | "\\u0301A\\u0301\\u0301", | |
612 | "abcd\\r\\u0301", | |
613 | "p\\u00EAche", | |
614 | "pe\\u0302che", | |
615 | }; | |
616 | ||
2ca993e8 | 617 | int32_t testCount = UPRV_LENGTHOF(test); |
46f4442e A |
618 | UErrorCode status = U_ZERO_ERROR; |
619 | RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status); | |
620 | if (U_FAILURE(status)) { | |
729e4ab9 | 621 | errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status)); |
46f4442e A |
622 | return; |
623 | } | |
624 | char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases... | |
625 | // We could allocate one that's the right size by (CE_count * 10) + 2 | |
626 | // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]" | |
627 | ||
628 | col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); | |
629 | ||
630 | for(int32_t i = 0; i < testCount; i += 1) { | |
631 | UnicodeString ts = CharsToUnicodeString(test[i]); | |
632 | CollationElementIterator *iter = col->createCollationElementIterator(ts); | |
633 | OrderList forwardList; | |
634 | OrderList backwardList; | |
635 | int32_t order, low, high; | |
636 | ||
637 | do { | |
638 | low = iter->getOffset(); | |
639 | order = iter->next(status); | |
640 | high = iter->getOffset(); | |
641 | ||
642 | forwardList.add(order, low, high); | |
643 | } while (order != CollationElementIterator::NULLORDER); | |
644 | ||
645 | iter->reset(); | |
646 | iter->setOffset(ts.length(), status); | |
647 | ||
648 | backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset()); | |
649 | ||
650 | do { | |
651 | high = iter->getOffset(); | |
652 | order = iter->previous(status); | |
653 | low = iter->getOffset(); | |
654 | ||
655 | if (order == CollationElementIterator::NULLORDER) { | |
656 | break; | |
657 | } | |
658 | ||
659 | backwardList.add(order, low, high); | |
660 | } while (TRUE); | |
661 | ||
662 | backwardList.reverse(); | |
663 | ||
664 | if (forwardList.compare(backwardList)) { | |
665 | logln("Works with \"%s\"", test[i]); | |
666 | logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
667 | // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); | |
668 | ||
669 | logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
670 | // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
671 | ||
672 | logln(); | |
673 | } else { | |
674 | errln("Fails with \"%s\"", test[i]); | |
675 | infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
676 | infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); | |
677 | ||
678 | infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
679 | infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
680 | ||
681 | infoln(); | |
682 | } | |
683 | delete iter; | |
684 | } | |
685 | delete col; | |
686 | } | |
687 | ||
729e4ab9 A |
688 | #if 0 |
689 | static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) | |
46f4442e | 690 | { |
729e4ab9 A |
691 | for(int32_t i = 0; i < string.length(); i += 1) { |
692 | UChar32 ch = string.char32At(i); | |
46f4442e | 693 | |
729e4ab9 A |
694 | if (ch >= 0x0020 && ch <= 0x007F) { |
695 | if (ch == 0x005C) { | |
696 | buffer.append("\\\\"); | |
697 | } else { | |
698 | buffer.append(ch); | |
699 | } | |
700 | } else { | |
701 | char cbuffer[12]; | |
46f4442e | 702 | |
729e4ab9 A |
703 | if (ch <= 0xFFFFL) { |
704 | sprintf(cbuffer, "\\u%4.4X", ch); | |
705 | } else { | |
706 | sprintf(cbuffer, "\\U%8.8X", ch); | |
707 | } | |
46f4442e | 708 | |
729e4ab9 | 709 | buffer.append(cbuffer); |
46f4442e A |
710 | } |
711 | ||
729e4ab9 A |
712 | if (ch >= 0x10000L) { |
713 | i += 1; | |
714 | } | |
46f4442e A |
715 | } |
716 | ||
729e4ab9 | 717 | return buffer; |
46f4442e | 718 | } |
729e4ab9 | 719 | #endif |
46f4442e | 720 | |
51004dcb | 721 | void SSearchTest::sharpSTest() |
46f4442e A |
722 | { |
723 | UErrorCode status = U_ZERO_ERROR; | |
729e4ab9 | 724 | UCollator *coll = NULL; |
729e4ab9 A |
725 | UnicodeString lp = "fuss"; |
726 | UnicodeString sp = "fu\\u00DF"; | |
729e4ab9 A |
727 | UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", |
728 | "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", | |
729 | "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"}; | |
730 | int32_t start = -1, end = -1; | |
731 | ||
732 | coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); | |
51004dcb | 733 | TEST_ASSERT_SUCCESS(status); |
46f4442e | 734 | |
51004dcb A |
735 | UnicodeString lpUnescaped = lp.unescape(); |
736 | UnicodeString spUnescaped = sp.unescape(); | |
729e4ab9 | 737 | |
51004dcb A |
738 | LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(), |
739 | lpUnescaped.getBuffer(), lpUnescaped.length(), // actual test data will be set later | |
740 | coll, | |
741 | NULL, // the break iterator | |
742 | &status)); | |
46f4442e | 743 | |
51004dcb A |
744 | LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(), |
745 | spUnescaped.getBuffer(), spUnescaped.length(), // actual test data will be set later | |
746 | coll, | |
747 | NULL, // the break iterator | |
748 | &status)); | |
749 | TEST_ASSERT_SUCCESS(status); | |
46f4442e | 750 | |
2ca993e8 | 751 | for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) { |
51004dcb | 752 | UBool bFound; |
729e4ab9 | 753 | UnicodeString target = targets[t].unescape(); |
46f4442e | 754 | |
51004dcb A |
755 | start = end = -1; |
756 | usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status); | |
757 | bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status); | |
758 | TEST_ASSERT_SUCCESS(status); | |
759 | if (bFound) { | |
729e4ab9 | 760 | logln("Test %d: found long pattern at [%d, %d].", t, start, end); |
46f4442e | 761 | } else { |
51004dcb | 762 | dataerrln("Test %d: did not find long pattern.", t); |
46f4442e | 763 | } |
46f4442e | 764 | |
51004dcb A |
765 | usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status); |
766 | bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status); | |
767 | TEST_ASSERT_SUCCESS(status); | |
768 | if (bFound) { | |
769 | logln("Test %d: found long pattern at [%d, %d].", t, start, end); | |
46f4442e | 770 | } else { |
51004dcb | 771 | dataerrln("Test %d: did not find long pattern.", t); |
729e4ab9 A |
772 | } |
773 | } | |
46f4442e | 774 | |
729e4ab9 | 775 | ucol_close(coll); |
46f4442e A |
776 | } |
777 | ||
729e4ab9 | 778 | void SSearchTest::goodSuffixTest() |
46f4442e | 779 | { |
729e4ab9 A |
780 | UErrorCode status = U_ZERO_ERROR; |
781 | UCollator *coll = NULL; | |
729e4ab9 A |
782 | UnicodeString pat = /*"gcagagag"*/ "fxeld"; |
783 | UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; | |
729e4ab9 | 784 | int32_t start = -1, end = -1; |
51004dcb | 785 | UBool bFound; |
729e4ab9 A |
786 | |
787 | coll = ucol_open(NULL, &status); | |
51004dcb | 788 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 | 789 | |
51004dcb A |
790 | LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(), |
791 | target.getBuffer(), target.length(), | |
792 | coll, | |
793 | NULL, // the break iterator | |
794 | &status)); | |
795 | TEST_ASSERT_SUCCESS(status); | |
46f4442e | 796 | |
51004dcb A |
797 | bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status); |
798 | TEST_ASSERT_SUCCESS(status); | |
799 | if (bFound) { | |
729e4ab9 A |
800 | logln("Found pattern at [%d, %d].", start, end); |
801 | } else { | |
51004dcb | 802 | dataerrln("Did not find pattern."); |
729e4ab9 A |
803 | } |
804 | ||
729e4ab9 | 805 | ucol_close(coll); |
46f4442e A |
806 | } |
807 | ||
729e4ab9 A |
808 | // |
809 | // searchTime() A quick and dirty performance test for string search. | |
810 | // Probably doesn't really belong as part of intltest, but it | |
811 | // does check that the search succeeds, and gets the right result, | |
812 | // so it serves as a functionality test also. | |
813 | // | |
814 | // To run as a perf test, up the loop count, select by commenting | |
815 | // and uncommenting in the code the operation to be measured, | |
816 | // rebuild, and measure the running time of this test alone. | |
817 | // | |
818 | // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime | |
819 | // | |
820 | void SSearchTest::searchTime() { | |
821 | static const char *longishText = | |
822 | "Whylom, as olde stories tellen us,\n" | |
823 | "Ther was a duk that highte Theseus:\n" | |
824 | "Of Athenes he was lord and governour,\n" | |
825 | "And in his tyme swich a conquerour,\n" | |
826 | "That gretter was ther noon under the sonne.\n" | |
827 | "Ful many a riche contree hadde he wonne;\n" | |
828 | "What with his wisdom and his chivalrye,\n" | |
829 | "He conquered al the regne of Femenye,\n" | |
830 | "That whylom was y-cleped Scithia;\n" | |
831 | "And weddede the quene Ipolita,\n" | |
832 | "And broghte hir hoom with him in his contree\n" | |
833 | "With muchel glorie and greet solempnitee,\n" | |
834 | "And eek hir yonge suster Emelye.\n" | |
835 | "And thus with victorie and with melodye\n" | |
836 | "Lete I this noble duk to Athenes ryde,\n" | |
837 | "And al his hoost, in armes, him bisyde.\n" | |
838 | "And certes, if it nere to long to here,\n" | |
839 | "I wolde han told yow fully the manere,\n" | |
840 | "How wonnen was the regne of Femenye\n" | |
841 | "By Theseus, and by his chivalrye;\n" | |
842 | "And of the grete bataille for the nones\n" | |
843 | "Bitwixen Athen's and Amazones;\n" | |
844 | "And how asseged was Ipolita,\n" | |
845 | "The faire hardy quene of Scithia;\n" | |
846 | "And of the feste that was at hir weddinge,\n" | |
847 | "And of the tempest at hir hoom-cominge;\n" | |
848 | "But al that thing I moot as now forbere.\n" | |
849 | "I have, God woot, a large feeld to ere,\n" | |
850 | "And wayke been the oxen in my plough.\n" | |
851 | "The remenant of the tale is long y-nough.\n" | |
852 | "I wol nat letten eek noon of this route;\n" | |
853 | "Lat every felawe telle his tale aboute,\n" | |
854 | "And lat see now who shal the soper winne;\n" | |
855 | "And ther I lefte, I wol ageyn biginne.\n" | |
856 | "This duk, of whom I make mencioun,\n" | |
857 | "When he was come almost unto the toun,\n" | |
858 | "In al his wele and in his moste pryde,\n" | |
859 | "He was war, as he caste his eye asyde,\n" | |
860 | "Wher that ther kneled in the hye weye\n" | |
861 | "A companye of ladies, tweye and tweye,\n" | |
862 | "Ech after other, clad in clothes blake; \n" | |
863 | "But swich a cry and swich a wo they make,\n" | |
864 | "That in this world nis creature livinge,\n" | |
865 | "That herde swich another weymentinge;\n" | |
866 | "And of this cry they nolde never stenten,\n" | |
867 | "Til they the reynes of his brydel henten.\n" | |
868 | "'What folk ben ye, that at myn hoomcominge\n" | |
869 | "Perturben so my feste with cryinge'?\n" | |
870 | "Quod Theseus, 'have ye so greet envye\n" | |
871 | "Of myn honour, that thus compleyne and crye? \n" | |
872 | "Or who hath yow misboden, or offended?\n" | |
873 | "And telleth me if it may been amended;\n" | |
874 | "And why that ye ben clothed thus in blak'?\n" | |
875 | "The eldest lady of hem alle spak,\n" | |
876 | "When she hadde swowned with a deedly chere,\n" | |
877 | "That it was routhe for to seen and here,\n" | |
878 | "And seyde: 'Lord, to whom Fortune hath yiven\n" | |
879 | "Victorie, and as a conquerour to liven,\n" | |
880 | "Noght greveth us your glorie and your honour;\n" | |
881 | "But we biseken mercy and socour.\n" | |
882 | "Have mercy on our wo and our distresse.\n" | |
883 | "Som drope of pitee, thurgh thy gentilesse,\n" | |
884 | "Up-on us wrecched wommen lat thou falle.\n" | |
885 | "For certes, lord, ther nis noon of us alle,\n" | |
886 | "That she nath been a duchesse or a quene;\n" | |
887 | "Now be we caitifs, as it is wel sene:\n" | |
888 | "Thanked be Fortune, and hir false wheel,\n" | |
889 | "That noon estat assureth to be weel.\n" | |
890 | "And certes, lord, t'abyden your presence,\n" | |
891 | "Here in the temple of the goddesse Clemence\n" | |
892 | "We han ben waytinge al this fourtenight;\n" | |
893 | "Now help us, lord, sith it is in thy might.\n" | |
894 | "I wrecche, which that wepe and waille thus,\n" | |
895 | "Was whylom wyf to king Capaneus,\n" | |
896 | "That starf at Thebes, cursed be that day!\n" | |
897 | "And alle we, that been in this array,\n" | |
898 | "And maken al this lamentacioun,\n" | |
899 | "We losten alle our housbondes at that toun,\n" | |
900 | "Whyl that the sege ther-aboute lay.\n" | |
901 | "And yet now th'olde Creon, weylaway!\n" | |
902 | "The lord is now of Thebes the citee, \n" | |
903 | "Fulfild of ire and of iniquitee,\n" | |
904 | "He, for despyt, and for his tirannye,\n" | |
905 | "To do the dede bodyes vileinye,\n" | |
906 | "Of alle our lordes, whiche that ben slawe,\n" | |
907 | "Hath alle the bodyes on an heep y-drawe,\n" | |
908 | "And wol nat suffren hem, by noon assent,\n" | |
909 | "Neither to been y-buried nor y-brent,\n" | |
910 | "But maketh houndes ete hem in despyt. zet'\n"; | |
911 | ||
729e4ab9 A |
912 | const char *cPattern = "maketh houndes ete hem"; |
913 | //const char *cPattern = "Whylom"; | |
914 | //const char *cPattern = "zet"; | |
915 | const char *testId = "searchTime()"; // for error macros. | |
916 | UnicodeString target = longishText; | |
917 | UErrorCode status = U_ZERO_ERROR; | |
46f4442e | 918 | |
46f4442e | 919 | |
729e4ab9 | 920 | LocalUCollatorPointer collator(ucol_open("en", &status)); |
729e4ab9 A |
921 | //ucol_setStrength(collator.getAlias(), collatorStrength); |
922 | //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status); | |
923 | UnicodeString uPattern = cPattern; | |
729e4ab9 A |
924 | LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(), |
925 | target.getBuffer(), target.length(), | |
926 | collator.getAlias(), | |
927 | NULL, // the break iterator | |
928 | &status)); | |
929 | TEST_ASSERT_SUCCESS(status); | |
46f4442e | 930 | |
729e4ab9 A |
931 | // int32_t foundStart; |
932 | // int32_t foundEnd; | |
933 | UBool found; | |
934 | ||
935 | // Find the match position usgin strstr | |
936 | const char *pm = strstr(longishText, cPattern); | |
937 | TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); | |
938 | int32_t refMatchPos = (int32_t)(pm - longishText); | |
939 | int32_t icuMatchPos; | |
940 | int32_t icuMatchEnd; | |
729e4ab9 A |
941 | usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); |
942 | TEST_ASSERT_SUCCESS(status); | |
729e4ab9 | 943 | TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions."); |
46f4442e | 944 | |
729e4ab9 | 945 | int32_t i; |
4388f060 | 946 | // int32_t j=0; |
729e4ab9 A |
947 | |
948 | // Try loopcounts around 100000 to some millions, depending on the operation, | |
949 | // to get runtimes of at least several seconds. | |
950 | for (i=0; i<10000; i++) { | |
729e4ab9 | 951 | found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); |
57a6839d | 952 | (void)found; // Suppress set but not used warning. |
729e4ab9 A |
953 | //TEST_ASSERT_SUCCESS(status); |
954 | //TEST_ASSERT(found); | |
955 | ||
956 | // usearch_setOffset(uss.getAlias(), 0, &status); | |
957 | // icuMatchPos = usearch_next(uss.getAlias(), &status); | |
46f4442e | 958 | |
729e4ab9 A |
959 | // The i+j stuff is to confuse the optimizer and get it to actually leave the |
960 | // call to strstr in place. | |
961 | //pm = strstr(longishText+j, cPattern); | |
962 | //j = (j + i)%5; | |
46f4442e | 963 | } |
729e4ab9 | 964 | |
4388f060 | 965 | //printf("%ld, %d\n", pm-longishText, j); |
46f4442e A |
966 | } |
967 | ||
968 | //---------------------------------------------------------------------------------------- | |
969 | // | |
970 | // Random Numbers. Similar to standard lib rand() and srand() | |
971 | // Not using library to | |
972 | // 1. Get same results on all platforms. | |
973 | // 2. Get access to current seed, to more easily reproduce failures. | |
974 | // | |
975 | //--------------------------------------------------------------------------------------- | |
976 | static uint32_t m_seed = 1; | |
977 | ||
978 | static uint32_t m_rand() | |
979 | { | |
980 | m_seed = m_seed * 1103515245 + 12345; | |
981 | return (uint32_t)(m_seed/65536) % 32768; | |
982 | } | |
983 | ||
984 | class Monkey | |
985 | { | |
986 | public: | |
987 | virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; | |
988 | ||
989 | protected: | |
990 | Monkey(); | |
991 | virtual ~Monkey(); | |
992 | }; | |
993 | ||
994 | Monkey::Monkey() | |
995 | { | |
996 | // ook? | |
997 | } | |
998 | ||
999 | Monkey::~Monkey() | |
1000 | { | |
1001 | // ook? | |
1002 | } | |
1003 | ||
1004 | class SetMonkey : public Monkey | |
1005 | { | |
1006 | public: | |
1007 | SetMonkey(const USet *theSet); | |
1008 | ~SetMonkey(); | |
1009 | ||
1010 | virtual void append(UnicodeString &test, UnicodeString &alternate); | |
1011 | ||
1012 | private: | |
1013 | const USet *set; | |
1014 | }; | |
1015 | ||
1016 | SetMonkey::SetMonkey(const USet *theSet) | |
1017 | : Monkey(), set(theSet) | |
1018 | { | |
1019 | // ook? | |
1020 | } | |
1021 | ||
1022 | SetMonkey::~SetMonkey() | |
1023 | { | |
1024 | //ook... | |
1025 | } | |
1026 | ||
1027 | void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) | |
1028 | { | |
1029 | int32_t size = uset_size(set); | |
1030 | int32_t index = m_rand() % size; | |
1031 | UChar32 ch = uset_charAt(set, index); | |
1032 | UnicodeString str(ch); | |
1033 | ||
1034 | test.append(str); | |
1035 | alternate.append(str); // flip case, or some junk? | |
1036 | } | |
1037 | ||
1038 | class StringSetMonkey : public Monkey | |
1039 | { | |
1040 | public: | |
729e4ab9 | 1041 | StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData); |
46f4442e A |
1042 | ~StringSetMonkey(); |
1043 | ||
1044 | void append(UnicodeString &testCase, UnicodeString &alternate); | |
1045 | ||
1046 | private: | |
1047 | UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate); | |
1048 | ||
1049 | const USet *set; | |
729e4ab9 A |
1050 | UCollator *coll; |
1051 | CollData *collData; | |
46f4442e A |
1052 | }; |
1053 | ||
729e4ab9 A |
1054 | StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData) |
1055 | : Monkey(), set(theSet), coll(theCollator), collData(theCollData) | |
46f4442e A |
1056 | { |
1057 | // ook. | |
1058 | } | |
1059 | ||
1060 | StringSetMonkey::~StringSetMonkey() | |
1061 | { | |
1062 | // ook? | |
1063 | } | |
1064 | ||
1065 | void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) | |
1066 | { | |
1067 | int32_t itemCount = uset_getItemCount(set), len = 0; | |
1068 | int32_t index = m_rand() % itemCount; | |
1069 | UChar32 rangeStart = 0, rangeEnd = 0; | |
1070 | UChar buffer[16]; | |
1071 | UErrorCode err = U_ZERO_ERROR; | |
1072 | ||
1073 | len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); | |
1074 | ||
1075 | if (len == 0) { | |
1076 | int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); | |
1077 | UChar32 ch = rangeStart + offset; | |
1078 | UnicodeString str(ch); | |
1079 | ||
1080 | testCase.append(str); | |
1081 | generateAlternative(str, alternate); | |
1082 | } else if (len > 0) { | |
1083 | // should check that len < 16... | |
1084 | UnicodeString str(buffer, len); | |
1085 | ||
1086 | testCase.append(str); | |
1087 | generateAlternative(str, alternate); | |
1088 | } else { | |
1089 | // shouldn't happen... | |
1090 | } | |
1091 | } | |
1092 | ||
1093 | UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate) | |
1094 | { | |
1095 | // find out shortest string for the longest sequence of ces. | |
1096 | // needs to be refined to use dynamic programming, but will be roughly right | |
729e4ab9 A |
1097 | UErrorCode status = U_ZERO_ERROR; |
1098 | CEList ceList(coll, testCase, status); | |
46f4442e A |
1099 | UnicodeString alt; |
1100 | int32_t offset = 0; | |
1101 | ||
1102 | if (ceList.size() == 0) { | |
1103 | return alternate.append(testCase); | |
1104 | } | |
1105 | ||
1106 | while (offset < ceList.size()) { | |
1107 | int32_t ce = ceList.get(offset); | |
729e4ab9 | 1108 | const StringList *strings = collData->getStringList(ce); |
46f4442e A |
1109 | |
1110 | if (strings == NULL) { | |
1111 | return alternate.append(testCase); | |
1112 | } | |
1113 | ||
1114 | int32_t stringCount = strings->size(); | |
1115 | int32_t tries = 0; | |
729e4ab9 | 1116 | |
46f4442e | 1117 | // find random string that generates the same CEList |
729e4ab9 A |
1118 | const CEList *ceList2 = NULL; |
1119 | const UnicodeString *string = NULL; | |
1120 | UBool matches = FALSE; | |
46f4442e A |
1121 | |
1122 | do { | |
1123 | int32_t s = m_rand() % stringCount; | |
1124 | ||
1125 | if (tries++ > stringCount) { | |
1126 | alternate.append(testCase); | |
1127 | return alternate; | |
1128 | } | |
1129 | ||
1130 | string = strings->get(s); | |
729e4ab9 A |
1131 | ceList2 = collData->getCEList(string); |
1132 | matches = ceList.matchesAt(offset, ceList2); | |
1133 | ||
1134 | if (! matches) { | |
1135 | collData->freeCEList((CEList *) ceList2); | |
1136 | } | |
1137 | } while (! matches); | |
46f4442e A |
1138 | |
1139 | alt.append(*string); | |
1140 | offset += ceList2->size(); | |
729e4ab9 | 1141 | collData->freeCEList(ceList2); |
46f4442e A |
1142 | } |
1143 | ||
729e4ab9 | 1144 | const CEList altCEs(coll, alt, status); |
46f4442e A |
1145 | |
1146 | if (ceList.matchesAt(0, &altCEs)) { | |
1147 | return alternate.append(alt); | |
1148 | } | |
1149 | ||
1150 | return alternate.append(testCase); | |
1151 | } | |
1152 | ||
1153 | static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate) | |
1154 | { | |
1155 | int32_t pieces = (m_rand() % 4) + 1; | |
729e4ab9 | 1156 | UErrorCode status = U_ZERO_ERROR; |
46f4442e A |
1157 | UBool matches; |
1158 | ||
1159 | do { | |
1160 | testCase.remove(); | |
1161 | alternate.remove(); | |
1162 | monkeys[0]->append(testCase, alternate); | |
1163 | ||
1164 | for(int32_t piece = 0; piece < pieces; piece += 1) { | |
1165 | int32_t monkey = m_rand() % monkeyCount; | |
1166 | ||
1167 | monkeys[monkey]->append(testCase, alternate); | |
1168 | } | |
1169 | ||
729e4ab9 A |
1170 | const CEList ceTest(coll, testCase, status); |
1171 | const CEList ceAlt(coll, alternate, status); | |
46f4442e A |
1172 | |
1173 | matches = ceTest.matchesAt(0, &ceAlt); | |
1174 | } while (! matches); | |
1175 | } | |
1176 | ||
46f4442e A |
1177 | static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) |
1178 | { | |
1179 | UErrorCode status = U_ZERO_ERROR; | |
1180 | OrderList targetOrders(coll, target, offset); | |
1181 | OrderList patternOrders(coll, pattern); | |
1182 | int32_t targetSize = targetOrders.size() - 1; | |
1183 | int32_t patternSize = patternOrders.size() - 1; | |
729e4ab9 A |
1184 | UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status), |
1185 | target.getBuffer(), target.length(), &status); | |
46f4442e A |
1186 | |
1187 | if (patternSize == 0) { | |
729e4ab9 A |
1188 | // Searching for an empty pattern always fails |
1189 | matchStart = matchEnd = -1; | |
1190 | ubrk_close(charBreakIterator); | |
46f4442e A |
1191 | return FALSE; |
1192 | } | |
1193 | ||
1194 | matchStart = matchEnd = -1; | |
1195 | ||
1196 | for(int32_t i = 0; i < targetSize; i += 1) { | |
1197 | if (targetOrders.matchesAt(i, patternOrders)) { | |
1198 | int32_t start = targetOrders.getLowOffset(i); | |
1199 | int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); | |
1200 | int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); | |
1201 | ||
1202 | // if the low and high offsets of the first CE in | |
1203 | // the match are the same, it means that the match | |
1204 | // starts in the middle of an expansion - all but | |
1205 | // the first CE of the expansion will have the offset | |
1206 | // of the following character. | |
1207 | if (start == targetOrders.getHighOffset(i)) { | |
1208 | continue; | |
1209 | } | |
1210 | ||
1211 | // Make sure match starts on a grapheme boundary | |
1212 | if (! ubrk_isBoundary(charBreakIterator, start)) { | |
1213 | continue; | |
1214 | } | |
1215 | ||
1216 | // If the low and high offsets of the CE after the match | |
1217 | // are the same, it means that the match ends in the middle | |
1218 | // of an expansion sequence. | |
1219 | if (maxLimit == targetOrders.getHighOffset(i + patternSize) && | |
1220 | targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { | |
1221 | continue; | |
1222 | } | |
1223 | ||
1224 | int32_t mend = maxLimit; | |
1225 | ||
1226 | // Find the first grapheme break after the character index | |
1227 | // of the last CE in the match. If it's after character index | |
1228 | // that's after the last CE in the match, use that index | |
1229 | // as the end of the match. | |
1230 | if (minLimit < maxLimit) { | |
4388f060 A |
1231 | // When the last CE's low index is same with its high index, the CE is likely |
1232 | // a part of expansion. In this case, the index is located just after the | |
1233 | // character corresponding to the CEs compared above. If the index is right | |
1234 | // at the break boundary, move the position to the next boundary will result | |
1235 | // incorrect match length when there are ignorable characters exist between | |
1236 | // the position and the next character produces CE(s). See ticket#8482. | |
1237 | if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) { | |
1238 | mend = minLimit; | |
1239 | } else { | |
1240 | int32_t nba = ubrk_following(charBreakIterator, minLimit); | |
1241 | ||
1242 | if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { | |
1243 | mend = nba; | |
1244 | } | |
46f4442e A |
1245 | } |
1246 | } | |
1247 | ||
1248 | if (mend > maxLimit) { | |
1249 | continue; | |
1250 | } | |
1251 | ||
1252 | if (! ubrk_isBoundary(charBreakIterator, mend)) { | |
1253 | continue; | |
1254 | } | |
1255 | ||
1256 | matchStart = start; | |
1257 | matchEnd = mend; | |
1258 | ||
1259 | ubrk_close(charBreakIterator); | |
1260 | return TRUE; | |
1261 | } | |
1262 | } | |
1263 | ||
1264 | ubrk_close(charBreakIterator); | |
1265 | return FALSE; | |
1266 | } | |
1267 | ||
1268 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
1269 | static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { | |
1270 | int32_t val = defaultVal; | |
1271 | ||
1272 | name.append(" *= *(-?\\d+)"); | |
1273 | ||
1274 | UErrorCode status = U_ZERO_ERROR; | |
1275 | RegexMatcher m(name, params, 0, status); | |
1276 | ||
1277 | if (m.find()) { | |
1278 | // The param exists. Convert the string to an int. | |
1279 | char valString[100]; | |
1280 | int32_t paramLength = m.end(1, status) - m.start(1, status); | |
1281 | ||
1282 | if (paramLength >= (int32_t)(sizeof(valString)-1)) { | |
1283 | paramLength = (int32_t)(sizeof(valString)-2); | |
1284 | } | |
1285 | ||
1286 | params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); | |
51004dcb | 1287 | val = uprv_strtol(valString, NULL, 10); |
46f4442e A |
1288 | |
1289 | // Delete this parameter from the params string. | |
1290 | m.reset(); | |
1291 | params = m.replaceFirst("", status); | |
1292 | } | |
1293 | ||
1294 | //U_ASSERT(U_SUCCESS(status)); | |
1295 | if (! U_SUCCESS(status)) { | |
1296 | val = defaultVal; | |
1297 | } | |
1298 | ||
1299 | return val; | |
1300 | } | |
1301 | #endif | |
1302 | ||
1303 | #if !UCONFIG_NO_COLLATION | |
1304 | int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, | |
1305 | const char *name, const char *strength, uint32_t seed) | |
1306 | { | |
1307 | UErrorCode status = U_ZERO_ERROR; | |
1308 | int32_t actualStart = -1, actualEnd = -1; | |
1309 | //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); | |
1310 | int32_t expectedStart = -1, expectedEnd = -1; | |
1311 | int32_t notFoundCount = 0; | |
729e4ab9 A |
1312 | LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(), |
1313 | testCase.getBuffer(), testCase.length(), | |
1314 | coll, | |
1315 | NULL, // the break iterator | |
1316 | &status)); | |
46f4442e A |
1317 | |
1318 | // **** TODO: find *all* matches, not just first one **** | |
1319 | simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); | |
1320 | ||
729e4ab9 | 1321 | usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); |
46f4442e | 1322 | |
729e4ab9 | 1323 | if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { |
46f4442e A |
1324 | errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" |
1325 | " strength=%s seed=%d", | |
1326 | name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); | |
1327 | } | |
1328 | ||
1329 | if (expectedStart == -1 && actualStart == -1) { | |
1330 | notFoundCount += 1; | |
1331 | } | |
1332 | ||
1333 | // **** TODO: find *all* matches, not just first one **** | |
1334 | simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); | |
1335 | ||
729e4ab9 | 1336 | usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status); |
46f4442e | 1337 | |
729e4ab9 | 1338 | usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); |
46f4442e | 1339 | |
729e4ab9 | 1340 | if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) { |
46f4442e A |
1341 | errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" |
1342 | " strength=%s seed=%d", | |
1343 | name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); | |
1344 | } | |
1345 | ||
1346 | if (expectedStart == -1 && actualStart == -1) { | |
1347 | notFoundCount += 1; | |
1348 | } | |
1349 | ||
46f4442e A |
1350 | return notFoundCount; |
1351 | } | |
1352 | #endif | |
1353 | ||
1354 | void SSearchTest::monkeyTest(char *params) | |
1355 | { | |
1356 | // ook! | |
1357 | UErrorCode status = U_ZERO_ERROR; | |
729e4ab9 A |
1358 | //UCollator *coll = ucol_open(NULL, &status); |
1359 | UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); | |
1360 | ||
46f4442e | 1361 | if (U_FAILURE(status)) { |
729e4ab9 | 1362 | errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); |
46f4442e A |
1363 | return; |
1364 | } | |
729e4ab9 | 1365 | |
51004dcb | 1366 | CollData *monkeyData = new CollData(coll, status); |
729e4ab9 | 1367 | |
46f4442e A |
1368 | USet *expansions = uset_openEmpty(); |
1369 | USet *contractions = uset_openEmpty(); | |
46f4442e A |
1370 | |
1371 | ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); | |
1372 | ||
46f4442e A |
1373 | U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); |
1374 | U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); | |
1375 | USet *letters = uset_openPattern(letter_pattern, 39, &status); | |
1376 | SetMonkey letterMonkey(letters); | |
729e4ab9 A |
1377 | StringSetMonkey contractionMonkey(contractions, coll, monkeyData); |
1378 | StringSetMonkey expansionMonkey(expansions, coll, monkeyData); | |
46f4442e A |
1379 | UnicodeString testCase; |
1380 | UnicodeString alternate; | |
1381 | UnicodeString pattern, altPattern; | |
1382 | UnicodeString prefix, altPrefix; | |
1383 | UnicodeString suffix, altSuffix; | |
1384 | ||
1385 | Monkey *monkeys[] = { | |
1386 | &letterMonkey, | |
1387 | &contractionMonkey, | |
1388 | &expansionMonkey, | |
1389 | &contractionMonkey, | |
1390 | &expansionMonkey, | |
1391 | &contractionMonkey, | |
1392 | &expansionMonkey, | |
1393 | &contractionMonkey, | |
1394 | &expansionMonkey}; | |
2ca993e8 | 1395 | int32_t monkeyCount = UPRV_LENGTHOF(monkeys); |
729e4ab9 | 1396 | // int32_t nonMatchCount = 0; |
46f4442e A |
1397 | |
1398 | UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; | |
1399 | const char *strengthNames[] = {"primary", "secondary", "tertiary"}; | |
2ca993e8 | 1400 | int32_t strengthCount = UPRV_LENGTHOF(strengths); |
46f4442e A |
1401 | int32_t loopCount = quick? 1000 : 10000; |
1402 | int32_t firstStrength = 0; | |
729e4ab9 | 1403 | int32_t lastStrength = strengthCount - 1; //*/ 0; |
46f4442e A |
1404 | |
1405 | if (params != NULL) { | |
1406 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
1407 | UnicodeString p(params); | |
1408 | ||
1409 | loopCount = getIntParam("loop", p, loopCount); | |
1410 | m_seed = getIntParam("seed", p, m_seed); | |
1411 | ||
1412 | RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); | |
1413 | if (m.find()) { | |
1414 | UnicodeString breakType = m.group(1, status); | |
1415 | ||
1416 | for (int32_t s = 0; s < strengthCount; s += 1) { | |
1417 | if (breakType == strengthNames[s]) { | |
1418 | firstStrength = lastStrength = s; | |
1419 | break; | |
1420 | } | |
1421 | } | |
1422 | ||
1423 | m.reset(); | |
1424 | p = m.replaceFirst("", status); | |
1425 | } | |
1426 | ||
1427 | if (RegexMatcher("\\S", p, 0, status).find()) { | |
1428 | // Each option is stripped out of the option string as it is processed. | |
1429 | // All options have been checked. The option string should have been completely emptied.. | |
1430 | char buf[100]; | |
1431 | p.extract(buf, sizeof(buf), NULL, status); | |
1432 | buf[sizeof(buf)-1] = 0; | |
1433 | errln("Unrecognized or extra parameter: %s\n", buf); | |
1434 | return; | |
1435 | } | |
1436 | #else | |
1437 | infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); | |
1438 | #endif | |
1439 | } | |
1440 | ||
1441 | for(int32_t s = firstStrength; s <= lastStrength; s += 1) { | |
1442 | int32_t notFoundCount = 0; | |
1443 | ||
729e4ab9 | 1444 | logln("Setting strength to %s.", strengthNames[s]); |
46f4442e A |
1445 | ucol_setStrength(coll, strengths[s]); |
1446 | ||
1447 | // TODO: try alternate prefix and suffix too? | |
57a6839d | 1448 | // TODO: alternates are only equal at primary strength. Is this OK? |
729e4ab9 | 1449 | for(int32_t t = 0; t < loopCount; t += 1) { |
46f4442e | 1450 | uint32_t seed = m_seed; |
729e4ab9 | 1451 | // int32_t nmc = 0; |
46f4442e A |
1452 | |
1453 | generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); | |
1454 | generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); | |
1455 | generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); | |
1456 | ||
1457 | // pattern | |
1458 | notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); | |
1459 | ||
1460 | testCase.remove(); | |
1461 | testCase.append(prefix); | |
1462 | testCase.append(/*alt*/pattern); | |
1463 | ||
1464 | // prefix + pattern | |
1465 | notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); | |
1466 | ||
1467 | testCase.append(suffix); | |
1468 | ||
1469 | // prefix + pattern + suffix | |
1470 | notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); | |
1471 | ||
1472 | testCase.remove(); | |
1473 | testCase.append(pattern); | |
1474 | testCase.append(suffix); | |
729e4ab9 | 1475 | |
46f4442e A |
1476 | // pattern + suffix |
1477 | notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); | |
1478 | } | |
1479 | ||
729e4ab9 A |
1480 | logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); |
1481 | } | |
1482 | ||
1483 | uset_close(contractions); | |
1484 | uset_close(expansions); | |
1485 | uset_close(letters); | |
51004dcb | 1486 | delete monkeyData; |
729e4ab9 | 1487 | |
46f4442e A |
1488 | ucol_close(coll); |
1489 | } | |
1490 | ||
729e4ab9 A |
1491 | #endif |
1492 | ||
46f4442e | 1493 | #endif |