]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/ssearch.cpp
ICU-57132.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / ssearch.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_COLLATION
11
12 #include "cmemory.h"
13 #include "cstring.h"
14 #include "usrchimp.h"
15
16 #include "unicode/coll.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/usearch.h"
19 #include "unicode/uset.h"
20 #include "unicode/ustring.h"
21
22 #include "unicode/coleitr.h"
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
24
25 #include "colldata.h"
26 #include "ssearch.h"
27 #include "xmlparser.h"
28
29 #include <stdio.h> // for sprintf
30
31 char testId[100];
32
33 #define TEST_ASSERT(x) {if (!(x)) { \
34 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
35
36 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
37 dataerrln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
38
39 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
40 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
41 __FILE__, __LINE__, testId, u_errorName(errcode));}}
42
43 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
44 #define DELETE_ARRAY(array) uprv_free((void *) (array))
45
46 //---------------------------------------------------------------------------
47 //
48 // Test class boilerplate
49 //
50 //---------------------------------------------------------------------------
51 SSearchTest::SSearchTest()
52 {
53 }
54
55 SSearchTest::~SSearchTest()
56 {
57 }
58
59 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
60 {
61 if (exec) logln("TestSuite SSearchTest: ");
62 switch (index) {
63 #if !UCONFIG_NO_BREAK_ITERATION
64 case 0: name = "searchTest";
65 if (exec) searchTest();
66 break;
67
68 case 1: name = "offsetTest";
69 if (exec) offsetTest();
70 break;
71
72 case 2: name = "monkeyTest";
73 if (exec) monkeyTest(params);
74 break;
75
76 case 3: name = "sharpSTest";
77 if (exec) sharpSTest();
78 break;
79
80 case 4: name = "goodSuffixTest";
81 if (exec) goodSuffixTest();
82 break;
83
84 case 5: name = "searchTime";
85 if (exec) searchTime();
86 break;
87 #endif
88 default: name = "";
89 break; //needed to end loop
90 }
91 }
92
93
94 #if !UCONFIG_NO_BREAK_ITERATION
95
96 #define PATH_BUFFER_SIZE 2048
97 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
98 UErrorCode status = U_ZERO_ERROR;
99 const char *testDataDirectory = IntlTest::getSourceTestData(status);
100
101 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
102 errln("ERROR: getPath() failed - %s", u_errorName(status));
103 return NULL;
104 }
105
106 strcpy(buffer, testDataDirectory);
107 strcat(buffer, filename);
108 return buffer;
109 }
110
111
112 void SSearchTest::searchTest()
113 {
114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
115 UErrorCode status = U_ZERO_ERROR;
116 char path[PATH_BUFFER_SIZE];
117 const char *testFilePath = getPath(path, "ssearch.xml");
118
119 if (testFilePath == NULL) {
120 return; /* Couldn't get path: error message already output. */
121 }
122
123 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
124 TEST_ASSERT_SUCCESS(status);
125 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
126 TEST_ASSERT_SUCCESS(status);
127 if (U_FAILURE(status)) {
128 return;
129 }
130
131 const UnicodeString *debugTestCase = root->getAttribute("debug");
132 if (debugTestCase != NULL) {
133 // setenv("USEARCH_DEBUG", "1", 1);
134 }
135
136
137 const UXMLElement *testCase;
138 int32_t tc = 0;
139
140 while((testCase = root->nextChildElement(tc)) != NULL) {
141
142 if (testCase->getTagName().compare("test-case") != 0) {
143 errln("ssearch, unrecognized XML Element in test file");
144 continue;
145 }
146 const UnicodeString *id = testCase->getAttribute("id");
147 *testId = 0;
148 if (id != NULL) {
149 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
150 }
151
152 // If debugging test case has been specified and this is not it, skip to next.
153 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
154 continue;
155 }
156 //
157 // Get the requested collation strength.
158 // Default is tertiary if the XML attribute is missing from the test case.
159 //
160 const UnicodeString *strength = testCase->getAttribute("strength");
161 UColAttributeValue collatorStrength = UCOL_PRIMARY;
162 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
163 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
164 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
165 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
166 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
167 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
168 else {
169 // Bogus value supplied for strength. Shouldn't happen, even from
170 // typos, if the XML source has been validated.
171 // This assert is a little deceiving in that strength can be
172 // any of the allowed values, not just TERTIARY, but it will
173 // do the job of getting the error output.
174 TEST_ASSERT(*strength=="TERTIARY")
175 }
176
177 //
178 // Get the collator normalization flag. Default is UCOL_OFF.
179 //
180 UColAttributeValue normalize = UCOL_OFF;
181 const UnicodeString *norm = testCase->getAttribute("norm");
182 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
183 if (norm!=NULL && *norm=="ON") {
184 normalize = UCOL_ON;
185 }
186
187 //
188 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
189 //
190 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
191 const UnicodeString *alt = testCase->getAttribute("alternate_handling");
192 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
193 if (alt != NULL && *alt == "SHIFTED") {
194 alternateHandling = UCOL_SHIFTED;
195 }
196
197 const UnicodeString defLocale("en");
198 char clocale[100];
199 const UnicodeString *locale = testCase->getAttribute("locale");
200 if (locale == NULL || locale->length()==0) {
201 locale = &defLocale;
202 };
203 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
204
205
206 UnicodeString text;
207 UnicodeString target;
208 UnicodeString pattern;
209 int32_t expectedMatchStart = -1;
210 int32_t expectedMatchLimit = -1;
211 const UXMLElement *n;
212 int32_t nodeCount = 0;
213
214 n = testCase->getChildElement("pattern");
215 TEST_ASSERT(n != NULL);
216 if (n==NULL) {
217 continue;
218 }
219 text = n->getText(FALSE);
220 text = text.unescape();
221 pattern.append(text);
222 nodeCount++;
223
224 n = testCase->getChildElement("pre");
225 if (n!=NULL) {
226 text = n->getText(FALSE);
227 text = text.unescape();
228 target.append(text);
229 nodeCount++;
230 }
231
232 n = testCase->getChildElement("m");
233 if (n!=NULL) {
234 expectedMatchStart = target.length();
235 text = n->getText(FALSE);
236 text = text.unescape();
237 target.append(text);
238 expectedMatchLimit = target.length();
239 nodeCount++;
240 }
241
242 n = testCase->getChildElement("post");
243 if (n!=NULL) {
244 text = n->getText(FALSE);
245 text = text.unescape();
246 target.append(text);
247 nodeCount++;
248 }
249
250 // Check that there weren't extra things in the XML
251 TEST_ASSERT(nodeCount == testCase->countChildren());
252
253 // Open a collator and StringSearch based on the parameters
254 // obtained from the XML.
255 //
256 status = U_ZERO_ERROR;
257 LocalUCollatorPointer collator(ucol_open(clocale, &status));
258 ucol_setStrength(collator.getAlias(), collatorStrength);
259 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
260 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
261 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
262 target.getBuffer(), target.length(),
263 collator.getAlias(),
264 NULL, // the break iterator
265 &status));
266
267 TEST_ASSERT_SUCCESS(status);
268 if (U_FAILURE(status)) {
269 continue;
270 }
271
272 int32_t foundStart = 0;
273 int32_t foundLimit = 0;
274 UBool foundMatch;
275
276 //
277 // Do the search, check the match result against the expected results.
278 //
279 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
280 TEST_ASSERT_SUCCESS(status);
281 if ((foundMatch && expectedMatchStart<0) ||
282 (foundStart != expectedMatchStart) ||
283 (foundLimit != expectedMatchLimit)) {
284 TEST_ASSERT(FALSE); // ouput generic error position
285 infoln("Found, expected match start = %d, %d \n"
286 "Found, expected match limit = %d, %d",
287 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
288 }
289
290 // In case there are other matches...
291 // (should we only do this if the test case passed?)
292 while (foundMatch) {
293 expectedMatchStart = foundStart;
294 expectedMatchLimit = foundLimit;
295
296 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
297 }
298
299 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
300 target.getBuffer(), target.length(),
301 collator.getAlias(),
302 NULL,
303 &status));
304
305 //
306 // Do the backwards search, check the match result against the expected results.
307 //
308 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
309 TEST_ASSERT_SUCCESS(status);
310 if ((foundMatch && expectedMatchStart<0) ||
311 (foundStart != expectedMatchStart) ||
312 (foundLimit != expectedMatchLimit)) {
313 TEST_ASSERT(FALSE); // ouput generic error position
314 infoln("Found, expected backwards match start = %d, %d \n"
315 "Found, expected backwards match limit = %d, %d",
316 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
317 }
318 }
319 #endif
320 }
321
322 struct Order
323 {
324 int32_t order;
325 int32_t lowOffset;
326 int32_t highOffset;
327 };
328
329 class OrderList
330 {
331 public:
332 OrderList();
333 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
334 ~OrderList();
335
336 int32_t size(void) const;
337 void add(int32_t order, int32_t low, int32_t high);
338 const Order *get(int32_t index) const;
339 int32_t getLowOffset(int32_t index) const;
340 int32_t getHighOffset(int32_t index) const;
341 int32_t getOrder(int32_t index) const;
342 void reverse(void);
343 UBool compare(const OrderList &other) const;
344 UBool matchesAt(int32_t offset, const OrderList &other) const;
345
346 private:
347 Order *list;
348 int32_t listMax;
349 int32_t listSize;
350 };
351
352 OrderList::OrderList()
353 : list(NULL), listMax(16), listSize(0)
354 {
355 list = new Order[listMax];
356 }
357
358 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
359 : list(NULL), listMax(16), listSize(0)
360 {
361 UErrorCode status = U_ZERO_ERROR;
362 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
363 uint32_t strengthMask = 0;
364 int32_t order, low, high;
365
366 switch (ucol_getStrength(coll))
367 {
368 default:
369 strengthMask |= UCOL_TERTIARYORDERMASK;
370 U_FALLTHROUGH;
371 case UCOL_SECONDARY:
372 strengthMask |= UCOL_SECONDARYORDERMASK;
373 U_FALLTHROUGH;
374 case UCOL_PRIMARY:
375 strengthMask |= UCOL_PRIMARYORDERMASK;
376 }
377
378 list = new Order[listMax];
379
380 ucol_setOffset(elems, stringOffset, &status);
381
382 do {
383 low = ucol_getOffset(elems);
384 order = ucol_next(elems, &status);
385 high = ucol_getOffset(elems);
386
387 if (order != UCOL_NULLORDER) {
388 order &= strengthMask;
389 }
390
391 if (order != UCOL_IGNORABLE) {
392 add(order, low, high);
393 }
394 } while (order != UCOL_NULLORDER);
395
396 ucol_closeElements(elems);
397 }
398
399 OrderList::~OrderList()
400 {
401 delete[] list;
402 }
403
404 void OrderList::add(int32_t order, int32_t low, int32_t high)
405 {
406 if (listSize >= listMax) {
407 listMax *= 2;
408
409 Order *newList = new Order[listMax];
410
411 uprv_memcpy(newList, list, listSize * sizeof(Order));
412 delete[] list;
413 list = newList;
414 }
415
416 list[listSize].order = order;
417 list[listSize].lowOffset = low;
418 list[listSize].highOffset = high;
419
420 listSize += 1;
421 }
422
423 const Order *OrderList::get(int32_t index) const
424 {
425 if (index >= listSize) {
426 return NULL;
427 }
428
429 return &list[index];
430 }
431
432 int32_t OrderList::getLowOffset(int32_t index) const
433 {
434 const Order *order = get(index);
435
436 if (order != NULL) {
437 return order->lowOffset;
438 }
439
440 return -1;
441 }
442
443 int32_t OrderList::getHighOffset(int32_t index) const
444 {
445 const Order *order = get(index);
446
447 if (order != NULL) {
448 return order->highOffset;
449 }
450
451 return -1;
452 }
453
454 int32_t OrderList::getOrder(int32_t index) const
455 {
456 const Order *order = get(index);
457
458 if (order != NULL) {
459 return order->order;
460 }
461
462 return UCOL_NULLORDER;
463 }
464
465 int32_t OrderList::size() const
466 {
467 return listSize;
468 }
469
470 void OrderList::reverse()
471 {
472 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
473 Order swap = list[b];
474
475 list[b] = list[f];
476 list[f] = swap;
477 }
478 }
479
480 UBool OrderList::compare(const OrderList &other) const
481 {
482 if (listSize != other.listSize) {
483 return FALSE;
484 }
485
486 for(int32_t i = 0; i < listSize; i += 1) {
487 if (list[i].order != other.list[i].order ||
488 list[i].lowOffset != other.list[i].lowOffset ||
489 list[i].highOffset != other.list[i].highOffset) {
490 return FALSE;
491 }
492 }
493
494 return TRUE;
495 }
496
497 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
498 {
499 // NOTE: sizes include the NULLORDER, which we don't want to compare.
500 int32_t otherSize = other.size() - 1;
501
502 if (listSize - 1 - offset < otherSize) {
503 return FALSE;
504 }
505
506 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
507 if (getOrder(i) != other.getOrder(j)) {
508 return FALSE;
509 }
510 }
511
512 return TRUE;
513 }
514
515 static char *printOffsets(char *buffer, OrderList &list)
516 {
517 int32_t size = list.size();
518 char *s = buffer;
519
520 for(int32_t i = 0; i < size; i += 1) {
521 const Order *order = list.get(i);
522
523 if (i != 0) {
524 s += sprintf(s, ", ");
525 }
526
527 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
528 }
529
530 return buffer;
531 }
532
533 static char *printOrders(char *buffer, OrderList &list)
534 {
535 int32_t size = list.size();
536 char *s = buffer;
537
538 for(int32_t i = 0; i < size; i += 1) {
539 const Order *order = list.get(i);
540
541 if (i != 0) {
542 s += sprintf(s, ", ");
543 }
544
545 s += sprintf(s, "%8.8X", order->order);
546 }
547
548 return buffer;
549 }
550
551 void SSearchTest::offsetTest()
552 {
553 const char *test[] = {
554 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
555 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
556 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
557
558 "\\ua191\\u16ef\\u2036\\u017a",
559
560 #if 0
561 // This results in a complex interaction between contraction,
562 // expansion and normalization that confuses the backwards offset fixups.
563 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
564 #endif
565
566 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
567 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
568
569 "\\u02FE\\u02FF"
570 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
571 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
572 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
573 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
574 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
575
576 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
577 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
578 "a\\u02FF\\u0316\\u0301",
579 "a\\u0430\\u0301\\u0316",
580 "a\\u0430\\u0316\\u0301",
581 "abc\\u0E41\\u0301\\u0316",
582 "abc\\u0E41\\u0316\\u0301",
583 "\\u0E41\\u0301\\u0316",
584 "\\u0E41\\u0316\\u0301",
585 "a\\u0301\\u0316",
586 "a\\u0316\\u0301",
587 "\\uAC52\\uAC53",
588 "\\u34CA\\u34CB",
589 "\\u11ED\\u11EE",
590 "\\u30C3\\u30D0",
591 "p\\u00E9ch\\u00E9",
592 "a\\u0301\\u0325",
593 "a\\u0300\\u0325",
594 "a\\u0325\\u0300",
595 "A\\u0323\\u0300B",
596 "A\\u0300\\u0323B",
597 "A\\u0301\\u0323B",
598 "A\\u0302\\u0301\\u0323B",
599 "abc",
600 "ab\\u0300c",
601 "ab\\u0300\\u0323c",
602 " \\uD800\\uDC00\\uDC00",
603 "a\\uD800\\uDC00\\uDC00",
604 "A\\u0301\\u0301",
605 "A\\u0301\\u0323",
606 "A\\u0301\\u0323B",
607 "B\\u0301\\u0323C",
608 "A\\u0300\\u0323B",
609 "\\u0301A\\u0301\\u0301",
610 "abcd\\r\\u0301",
611 "p\\u00EAche",
612 "pe\\u0302che",
613 };
614
615 int32_t testCount = UPRV_LENGTHOF(test);
616 UErrorCode status = U_ZERO_ERROR;
617 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
618 if (U_FAILURE(status)) {
619 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
620 return;
621 }
622 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
623 // We could allocate one that's the right size by (CE_count * 10) + 2
624 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
625
626 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
627
628 for(int32_t i = 0; i < testCount; i += 1) {
629 UnicodeString ts = CharsToUnicodeString(test[i]);
630 CollationElementIterator *iter = col->createCollationElementIterator(ts);
631 OrderList forwardList;
632 OrderList backwardList;
633 int32_t order, low, high;
634
635 do {
636 low = iter->getOffset();
637 order = iter->next(status);
638 high = iter->getOffset();
639
640 forwardList.add(order, low, high);
641 } while (order != CollationElementIterator::NULLORDER);
642
643 iter->reset();
644 iter->setOffset(ts.length(), status);
645
646 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
647
648 do {
649 high = iter->getOffset();
650 order = iter->previous(status);
651 low = iter->getOffset();
652
653 if (order == CollationElementIterator::NULLORDER) {
654 break;
655 }
656
657 backwardList.add(order, low, high);
658 } while (TRUE);
659
660 backwardList.reverse();
661
662 if (forwardList.compare(backwardList)) {
663 logln("Works with \"%s\"", test[i]);
664 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
665 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
666
667 logln("Forward CEs: [%s]", printOrders(buffer, forwardList));
668 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
669
670 logln();
671 } else {
672 errln("Fails with \"%s\"", test[i]);
673 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
674 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
675
676 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList));
677 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
678
679 infoln();
680 }
681 delete iter;
682 }
683 delete col;
684 }
685
686 #if 0
687 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
688 {
689 for(int32_t i = 0; i < string.length(); i += 1) {
690 UChar32 ch = string.char32At(i);
691
692 if (ch >= 0x0020 && ch <= 0x007F) {
693 if (ch == 0x005C) {
694 buffer.append("\\\\");
695 } else {
696 buffer.append(ch);
697 }
698 } else {
699 char cbuffer[12];
700
701 if (ch <= 0xFFFFL) {
702 sprintf(cbuffer, "\\u%4.4X", ch);
703 } else {
704 sprintf(cbuffer, "\\U%8.8X", ch);
705 }
706
707 buffer.append(cbuffer);
708 }
709
710 if (ch >= 0x10000L) {
711 i += 1;
712 }
713 }
714
715 return buffer;
716 }
717 #endif
718
719 void SSearchTest::sharpSTest()
720 {
721 UErrorCode status = U_ZERO_ERROR;
722 UCollator *coll = NULL;
723 UnicodeString lp = "fuss";
724 UnicodeString sp = "fu\\u00DF";
725 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
726 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
727 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
728 int32_t start = -1, end = -1;
729
730 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
731 TEST_ASSERT_SUCCESS(status);
732
733 UnicodeString lpUnescaped = lp.unescape();
734 UnicodeString spUnescaped = sp.unescape();
735
736 LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
737 lpUnescaped.getBuffer(), lpUnescaped.length(), // actual test data will be set later
738 coll,
739 NULL, // the break iterator
740 &status));
741
742 LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
743 spUnescaped.getBuffer(), spUnescaped.length(), // actual test data will be set later
744 coll,
745 NULL, // the break iterator
746 &status));
747 TEST_ASSERT_SUCCESS(status);
748
749 for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) {
750 UBool bFound;
751 UnicodeString target = targets[t].unescape();
752
753 start = end = -1;
754 usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
755 bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
756 TEST_ASSERT_SUCCESS(status);
757 if (bFound) {
758 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
759 } else {
760 dataerrln("Test %d: did not find long pattern.", t);
761 }
762
763 usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
764 bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
765 TEST_ASSERT_SUCCESS(status);
766 if (bFound) {
767 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
768 } else {
769 dataerrln("Test %d: did not find long pattern.", t);
770 }
771 }
772
773 ucol_close(coll);
774 }
775
776 void SSearchTest::goodSuffixTest()
777 {
778 UErrorCode status = U_ZERO_ERROR;
779 UCollator *coll = NULL;
780 UnicodeString pat = /*"gcagagag"*/ "fxeld";
781 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
782 int32_t start = -1, end = -1;
783 UBool bFound;
784
785 coll = ucol_open(NULL, &status);
786 TEST_ASSERT_SUCCESS(status);
787
788 LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
789 target.getBuffer(), target.length(),
790 coll,
791 NULL, // the break iterator
792 &status));
793 TEST_ASSERT_SUCCESS(status);
794
795 bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
796 TEST_ASSERT_SUCCESS(status);
797 if (bFound) {
798 logln("Found pattern at [%d, %d].", start, end);
799 } else {
800 dataerrln("Did not find pattern.");
801 }
802
803 ucol_close(coll);
804 }
805
806 //
807 // searchTime() A quick and dirty performance test for string search.
808 // Probably doesn't really belong as part of intltest, but it
809 // does check that the search succeeds, and gets the right result,
810 // so it serves as a functionality test also.
811 //
812 // To run as a perf test, up the loop count, select by commenting
813 // and uncommenting in the code the operation to be measured,
814 // rebuild, and measure the running time of this test alone.
815 //
816 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
817 //
818 void SSearchTest::searchTime() {
819 static const char *longishText =
820 "Whylom, as olde stories tellen us,\n"
821 "Ther was a duk that highte Theseus:\n"
822 "Of Athenes he was lord and governour,\n"
823 "And in his tyme swich a conquerour,\n"
824 "That gretter was ther noon under the sonne.\n"
825 "Ful many a riche contree hadde he wonne;\n"
826 "What with his wisdom and his chivalrye,\n"
827 "He conquered al the regne of Femenye,\n"
828 "That whylom was y-cleped Scithia;\n"
829 "And weddede the quene Ipolita,\n"
830 "And broghte hir hoom with him in his contree\n"
831 "With muchel glorie and greet solempnitee,\n"
832 "And eek hir yonge suster Emelye.\n"
833 "And thus with victorie and with melodye\n"
834 "Lete I this noble duk to Athenes ryde,\n"
835 "And al his hoost, in armes, him bisyde.\n"
836 "And certes, if it nere to long to here,\n"
837 "I wolde han told yow fully the manere,\n"
838 "How wonnen was the regne of Femenye\n"
839 "By Theseus, and by his chivalrye;\n"
840 "And of the grete bataille for the nones\n"
841 "Bitwixen Athen's and Amazones;\n"
842 "And how asseged was Ipolita,\n"
843 "The faire hardy quene of Scithia;\n"
844 "And of the feste that was at hir weddinge,\n"
845 "And of the tempest at hir hoom-cominge;\n"
846 "But al that thing I moot as now forbere.\n"
847 "I have, God woot, a large feeld to ere,\n"
848 "And wayke been the oxen in my plough.\n"
849 "The remenant of the tale is long y-nough.\n"
850 "I wol nat letten eek noon of this route;\n"
851 "Lat every felawe telle his tale aboute,\n"
852 "And lat see now who shal the soper winne;\n"
853 "And ther I lefte, I wol ageyn biginne.\n"
854 "This duk, of whom I make mencioun,\n"
855 "When he was come almost unto the toun,\n"
856 "In al his wele and in his moste pryde,\n"
857 "He was war, as he caste his eye asyde,\n"
858 "Wher that ther kneled in the hye weye\n"
859 "A companye of ladies, tweye and tweye,\n"
860 "Ech after other, clad in clothes blake; \n"
861 "But swich a cry and swich a wo they make,\n"
862 "That in this world nis creature livinge,\n"
863 "That herde swich another weymentinge;\n"
864 "And of this cry they nolde never stenten,\n"
865 "Til they the reynes of his brydel henten.\n"
866 "'What folk ben ye, that at myn hoomcominge\n"
867 "Perturben so my feste with cryinge'?\n"
868 "Quod Theseus, 'have ye so greet envye\n"
869 "Of myn honour, that thus compleyne and crye? \n"
870 "Or who hath yow misboden, or offended?\n"
871 "And telleth me if it may been amended;\n"
872 "And why that ye ben clothed thus in blak'?\n"
873 "The eldest lady of hem alle spak,\n"
874 "When she hadde swowned with a deedly chere,\n"
875 "That it was routhe for to seen and here,\n"
876 "And seyde: 'Lord, to whom Fortune hath yiven\n"
877 "Victorie, and as a conquerour to liven,\n"
878 "Noght greveth us your glorie and your honour;\n"
879 "But we biseken mercy and socour.\n"
880 "Have mercy on our wo and our distresse.\n"
881 "Som drope of pitee, thurgh thy gentilesse,\n"
882 "Up-on us wrecched wommen lat thou falle.\n"
883 "For certes, lord, ther nis noon of us alle,\n"
884 "That she nath been a duchesse or a quene;\n"
885 "Now be we caitifs, as it is wel sene:\n"
886 "Thanked be Fortune, and hir false wheel,\n"
887 "That noon estat assureth to be weel.\n"
888 "And certes, lord, t'abyden your presence,\n"
889 "Here in the temple of the goddesse Clemence\n"
890 "We han ben waytinge al this fourtenight;\n"
891 "Now help us, lord, sith it is in thy might.\n"
892 "I wrecche, which that wepe and waille thus,\n"
893 "Was whylom wyf to king Capaneus,\n"
894 "That starf at Thebes, cursed be that day!\n"
895 "And alle we, that been in this array,\n"
896 "And maken al this lamentacioun,\n"
897 "We losten alle our housbondes at that toun,\n"
898 "Whyl that the sege ther-aboute lay.\n"
899 "And yet now th'olde Creon, weylaway!\n"
900 "The lord is now of Thebes the citee, \n"
901 "Fulfild of ire and of iniquitee,\n"
902 "He, for despyt, and for his tirannye,\n"
903 "To do the dede bodyes vileinye,\n"
904 "Of alle our lordes, whiche that ben slawe,\n"
905 "Hath alle the bodyes on an heep y-drawe,\n"
906 "And wol nat suffren hem, by noon assent,\n"
907 "Neither to been y-buried nor y-brent,\n"
908 "But maketh houndes ete hem in despyt. zet'\n";
909
910 const char *cPattern = "maketh houndes ete hem";
911 //const char *cPattern = "Whylom";
912 //const char *cPattern = "zet";
913 const char *testId = "searchTime()"; // for error macros.
914 UnicodeString target = longishText;
915 UErrorCode status = U_ZERO_ERROR;
916
917
918 LocalUCollatorPointer collator(ucol_open("en", &status));
919 //ucol_setStrength(collator.getAlias(), collatorStrength);
920 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
921 UnicodeString uPattern = cPattern;
922 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
923 target.getBuffer(), target.length(),
924 collator.getAlias(),
925 NULL, // the break iterator
926 &status));
927 TEST_ASSERT_SUCCESS(status);
928
929 // int32_t foundStart;
930 // int32_t foundEnd;
931 UBool found;
932
933 // Find the match position usgin strstr
934 const char *pm = strstr(longishText, cPattern);
935 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
936 int32_t refMatchPos = (int32_t)(pm - longishText);
937 int32_t icuMatchPos;
938 int32_t icuMatchEnd;
939 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
940 TEST_ASSERT_SUCCESS(status);
941 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
942
943 int32_t i;
944 // int32_t j=0;
945
946 // Try loopcounts around 100000 to some millions, depending on the operation,
947 // to get runtimes of at least several seconds.
948 for (i=0; i<10000; i++) {
949 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
950 (void)found; // Suppress set but not used warning.
951 //TEST_ASSERT_SUCCESS(status);
952 //TEST_ASSERT(found);
953
954 // usearch_setOffset(uss.getAlias(), 0, &status);
955 // icuMatchPos = usearch_next(uss.getAlias(), &status);
956
957 // The i+j stuff is to confuse the optimizer and get it to actually leave the
958 // call to strstr in place.
959 //pm = strstr(longishText+j, cPattern);
960 //j = (j + i)%5;
961 }
962
963 //printf("%ld, %d\n", pm-longishText, j);
964 }
965
966 //----------------------------------------------------------------------------------------
967 //
968 // Random Numbers. Similar to standard lib rand() and srand()
969 // Not using library to
970 // 1. Get same results on all platforms.
971 // 2. Get access to current seed, to more easily reproduce failures.
972 //
973 //---------------------------------------------------------------------------------------
974 static uint32_t m_seed = 1;
975
976 static uint32_t m_rand()
977 {
978 m_seed = m_seed * 1103515245 + 12345;
979 return (uint32_t)(m_seed/65536) % 32768;
980 }
981
982 class Monkey
983 {
984 public:
985 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
986
987 protected:
988 Monkey();
989 virtual ~Monkey();
990 };
991
992 Monkey::Monkey()
993 {
994 // ook?
995 }
996
997 Monkey::~Monkey()
998 {
999 // ook?
1000 }
1001
1002 class SetMonkey : public Monkey
1003 {
1004 public:
1005 SetMonkey(const USet *theSet);
1006 ~SetMonkey();
1007
1008 virtual void append(UnicodeString &test, UnicodeString &alternate);
1009
1010 private:
1011 const USet *set;
1012 };
1013
1014 SetMonkey::SetMonkey(const USet *theSet)
1015 : Monkey(), set(theSet)
1016 {
1017 // ook?
1018 }
1019
1020 SetMonkey::~SetMonkey()
1021 {
1022 //ook...
1023 }
1024
1025 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1026 {
1027 int32_t size = uset_size(set);
1028 int32_t index = m_rand() % size;
1029 UChar32 ch = uset_charAt(set, index);
1030 UnicodeString str(ch);
1031
1032 test.append(str);
1033 alternate.append(str); // flip case, or some junk?
1034 }
1035
1036 class StringSetMonkey : public Monkey
1037 {
1038 public:
1039 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1040 ~StringSetMonkey();
1041
1042 void append(UnicodeString &testCase, UnicodeString &alternate);
1043
1044 private:
1045 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1046
1047 const USet *set;
1048 UCollator *coll;
1049 CollData *collData;
1050 };
1051
1052 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1053 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1054 {
1055 // ook.
1056 }
1057
1058 StringSetMonkey::~StringSetMonkey()
1059 {
1060 // ook?
1061 }
1062
1063 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1064 {
1065 int32_t itemCount = uset_getItemCount(set), len = 0;
1066 int32_t index = m_rand() % itemCount;
1067 UChar32 rangeStart = 0, rangeEnd = 0;
1068 UChar buffer[16];
1069 UErrorCode err = U_ZERO_ERROR;
1070
1071 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1072
1073 if (len == 0) {
1074 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1075 UChar32 ch = rangeStart + offset;
1076 UnicodeString str(ch);
1077
1078 testCase.append(str);
1079 generateAlternative(str, alternate);
1080 } else if (len > 0) {
1081 // should check that len < 16...
1082 UnicodeString str(buffer, len);
1083
1084 testCase.append(str);
1085 generateAlternative(str, alternate);
1086 } else {
1087 // shouldn't happen...
1088 }
1089 }
1090
1091 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1092 {
1093 // find out shortest string for the longest sequence of ces.
1094 // needs to be refined to use dynamic programming, but will be roughly right
1095 UErrorCode status = U_ZERO_ERROR;
1096 CEList ceList(coll, testCase, status);
1097 UnicodeString alt;
1098 int32_t offset = 0;
1099
1100 if (ceList.size() == 0) {
1101 return alternate.append(testCase);
1102 }
1103
1104 while (offset < ceList.size()) {
1105 int32_t ce = ceList.get(offset);
1106 const StringList *strings = collData->getStringList(ce);
1107
1108 if (strings == NULL) {
1109 return alternate.append(testCase);
1110 }
1111
1112 int32_t stringCount = strings->size();
1113 int32_t tries = 0;
1114
1115 // find random string that generates the same CEList
1116 const CEList *ceList2 = NULL;
1117 const UnicodeString *string = NULL;
1118 UBool matches = FALSE;
1119
1120 do {
1121 int32_t s = m_rand() % stringCount;
1122
1123 if (tries++ > stringCount) {
1124 alternate.append(testCase);
1125 return alternate;
1126 }
1127
1128 string = strings->get(s);
1129 ceList2 = collData->getCEList(string);
1130 matches = ceList.matchesAt(offset, ceList2);
1131
1132 if (! matches) {
1133 collData->freeCEList((CEList *) ceList2);
1134 }
1135 } while (! matches);
1136
1137 alt.append(*string);
1138 offset += ceList2->size();
1139 collData->freeCEList(ceList2);
1140 }
1141
1142 const CEList altCEs(coll, alt, status);
1143
1144 if (ceList.matchesAt(0, &altCEs)) {
1145 return alternate.append(alt);
1146 }
1147
1148 return alternate.append(testCase);
1149 }
1150
1151 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1152 {
1153 int32_t pieces = (m_rand() % 4) + 1;
1154 UErrorCode status = U_ZERO_ERROR;
1155 UBool matches;
1156
1157 do {
1158 testCase.remove();
1159 alternate.remove();
1160 monkeys[0]->append(testCase, alternate);
1161
1162 for(int32_t piece = 0; piece < pieces; piece += 1) {
1163 int32_t monkey = m_rand() % monkeyCount;
1164
1165 monkeys[monkey]->append(testCase, alternate);
1166 }
1167
1168 const CEList ceTest(coll, testCase, status);
1169 const CEList ceAlt(coll, alternate, status);
1170
1171 matches = ceTest.matchesAt(0, &ceAlt);
1172 } while (! matches);
1173 }
1174
1175 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1176 {
1177 UErrorCode status = U_ZERO_ERROR;
1178 OrderList targetOrders(coll, target, offset);
1179 OrderList patternOrders(coll, pattern);
1180 int32_t targetSize = targetOrders.size() - 1;
1181 int32_t patternSize = patternOrders.size() - 1;
1182 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1183 target.getBuffer(), target.length(), &status);
1184
1185 if (patternSize == 0) {
1186 // Searching for an empty pattern always fails
1187 matchStart = matchEnd = -1;
1188 ubrk_close(charBreakIterator);
1189 return FALSE;
1190 }
1191
1192 matchStart = matchEnd = -1;
1193
1194 for(int32_t i = 0; i < targetSize; i += 1) {
1195 if (targetOrders.matchesAt(i, patternOrders)) {
1196 int32_t start = targetOrders.getLowOffset(i);
1197 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1198 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1199
1200 // if the low and high offsets of the first CE in
1201 // the match are the same, it means that the match
1202 // starts in the middle of an expansion - all but
1203 // the first CE of the expansion will have the offset
1204 // of the following character.
1205 if (start == targetOrders.getHighOffset(i)) {
1206 continue;
1207 }
1208
1209 // Make sure match starts on a grapheme boundary
1210 if (! ubrk_isBoundary(charBreakIterator, start)) {
1211 continue;
1212 }
1213
1214 // If the low and high offsets of the CE after the match
1215 // are the same, it means that the match ends in the middle
1216 // of an expansion sequence.
1217 if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1218 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1219 continue;
1220 }
1221
1222 int32_t mend = maxLimit;
1223
1224 // Find the first grapheme break after the character index
1225 // of the last CE in the match. If it's after character index
1226 // that's after the last CE in the match, use that index
1227 // as the end of the match.
1228 if (minLimit < maxLimit) {
1229 // When the last CE's low index is same with its high index, the CE is likely
1230 // a part of expansion. In this case, the index is located just after the
1231 // character corresponding to the CEs compared above. If the index is right
1232 // at the break boundary, move the position to the next boundary will result
1233 // incorrect match length when there are ignorable characters exist between
1234 // the position and the next character produces CE(s). See ticket#8482.
1235 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1236 mend = minLimit;
1237 } else {
1238 int32_t nba = ubrk_following(charBreakIterator, minLimit);
1239
1240 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1241 mend = nba;
1242 }
1243 }
1244 }
1245
1246 if (mend > maxLimit) {
1247 continue;
1248 }
1249
1250 if (! ubrk_isBoundary(charBreakIterator, mend)) {
1251 continue;
1252 }
1253
1254 matchStart = start;
1255 matchEnd = mend;
1256
1257 ubrk_close(charBreakIterator);
1258 return TRUE;
1259 }
1260 }
1261
1262 ubrk_close(charBreakIterator);
1263 return FALSE;
1264 }
1265
1266 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1267 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1268 int32_t val = defaultVal;
1269
1270 name.append(" *= *(-?\\d+)");
1271
1272 UErrorCode status = U_ZERO_ERROR;
1273 RegexMatcher m(name, params, 0, status);
1274
1275 if (m.find()) {
1276 // The param exists. Convert the string to an int.
1277 char valString[100];
1278 int32_t paramLength = m.end(1, status) - m.start(1, status);
1279
1280 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1281 paramLength = (int32_t)(sizeof(valString)-2);
1282 }
1283
1284 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1285 val = uprv_strtol(valString, NULL, 10);
1286
1287 // Delete this parameter from the params string.
1288 m.reset();
1289 params = m.replaceFirst("", status);
1290 }
1291
1292 //U_ASSERT(U_SUCCESS(status));
1293 if (! U_SUCCESS(status)) {
1294 val = defaultVal;
1295 }
1296
1297 return val;
1298 }
1299 #endif
1300
1301 #if !UCONFIG_NO_COLLATION
1302 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1303 const char *name, const char *strength, uint32_t seed)
1304 {
1305 UErrorCode status = U_ZERO_ERROR;
1306 int32_t actualStart = -1, actualEnd = -1;
1307 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1308 int32_t expectedStart = -1, expectedEnd = -1;
1309 int32_t notFoundCount = 0;
1310 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1311 testCase.getBuffer(), testCase.length(),
1312 coll,
1313 NULL, // the break iterator
1314 &status));
1315
1316 // **** TODO: find *all* matches, not just first one ****
1317 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1318
1319 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1320
1321 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1322 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1323 " strength=%s seed=%d",
1324 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1325 }
1326
1327 if (expectedStart == -1 && actualStart == -1) {
1328 notFoundCount += 1;
1329 }
1330
1331 // **** TODO: find *all* matches, not just first one ****
1332 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1333
1334 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1335
1336 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1337
1338 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1339 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1340 " strength=%s seed=%d",
1341 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1342 }
1343
1344 if (expectedStart == -1 && actualStart == -1) {
1345 notFoundCount += 1;
1346 }
1347
1348 return notFoundCount;
1349 }
1350 #endif
1351
1352 void SSearchTest::monkeyTest(char *params)
1353 {
1354 // ook!
1355 UErrorCode status = U_ZERO_ERROR;
1356 //UCollator *coll = ucol_open(NULL, &status);
1357 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1358
1359 if (U_FAILURE(status)) {
1360 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1361 return;
1362 }
1363
1364 CollData *monkeyData = new CollData(coll, status);
1365
1366 USet *expansions = uset_openEmpty();
1367 USet *contractions = uset_openEmpty();
1368
1369 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1370
1371 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1372 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1373 USet *letters = uset_openPattern(letter_pattern, 39, &status);
1374 SetMonkey letterMonkey(letters);
1375 StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1376 StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1377 UnicodeString testCase;
1378 UnicodeString alternate;
1379 UnicodeString pattern, altPattern;
1380 UnicodeString prefix, altPrefix;
1381 UnicodeString suffix, altSuffix;
1382
1383 Monkey *monkeys[] = {
1384 &letterMonkey,
1385 &contractionMonkey,
1386 &expansionMonkey,
1387 &contractionMonkey,
1388 &expansionMonkey,
1389 &contractionMonkey,
1390 &expansionMonkey,
1391 &contractionMonkey,
1392 &expansionMonkey};
1393 int32_t monkeyCount = UPRV_LENGTHOF(monkeys);
1394 // int32_t nonMatchCount = 0;
1395
1396 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1397 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1398 int32_t strengthCount = UPRV_LENGTHOF(strengths);
1399 int32_t loopCount = quick? 1000 : 10000;
1400 int32_t firstStrength = 0;
1401 int32_t lastStrength = strengthCount - 1; //*/ 0;
1402
1403 if (params != NULL) {
1404 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1405 UnicodeString p(params);
1406
1407 loopCount = getIntParam("loop", p, loopCount);
1408 m_seed = getIntParam("seed", p, m_seed);
1409
1410 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1411 if (m.find()) {
1412 UnicodeString breakType = m.group(1, status);
1413
1414 for (int32_t s = 0; s < strengthCount; s += 1) {
1415 if (breakType == strengthNames[s]) {
1416 firstStrength = lastStrength = s;
1417 break;
1418 }
1419 }
1420
1421 m.reset();
1422 p = m.replaceFirst("", status);
1423 }
1424
1425 if (RegexMatcher("\\S", p, 0, status).find()) {
1426 // Each option is stripped out of the option string as it is processed.
1427 // All options have been checked. The option string should have been completely emptied..
1428 char buf[100];
1429 p.extract(buf, sizeof(buf), NULL, status);
1430 buf[sizeof(buf)-1] = 0;
1431 errln("Unrecognized or extra parameter: %s\n", buf);
1432 return;
1433 }
1434 #else
1435 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1436 #endif
1437 }
1438
1439 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1440 int32_t notFoundCount = 0;
1441
1442 logln("Setting strength to %s.", strengthNames[s]);
1443 ucol_setStrength(coll, strengths[s]);
1444
1445 // TODO: try alternate prefix and suffix too?
1446 // TODO: alternates are only equal at primary strength. Is this OK?
1447 for(int32_t t = 0; t < loopCount; t += 1) {
1448 uint32_t seed = m_seed;
1449 // int32_t nmc = 0;
1450
1451 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1452 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
1453 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
1454
1455 // pattern
1456 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1457
1458 testCase.remove();
1459 testCase.append(prefix);
1460 testCase.append(/*alt*/pattern);
1461
1462 // prefix + pattern
1463 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1464
1465 testCase.append(suffix);
1466
1467 // prefix + pattern + suffix
1468 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1469
1470 testCase.remove();
1471 testCase.append(pattern);
1472 testCase.append(suffix);
1473
1474 // pattern + suffix
1475 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1476 }
1477
1478 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1479 }
1480
1481 uset_close(contractions);
1482 uset_close(expansions);
1483 uset_close(letters);
1484 delete monkeyData;
1485
1486 ucol_close(coll);
1487 }
1488
1489 #endif
1490
1491 #endif