]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/ssearch.cpp
ICU-400.39.tar.gz
[apple/icu.git] / icuSources / test / intltest / ssearch.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_COLLATION
12
13 #include "unicode/unistr.h"
14 #include "unicode/putil.h"
15 #include "unicode/usearch.h"
16
17 #include "cmemory.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
22
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
24
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/ustring.h"
28 #include "hash.h"
29 #include "uhash.h"
30 #include "ucol_imp.h"
31
32 #include "intltest.h"
33 #include "ssearch.h"
34
35 #include "xmlparser.h"
36
37 #include <stdlib.h>
38 #include <string.h>
39 #include <stdio.h>
40
41 char testId[100];
42
43 #define TEST_ASSERT(x) {if (!(x)) { \
44 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
45
46 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
47 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
48
49 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
50 errln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
51 __FILE__, __LINE__, testId, u_errorName(errcode));}}
52
53 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
54
55 //---------------------------------------------------------------------------
56 //
57 // Test class boilerplate
58 //
59 //---------------------------------------------------------------------------
60 SSearchTest::SSearchTest()
61 {
62 }
63
64 SSearchTest::~SSearchTest()
65 {
66 }
67
68 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
69 {
70 if (exec) logln("TestSuite SSearchTest: ");
71 switch (index) {
72 #if !UCONFIG_NO_BREAK_ITERATION
73 case 0: name = "searchTest";
74 if (exec) searchTest();
75 break;
76
77 case 1: name = "offsetTest";
78 if (exec) offsetTest();
79 break;
80
81 case 2: name = "monkeyTest";
82 if (exec) monkeyTest(params);
83 break;
84 #endif
85 default: name = "";
86 break; //needed to end loop
87 }
88 }
89
90
91 #if !UCONFIG_NO_BREAK_ITERATION
92
93 #define PATH_BUFFER_SIZE 2048
94 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
95 UErrorCode status = U_ZERO_ERROR;
96 const char *testDataDirectory = IntlTest::getSourceTestData(status);
97
98 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
99 errln("ERROR: getPath() failed - %s", u_errorName(status));
100 return NULL;
101 }
102
103 strcpy(buffer, testDataDirectory);
104 strcat(buffer, filename);
105 return buffer;
106 }
107
108
109 void SSearchTest::searchTest()
110 {
111 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
112 UErrorCode status = U_ZERO_ERROR;
113 char path[PATH_BUFFER_SIZE];
114 const char *testFilePath = getPath(path, "ssearch.xml");
115
116 if (testFilePath == NULL) {
117 return; /* Couldn't get path: error message already output. */
118 }
119
120 UXMLParser *parser = UXMLParser::createParser(status);
121 TEST_ASSERT_SUCCESS(status);
122 UXMLElement *root = parser->parseFile(testFilePath, status);
123 TEST_ASSERT_SUCCESS(status);
124 if (U_FAILURE(status)) {
125 return;
126 }
127
128 const UnicodeString *debugTestCase = root->getAttribute("debug");
129 if (debugTestCase != NULL) {
130 // setenv("USEARCH_DEBUG", "1", 1);
131 }
132
133
134 const UXMLElement *testCase;
135 int32_t tc = 0;
136
137 while((testCase = root->nextChildElement(tc)) != NULL) {
138
139 if (testCase->getTagName().compare("test-case") != 0) {
140 errln("ssearch, unrecognized XML Element in test file");
141 continue;
142 }
143 const UnicodeString *id = testCase->getAttribute("id");
144 *testId = 0;
145 if (id != NULL) {
146 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
147 }
148
149 // If debugging test case has been specified and this is not it, skip to next.
150 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
151 continue;
152 }
153 //
154 // Get the requested collation strength.
155 // Default is tertiary if the XML attribute is missing from the test case.
156 //
157 const UnicodeString *strength = testCase->getAttribute("strength");
158 UColAttributeValue collatorStrength;
159 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
160 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
161 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
162 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
163 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
164 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
165 else {
166 // Bogus value supplied for strength. Shouldn't happen, even from
167 // typos, if the XML source has been validated.
168 // This assert is a little deceiving in that strength can be
169 // any of the allowed values, not just TERTIARY, but it will
170 // do the job of getting the error output.
171 TEST_ASSERT(*strength=="TERTIARY")
172 }
173
174 //
175 // Get the collator normalization flag. Default is UCOL_OFF.
176 //
177 UColAttributeValue normalize = UCOL_OFF;
178 const UnicodeString *norm = testCase->getAttribute("norm");
179 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
180 if (norm!=NULL && *norm=="ON") {
181 normalize = UCOL_ON;
182 }
183
184 const UnicodeString defLocale("en");
185 char clocale[100];
186 const UnicodeString *locale = testCase->getAttribute("locale");
187 if (locale == NULL || locale->length()==0) {
188 locale = &defLocale;
189 };
190 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
191
192
193 UnicodeString text;
194 UnicodeString target;
195 UnicodeString pattern;
196 int32_t expectedMatchStart = -1;
197 int32_t expectedMatchLimit = -1;
198 const UXMLElement *n;
199 int nodeCount = 0;
200
201 n = testCase->getChildElement("pattern");
202 TEST_ASSERT(n != NULL);
203 if (n==NULL) {
204 continue;
205 }
206 text = n->getText(FALSE);
207 text = text.unescape();
208 pattern.append(text);
209 nodeCount++;
210
211 n = testCase->getChildElement("pre");
212 if (n!=NULL) {
213 text = n->getText(FALSE);
214 text = text.unescape();
215 target.append(text);
216 nodeCount++;
217 }
218
219 n = testCase->getChildElement("m");
220 if (n!=NULL) {
221 expectedMatchStart = target.length();
222 text = n->getText(FALSE);
223 text = text.unescape();
224 target.append(text);
225 expectedMatchLimit = target.length();
226 nodeCount++;
227 }
228
229 n = testCase->getChildElement("post");
230 if (n!=NULL) {
231 text = n->getText(FALSE);
232 text = text.unescape();
233 target.append(text);
234 nodeCount++;
235 }
236
237 // Check that there weren't extra things in the XML
238 TEST_ASSERT(nodeCount == testCase->countChildren());
239
240 // Open a collotor and StringSearch based on the parameters
241 // obtained from the XML.
242 //
243 status = U_ZERO_ERROR;
244 UCollator *collator = ucol_open(clocale, &status);
245 ucol_setStrength(collator, collatorStrength);
246 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
247 UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
248 target.getBuffer(), target.length(),
249 collator,
250 NULL, // the break iterator
251 &status);
252
253 TEST_ASSERT_SUCCESS(status);
254 if (U_FAILURE(status)) {
255 usearch_close(uss);
256 ucol_close(collator);
257 continue;
258 }
259
260 int32_t foundStart = 0;
261 int32_t foundLimit = 0;
262 UBool foundMatch;
263
264 //
265 // Do the search, check the match result against the expected results.
266 //
267 foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status);
268 TEST_ASSERT_SUCCESS(status);
269 if (foundMatch && expectedMatchStart<0 ||
270 foundStart != expectedMatchStart ||
271 foundLimit != expectedMatchLimit) {
272 TEST_ASSERT(FALSE); // ouput generic error position
273 infoln("Found, expected match start = %d, %d \n"
274 "Found, expected match limit = %d, %d",
275 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
276 }
277
278 // In case there are other matches...
279 // (should we only do this if the test case passed?)
280 while (foundMatch) {
281 expectedMatchStart = foundStart;
282 expectedMatchLimit = foundLimit;
283
284 foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status);
285 }
286
287 usearch_close(uss);
288
289 uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
290 target.getBuffer(), target.length(),
291 collator,
292 NULL,
293 &status);
294
295 //
296 // Do the backwards search, check the match result against the expected results.
297 //
298 foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status);
299 TEST_ASSERT_SUCCESS(status);
300 if (foundMatch && expectedMatchStart<0 ||
301 foundStart != expectedMatchStart ||
302 foundLimit != expectedMatchLimit) {
303 TEST_ASSERT(FALSE); // ouput generic error position
304 infoln("Found, expected backwards match start = %d, %d \n"
305 "Found, expected backwards match limit = %d, %d",
306 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
307 }
308
309 usearch_close(uss);
310 ucol_close(collator);
311 }
312
313 delete root;
314 delete parser;
315 #endif
316 }
317
318 struct Order
319 {
320 int32_t order;
321 int32_t lowOffset;
322 int32_t highOffset;
323 };
324
325 class OrderList
326 {
327 public:
328 OrderList();
329 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
330 ~OrderList();
331
332 int32_t size(void) const;
333 void add(int32_t order, int32_t low, int32_t high);
334 const Order *get(int32_t index) const;
335 int32_t getLowOffset(int32_t index) const;
336 int32_t getHighOffset(int32_t index) const;
337 int32_t getOrder(int32_t index) const;
338 void reverse(void);
339 UBool compare(const OrderList &other) const;
340 UBool matchesAt(int32_t offset, const OrderList &other) const;
341
342 private:
343 Order *list;
344 int32_t listMax;
345 int32_t listSize;
346 };
347
348 OrderList::OrderList()
349 : list(NULL), listSize(0), listMax(16)
350 {
351 list = new Order[listMax];
352 }
353
354 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
355 : list(NULL), listMax(16), listSize(0)
356 {
357 UErrorCode status = U_ZERO_ERROR;
358 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
359 uint32_t strengthMask = 0;
360 int32_t order, low, high;
361
362 switch (ucol_getStrength(coll))
363 {
364 default:
365 strengthMask |= UCOL_TERTIARYORDERMASK;
366 /* fall through */
367
368 case UCOL_SECONDARY:
369 strengthMask |= UCOL_SECONDARYORDERMASK;
370 /* fall through */
371
372 case UCOL_PRIMARY:
373 strengthMask |= UCOL_PRIMARYORDERMASK;
374 }
375
376 list = new Order[listMax];
377
378 ucol_setOffset(elems, stringOffset, &status);
379
380 do {
381 low = ucol_getOffset(elems);
382 order = ucol_next(elems, &status);
383 high = ucol_getOffset(elems);
384
385 if (order != UCOL_NULLORDER) {
386 order &= strengthMask;
387 }
388
389 if (order != UCOL_IGNORABLE) {
390 add(order, low, high);
391 }
392 } while (order != UCOL_NULLORDER);
393
394 ucol_closeElements(elems);
395 }
396
397 OrderList::~OrderList()
398 {
399 delete[] list;
400 }
401
402 void OrderList::add(int32_t order, int32_t low, int32_t high)
403 {
404 if (listSize >= listMax) {
405 listMax *= 2;
406
407 Order *newList = new Order[listMax];
408
409 uprv_memcpy(newList, list, listSize * sizeof(Order));
410 delete[] list;
411 list = newList;
412 }
413
414 list[listSize].order = order;
415 list[listSize].lowOffset = low;
416 list[listSize].highOffset = high;
417
418 listSize += 1;
419 }
420
421 const Order *OrderList::get(int32_t index) const
422 {
423 if (index >= listSize) {
424 return NULL;
425 }
426
427 return &list[index];
428 }
429
430 int32_t OrderList::getLowOffset(int32_t index) const
431 {
432 const Order *order = get(index);
433
434 if (order != NULL) {
435 return order->lowOffset;
436 }
437
438 return -1;
439 }
440
441 int32_t OrderList::getHighOffset(int32_t index) const
442 {
443 const Order *order = get(index);
444
445 if (order != NULL) {
446 return order->highOffset;
447 }
448
449 return -1;
450 }
451
452 int32_t OrderList::getOrder(int32_t index) const
453 {
454 const Order *order = get(index);
455
456 if (order != NULL) {
457 return order->order;
458 }
459
460 return UCOL_NULLORDER;
461 }
462
463 int32_t OrderList::size() const
464 {
465 return listSize;
466 }
467
468 void OrderList::reverse()
469 {
470 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
471 Order swap = list[b];
472
473 list[b] = list[f];
474 list[f] = swap;
475 }
476 }
477
478 UBool OrderList::compare(const OrderList &other) const
479 {
480 if (listSize != other.listSize) {
481 return FALSE;
482 }
483
484 for(int32_t i = 0; i < listSize; i += 1) {
485 if (list[i].order != other.list[i].order ||
486 list[i].lowOffset != other.list[i].lowOffset ||
487 list[i].highOffset != other.list[i].highOffset) {
488 return FALSE;
489 }
490 }
491
492 return TRUE;
493 }
494
495 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
496 {
497 // NOTE: sizes include the NULLORDER, which we don't want to compare.
498 int32_t otherSize = other.size() - 1;
499
500 if (listSize - 1 - offset < otherSize) {
501 return FALSE;
502 }
503
504 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
505 if (getOrder(i) != other.getOrder(j)) {
506 return FALSE;
507 }
508 }
509
510 return TRUE;
511 }
512
513 static char *printOffsets(char *buffer, OrderList &list)
514 {
515 int32_t size = list.size();
516 char *s = buffer;
517
518 for(int32_t i = 0; i < size; i += 1) {
519 const Order *order = list.get(i);
520
521 if (i != 0) {
522 s += sprintf(s, ", ");
523 }
524
525 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
526 }
527
528 return buffer;
529 }
530
531 static char *printOrders(char *buffer, OrderList &list)
532 {
533 int32_t size = list.size();
534 char *s = buffer;
535
536 for(int32_t i = 0; i < size; i += 1) {
537 const Order *order = list.get(i);
538
539 if (i != 0) {
540 s += sprintf(s, ", ");
541 }
542
543 s += sprintf(s, "%8.8X", order->order);
544 }
545
546 return buffer;
547 }
548
549 void SSearchTest::offsetTest()
550 {
551 const char *test[] = {
552 "\\ua191\\u16ef\\u2036\\u017a",
553
554 #if 0
555 // This results in a complex interaction between contraction,
556 // expansion and normalization that confuses the backwards offset fixups.
557 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
558 #endif
559
560 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
561 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
562
563 "\\u02FE\\u02FF"
564 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
565 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
566 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
567 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
568 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
569
570 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
571 "abc\\u0E41\\u0301\\u0316",
572 "abc\\u0E41\\u0316\\u0301",
573 "\\u0E41\\u0301\\u0316",
574 "\\u0E41\\u0316\\u0301",
575 "a\\u0301\\u0316",
576 "a\\u0316\\u0301",
577 "\\uAC52\\uAC53",
578 "\\u34CA\\u34CB",
579 "\\u11ED\\u11EE",
580 "\\u30C3\\u30D0",
581 "p\\u00E9ch\\u00E9",
582 "a\\u0301\\u0325",
583 "a\\u0300\\u0325",
584 "a\\u0325\\u0300",
585 "A\\u0323\\u0300B",
586 "A\\u0300\\u0323B",
587 "A\\u0301\\u0323B",
588 "A\\u0302\\u0301\\u0323B",
589 "abc",
590 "ab\\u0300c",
591 "ab\\u0300\\u0323c",
592 " \\uD800\\uDC00\\uDC00",
593 "a\\uD800\\uDC00\\uDC00",
594 "A\\u0301\\u0301",
595 "A\\u0301\\u0323",
596 "A\\u0301\\u0323B",
597 "B\\u0301\\u0323C",
598 "A\\u0300\\u0323B",
599 "\\u0301A\\u0301\\u0301",
600 "abcd\\r\\u0301",
601 "p\\u00EAche",
602 "pe\\u0302che",
603 };
604
605 int32_t testCount = ARRAY_SIZE(test);
606 UErrorCode status = U_ZERO_ERROR;
607 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
608 if (U_FAILURE(status)) {
609 errln("Failed to create collator in offsetTest!");
610 return;
611 }
612 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
613 // We could allocate one that's the right size by (CE_count * 10) + 2
614 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
615
616 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
617
618 for(int32_t i = 0; i < testCount; i += 1) {
619 UnicodeString ts = CharsToUnicodeString(test[i]);
620 CollationElementIterator *iter = col->createCollationElementIterator(ts);
621 OrderList forwardList;
622 OrderList backwardList;
623 int32_t order, low, high;
624
625 do {
626 low = iter->getOffset();
627 order = iter->next(status);
628 high = iter->getOffset();
629
630 forwardList.add(order, low, high);
631 } while (order != CollationElementIterator::NULLORDER);
632
633 iter->reset();
634 iter->setOffset(ts.length(), status);
635
636 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
637
638 do {
639 high = iter->getOffset();
640 order = iter->previous(status);
641 low = iter->getOffset();
642
643 if (order == CollationElementIterator::NULLORDER) {
644 break;
645 }
646
647 backwardList.add(order, low, high);
648 } while (TRUE);
649
650 backwardList.reverse();
651
652 if (forwardList.compare(backwardList)) {
653 logln("Works with \"%s\"", test[i]);
654 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
655 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
656
657 logln("Forward CEs: [%s]", printOrders(buffer, forwardList));
658 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
659
660 logln();
661 } else {
662 errln("Fails with \"%s\"", test[i]);
663 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
664 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
665
666 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList));
667 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
668
669 infoln();
670 }
671 delete iter;
672 }
673 delete col;
674 }
675
676 class CEList
677 {
678 public:
679 CEList(UCollator *coll, const UnicodeString &string);
680 ~CEList();
681
682 int32_t size() const;
683 int32_t get(int32_t index) const;
684 UBool matchesAt(int32_t offset, const CEList *other) const;
685
686 private:
687 void add(int32_t ce);
688
689 int32_t *ces;
690 int32_t listMax;
691 int32_t listSize;
692 };
693
694 CEList::CEList(UCollator *coll, const UnicodeString &string)
695 : ces(NULL), listMax(8), listSize(0)
696 {
697 UErrorCode status = U_ZERO_ERROR;
698 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
699 uint32_t strengthMask = 0;
700 int32_t order;
701
702 #if 0
703 switch (ucol_getStrength(coll))
704 {
705 default:
706 strengthMask |= UCOL_TERTIARYORDERMASK;
707 /* fall through */
708
709 case UCOL_SECONDARY:
710 strengthMask |= UCOL_SECONDARYORDERMASK;
711 /* fall through */
712
713 case UCOL_PRIMARY:
714 strengthMask |= UCOL_PRIMARYORDERMASK;
715 }
716 #else
717 strengthMask = UCOL_PRIMARYORDERMASK;
718 #endif
719
720 ces = new int32_t[listMax];
721
722 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
723 order &= strengthMask;
724
725 if (order == UCOL_IGNORABLE) {
726 continue;
727 }
728
729 add(order);
730 }
731
732 ucol_closeElements(elems);
733 }
734
735 CEList::~CEList()
736 {
737 delete[] ces;
738 }
739
740 void CEList::add(int32_t ce)
741 {
742 if (listSize >= listMax) {
743 listMax *= 2;
744
745 int32_t *newCEs = new int32_t[listMax];
746
747 uprv_memcpy(newCEs, ces, listSize * sizeof(int32_t));
748 delete[] ces;
749 ces = newCEs;
750 }
751
752 ces[listSize++] = ce;
753 }
754
755 int32_t CEList::get(int32_t index) const
756 {
757 if (index >= 0 && index < listSize) {
758 return ces[index];
759 }
760
761 return -1;
762 }
763
764 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
765 {
766 if (listSize - offset < other->size()) {
767 return FALSE;
768 }
769
770 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
771 if (ces[i] != other->get(j)) {
772 return FALSE;
773 }
774 }
775
776 return TRUE;
777 }
778
779 int32_t CEList::size() const
780 {
781 return listSize;
782 }
783
784 class StringList
785 {
786 public:
787 StringList();
788 ~StringList();
789
790 void add(const UnicodeString *string);
791 void add(const UChar *chars, int32_t count);
792 const UnicodeString *get(int32_t index) const;
793 int32_t size() const;
794
795 private:
796 UnicodeString *strings;
797 int32_t listMax;
798 int32_t listSize;
799 };
800
801 StringList::StringList()
802 : strings(NULL), listMax(16), listSize(0)
803 {
804 strings = new UnicodeString [listMax];
805 }
806
807 StringList::~StringList()
808 {
809 delete[] strings;
810 }
811
812 void StringList::add(const UnicodeString *string)
813 {
814 if (listSize >= listMax) {
815 listMax *= 2;
816
817 UnicodeString *newStrings = new UnicodeString[listMax];
818
819 uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString));
820
821 delete[] strings;
822 strings = newStrings;
823 }
824
825 // The ctor initialized all the strings in
826 // the array to empty strings, so this
827 // is the same as copying the source string.
828 strings[listSize++].append(*string);
829 }
830
831 void StringList::add(const UChar *chars, int32_t count)
832 {
833 const UnicodeString string(chars, count);
834
835 add(&string);
836 }
837
838 const UnicodeString *StringList::get(int32_t index) const
839 {
840 if (index >= 0 && index < listSize) {
841 return &strings[index];
842 }
843
844 return NULL;
845 }
846
847 int32_t StringList::size() const
848 {
849 return listSize;
850 }
851
852 class CEToStringsMap
853 {
854 public:
855
856 CEToStringsMap();
857 ~CEToStringsMap();
858
859 void put(int32_t ce, UnicodeString *string);
860 StringList *getStringList(int32_t ce) const;
861
862 private:
863
864 static void deleteStringList(void *obj);
865 void putStringList(int32_t ce, StringList *stringList);
866 UHashtable *map;
867 };
868
869 CEToStringsMap::CEToStringsMap()
870 {
871 UErrorCode status = U_ZERO_ERROR;
872
873 map = uhash_open(uhash_hashLong, uhash_compareLong,
874 uhash_compareCaselessUnicodeString,
875 &status);
876
877 uhash_setValueDeleter(map, deleteStringList);
878 }
879
880 CEToStringsMap::~CEToStringsMap()
881 {
882 uhash_close(map);
883 }
884
885 void CEToStringsMap::put(int32_t ce, UnicodeString *string)
886 {
887 StringList *strings = getStringList(ce);
888
889 if (strings == NULL) {
890 strings = new StringList();
891 putStringList(ce, strings);
892 }
893
894 strings->add(string);
895 }
896
897 StringList *CEToStringsMap::getStringList(int32_t ce) const
898 {
899 return (StringList *) uhash_iget(map, ce);
900 }
901
902 void CEToStringsMap::putStringList(int32_t ce, StringList *stringList)
903 {
904 UErrorCode status = U_ZERO_ERROR;
905
906 uhash_iput(map, ce, (void *) stringList, &status);
907 }
908
909 void CEToStringsMap::deleteStringList(void *obj)
910 {
911 StringList *strings = (StringList *) obj;
912
913 delete strings;
914 }
915
916 class StringToCEsMap
917 {
918 public:
919 StringToCEsMap();
920 ~StringToCEsMap();
921
922 void put(const UnicodeString *string, const CEList *ces);
923 const CEList *get(const UnicodeString *string);
924
925 private:
926
927 static void deleteCEList(void *obj);
928 static void deleteUnicodeStringKey(void *obj);
929
930 UHashtable *map;
931 };
932
933 StringToCEsMap::StringToCEsMap()
934 {
935 UErrorCode status = U_ZERO_ERROR;
936
937 map = uhash_open(uhash_hashCaselessUnicodeString,
938 uhash_compareCaselessUnicodeString,
939 uhash_compareLong,
940 &status);
941
942 uhash_setValueDeleter(map, deleteCEList);
943 uhash_setKeyDeleter(map, deleteUnicodeStringKey);
944 }
945
946 StringToCEsMap::~StringToCEsMap()
947 {
948 uhash_close(map);
949 }
950
951 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces)
952 {
953 UErrorCode status = U_ZERO_ERROR;
954
955 uhash_put(map, (void *) string, (void *) ces, &status);
956 }
957
958 const CEList *StringToCEsMap::get(const UnicodeString *string)
959 {
960 return (const CEList *) uhash_get(map, string);
961 }
962
963 void StringToCEsMap::deleteCEList(void *obj)
964 {
965 CEList *list = (CEList *) obj;
966
967 delete list;
968 }
969
970 void StringToCEsMap::deleteUnicodeStringKey(void *obj)
971 {
972 UnicodeString *key = (UnicodeString *) obj;
973
974 delete key;
975 }
976
977 static void buildData(UCollator *coll, USet *charsToTest, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
978 {
979 int32_t itemCount = uset_getItemCount(charsToTest);
980 UErrorCode status = U_ZERO_ERROR;
981
982 for(int32_t item = 0; item < itemCount; item += 1) {
983 UChar32 start = 0, end = 0;
984 UChar buffer[16];
985 int32_t len = uset_getItem(charsToTest, item, &start, &end,
986 buffer, 16, &status);
987
988 if (len == 0) {
989 for (UChar32 ch = start; ch <= end; ch += 1) {
990 UnicodeString *st = new UnicodeString(ch);
991 CEList *ceList = new CEList(coll, *st);
992
993 charsToCEList->put(st, ceList);
994 ceToCharsStartingWith->put(ceList->get(0), st);
995 }
996 } else if (len > 0) {
997 UnicodeString *st = new UnicodeString(buffer, len);
998 CEList *ceList = new CEList(coll, *st);
999
1000 charsToCEList->put(st, ceList);
1001 ceToCharsStartingWith->put(ceList->get(0), st);
1002 } else {
1003 // shouldn't happen...
1004 }
1005 }
1006 }
1007
1008 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
1009 {
1010 for(int32_t i = 0; i < string.length(); i += 1) {
1011 UChar32 ch = string.char32At(i);
1012
1013 if (ch >= 0x0020 && ch <= 0x007F) {
1014 if (ch == 0x005C) {
1015 buffer.append("\\\\");
1016 } else {
1017 buffer.append(ch);
1018 }
1019 } else {
1020 char cbuffer[12];
1021
1022 if (ch <= 0xFFFFL) {
1023 sprintf(cbuffer, "\\u%4.4X", ch);
1024 } else {
1025 sprintf(cbuffer, "\\U%8.8X", ch);
1026 }
1027
1028 buffer.append(cbuffer);
1029 }
1030
1031 if (ch >= 0x10000L) {
1032 i += 1;
1033 }
1034 }
1035
1036 return buffer;
1037 }
1038
1039 static int32_t minLengthInChars(const CEList *ceList, int32_t offset, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith,
1040 UnicodeString &debug)
1041 {
1042 // find out shortest string for the longest sequence of ces.
1043 // needs to be refined to use dynamic programming, but will be roughly right
1044 int32_t totalStringLength = 0;
1045
1046 while (offset < ceList->size()) {
1047 int32_t ce = ceList->get(offset);
1048 int32_t bestLength = INT32_MIN;
1049 const UnicodeString *bestString = NULL;
1050 int32_t bestCeLength = 0;
1051 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
1052 int32_t stringCount = strings->size();
1053
1054 for (int32_t s = 0; s < stringCount; s += 1) {
1055 const UnicodeString *string = strings->get(s);
1056 const CEList *ceList2 = charsToCEList->get(string);
1057
1058 if (ceList->matchesAt(offset, ceList2)) {
1059 int32_t length = ceList2->size() - string->length();
1060
1061 if (bestLength < length) {
1062 bestLength = length;
1063 bestCeLength = ceList2->size();
1064 bestString = string;
1065 }
1066 }
1067 }
1068
1069 totalStringLength += bestString->length();
1070 escape(*bestString, debug).append("/");
1071 offset += bestCeLength;
1072 }
1073
1074 debug.append((UChar)0x0000);
1075 return totalStringLength;
1076 }
1077
1078 static void minLengthTest(UCollator *coll, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
1079 {
1080 UnicodeString examples[] = {"fuss", "fiss", "affliss", "VII"};
1081 UnicodeString debug;
1082 int32_t nExamples = sizeof(examples) / sizeof(examples[0]);
1083
1084 for (int32_t s = 0; s < nExamples; s += 1) {
1085 CEList *ceList = new CEList(coll, examples[s]);
1086
1087 //infoln("%S:", examples[s].getTerminatedBuffer());
1088
1089 for(int32_t i = 0; i < examples[s].length(); i += 1) {
1090 debug.remove();
1091
1092 int32_t minLength = minLengthInChars(ceList, i, charsToCEList, ceToCharsStartingWith, debug);
1093 //infoln("\t%d\t%S", minLength, debug.getTerminatedBuffer());
1094 }
1095
1096 //infoln();
1097 delete ceList;
1098 }
1099 }
1100
1101 //----------------------------------------------------------------------------------------
1102 //
1103 // Random Numbers. Similar to standard lib rand() and srand()
1104 // Not using library to
1105 // 1. Get same results on all platforms.
1106 // 2. Get access to current seed, to more easily reproduce failures.
1107 //
1108 //---------------------------------------------------------------------------------------
1109 static uint32_t m_seed = 1;
1110
1111 static uint32_t m_rand()
1112 {
1113 m_seed = m_seed * 1103515245 + 12345;
1114 return (uint32_t)(m_seed/65536) % 32768;
1115 }
1116
1117 class Monkey
1118 {
1119 public:
1120 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
1121
1122 protected:
1123 Monkey();
1124 virtual ~Monkey();
1125 };
1126
1127 Monkey::Monkey()
1128 {
1129 // ook?
1130 }
1131
1132 Monkey::~Monkey()
1133 {
1134 // ook?
1135 }
1136
1137 class SetMonkey : public Monkey
1138 {
1139 public:
1140 SetMonkey(const USet *theSet);
1141 ~SetMonkey();
1142
1143 virtual void append(UnicodeString &test, UnicodeString &alternate);
1144
1145 private:
1146 const USet *set;
1147 };
1148
1149 SetMonkey::SetMonkey(const USet *theSet)
1150 : Monkey(), set(theSet)
1151 {
1152 // ook?
1153 }
1154
1155 SetMonkey::~SetMonkey()
1156 {
1157 //ook...
1158 }
1159
1160 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1161 {
1162 int32_t size = uset_size(set);
1163 int32_t index = m_rand() % size;
1164 UChar32 ch = uset_charAt(set, index);
1165 UnicodeString str(ch);
1166
1167 test.append(str);
1168 alternate.append(str); // flip case, or some junk?
1169 }
1170
1171 class StringSetMonkey : public Monkey
1172 {
1173 public:
1174 StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith);
1175 ~StringSetMonkey();
1176
1177 void append(UnicodeString &testCase, UnicodeString &alternate);
1178
1179 private:
1180 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1181
1182 const USet *set;
1183 UCollator *coll;
1184 StringToCEsMap *charsToCEList;
1185 CEToStringsMap *ceToCharsStartingWith;
1186 };
1187
1188 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith)
1189 : Monkey(), set(theSet), coll(theCollator), charsToCEList(theCharsToCEList), ceToCharsStartingWith(theCeToCharsStartingWith)
1190 {
1191 // ook.
1192 }
1193
1194 StringSetMonkey::~StringSetMonkey()
1195 {
1196 // ook?
1197 }
1198
1199 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1200 {
1201 int32_t itemCount = uset_getItemCount(set), len = 0;
1202 int32_t index = m_rand() % itemCount;
1203 UChar32 rangeStart = 0, rangeEnd = 0;
1204 UChar buffer[16];
1205 UErrorCode err = U_ZERO_ERROR;
1206
1207 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1208
1209 if (len == 0) {
1210 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1211 UChar32 ch = rangeStart + offset;
1212 UnicodeString str(ch);
1213
1214 testCase.append(str);
1215 generateAlternative(str, alternate);
1216 } else if (len > 0) {
1217 // should check that len < 16...
1218 UnicodeString str(buffer, len);
1219
1220 testCase.append(str);
1221 generateAlternative(str, alternate);
1222 } else {
1223 // shouldn't happen...
1224 }
1225 }
1226
1227 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1228 {
1229 // find out shortest string for the longest sequence of ces.
1230 // needs to be refined to use dynamic programming, but will be roughly right
1231 CEList ceList(coll, testCase);
1232 UnicodeString alt;
1233 int32_t offset = 0;
1234
1235 if (ceList.size() == 0) {
1236 return alternate.append(testCase);
1237 }
1238
1239 while (offset < ceList.size()) {
1240 int32_t ce = ceList.get(offset);
1241 const StringList *strings = ceToCharsStartingWith->getStringList(ce);
1242
1243 if (strings == NULL) {
1244 return alternate.append(testCase);
1245 }
1246
1247 int32_t stringCount = strings->size();
1248 int32_t tries = 0;
1249
1250 // find random string that generates the same CEList
1251 const CEList *ceList2;
1252 const UnicodeString *string;
1253
1254 do {
1255 int32_t s = m_rand() % stringCount;
1256
1257 if (tries++ > stringCount) {
1258 alternate.append(testCase);
1259 return alternate;
1260 }
1261
1262 string = strings->get(s);
1263 ceList2 = charsToCEList->get(string);
1264 } while (! ceList.matchesAt(offset, ceList2));
1265
1266 alt.append(*string);
1267 offset += ceList2->size();
1268 }
1269
1270 const CEList altCEs(coll, alt);
1271
1272 if (ceList.matchesAt(0, &altCEs)) {
1273 return alternate.append(alt);
1274 }
1275
1276 return alternate.append(testCase);
1277 }
1278
1279 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1280 {
1281 int32_t pieces = (m_rand() % 4) + 1;
1282 UBool matches;
1283
1284 do {
1285 testCase.remove();
1286 alternate.remove();
1287 monkeys[0]->append(testCase, alternate);
1288
1289 for(int32_t piece = 0; piece < pieces; piece += 1) {
1290 int32_t monkey = m_rand() % monkeyCount;
1291
1292 monkeys[monkey]->append(testCase, alternate);
1293 }
1294
1295 const CEList ceTest(coll, testCase);
1296 const CEList ceAlt(coll, alternate);
1297
1298 matches = ceTest.matchesAt(0, &ceAlt);
1299 } while (! matches);
1300 }
1301
1302 static inline USet *uset_openEmpty()
1303 {
1304 return uset_open(1, 0);
1305 }
1306
1307 //
1308 // Find the next acceptable boundary following the specified starting index
1309 // in the target text being searched.
1310 // TODO: refine what is an acceptable boundary. For the moment,
1311 // choose the next position not within a combining sequence.
1312 //
1313 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
1314 const UChar *text = string.getBuffer();
1315 int32_t textLen = string.length();
1316
1317 if (startIndex >= textLen) {
1318 return startIndex;
1319 }
1320
1321 UChar32 c;
1322 int32_t i = startIndex;
1323
1324 U16_NEXT(text, i, textLen, c);
1325
1326 // If we are on a control character, stop without looking for combining marks.
1327 // Control characters do not combine.
1328 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1329 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1330 return i;
1331 }
1332
1333 // The initial character was not a control, and can thus accept trailing
1334 // combining characters. Advance over however many of them there are.
1335 int32_t indexOfLastCharChecked;
1336
1337 for (;;) {
1338 indexOfLastCharChecked = i;
1339
1340 if (i>=textLen) {
1341 break;
1342 }
1343
1344 U16_NEXT(text, i, textLen, c);
1345 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1346
1347 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1348 break;
1349 }
1350 }
1351
1352 return indexOfLastCharChecked;
1353 }
1354
1355 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
1356 const UChar *text = string.getBuffer();
1357 int32_t textLen = string.length();
1358
1359 if (index>=textLen || index<=0) {
1360 return FALSE;
1361 }
1362
1363 // If the character at the current index is not a GRAPHEME_EXTEND
1364 // then we can not be within a combining sequence.
1365 UChar32 c;
1366 U16_GET(text, 0, index, textLen, c);
1367 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1368 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1369 return FALSE;
1370 }
1371
1372 // We are at a combining mark. If the preceding character is anything
1373 // except a CONTROL, CR or LF, we are in a combining sequence.
1374 U16_PREV(text, 0, index, c);
1375 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1376
1377 return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1378 }
1379
1380 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1381 {
1382 UErrorCode status = U_ZERO_ERROR;
1383 OrderList targetOrders(coll, target, offset);
1384 OrderList patternOrders(coll, pattern);
1385 int32_t targetSize = targetOrders.size() - 1;
1386 int32_t patternSize = patternOrders.size() - 1;
1387 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status),
1388 target.getBuffer(), target.length(), &status);
1389
1390 if (patternSize == 0) {
1391 matchStart = matchEnd = 0;
1392 return FALSE;
1393 }
1394
1395 matchStart = matchEnd = -1;
1396
1397 for(int32_t i = 0; i < targetSize; i += 1) {
1398 if (targetOrders.matchesAt(i, patternOrders)) {
1399 int32_t start = targetOrders.getLowOffset(i);
1400 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1401 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1402
1403 // if the low and high offsets of the first CE in
1404 // the match are the same, it means that the match
1405 // starts in the middle of an expansion - all but
1406 // the first CE of the expansion will have the offset
1407 // of the following character.
1408 if (start == targetOrders.getHighOffset(i)) {
1409 continue;
1410 }
1411
1412 // Make sure match starts on a grapheme boundary
1413 if (! ubrk_isBoundary(charBreakIterator, start)) {
1414 continue;
1415 }
1416
1417 // If the low and high offsets of the CE after the match
1418 // are the same, it means that the match ends in the middle
1419 // of an expansion sequence.
1420 if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1421 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1422 continue;
1423 }
1424
1425 int32_t mend = maxLimit;
1426
1427 // Find the first grapheme break after the character index
1428 // of the last CE in the match. If it's after character index
1429 // that's after the last CE in the match, use that index
1430 // as the end of the match.
1431 if (minLimit < maxLimit) {
1432 int32_t nba = ubrk_following(charBreakIterator, minLimit);
1433
1434 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1435 mend = nba;
1436 }
1437 }
1438
1439 if (mend > maxLimit) {
1440 continue;
1441 }
1442
1443 if (! ubrk_isBoundary(charBreakIterator, mend)) {
1444 continue;
1445 }
1446
1447 matchStart = start;
1448 matchEnd = mend;
1449
1450 ubrk_close(charBreakIterator);
1451 return TRUE;
1452 }
1453 }
1454
1455 ubrk_close(charBreakIterator);
1456 return FALSE;
1457 }
1458
1459 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1460 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1461 int32_t val = defaultVal;
1462
1463 name.append(" *= *(-?\\d+)");
1464
1465 UErrorCode status = U_ZERO_ERROR;
1466 RegexMatcher m(name, params, 0, status);
1467
1468 if (m.find()) {
1469 // The param exists. Convert the string to an int.
1470 char valString[100];
1471 int32_t paramLength = m.end(1, status) - m.start(1, status);
1472
1473 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1474 paramLength = (int32_t)(sizeof(valString)-2);
1475 }
1476
1477 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1478 val = strtol(valString, NULL, 10);
1479
1480 // Delete this parameter from the params string.
1481 m.reset();
1482 params = m.replaceFirst("", status);
1483 }
1484
1485 //U_ASSERT(U_SUCCESS(status));
1486 if (! U_SUCCESS(status)) {
1487 val = defaultVal;
1488 }
1489
1490 return val;
1491 }
1492 #endif
1493
1494 #if !UCONFIG_NO_COLLATION
1495 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1496 const char *name, const char *strength, uint32_t seed)
1497 {
1498 UErrorCode status = U_ZERO_ERROR;
1499 int32_t actualStart = -1, actualEnd = -1;
1500 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1501 int32_t expectedStart = -1, expectedEnd = -1;
1502 int32_t notFoundCount = 0;
1503 UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1504 testCase.getBuffer(), testCase.length(),
1505 coll,
1506 NULL, // the break iterator
1507 &status);
1508
1509 // **** TODO: find *all* matches, not just first one ****
1510 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1511
1512 #if 0
1513 usearch_search(uss, 0, &actualStart, &actualEnd, &status);
1514 #else
1515 actualStart = usearch_next(uss, &status);
1516 actualEnd = actualStart + usearch_getMatchedLength(uss);
1517 #endif
1518
1519 if (actualStart != expectedStart || actualEnd != expectedEnd) {
1520 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1521 " strength=%s seed=%d",
1522 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1523 }
1524
1525 if (expectedStart == -1 && actualStart == -1) {
1526 notFoundCount += 1;
1527 }
1528
1529 // **** TODO: find *all* matches, not just first one ****
1530 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1531
1532 usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status);
1533
1534 #if 0
1535 usearch_search(uss, 0, &actualStart, &actualEnd, &status);
1536 #else
1537 usearch_reset(uss);
1538 actualStart = usearch_next(uss, &status);
1539 actualEnd = actualStart + usearch_getMatchedLength(uss);
1540 #endif
1541
1542 if (actualStart != expectedStart || actualEnd != expectedEnd) {
1543 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1544 " strength=%s seed=%d",
1545 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1546 }
1547
1548 if (expectedStart == -1 && actualStart == -1) {
1549 notFoundCount += 1;
1550 }
1551
1552 usearch_close(uss);
1553
1554 return notFoundCount;
1555 }
1556 #endif
1557
1558 void SSearchTest::monkeyTest(char *params)
1559 {
1560 // ook!
1561 UErrorCode status = U_ZERO_ERROR;
1562 U_STRING_DECL(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
1563 U_STRING_INIT(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
1564 UCollator *coll = ucol_open(NULL, &status);
1565 if (U_FAILURE(status)) {
1566 errln("Failed to create collator in MonkeyTest!");
1567 return;
1568 }
1569 USet *charsToTest = uset_openPattern(test_pattern, 47, &status);
1570 USet *expansions = uset_openEmpty();
1571 USet *contractions = uset_openEmpty();
1572 StringToCEsMap *charsToCEList = new StringToCEsMap();
1573 CEToStringsMap *ceToCharsStartingWith = new CEToStringsMap();
1574
1575 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1576
1577 uset_addAll(charsToTest, contractions);
1578 uset_addAll(charsToTest, expansions);
1579
1580 // TODO: set strength to UCOL_PRIMARY, change CEList to use strength?
1581 buildData(coll, charsToTest, charsToCEList, ceToCharsStartingWith);
1582
1583 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1584 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1585 USet *letters = uset_openPattern(letter_pattern, 39, &status);
1586 SetMonkey letterMonkey(letters);
1587 StringSetMonkey contractionMonkey(contractions, coll, charsToCEList, ceToCharsStartingWith);
1588 StringSetMonkey expansionMonkey(expansions, coll, charsToCEList, ceToCharsStartingWith);
1589 UnicodeString testCase;
1590 UnicodeString alternate;
1591 UnicodeString pattern, altPattern;
1592 UnicodeString prefix, altPrefix;
1593 UnicodeString suffix, altSuffix;
1594
1595 Monkey *monkeys[] = {
1596 &letterMonkey,
1597 &contractionMonkey,
1598 &expansionMonkey,
1599 &contractionMonkey,
1600 &expansionMonkey,
1601 &contractionMonkey,
1602 &expansionMonkey,
1603 &contractionMonkey,
1604 &expansionMonkey};
1605 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
1606 int32_t nonMatchCount = 0;
1607
1608 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1609 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1610 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
1611 int32_t loopCount = quick? 1000 : 10000;
1612 int32_t firstStrength = 0;
1613 int32_t lastStrength = strengthCount - 1;
1614
1615 if (params != NULL) {
1616 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1617 UnicodeString p(params);
1618
1619 loopCount = getIntParam("loop", p, loopCount);
1620 m_seed = getIntParam("seed", p, m_seed);
1621
1622 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1623 if (m.find()) {
1624 UnicodeString breakType = m.group(1, status);
1625
1626 for (int32_t s = 0; s < strengthCount; s += 1) {
1627 if (breakType == strengthNames[s]) {
1628 firstStrength = lastStrength = s;
1629 break;
1630 }
1631 }
1632
1633 m.reset();
1634 p = m.replaceFirst("", status);
1635 }
1636
1637 if (RegexMatcher("\\S", p, 0, status).find()) {
1638 // Each option is stripped out of the option string as it is processed.
1639 // All options have been checked. The option string should have been completely emptied..
1640 char buf[100];
1641 p.extract(buf, sizeof(buf), NULL, status);
1642 buf[sizeof(buf)-1] = 0;
1643 errln("Unrecognized or extra parameter: %s\n", buf);
1644 return;
1645 }
1646 #else
1647 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1648 #endif
1649 }
1650
1651 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1652 int32_t notFoundCount = 0;
1653
1654 ucol_setStrength(coll, strengths[s]);
1655
1656 // TODO: try alternate prefix and suffix too?
1657 // TODO: alterntaes are only equal at primary strength. Is this OK?
1658 for(int32_t t = 0; t < 10000; t += 1) {
1659 uint32_t seed = m_seed;
1660 int32_t nmc = 0;
1661
1662 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1663 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
1664 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
1665
1666 // pattern
1667 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1668
1669 testCase.remove();
1670 testCase.append(prefix);
1671 testCase.append(/*alt*/pattern);
1672
1673 // prefix + pattern
1674 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1675
1676 testCase.append(suffix);
1677
1678 // prefix + pattern + suffix
1679 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1680
1681 testCase.remove();
1682 testCase.append(pattern);
1683 testCase.append(suffix);
1684
1685 // pattern + suffix
1686 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1687 }
1688
1689 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1690 }
1691
1692 delete ceToCharsStartingWith;
1693 delete charsToCEList;
1694
1695 uset_close(contractions);
1696 uset_close(expansions);
1697 uset_close(charsToTest);
1698 uset_close(letters);
1699
1700 ucol_close(coll);
1701 }
1702
1703 #endif
1704
1705 #endif