2 **********************************************************************
3 * Copyright (C) 2005-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_COLLATION
13 #include "unicode/unistr.h"
14 #include "unicode/putil.h"
15 #include "unicode/usearch.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/ustring.h"
35 #include "unicode/colldata.h"
36 #include "unicode/bmsearch.h"
37 #include "unicode/bms.h"
39 #include "xmlparser.h"
48 #define TEST_ASSERT(x) {if (!(x)) { \
49 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
52 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
55 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
56 __FILE__, __LINE__, testId, u_errorName(errcode));}}
58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
60 #define DELETE_ARRAY(array) uprv_free((void *) (array))
62 //---------------------------------------------------------------------------
64 // Test class boilerplate
66 //---------------------------------------------------------------------------
67 SSearchTest::SSearchTest()
71 SSearchTest::~SSearchTest()
75 void SSearchTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char *params
)
77 if (exec
) logln("TestSuite SSearchTest: ");
79 #if !UCONFIG_NO_BREAK_ITERATION
80 case 0: name
= "searchTest";
81 if (exec
) searchTest();
84 case 1: name
= "offsetTest";
85 if (exec
) offsetTest();
88 case 2: name
= "monkeyTest";
89 if (exec
) monkeyTest(params
);
92 case 3: name
= "bmMonkeyTest";
93 if (exec
) bmMonkeyTest(params
);
96 case 4: name
= "boyerMooreTest";
97 if (exec
) boyerMooreTest();
100 case 5: name
= "goodSuffixTest";
101 if (exec
) goodSuffixTest();
104 case 6: name
= "searchTime";
105 if (exec
) searchTime();
108 case 7: name
= "bmsTest";
112 case 8: name
= "bmSearchTest";
113 if (exec
) bmSearchTest();
116 case 9: name
= "udhrTest";
117 if (exec
) udhrTest();
119 case 10: name
= "stringListTest";
120 if (exec
) stringListTest();
124 break; //needed to end loop
129 #if !UCONFIG_NO_BREAK_ITERATION
131 #define PATH_BUFFER_SIZE 2048
132 const char *SSearchTest::getPath(char buffer
[2048], const char *filename
) {
133 UErrorCode status
= U_ZERO_ERROR
;
134 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
136 if (U_FAILURE(status
) || strlen(testDataDirectory
) + strlen(filename
) + 1 >= PATH_BUFFER_SIZE
) {
137 errln("ERROR: getPath() failed - %s", u_errorName(status
));
141 strcpy(buffer
, testDataDirectory
);
142 strcat(buffer
, filename
);
147 void SSearchTest::searchTest()
149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
150 UErrorCode status
= U_ZERO_ERROR
;
151 char path
[PATH_BUFFER_SIZE
];
152 const char *testFilePath
= getPath(path
, "ssearch.xml");
154 if (testFilePath
== NULL
) {
155 return; /* Couldn't get path: error message already output. */
158 LocalPointer
<UXMLParser
> parser(UXMLParser::createParser(status
));
159 TEST_ASSERT_SUCCESS(status
);
160 LocalPointer
<UXMLElement
> root(parser
->parseFile(testFilePath
, status
));
161 TEST_ASSERT_SUCCESS(status
);
162 if (U_FAILURE(status
)) {
166 const UnicodeString
*debugTestCase
= root
->getAttribute("debug");
167 if (debugTestCase
!= NULL
) {
168 // setenv("USEARCH_DEBUG", "1", 1);
172 const UXMLElement
*testCase
;
175 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
177 if (testCase
->getTagName().compare("test-case") != 0) {
178 errln("ssearch, unrecognized XML Element in test file");
181 const UnicodeString
*id
= testCase
->getAttribute("id");
184 id
->extract(0, id
->length(), testId
, sizeof(testId
), US_INV
);
187 // If debugging test case has been specified and this is not it, skip to next.
188 if (id
!=NULL
&& debugTestCase
!=NULL
&& *id
!= *debugTestCase
) {
192 // Get the requested collation strength.
193 // Default is tertiary if the XML attribute is missing from the test case.
195 const UnicodeString
*strength
= testCase
->getAttribute("strength");
196 UColAttributeValue collatorStrength
= UCOL_PRIMARY
;
197 if (strength
==NULL
) { collatorStrength
= UCOL_TERTIARY
;}
198 else if (*strength
=="PRIMARY") { collatorStrength
= UCOL_PRIMARY
;}
199 else if (*strength
=="SECONDARY") { collatorStrength
= UCOL_SECONDARY
;}
200 else if (*strength
=="TERTIARY") { collatorStrength
= UCOL_TERTIARY
;}
201 else if (*strength
=="QUATERNARY") { collatorStrength
= UCOL_QUATERNARY
;}
202 else if (*strength
=="IDENTICAL") { collatorStrength
= UCOL_IDENTICAL
;}
204 // Bogus value supplied for strength. Shouldn't happen, even from
205 // typos, if the XML source has been validated.
206 // This assert is a little deceiving in that strength can be
207 // any of the allowed values, not just TERTIARY, but it will
208 // do the job of getting the error output.
209 TEST_ASSERT(*strength
=="TERTIARY")
213 // Get the collator normalization flag. Default is UCOL_OFF.
215 UColAttributeValue normalize
= UCOL_OFF
;
216 const UnicodeString
*norm
= testCase
->getAttribute("norm");
217 TEST_ASSERT (norm
==NULL
|| *norm
=="ON" || *norm
=="OFF");
218 if (norm
!=NULL
&& *norm
=="ON") {
223 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
225 UColAttributeValue alternateHandling
= UCOL_NON_IGNORABLE
;
226 const UnicodeString
*alt
= testCase
->getAttribute("alternate_handling");
227 TEST_ASSERT (alt
== NULL
|| *alt
== "SHIFTED" || *alt
== "NON_IGNORABLE");
228 if (alt
!= NULL
&& *alt
== "SHIFTED") {
229 alternateHandling
= UCOL_SHIFTED
;
232 const UnicodeString
defLocale("en");
234 const UnicodeString
*locale
= testCase
->getAttribute("locale");
235 if (locale
== NULL
|| locale
->length()==0) {
238 locale
->extract(0, locale
->length(), clocale
, sizeof(clocale
), NULL
);
242 UnicodeString target
;
243 UnicodeString pattern
;
244 int32_t expectedMatchStart
= -1;
245 int32_t expectedMatchLimit
= -1;
246 const UXMLElement
*n
;
247 int32_t nodeCount
= 0;
249 n
= testCase
->getChildElement("pattern");
250 TEST_ASSERT(n
!= NULL
);
254 text
= n
->getText(FALSE
);
255 text
= text
.unescape();
256 pattern
.append(text
);
259 n
= testCase
->getChildElement("pre");
261 text
= n
->getText(FALSE
);
262 text
= text
.unescape();
267 n
= testCase
->getChildElement("m");
269 expectedMatchStart
= target
.length();
270 text
= n
->getText(FALSE
);
271 text
= text
.unescape();
273 expectedMatchLimit
= target
.length();
277 n
= testCase
->getChildElement("post");
279 text
= n
->getText(FALSE
);
280 text
= text
.unescape();
285 // Check that there weren't extra things in the XML
286 TEST_ASSERT(nodeCount
== testCase
->countChildren());
288 // Open a collator and StringSearch based on the parameters
289 // obtained from the XML.
291 status
= U_ZERO_ERROR
;
292 LocalUCollatorPointer
collator(ucol_open(clocale
, &status
));
293 ucol_setStrength(collator
.getAlias(), collatorStrength
);
294 ucol_setAttribute(collator
.getAlias(), UCOL_NORMALIZATION_MODE
, normalize
, &status
);
295 ucol_setAttribute(collator
.getAlias(), UCOL_ALTERNATE_HANDLING
, alternateHandling
, &status
);
296 LocalUStringSearchPointer
uss(usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
297 target
.getBuffer(), target
.length(),
299 NULL
, // the break iterator
302 TEST_ASSERT_SUCCESS(status
);
303 if (U_FAILURE(status
)) {
307 int32_t foundStart
= 0;
308 int32_t foundLimit
= 0;
312 // Do the search, check the match result against the expected results.
314 foundMatch
= usearch_search(uss
.getAlias(), 0, &foundStart
, &foundLimit
, &status
);
315 TEST_ASSERT_SUCCESS(status
);
316 if ((foundMatch
&& expectedMatchStart
<0) ||
317 (foundStart
!= expectedMatchStart
) ||
318 (foundLimit
!= expectedMatchLimit
)) {
319 TEST_ASSERT(FALSE
); // ouput generic error position
320 infoln("Found, expected match start = %d, %d \n"
321 "Found, expected match limit = %d, %d",
322 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
325 // In case there are other matches...
326 // (should we only do this if the test case passed?)
328 expectedMatchStart
= foundStart
;
329 expectedMatchLimit
= foundLimit
;
331 foundMatch
= usearch_search(uss
.getAlias(), foundLimit
, &foundStart
, &foundLimit
, &status
);
334 uss
.adoptInstead(usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
335 target
.getBuffer(), target
.length(),
341 // Do the backwards search, check the match result against the expected results.
343 foundMatch
= usearch_searchBackwards(uss
.getAlias(), target
.length(), &foundStart
, &foundLimit
, &status
);
344 TEST_ASSERT_SUCCESS(status
);
345 if ((foundMatch
&& expectedMatchStart
<0) ||
346 (foundStart
!= expectedMatchStart
) ||
347 (foundLimit
!= expectedMatchLimit
)) {
348 TEST_ASSERT(FALSE
); // ouput generic error position
349 infoln("Found, expected backwards match start = %d, %d \n"
350 "Found, expected backwards match limit = %d, %d",
351 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
363 void SSearchTest::udhrTest()
365 UErrorCode status
= U_ZERO_ERROR
;
366 char path
[PATH_BUFFER_SIZE
];
367 const char *udhrPath
= getPath(path
, "udhr");
369 if (udhrPath
== NULL
) {
370 // couldn't get path: error message already output...
374 UdhrTestCase testCases
[] = {
375 {"en", "udhr_eng.txt"},
376 {"de", "udhr_deu_1996.txt"},
377 {"fr", "udhr_fra.txt"},
378 {"ru", "udhr_rus.txt"},
379 {"th", "udhr_tha.txt"},
380 {"ja", "udhr_jpn.txt"},
381 {"ko", "udhr_kor.txt"},
382 {"zh", "udhr_cmn_hans.txt"},
383 {"zh_Hant", "udhr_cmn_hant.txt"}
386 int32_t testCount
= ARRAY_SIZE(testCases
);
388 for (int32_t t
= 0; t
< testCount
; t
+= 1) {
390 char *resolvedFileName
= NULL
;
391 const char *encoding
= NULL
;
392 UCHARBUF
*ucharBuf
= NULL
;
394 ucbuf_resolveFileName(udhrPath
, testCases
[t
].file
, NULL
, &len
, &status
);
395 resolvedFileName
= NEW_ARRAY(char, len
);
397 if(resolvedFileName
== NULL
){
401 if(status
== U_BUFFER_OVERFLOW_ERROR
){
402 status
= U_ZERO_ERROR
;
405 ucbuf_resolveFileName(udhrPath
, testCases
[t
].file
, resolvedFileName
, &len
, &status
);
406 ucharBuf
= ucbuf_open(resolvedFileName
, &encoding
, TRUE
, FALSE
, &status
);
408 DELETE_ARRAY(resolvedFileName
);
410 if(U_FAILURE(status
)){
411 infoln("Could not open the input file %s. Test skipped\n", testCases
[t
].file
);
415 int32_t targetLen
= 0;
416 const UChar
*target
= ucbuf_getBuffer(ucharBuf
, &targetLen
, &status
);
418 /* The first line of the file contains the pattern */
419 int32_t start
= 0, end
= 0, plen
= 0;
421 for(end
= start
; ; end
+= 1) {
422 UChar ch
= target
[end
];
424 if (ch
== 0x000A || ch
== 0x000D || ch
== 0x2028) {
431 UChar
*pattern
= NEW_ARRAY(UChar
, plen
);
432 for (int32_t i
= 0; i
< plen
; i
+= 1) {
433 pattern
[i
] = target
[start
++];
437 UCollator
*coll
= ucol_open(testCases
[t
].locale
, &status
);
441 if (U_FAILURE(status
)) {
442 errln("Could not open collator for %s", testCases
[t
].locale
);
443 goto delete_collator
;
446 ucd
= ucd_open(coll
, &status
);
448 if (U_FAILURE(status
)) {
449 errln("Could not open CollData object for %s", testCases
[t
].locale
);
453 bms
= bms_open(ucd
, pattern
, plen
, target
, targetLen
, &status
);
455 if (U_FAILURE(status
)) {
456 errln("Could not open search object for %s", testCases
[t
].locale
);
461 while (bms_search(bms
, offset
, &start
, &end
)) {
466 errln("Could not find pattern - locale: %s, file: %s ", testCases
[t
].locale
, testCases
[t
].file
);
478 DELETE_ARRAY(pattern
);
479 ucbuf_close(ucharBuf
);
485 void SSearchTest::bmSearchTest()
487 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
488 UErrorCode status
= U_ZERO_ERROR
;
489 char path
[PATH_BUFFER_SIZE
];
490 const char *testFilePath
= getPath(path
, "ssearch.xml");
492 if (testFilePath
== NULL
) {
493 return; /* Couldn't get path: error message already output. */
496 UXMLParser
*parser
= UXMLParser::createParser(status
);
497 TEST_ASSERT_SUCCESS(status
);
498 UXMLElement
*root
= parser
->parseFile(testFilePath
, status
);
499 TEST_ASSERT_SUCCESS(status
);
500 if (U_FAILURE(status
)) {
504 const UnicodeString
*debugTestCase
= root
->getAttribute("debug");
505 if (debugTestCase
!= NULL
) {
506 // setenv("USEARCH_DEBUG", "1", 1);
510 const UXMLElement
*testCase
;
513 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
515 if (testCase
->getTagName().compare("test-case") != 0) {
516 errln("ssearch, unrecognized XML Element in test file");
519 const UnicodeString
*id
= testCase
->getAttribute("id");
522 id
->extract(0, id
->length(), testId
, sizeof(testId
), US_INV
);
525 // If debugging test case has been specified and this is not it, skip to next.
526 if (id
!=NULL
&& debugTestCase
!=NULL
&& *id
!= *debugTestCase
) {
530 // Get the requested collation strength.
531 // Default is tertiary if the XML attribute is missing from the test case.
533 const UnicodeString
*strength
= testCase
->getAttribute("strength");
534 UColAttributeValue collatorStrength
= UCOL_PRIMARY
;
535 if (strength
==NULL
) { collatorStrength
= UCOL_TERTIARY
;}
536 else if (*strength
=="PRIMARY") { collatorStrength
= UCOL_PRIMARY
;}
537 else if (*strength
=="SECONDARY") { collatorStrength
= UCOL_SECONDARY
;}
538 else if (*strength
=="TERTIARY") { collatorStrength
= UCOL_TERTIARY
;}
539 else if (*strength
=="QUATERNARY") { collatorStrength
= UCOL_QUATERNARY
;}
540 else if (*strength
=="IDENTICAL") { collatorStrength
= UCOL_IDENTICAL
;}
542 // Bogus value supplied for strength. Shouldn't happen, even from
543 // typos, if the XML source has been validated.
544 // This assert is a little deceiving in that strength can be
545 // any of the allowed values, not just TERTIARY, but it will
546 // do the job of getting the error output.
547 TEST_ASSERT(*strength
=="TERTIARY")
551 // Get the collator normalization flag. Default is UCOL_OFF.
553 UColAttributeValue normalize
= UCOL_OFF
;
554 const UnicodeString
*norm
= testCase
->getAttribute("norm");
555 TEST_ASSERT (norm
==NULL
|| *norm
=="ON" || *norm
=="OFF");
556 if (norm
!=NULL
&& *norm
=="ON") {
561 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
563 UColAttributeValue alternateHandling
= UCOL_NON_IGNORABLE
;
564 const UnicodeString
*alt
= testCase
->getAttribute("alternate_handling");
565 TEST_ASSERT (alt
== NULL
|| *alt
== "SHIFTED" || *alt
== "NON_IGNORABLE");
566 if (alt
!= NULL
&& *alt
== "SHIFTED") {
567 alternateHandling
= UCOL_SHIFTED
;
570 const UnicodeString
defLocale("en");
572 const UnicodeString
*locale
= testCase
->getAttribute("locale");
573 if (locale
== NULL
|| locale
->length()==0) {
576 locale
->extract(0, locale
->length(), clocale
, sizeof(clocale
), NULL
);
580 UnicodeString target
;
581 UnicodeString pattern
;
582 int32_t expectedMatchStart
= -1;
583 int32_t expectedMatchLimit
= -1;
584 const UXMLElement
*n
;
585 int32_t nodeCount
= 0;
587 n
= testCase
->getChildElement("pattern");
588 TEST_ASSERT(n
!= NULL
);
592 text
= n
->getText(FALSE
);
593 text
= text
.unescape();
594 pattern
.append(text
);
597 n
= testCase
->getChildElement("pre");
599 text
= n
->getText(FALSE
);
600 text
= text
.unescape();
605 n
= testCase
->getChildElement("m");
607 expectedMatchStart
= target
.length();
608 text
= n
->getText(FALSE
);
609 text
= text
.unescape();
611 expectedMatchLimit
= target
.length();
615 n
= testCase
->getChildElement("post");
617 text
= n
->getText(FALSE
);
618 text
= text
.unescape();
623 // Check that there weren't extra things in the XML
624 TEST_ASSERT(nodeCount
== testCase
->countChildren());
626 // Open a collator and StringSearch based on the parameters
627 // obtained from the XML.
629 status
= U_ZERO_ERROR
;
630 UCollator
*collator
= ucol_open(clocale
, &status
);
631 ucol_setStrength(collator
, collatorStrength
);
632 ucol_setAttribute(collator
, UCOL_NORMALIZATION_MODE
, normalize
, &status
);
633 ucol_setAttribute(collator
, UCOL_ALTERNATE_HANDLING
, alternateHandling
, &status
);
634 UCD
*ucd
= ucd_open(collator
, &status
);
635 BMS
*bms
= bms_open(ucd
, pattern
.getBuffer(), pattern
.length(), target
.getBuffer(), target
.length(), &status
);
637 TEST_ASSERT_SUCCESS(status
);
638 if (U_FAILURE(status
)) {
641 ucol_close(collator
);
645 int32_t foundStart
= 0;
646 int32_t foundLimit
= 0;
650 // Do the search, check the match result against the expected results.
652 foundMatch
= bms_search(bms
, 0, &foundStart
, &foundLimit
);
653 //TEST_ASSERT_SUCCESS(status);
654 if ((foundMatch
&& expectedMatchStart
< 0) ||
655 (foundStart
!= expectedMatchStart
) ||
656 (foundLimit
!= expectedMatchLimit
)) {
657 TEST_ASSERT(FALSE
); // ouput generic error position
658 infoln("Found, expected match start = %d, %d \n"
659 "Found, expected match limit = %d, %d",
660 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
665 ucol_close(collator
);
685 OrderList(UCollator
*coll
, const UnicodeString
&string
, int32_t stringOffset
= 0);
688 int32_t size(void) const;
689 void add(int32_t order
, int32_t low
, int32_t high
);
690 const Order
*get(int32_t index
) const;
691 int32_t getLowOffset(int32_t index
) const;
692 int32_t getHighOffset(int32_t index
) const;
693 int32_t getOrder(int32_t index
) const;
695 UBool
compare(const OrderList
&other
) const;
696 UBool
matchesAt(int32_t offset
, const OrderList
&other
) const;
704 OrderList::OrderList()
705 : list(NULL
), listMax(16), listSize(0)
707 list
= new Order
[listMax
];
710 OrderList::OrderList(UCollator
*coll
, const UnicodeString
&string
, int32_t stringOffset
)
711 : list(NULL
), listMax(16), listSize(0)
713 UErrorCode status
= U_ZERO_ERROR
;
714 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
715 uint32_t strengthMask
= 0;
716 int32_t order
, low
, high
;
718 switch (ucol_getStrength(coll
))
721 strengthMask
|= UCOL_TERTIARYORDERMASK
;
725 strengthMask
|= UCOL_SECONDARYORDERMASK
;
729 strengthMask
|= UCOL_PRIMARYORDERMASK
;
732 list
= new Order
[listMax
];
734 ucol_setOffset(elems
, stringOffset
, &status
);
737 low
= ucol_getOffset(elems
);
738 order
= ucol_next(elems
, &status
);
739 high
= ucol_getOffset(elems
);
741 if (order
!= UCOL_NULLORDER
) {
742 order
&= strengthMask
;
745 if (order
!= UCOL_IGNORABLE
) {
746 add(order
, low
, high
);
748 } while (order
!= UCOL_NULLORDER
);
750 ucol_closeElements(elems
);
753 OrderList::~OrderList()
758 void OrderList::add(int32_t order
, int32_t low
, int32_t high
)
760 if (listSize
>= listMax
) {
763 Order
*newList
= new Order
[listMax
];
765 uprv_memcpy(newList
, list
, listSize
* sizeof(Order
));
770 list
[listSize
].order
= order
;
771 list
[listSize
].lowOffset
= low
;
772 list
[listSize
].highOffset
= high
;
777 const Order
*OrderList::get(int32_t index
) const
779 if (index
>= listSize
) {
786 int32_t OrderList::getLowOffset(int32_t index
) const
788 const Order
*order
= get(index
);
791 return order
->lowOffset
;
797 int32_t OrderList::getHighOffset(int32_t index
) const
799 const Order
*order
= get(index
);
802 return order
->highOffset
;
808 int32_t OrderList::getOrder(int32_t index
) const
810 const Order
*order
= get(index
);
816 return UCOL_NULLORDER
;
819 int32_t OrderList::size() const
824 void OrderList::reverse()
826 for(int32_t f
= 0, b
= listSize
- 1; f
< b
; f
+= 1, b
-= 1) {
827 Order swap
= list
[b
];
834 UBool
OrderList::compare(const OrderList
&other
) const
836 if (listSize
!= other
.listSize
) {
840 for(int32_t i
= 0; i
< listSize
; i
+= 1) {
841 if (list
[i
].order
!= other
.list
[i
].order
||
842 list
[i
].lowOffset
!= other
.list
[i
].lowOffset
||
843 list
[i
].highOffset
!= other
.list
[i
].highOffset
) {
851 UBool
OrderList::matchesAt(int32_t offset
, const OrderList
&other
) const
853 // NOTE: sizes include the NULLORDER, which we don't want to compare.
854 int32_t otherSize
= other
.size() - 1;
856 if (listSize
- 1 - offset
< otherSize
) {
860 for (int32_t i
= offset
, j
= 0; j
< otherSize
; i
+= 1, j
+= 1) {
861 if (getOrder(i
) != other
.getOrder(j
)) {
869 static char *printOffsets(char *buffer
, OrderList
&list
)
871 int32_t size
= list
.size();
874 for(int32_t i
= 0; i
< size
; i
+= 1) {
875 const Order
*order
= list
.get(i
);
878 s
+= sprintf(s
, ", ");
881 s
+= sprintf(s
, "(%d, %d)", order
->lowOffset
, order
->highOffset
);
887 static char *printOrders(char *buffer
, OrderList
&list
)
889 int32_t size
= list
.size();
892 for(int32_t i
= 0; i
< size
; i
+= 1) {
893 const Order
*order
= list
.get(i
);
896 s
+= sprintf(s
, ", ");
899 s
+= sprintf(s
, "%8.8X", order
->order
);
905 void SSearchTest::offsetTest()
907 static const UVersionInfo icu47
= { 4, 7, 0, 0 };
908 const char *test
[] = {
909 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
910 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
911 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
913 "\\ua191\\u16ef\\u2036\\u017a",
916 // This results in a complex interaction between contraction,
917 // expansion and normalization that confuses the backwards offset fixups.
918 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
921 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
922 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
925 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
926 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
927 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
928 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
929 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
931 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
932 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
933 "a\\u02FF\\u0316\\u0301",
934 "a\\u0430\\u0301\\u0316",
935 "a\\u0430\\u0316\\u0301",
936 "abc\\u0E41\\u0301\\u0316",
937 "abc\\u0E41\\u0316\\u0301",
938 "\\u0E41\\u0301\\u0316",
939 "\\u0E41\\u0316\\u0301",
953 "A\\u0302\\u0301\\u0323B",
957 " \\uD800\\uDC00\\uDC00",
958 "a\\uD800\\uDC00\\uDC00",
964 "\\u0301A\\u0301\\u0301",
970 int32_t testCount
= ARRAY_SIZE(test
);
971 UErrorCode status
= U_ZERO_ERROR
;
972 RuleBasedCollator
*col
= (RuleBasedCollator
*) Collator::createInstance(Locale::getEnglish(), status
);
973 if (U_FAILURE(status
)) {
974 errcheckln(status
, "Failed to create collator in offsetTest! - %s", u_errorName(status
));
977 char buffer
[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
978 // We could allocate one that's the right size by (CE_count * 10) + 2
979 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
981 col
->setAttribute(UCOL_NORMALIZATION_MODE
, UCOL_ON
, status
);
983 for(int32_t i
= 0; i
< testCount
; i
+= 1) {
984 if (!isICUVersionAtLeast(icu47
) && i
>=4 && i
<=6) {
985 continue; // timebomb until ticket #8080 is resolved
987 UnicodeString ts
= CharsToUnicodeString(test
[i
]);
988 CollationElementIterator
*iter
= col
->createCollationElementIterator(ts
);
989 OrderList forwardList
;
990 OrderList backwardList
;
991 int32_t order
, low
, high
;
994 low
= iter
->getOffset();
995 order
= iter
->next(status
);
996 high
= iter
->getOffset();
998 forwardList
.add(order
, low
, high
);
999 } while (order
!= CollationElementIterator::NULLORDER
);
1002 iter
->setOffset(ts
.length(), status
);
1004 backwardList
.add(CollationElementIterator::NULLORDER
, iter
->getOffset(), iter
->getOffset());
1007 high
= iter
->getOffset();
1008 order
= iter
->previous(status
);
1009 low
= iter
->getOffset();
1011 if (order
== CollationElementIterator::NULLORDER
) {
1015 backwardList
.add(order
, low
, high
);
1018 backwardList
.reverse();
1020 if (forwardList
.compare(backwardList
)) {
1021 logln("Works with \"%s\"", test
[i
]);
1022 logln("Forward offsets: [%s]", printOffsets(buffer
, forwardList
));
1023 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1025 logln("Forward CEs: [%s]", printOrders(buffer
, forwardList
));
1026 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1030 errln("Fails with \"%s\"", test
[i
]);
1031 infoln("Forward offsets: [%s]", printOffsets(buffer
, forwardList
));
1032 infoln("Backward offsets: [%s]", printOffsets(buffer
, backwardList
));
1034 infoln("Forward CEs: [%s]", printOrders(buffer
, forwardList
));
1035 infoln("Backward CEs: [%s]", printOrders(buffer
, backwardList
));
1045 static UnicodeString
&escape(const UnicodeString
&string
, UnicodeString
&buffer
)
1047 for(int32_t i
= 0; i
< string
.length(); i
+= 1) {
1048 UChar32 ch
= string
.char32At(i
);
1050 if (ch
>= 0x0020 && ch
<= 0x007F) {
1052 buffer
.append("\\\\");
1059 if (ch
<= 0xFFFFL
) {
1060 sprintf(cbuffer
, "\\u%4.4X", ch
);
1062 sprintf(cbuffer
, "\\U%8.8X", ch
);
1065 buffer
.append(cbuffer
);
1068 if (ch
>= 0x10000L
) {
1089 PCEList(UCollator
*coll
, const UnicodeString
&string
);
1092 int32_t size() const;
1094 const PCE
*get(int32_t index
) const;
1096 int32_t getLowOffset(int32_t index
) const;
1097 int32_t getHighOffset(int32_t index
) const;
1098 uint64_t getOrder(int32_t index
) const;
1100 UBool
matchesAt(int32_t offset
, const PCEList
&other
) const;
1102 uint64_t operator[](int32_t index
) const;
1105 void add(uint64_t ce
, int32_t low
, int32_t high
);
1112 PCEList::PCEList(UCollator
*coll
, const UnicodeString
&string
)
1114 UErrorCode status
= U_ZERO_ERROR
;
1115 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
1119 list
= new PCE
[listMax
];
1121 ucol_setOffset(elems
, 0, &status
);
1124 order
= ucol_nextProcessed(elems
, &low
, &high
, &status
);
1125 add(order
, low
, high
);
1126 } while (order
!= UCOL_PROCESSED_NULLORDER
);
1128 ucol_closeElements(elems
);
1136 void PCEList::add(uint64_t order
, int32_t low
, int32_t high
)
1138 if (listSize
>= listMax
) {
1141 PCE
*newList
= new PCE
[listMax
];
1143 uprv_memcpy(newList
, list
, listSize
* sizeof(Order
));
1148 list
[listSize
].ce
= order
;
1149 list
[listSize
].lowOffset
= low
;
1150 list
[listSize
].highOffset
= high
;
1155 const PCE
*PCEList::get(int32_t index
) const
1157 if (index
>= listSize
) {
1161 return &list
[index
];
1164 int32_t PCEList::getLowOffset(int32_t index
) const
1166 const PCE
*pce
= get(index
);
1169 return pce
->lowOffset
;
1175 int32_t PCEList::getHighOffset(int32_t index
) const
1177 const PCE
*pce
= get(index
);
1180 return pce
->highOffset
;
1186 uint64_t PCEList::getOrder(int32_t index
) const
1188 const PCE
*pce
= get(index
);
1194 return UCOL_PROCESSED_NULLORDER
;
1197 int32_t PCEList::size() const
1202 UBool
PCEList::matchesAt(int32_t offset
, const PCEList
&other
) const
1204 // NOTE: sizes include the NULLORDER, which we don't want to compare.
1205 int32_t otherSize
= other
.size() - 1;
1207 if (listSize
- 1 - offset
< otherSize
) {
1211 for (int32_t i
= offset
, j
= 0; j
< otherSize
; i
+= 1, j
+= 1) {
1212 if (getOrder(i
) != other
.getOrder(j
)) {
1220 uint64_t PCEList::operator[](int32_t index
) const
1222 return getOrder(index
);
1225 void SSearchTest::boyerMooreTest()
1227 UErrorCode status
= U_ZERO_ERROR
;
1228 UCollator
*coll
= NULL
;
1229 CollData
*data
= NULL
;
1230 const CEList
* ce
= NULL
;
1231 const CEList
* ce1
= NULL
;
1232 UnicodeString lp
= "fuss";
1233 UnicodeString sp
= "fu\\u00DF";
1234 BoyerMooreSearch
*longPattern
= NULL
;
1235 BoyerMooreSearch
*shortPattern
= NULL
;
1236 UnicodeString targets
[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1237 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1238 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1239 int32_t start
= -1, end
= -1;
1241 coll
= ucol_openFromShortString("LEN_S1", FALSE
, NULL
, &status
);
1242 if (U_FAILURE(status
)) {
1243 errcheckln(status
, "Could not open collator. - %s", u_errorName(status
));
1247 data
= CollData::open(coll
, status
);
1248 if (U_FAILURE(status
)) {
1249 errln("Could not open CollData object.");
1253 data
->getDynamicClassID();
1254 if (U_FAILURE(status
)) {
1255 errln("Could not get dynamic class ID of CollData.");
1256 goto close_patterns
;
1259 data
->getStaticClassID();
1260 if (U_FAILURE(status
)) {
1261 errln("Could not get static class ID of CollData.");
1262 goto close_patterns
;
1265 longPattern
= new BoyerMooreSearch(data
, lp
.unescape(), NULL
, status
);
1266 shortPattern
= new BoyerMooreSearch(data
, sp
.unescape(), NULL
, status
);
1267 if (U_FAILURE(status
)) {
1268 errln("Could not create pattern objects.");
1269 goto close_patterns
;
1272 longPattern
->getBadCharacterTable();
1273 shortPattern
->getBadCharacterTable();
1274 if (U_FAILURE(status
)) {
1275 errln("Could not get bad character table.");
1276 goto close_patterns
;
1279 longPattern
->getGoodSuffixTable();
1280 shortPattern
->getGoodSuffixTable();
1281 if (U_FAILURE(status
)) {
1282 errln("Could not get good suffix table.");
1283 goto close_patterns
;
1286 longPattern
->getDynamicClassID();
1287 shortPattern
->getDynamicClassID();
1288 if (U_FAILURE(status
)) {
1289 errln("Could not get dynamic class ID of BoyerMooreSearch.");
1290 goto close_patterns
;
1293 longPattern
->getStaticClassID();
1294 shortPattern
->getStaticClassID();
1295 if (U_FAILURE(status
)) {
1296 errln("Could not get static class ID of BoyerMooreSearch.");
1297 goto close_patterns
;
1300 longPattern
->getData();
1301 shortPattern
->getData();
1302 if (U_FAILURE(status
)) {
1303 errln("Could not get collate data.");
1304 goto close_patterns
;
1307 ce
= longPattern
->getPatternCEs();
1308 ce1
= shortPattern
->getPatternCEs();
1309 if (U_FAILURE(status
)) {
1310 errln("Could not get pattern CEs.");
1311 goto close_patterns
;
1314 ce
->getDynamicClassID();
1315 ce1
->getDynamicClassID();
1316 if (U_FAILURE(status
)) {
1317 errln("Could not get dynamic class ID of CEList.");
1318 goto close_patterns
;
1321 ce
->getStaticClassID();
1322 ce1
->getStaticClassID();
1323 if (U_FAILURE(status
)) {
1324 errln("Could not get static class ID of CEList.");
1325 goto close_patterns
;
1328 if(data
->minLengthInChars(ce
,0) != 3){
1329 errln("Minimal Length in Characters for 'data' with 'ce' was suppose to give 3.");
1330 goto close_patterns
;
1333 if(data
->minLengthInChars(ce1
,0) != 3){
1334 errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to give 3.");
1335 goto close_patterns
;
1338 for (uint32_t t
= 0; t
< (sizeof(targets
)/sizeof(targets
[0])); t
+= 1) {
1339 UnicodeString target
= targets
[t
].unescape();
1341 longPattern
->setTargetString(&target
, status
);
1342 if (longPattern
->search(0, start
, end
)) {
1343 logln("Test %d: found long pattern at [%d, %d].", t
, start
, end
);
1345 errln("Test %d: did not find long pattern.", t
);
1348 shortPattern
->setTargetString(&target
, status
);
1349 if (shortPattern
->search(0, start
, end
)) {
1350 logln("Test %d: found short pattern at [%d, %d].", t
, start
, end
);
1352 errln("Test %d: did not find short pattern.", t
);
1355 if(longPattern
->empty()){
1356 errln("Test %d: Long pattern should not have been empty.");
1359 if(shortPattern
->empty()){
1360 errln("Test %d: Short pattern should not have been empty.");
1365 delete shortPattern
;
1369 CollData::close(data
);
1373 void SSearchTest::bmsTest()
1375 UErrorCode status
= U_ZERO_ERROR
;
1376 UCollator
*coll
= NULL
;
1378 UnicodeString lp
= "fuss";
1379 UnicodeString lpu
= lp
.unescape();
1380 UnicodeString sp
= "fu\\u00DF";
1381 UnicodeString spu
= sp
.unescape();
1382 BMS
*longPattern
= NULL
;
1383 BMS
*shortPattern
= NULL
;
1384 UnicodeString targets
[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1385 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1386 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1387 int32_t start
= -1, end
= -1;
1389 coll
= ucol_openFromShortString("LEN_S1", FALSE
, NULL
, &status
);
1390 if (U_FAILURE(status
)) {
1391 errcheckln(status
, "Could not open collator. - %s", u_errorName(status
));
1395 data
= ucd_open(coll
, &status
);
1396 if (U_FAILURE(status
)) {
1397 errln("Could not open CollData object.");
1401 longPattern
= bms_open(data
, lpu
.getBuffer(), lpu
.length(), NULL
, 0, &status
);
1402 shortPattern
= bms_open(data
, spu
.getBuffer(), spu
.length(), NULL
, 0, &status
);
1403 if (U_FAILURE(status
)) {
1404 errln("Couldn't open pattern objects.");
1405 goto close_patterns
;
1408 for (uint32_t t
= 0; t
< (sizeof(targets
)/sizeof(targets
[0])); t
+= 1) {
1409 UnicodeString target
= targets
[t
].unescape();
1411 bms_setTargetString(longPattern
, target
.getBuffer(), target
.length(), &status
);
1412 if (bms_search(longPattern
, 0, &start
, &end
)) {
1413 logln("Test %d: found long pattern at [%d, %d].", t
, start
, end
);
1415 errln("Test %d: did not find long pattern.", t
);
1418 bms_setTargetString(shortPattern
, target
.getBuffer(), target
.length(), &status
);
1419 if (bms_search(shortPattern
, 0, &start
, &end
)) {
1420 logln("Test %d: found short pattern at [%d, %d].", t
, start
, end
);
1422 errln("Test %d: did not find short pattern.", t
);
1426 /* Add better coverage for bms code. */
1427 if(bms_empty(longPattern
)) {
1428 errln("FAIL: longgPattern is empty.");
1431 if (!bms_getData(longPattern
)) {
1432 errln("FAIL: bms_getData returned NULL.");
1435 if (!ucd_getCollator(data
)) {
1436 errln("FAIL: ucd_getCollator returned NULL.");
1440 bms_close(shortPattern
);
1441 bms_close(longPattern
);
1449 void SSearchTest::goodSuffixTest()
1451 UErrorCode status
= U_ZERO_ERROR
;
1452 UCollator
*coll
= NULL
;
1453 CollData
*data
= NULL
;
1454 UnicodeString pat
= /*"gcagagag"*/ "fxeld";
1455 UnicodeString target
= /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
1456 BoyerMooreSearch
*pattern
= NULL
;
1457 int32_t start
= -1, end
= -1;
1459 coll
= ucol_open(NULL
, &status
);
1460 if (U_FAILURE(status
)) {
1461 errcheckln(status
, "Couldn't open collator. - %s", u_errorName(status
));
1465 data
= CollData::open(coll
, status
);
1466 if (U_FAILURE(status
)) {
1467 errln("Couldn't open CollData object.");
1471 pattern
= new BoyerMooreSearch(data
, pat
, &target
, status
);
1472 if (U_FAILURE(status
)) {
1473 errln("Couldn't open pattern object.");
1477 if (pattern
->search(0, start
, end
)) {
1478 logln("Found pattern at [%d, %d].", start
, end
);
1480 errln("Did not find pattern.");
1487 CollData::close(data
);
1492 // searchTime() A quick and dirty performance test for string search.
1493 // Probably doesn't really belong as part of intltest, but it
1494 // does check that the search succeeds, and gets the right result,
1495 // so it serves as a functionality test also.
1497 // To run as a perf test, up the loop count, select by commenting
1498 // and uncommenting in the code the operation to be measured,
1499 // rebuild, and measure the running time of this test alone.
1501 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
1503 void SSearchTest::searchTime() {
1504 static const char *longishText
=
1505 "Whylom, as olde stories tellen us,\n"
1506 "Ther was a duk that highte Theseus:\n"
1507 "Of Athenes he was lord and governour,\n"
1508 "And in his tyme swich a conquerour,\n"
1509 "That gretter was ther noon under the sonne.\n"
1510 "Ful many a riche contree hadde he wonne;\n"
1511 "What with his wisdom and his chivalrye,\n"
1512 "He conquered al the regne of Femenye,\n"
1513 "That whylom was y-cleped Scithia;\n"
1514 "And weddede the quene Ipolita,\n"
1515 "And broghte hir hoom with him in his contree\n"
1516 "With muchel glorie and greet solempnitee,\n"
1517 "And eek hir yonge suster Emelye.\n"
1518 "And thus with victorie and with melodye\n"
1519 "Lete I this noble duk to Athenes ryde,\n"
1520 "And al his hoost, in armes, him bisyde.\n"
1521 "And certes, if it nere to long to here,\n"
1522 "I wolde han told yow fully the manere,\n"
1523 "How wonnen was the regne of Femenye\n"
1524 "By Theseus, and by his chivalrye;\n"
1525 "And of the grete bataille for the nones\n"
1526 "Bitwixen Athen's and Amazones;\n"
1527 "And how asseged was Ipolita,\n"
1528 "The faire hardy quene of Scithia;\n"
1529 "And of the feste that was at hir weddinge,\n"
1530 "And of the tempest at hir hoom-cominge;\n"
1531 "But al that thing I moot as now forbere.\n"
1532 "I have, God woot, a large feeld to ere,\n"
1533 "And wayke been the oxen in my plough.\n"
1534 "The remenant of the tale is long y-nough.\n"
1535 "I wol nat letten eek noon of this route;\n"
1536 "Lat every felawe telle his tale aboute,\n"
1537 "And lat see now who shal the soper winne;\n"
1538 "And ther I lefte, I wol ageyn biginne.\n"
1539 "This duk, of whom I make mencioun,\n"
1540 "When he was come almost unto the toun,\n"
1541 "In al his wele and in his moste pryde,\n"
1542 "He was war, as he caste his eye asyde,\n"
1543 "Wher that ther kneled in the hye weye\n"
1544 "A companye of ladies, tweye and tweye,\n"
1545 "Ech after other, clad in clothes blake; \n"
1546 "But swich a cry and swich a wo they make,\n"
1547 "That in this world nis creature livinge,\n"
1548 "That herde swich another weymentinge;\n"
1549 "And of this cry they nolde never stenten,\n"
1550 "Til they the reynes of his brydel henten.\n"
1551 "'What folk ben ye, that at myn hoomcominge\n"
1552 "Perturben so my feste with cryinge'?\n"
1553 "Quod Theseus, 'have ye so greet envye\n"
1554 "Of myn honour, that thus compleyne and crye? \n"
1555 "Or who hath yow misboden, or offended?\n"
1556 "And telleth me if it may been amended;\n"
1557 "And why that ye ben clothed thus in blak'?\n"
1558 "The eldest lady of hem alle spak,\n"
1559 "When she hadde swowned with a deedly chere,\n"
1560 "That it was routhe for to seen and here,\n"
1561 "And seyde: 'Lord, to whom Fortune hath yiven\n"
1562 "Victorie, and as a conquerour to liven,\n"
1563 "Noght greveth us your glorie and your honour;\n"
1564 "But we biseken mercy and socour.\n"
1565 "Have mercy on our wo and our distresse.\n"
1566 "Som drope of pitee, thurgh thy gentilesse,\n"
1567 "Up-on us wrecched wommen lat thou falle.\n"
1568 "For certes, lord, ther nis noon of us alle,\n"
1569 "That she nath been a duchesse or a quene;\n"
1570 "Now be we caitifs, as it is wel sene:\n"
1571 "Thanked be Fortune, and hir false wheel,\n"
1572 "That noon estat assureth to be weel.\n"
1573 "And certes, lord, t'abyden your presence,\n"
1574 "Here in the temple of the goddesse Clemence\n"
1575 "We han ben waytinge al this fourtenight;\n"
1576 "Now help us, lord, sith it is in thy might.\n"
1577 "I wrecche, which that wepe and waille thus,\n"
1578 "Was whylom wyf to king Capaneus,\n"
1579 "That starf at Thebes, cursed be that day!\n"
1580 "And alle we, that been in this array,\n"
1581 "And maken al this lamentacioun,\n"
1582 "We losten alle our housbondes at that toun,\n"
1583 "Whyl that the sege ther-aboute lay.\n"
1584 "And yet now th'olde Creon, weylaway!\n"
1585 "The lord is now of Thebes the citee, \n"
1586 "Fulfild of ire and of iniquitee,\n"
1587 "He, for despyt, and for his tirannye,\n"
1588 "To do the dede bodyes vileinye,\n"
1589 "Of alle our lordes, whiche that ben slawe,\n"
1590 "Hath alle the bodyes on an heep y-drawe,\n"
1591 "And wol nat suffren hem, by noon assent,\n"
1592 "Neither to been y-buried nor y-brent,\n"
1593 "But maketh houndes ete hem in despyt. zet'\n";
1595 #define TEST_BOYER_MOORE 1
1596 const char *cPattern
= "maketh houndes ete hem";
1597 //const char *cPattern = "Whylom";
1598 //const char *cPattern = "zet";
1599 const char *testId
= "searchTime()"; // for error macros.
1600 UnicodeString target
= longishText
;
1601 UErrorCode status
= U_ZERO_ERROR
;
1604 LocalUCollatorPointer
collator(ucol_open("en", &status
));
1605 CollData
*data
= CollData::open(collator
.getAlias(), status
);
1606 if (U_FAILURE(status
) || collator
.isNull() || data
== NULL
) {
1607 errcheckln(status
, "Unable to open UCollator or CollData. - %s", u_errorName(status
));
1610 //ucol_setStrength(collator.getAlias(), collatorStrength);
1611 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
1612 UnicodeString uPattern
= cPattern
;
1613 #ifndef TEST_BOYER_MOORE
1614 LocalUStringSearchPointer
uss(usearch_openFromCollator(uPattern
.getBuffer(), uPattern
.length(),
1615 target
.getBuffer(), target
.length(),
1616 collator
.getAlias(),
1617 NULL
, // the break iterator
1619 TEST_ASSERT_SUCCESS(status
);
1621 BoyerMooreSearch
bms(data
, uPattern
, &target
, status
);
1622 TEST_ASSERT_SUCCESS(status
);
1625 // int32_t foundStart;
1626 // int32_t foundEnd;
1629 // Find the match position usgin strstr
1630 const char *pm
= strstr(longishText
, cPattern
);
1631 TEST_ASSERT_M(pm
!=NULL
, "No pattern match with strstr");
1632 int32_t refMatchPos
= (int32_t)(pm
- longishText
);
1633 int32_t icuMatchPos
;
1634 int32_t icuMatchEnd
;
1635 #ifndef TEST_BOYER_MOORE
1636 usearch_search(uss
.getAlias(), 0, &icuMatchPos
, &icuMatchEnd
, &status
);
1637 TEST_ASSERT_SUCCESS(status
);
1639 found
= bms
.search(0, icuMatchPos
, icuMatchEnd
);
1641 TEST_ASSERT_M(refMatchPos
== icuMatchPos
, "strstr and icu give different match positions.");
1646 // Try loopcounts around 100000 to some millions, depending on the operation,
1647 // to get runtimes of at least several seconds.
1648 for (i
=0; i
<10000; i
++) {
1649 #ifndef TEST_BOYER_MOORE
1650 found
= usearch_search(uss
.getAlias(), 0, &icuMatchPos
, &icuMatchEnd
, &status
);
1652 found
= bms
.search(0, icuMatchPos
, icuMatchEnd
);
1654 //TEST_ASSERT_SUCCESS(status);
1655 //TEST_ASSERT(found);
1657 // usearch_setOffset(uss.getAlias(), 0, &status);
1658 // icuMatchPos = usearch_next(uss.getAlias(), &status);
1660 // The i+j stuff is to confuse the optimizer and get it to actually leave the
1661 // call to strstr in place.
1662 //pm = strstr(longishText+j, cPattern);
1666 printf("%ld, %d\n", pm
-longishText
, j
);
1667 #ifdef TEST_BOYER_MOORE
1668 CollData::close(data
);
1673 //----------------------------------------------------------------------------------------
1675 // Random Numbers. Similar to standard lib rand() and srand()
1676 // Not using library to
1677 // 1. Get same results on all platforms.
1678 // 2. Get access to current seed, to more easily reproduce failures.
1680 //---------------------------------------------------------------------------------------
1681 static uint32_t m_seed
= 1;
1683 static uint32_t m_rand()
1685 m_seed
= m_seed
* 1103515245 + 12345;
1686 return (uint32_t)(m_seed
/65536) % 32768;
1692 virtual void append(UnicodeString
&test
, UnicodeString
&alternate
) = 0;
1709 class SetMonkey
: public Monkey
1712 SetMonkey(const USet
*theSet
);
1715 virtual void append(UnicodeString
&test
, UnicodeString
&alternate
);
1721 SetMonkey::SetMonkey(const USet
*theSet
)
1722 : Monkey(), set(theSet
)
1727 SetMonkey::~SetMonkey()
1732 void SetMonkey::append(UnicodeString
&test
, UnicodeString
&alternate
)
1734 int32_t size
= uset_size(set
);
1735 int32_t index
= m_rand() % size
;
1736 UChar32 ch
= uset_charAt(set
, index
);
1737 UnicodeString
str(ch
);
1740 alternate
.append(str
); // flip case, or some junk?
1743 class StringSetMonkey
: public Monkey
1746 StringSetMonkey(const USet
*theSet
, UCollator
*theCollator
, CollData
*theCollData
);
1749 void append(UnicodeString
&testCase
, UnicodeString
&alternate
);
1752 UnicodeString
&generateAlternative(const UnicodeString
&testCase
, UnicodeString
&alternate
);
1759 StringSetMonkey::StringSetMonkey(const USet
*theSet
, UCollator
*theCollator
, CollData
*theCollData
)
1760 : Monkey(), set(theSet
), coll(theCollator
), collData(theCollData
)
1765 StringSetMonkey::~StringSetMonkey()
1770 void StringSetMonkey::append(UnicodeString
&testCase
, UnicodeString
&alternate
)
1772 int32_t itemCount
= uset_getItemCount(set
), len
= 0;
1773 int32_t index
= m_rand() % itemCount
;
1774 UChar32 rangeStart
= 0, rangeEnd
= 0;
1776 UErrorCode err
= U_ZERO_ERROR
;
1778 len
= uset_getItem(set
, index
, &rangeStart
, &rangeEnd
, buffer
, 16, &err
);
1781 int32_t offset
= m_rand() % (rangeEnd
- rangeStart
+ 1);
1782 UChar32 ch
= rangeStart
+ offset
;
1783 UnicodeString
str(ch
);
1785 testCase
.append(str
);
1786 generateAlternative(str
, alternate
);
1787 } else if (len
> 0) {
1788 // should check that len < 16...
1789 UnicodeString
str(buffer
, len
);
1791 testCase
.append(str
);
1792 generateAlternative(str
, alternate
);
1794 // shouldn't happen...
1798 UnicodeString
&StringSetMonkey::generateAlternative(const UnicodeString
&testCase
, UnicodeString
&alternate
)
1800 // find out shortest string for the longest sequence of ces.
1801 // needs to be refined to use dynamic programming, but will be roughly right
1802 UErrorCode status
= U_ZERO_ERROR
;
1803 CEList
ceList(coll
, testCase
, status
);
1807 if (ceList
.size() == 0) {
1808 return alternate
.append(testCase
);
1811 while (offset
< ceList
.size()) {
1812 int32_t ce
= ceList
.get(offset
);
1813 const StringList
*strings
= collData
->getStringList(ce
);
1815 if (strings
== NULL
) {
1816 return alternate
.append(testCase
);
1819 int32_t stringCount
= strings
->size();
1822 // find random string that generates the same CEList
1823 const CEList
*ceList2
= NULL
;
1824 const UnicodeString
*string
= NULL
;
1825 UBool matches
= FALSE
;
1828 int32_t s
= m_rand() % stringCount
;
1830 if (tries
++ > stringCount
) {
1831 alternate
.append(testCase
);
1835 string
= strings
->get(s
);
1836 ceList2
= collData
->getCEList(string
);
1837 matches
= ceList
.matchesAt(offset
, ceList2
);
1840 collData
->freeCEList((CEList
*) ceList2
);
1842 } while (! matches
);
1844 alt
.append(*string
);
1845 offset
+= ceList2
->size();
1846 collData
->freeCEList(ceList2
);
1849 const CEList
altCEs(coll
, alt
, status
);
1851 if (ceList
.matchesAt(0, &altCEs
)) {
1852 return alternate
.append(alt
);
1855 return alternate
.append(testCase
);
1858 static void generateTestCase(UCollator
*coll
, Monkey
*monkeys
[], int32_t monkeyCount
, UnicodeString
&testCase
, UnicodeString
&alternate
)
1860 int32_t pieces
= (m_rand() % 4) + 1;
1861 UErrorCode status
= U_ZERO_ERROR
;
1867 monkeys
[0]->append(testCase
, alternate
);
1869 for(int32_t piece
= 0; piece
< pieces
; piece
+= 1) {
1870 int32_t monkey
= m_rand() % monkeyCount
;
1872 monkeys
[monkey
]->append(testCase
, alternate
);
1875 const CEList
ceTest(coll
, testCase
, status
);
1876 const CEList
ceAlt(coll
, alternate
, status
);
1878 matches
= ceTest
.matchesAt(0, &ceAlt
);
1879 } while (! matches
);
1883 // Find the next acceptable boundary following the specified starting index
1884 // in the target text being searched.
1885 // TODO: refine what is an acceptable boundary. For the moment,
1886 // choose the next position not within a combining sequence.
1889 static int32_t nextBoundaryAfter(const UnicodeString
&string
, int32_t startIndex
) {
1890 const UChar
*text
= string
.getBuffer();
1891 int32_t textLen
= string
.length();
1893 if (startIndex
>= textLen
) {
1898 int32_t i
= startIndex
;
1900 U16_NEXT(text
, i
, textLen
, c
);
1902 // If we are on a control character, stop without looking for combining marks.
1903 // Control characters do not combine.
1904 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1905 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
1909 // The initial character was not a control, and can thus accept trailing
1910 // combining characters. Advance over however many of them there are.
1911 int32_t indexOfLastCharChecked
;
1914 indexOfLastCharChecked
= i
;
1920 U16_NEXT(text
, i
, textLen
, c
);
1921 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1923 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
1928 return indexOfLastCharChecked
;
1933 static UBool
isInCombiningSequence(const UnicodeString
&string
, int32_t index
) {
1934 const UChar
*text
= string
.getBuffer();
1935 int32_t textLen
= string
.length();
1937 if (index
>=textLen
|| index
<=0) {
1941 // If the character at the current index is not a GRAPHEME_EXTEND
1942 // then we can not be within a combining sequence.
1944 U16_GET(text
, 0, index
, textLen
, c
);
1945 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1946 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
1950 // We are at a combining mark. If the preceding character is anything
1951 // except a CONTROL, CR or LF, we are in a combining sequence.
1952 U16_PREV(text
, 0, index
, c
);
1953 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1955 return !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
1959 static UBool
simpleSearch(UCollator
*coll
, const UnicodeString
&target
, int32_t offset
, const UnicodeString
&pattern
, int32_t &matchStart
, int32_t &matchEnd
)
1961 UErrorCode status
= U_ZERO_ERROR
;
1962 OrderList
targetOrders(coll
, target
, offset
);
1963 OrderList
patternOrders(coll
, pattern
);
1964 int32_t targetSize
= targetOrders
.size() - 1;
1965 int32_t patternSize
= patternOrders
.size() - 1;
1966 UBreakIterator
*charBreakIterator
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(coll
, ULOC_VALID_LOCALE
, &status
),
1967 target
.getBuffer(), target
.length(), &status
);
1969 if (patternSize
== 0) {
1970 // Searching for an empty pattern always fails
1971 matchStart
= matchEnd
= -1;
1972 ubrk_close(charBreakIterator
);
1976 matchStart
= matchEnd
= -1;
1978 for(int32_t i
= 0; i
< targetSize
; i
+= 1) {
1979 if (targetOrders
.matchesAt(i
, patternOrders
)) {
1980 int32_t start
= targetOrders
.getLowOffset(i
);
1981 int32_t maxLimit
= targetOrders
.getLowOffset(i
+ patternSize
);
1982 int32_t minLimit
= targetOrders
.getLowOffset(i
+ patternSize
- 1);
1984 // if the low and high offsets of the first CE in
1985 // the match are the same, it means that the match
1986 // starts in the middle of an expansion - all but
1987 // the first CE of the expansion will have the offset
1988 // of the following character.
1989 if (start
== targetOrders
.getHighOffset(i
)) {
1993 // Make sure match starts on a grapheme boundary
1994 if (! ubrk_isBoundary(charBreakIterator
, start
)) {
1998 // If the low and high offsets of the CE after the match
1999 // are the same, it means that the match ends in the middle
2000 // of an expansion sequence.
2001 if (maxLimit
== targetOrders
.getHighOffset(i
+ patternSize
) &&
2002 targetOrders
.getOrder(i
+ patternSize
) != UCOL_NULLORDER
) {
2006 int32_t mend
= maxLimit
;
2008 // Find the first grapheme break after the character index
2009 // of the last CE in the match. If it's after character index
2010 // that's after the last CE in the match, use that index
2011 // as the end of the match.
2012 if (minLimit
< maxLimit
) {
2013 int32_t nba
= ubrk_following(charBreakIterator
, minLimit
);
2015 if (nba
>= targetOrders
.getHighOffset(i
+ patternSize
- 1)) {
2020 if (mend
> maxLimit
) {
2024 if (! ubrk_isBoundary(charBreakIterator
, mend
)) {
2031 ubrk_close(charBreakIterator
);
2036 ubrk_close(charBreakIterator
);
2040 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2041 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
2042 int32_t val
= defaultVal
;
2044 name
.append(" *= *(-?\\d+)");
2046 UErrorCode status
= U_ZERO_ERROR
;
2047 RegexMatcher
m(name
, params
, 0, status
);
2050 // The param exists. Convert the string to an int.
2051 char valString
[100];
2052 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
2054 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
2055 paramLength
= (int32_t)(sizeof(valString
)-2);
2058 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
2059 val
= strtol(valString
, NULL
, 10);
2061 // Delete this parameter from the params string.
2063 params
= m
.replaceFirst("", status
);
2066 //U_ASSERT(U_SUCCESS(status));
2067 if (! U_SUCCESS(status
)) {
2075 #if !UCONFIG_NO_COLLATION
2076 int32_t SSearchTest::monkeyTestCase(UCollator
*coll
, const UnicodeString
&testCase
, const UnicodeString
&pattern
, const UnicodeString
&altPattern
,
2077 const char *name
, const char *strength
, uint32_t seed
)
2079 UErrorCode status
= U_ZERO_ERROR
;
2080 int32_t actualStart
= -1, actualEnd
= -1;
2081 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2082 int32_t expectedStart
= -1, expectedEnd
= -1;
2083 int32_t notFoundCount
= 0;
2084 LocalUStringSearchPointer
uss(usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
2085 testCase
.getBuffer(), testCase
.length(),
2087 NULL
, // the break iterator
2090 // **** TODO: find *all* matches, not just first one ****
2091 simpleSearch(coll
, testCase
, 0, pattern
, expectedStart
, expectedEnd
);
2093 usearch_search(uss
.getAlias(), 0, &actualStart
, &actualEnd
, &status
);
2095 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2096 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2097 " strength=%s seed=%d",
2098 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
);
2101 if (expectedStart
== -1 && actualStart
== -1) {
2105 // **** TODO: find *all* matches, not just first one ****
2106 simpleSearch(coll
, testCase
, 0, altPattern
, expectedStart
, expectedEnd
);
2108 usearch_setPattern(uss
.getAlias(), altPattern
.getBuffer(), altPattern
.length(), &status
);
2110 usearch_search(uss
.getAlias(), 0, &actualStart
, &actualEnd
, &status
);
2112 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2113 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2114 " strength=%s seed=%d",
2115 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
);
2118 if (expectedStart
== -1 && actualStart
== -1) {
2122 return notFoundCount
;
2125 static void hexForUnicodeString(const UnicodeString
&ustr
, char * cbuf
, int32_t cbuflen
)
2127 int32_t ustri
, ustrlen
= ustr
.length();
2129 for (ustri
= 0; ustri
< ustrlen
; ++ustri
) {
2130 if (cbuflen
>= 9 /* format width for single code unit(5) + terminating ellipsis(3) + null(1) */) {
2131 int len
= sprintf(cbuf
, " %04X", ustr
.charAt(ustri
));
2135 if (cbuflen
>= 4 /* terminating ellipsis(3) + null(1) */) {
2136 sprintf(cbuf
, "...");
2137 } else if (cbuflen
>= 1) {
2145 int32_t SSearchTest::bmMonkeyTestCase(UCollator
*coll
, const UnicodeString
&testCase
, const UnicodeString
&pattern
, const UnicodeString
&altPattern
,
2146 BoyerMooreSearch
*bms
, BoyerMooreSearch
*abms
,
2147 const char *name
, const char *strength
, uint32_t seed
)
2149 UErrorCode status
= U_ZERO_ERROR
;
2150 int32_t actualStart
= -1, actualEnd
= -1;
2151 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2152 int32_t expectedStart
= -1, expectedEnd
= -1;
2153 int32_t notFoundCount
= 0;
2156 // **** TODO: find *all* matches, not just first one ****
2157 simpleSearch(coll
, testCase
, 0, pattern
, expectedStart
, expectedEnd
);
2159 bms
->setTargetString(&testCase
, status
);
2160 bms
->search(0, actualStart
, actualEnd
);
2162 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2163 hexForUnicodeString(pattern
, hexbuf
, sizeof(hexbuf
));
2164 errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2165 " strength=%s seed=%d <pattern>: %s",
2166 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
, hexbuf
);
2169 if (expectedStart
== -1 && actualStart
== -1) {
2173 // **** TODO: find *all* matches, not just first one ****
2174 simpleSearch(coll
, testCase
, 0, altPattern
, expectedStart
, expectedEnd
);
2176 abms
->setTargetString(&testCase
, status
);
2177 abms
->search(0, actualStart
, actualEnd
);
2179 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2180 hexForUnicodeString(altPattern
, hexbuf
, sizeof(hexbuf
));
2181 errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2182 " strength=%s seed=%d <pattern>: %s",
2183 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
, hexbuf
);
2186 if (expectedStart
== -1 && actualStart
== -1) {
2191 return notFoundCount
;
2195 void SSearchTest::monkeyTest(char *params
)
2198 UErrorCode status
= U_ZERO_ERROR
;
2199 //UCollator *coll = ucol_open(NULL, &status);
2200 UCollator
*coll
= ucol_openFromShortString("S1", FALSE
, NULL
, &status
);
2202 if (U_FAILURE(status
)) {
2203 errcheckln(status
, "Failed to create collator in MonkeyTest! - %s", u_errorName(status
));
2207 CollData
*monkeyData
= CollData::open(coll
, status
);
2209 USet
*expansions
= uset_openEmpty();
2210 USet
*contractions
= uset_openEmpty();
2212 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
2214 U_STRING_DECL(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2215 U_STRING_INIT(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2216 USet
*letters
= uset_openPattern(letter_pattern
, 39, &status
);
2217 SetMonkey
letterMonkey(letters
);
2218 StringSetMonkey
contractionMonkey(contractions
, coll
, monkeyData
);
2219 StringSetMonkey
expansionMonkey(expansions
, coll
, monkeyData
);
2220 UnicodeString testCase
;
2221 UnicodeString alternate
;
2222 UnicodeString pattern
, altPattern
;
2223 UnicodeString prefix
, altPrefix
;
2224 UnicodeString suffix
, altSuffix
;
2226 Monkey
*monkeys
[] = {
2236 int32_t monkeyCount
= sizeof(monkeys
) / sizeof(monkeys
[0]);
2237 // int32_t nonMatchCount = 0;
2239 UCollationStrength strengths
[] = {UCOL_PRIMARY
, UCOL_SECONDARY
, UCOL_TERTIARY
};
2240 const char *strengthNames
[] = {"primary", "secondary", "tertiary"};
2241 int32_t strengthCount
= sizeof(strengths
) / sizeof(strengths
[0]);
2242 int32_t loopCount
= quick
? 1000 : 10000;
2243 int32_t firstStrength
= 0;
2244 int32_t lastStrength
= strengthCount
- 1; //*/ 0;
2246 if (params
!= NULL
) {
2247 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2248 UnicodeString
p(params
);
2250 loopCount
= getIntParam("loop", p
, loopCount
);
2251 m_seed
= getIntParam("seed", p
, m_seed
);
2253 RegexMatcher
m(" *strength *= *(primary|secondary|tertiary) *", p
, 0, status
);
2255 UnicodeString breakType
= m
.group(1, status
);
2257 for (int32_t s
= 0; s
< strengthCount
; s
+= 1) {
2258 if (breakType
== strengthNames
[s
]) {
2259 firstStrength
= lastStrength
= s
;
2265 p
= m
.replaceFirst("", status
);
2268 if (RegexMatcher("\\S", p
, 0, status
).find()) {
2269 // Each option is stripped out of the option string as it is processed.
2270 // All options have been checked. The option string should have been completely emptied..
2272 p
.extract(buf
, sizeof(buf
), NULL
, status
);
2273 buf
[sizeof(buf
)-1] = 0;
2274 errln("Unrecognized or extra parameter: %s\n", buf
);
2278 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2282 for(int32_t s
= firstStrength
; s
<= lastStrength
; s
+= 1) {
2283 int32_t notFoundCount
= 0;
2285 logln("Setting strength to %s.", strengthNames
[s
]);
2286 ucol_setStrength(coll
, strengths
[s
]);
2288 // TODO: try alternate prefix and suffix too?
2289 // TODO: alterntaes are only equal at primary strength. Is this OK?
2290 for(int32_t t
= 0; t
< loopCount
; t
+= 1) {
2291 uint32_t seed
= m_seed
;
2294 generateTestCase(coll
, monkeys
, monkeyCount
, pattern
, altPattern
);
2295 generateTestCase(coll
, monkeys
, monkeyCount
, prefix
, altPrefix
);
2296 generateTestCase(coll
, monkeys
, monkeyCount
, suffix
, altSuffix
);
2299 notFoundCount
+= monkeyTestCase(coll
, pattern
, pattern
, altPattern
, "pattern", strengthNames
[s
], seed
);
2302 testCase
.append(prefix
);
2303 testCase
.append(/*alt*/pattern
);
2306 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "prefix + pattern", strengthNames
[s
], seed
);
2308 testCase
.append(suffix
);
2310 // prefix + pattern + suffix
2311 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "prefix + pattern + suffix", strengthNames
[s
], seed
);
2314 testCase
.append(pattern
);
2315 testCase
.append(suffix
);
2318 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "pattern + suffix", strengthNames
[s
], seed
);
2321 logln("For strength %s the not found count is %d.", strengthNames
[s
], notFoundCount
);
2324 uset_close(contractions
);
2325 uset_close(expansions
);
2326 uset_close(letters
);
2328 CollData::close(monkeyData
);
2333 void SSearchTest::bmMonkeyTest(char *params
)
2335 static const UVersionInfo icu47
= { 4, 7, 0, 0 }; // for timebomb
2336 static const UChar skipChars
[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb
2338 UErrorCode status
= U_ZERO_ERROR
;
2339 UCollator
*coll
= ucol_openFromShortString("LEN_S1", FALSE
, NULL
, &status
);
2341 if (U_FAILURE(status
)) {
2342 errcheckln(status
, "Failed to create collator in MonkeyTest! - %s", u_errorName(status
));
2346 CollData
*monkeyData
= CollData::open(coll
, status
);
2348 USet
*expansions
= uset_openEmpty();
2349 USet
*contractions
= uset_openEmpty();
2351 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
2353 U_STRING_DECL(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2354 U_STRING_INIT(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2355 USet
*letters
= uset_openPattern(letter_pattern
, 39, &status
);
2356 SetMonkey
letterMonkey(letters
);
2357 StringSetMonkey
contractionMonkey(contractions
, coll
, monkeyData
);
2358 StringSetMonkey
expansionMonkey(expansions
, coll
, monkeyData
);
2359 UnicodeString testCase
;
2360 UnicodeString alternate
;
2361 UnicodeString pattern
, altPattern
;
2362 UnicodeString prefix
, altPrefix
;
2363 UnicodeString suffix
, altSuffix
;
2365 Monkey
*monkeys
[] = {
2375 int32_t monkeyCount
= sizeof(monkeys
) / sizeof(monkeys
[0]);
2376 // int32_t nonMatchCount = 0;
2378 UCollationStrength strengths
[] = {UCOL_PRIMARY
, UCOL_SECONDARY
, UCOL_TERTIARY
};
2379 const char *strengthNames
[] = {"primary", "secondary", "tertiary"};
2380 int32_t strengthCount
= sizeof(strengths
) / sizeof(strengths
[0]);
2381 int32_t loopCount
= quick
? 1000 : 10000;
2382 int32_t firstStrength
= 0;
2383 int32_t lastStrength
= strengthCount
- 1; //*/ 0;
2385 if (params
!= NULL
) {
2386 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2387 UnicodeString
p(params
);
2389 loopCount
= getIntParam("loop", p
, loopCount
);
2390 m_seed
= getIntParam("seed", p
, m_seed
);
2392 RegexMatcher
m(" *strength *= *(primary|secondary|tertiary) *", p
, 0, status
);
2394 UnicodeString breakType
= m
.group(1, status
);
2396 for (int32_t s
= 0; s
< strengthCount
; s
+= 1) {
2397 if (breakType
== strengthNames
[s
]) {
2398 firstStrength
= lastStrength
= s
;
2404 p
= m
.replaceFirst("", status
);
2407 if (RegexMatcher("\\S", p
, 0, status
).find()) {
2408 // Each option is stripped out of the option string as it is processed.
2409 // All options have been checked. The option string should have been completely emptied..
2411 p
.extract(buf
, sizeof(buf
), NULL
, status
);
2412 buf
[sizeof(buf
)-1] = 0;
2413 errln("Unrecognized or extra parameter: %s\n", buf
);
2417 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2421 for(int32_t s
= firstStrength
; s
<= lastStrength
; s
+= 1) {
2422 int32_t notFoundCount
= 0;
2424 logln("Setting strength to %s.", strengthNames
[s
]);
2425 ucol_setStrength(coll
, strengths
[s
]);
2427 CollData
*data
= CollData::open(coll
, status
);
2429 UnicodeString
skipString(skipChars
); // for timebomb
2430 UnicodeSet
* skipSet
= UnicodeSet::createFromAll(skipString
); // for timebomb
2431 // TODO: try alternate prefix and suffix too?
2432 // TODO: alterntaes are only equal at primary strength. Is this OK?
2433 for(int32_t t
= 0; t
< loopCount
; t
+= 1) {
2434 uint32_t seed
= m_seed
;
2437 generateTestCase(coll
, monkeys
, monkeyCount
, pattern
, altPattern
);
2438 generateTestCase(coll
, monkeys
, monkeyCount
, prefix
, altPrefix
);
2439 generateTestCase(coll
, monkeys
, monkeyCount
, suffix
, altSuffix
);
2441 if (!isICUVersionAtLeast(icu47
) && skipSet
->containsSome(pattern
)) {
2442 continue; // timebomb until ticket #8080 is resolved
2445 BoyerMooreSearch
pat(data
, pattern
, NULL
, status
);
2446 BoyerMooreSearch
alt(data
, altPattern
, NULL
, status
);
2448 // **** need a better way to deal with this ****
2457 notFoundCount
+= bmMonkeyTestCase(coll
, pattern
, pattern
, altPattern
, &pat
, &alt
, "pattern", strengthNames
[s
], seed
);
2460 testCase
.append(prefix
);
2461 testCase
.append(/*alt*/pattern
);
2464 notFoundCount
+= bmMonkeyTestCase(coll
, testCase
, pattern
, altPattern
, &pat
, &alt
, "prefix + pattern", strengthNames
[s
], seed
);
2466 testCase
.append(suffix
);
2468 // prefix + pattern + suffix
2469 notFoundCount
+= bmMonkeyTestCase(coll
, testCase
, pattern
, altPattern
, &pat
, &alt
, "prefix + pattern + suffix", strengthNames
[s
], seed
);
2472 testCase
.append(pattern
);
2473 testCase
.append(suffix
);
2476 notFoundCount
+= bmMonkeyTestCase(coll
, testCase
, pattern
, altPattern
, &pat
, &alt
, "pattern + suffix", strengthNames
[s
], seed
);
2478 delete skipSet
; // for timebomb
2480 CollData::close(data
);
2482 logln("For strength %s the not found count is %d.", strengthNames
[s
], notFoundCount
);
2485 uset_close(contractions
);
2486 uset_close(expansions
);
2487 uset_close(letters
);
2489 CollData::close(monkeyData
);
2494 void SSearchTest::stringListTest(){
2495 UErrorCode status
= U_ZERO_ERROR
;
2496 StringList
*sl
= new StringList(status
);
2497 if(U_FAILURE(status
)){
2498 errln("ERROR: stringListTest: Could not start StringList");
2501 const UChar chars
[] = {
2504 sl
->add(chars
, (int32_t) 0, status
);
2505 if(U_FAILURE(status
)){
2506 errln("ERROR: stringListTest: StringList::add");
2509 if(sl
->getDynamicClassID() != StringList::getStaticClassID()){
2510 errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID does not match");