2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_COLLATION
13 #include "unicode/unistr.h"
14 #include "unicode/putil.h"
15 #include "unicode/usearch.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/ustring.h"
35 #include "unicode/colldata.h"
36 #include "unicode/bmsearch.h"
37 #include "unicode/bms.h"
39 #include "xmlparser.h"
48 #define TEST_ASSERT(x) {if (!(x)) { \
49 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
52 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
55 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
56 __FILE__, __LINE__, testId, u_errorName(errcode));}}
58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
60 #define DELETE_ARRAY(array) uprv_free((void *) (array))
62 //---------------------------------------------------------------------------
64 // Test class boilerplate
66 //---------------------------------------------------------------------------
67 SSearchTest::SSearchTest()
71 SSearchTest::~SSearchTest()
75 void SSearchTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char *params
)
77 if (exec
) logln("TestSuite SSearchTest: ");
79 #if !UCONFIG_NO_BREAK_ITERATION
80 case 0: name
= "searchTest";
81 if (exec
) searchTest();
84 case 1: name
= "offsetTest";
85 if (exec
) offsetTest();
88 case 2: name
= "monkeyTest";
89 if (exec
) monkeyTest(params
);
92 case 3: name
= "bmMonkeyTest";
93 if (exec
) bmMonkeyTest(params
);
96 case 4: name
= "boyerMooreTest";
97 if (exec
) boyerMooreTest();
100 case 5: name
= "goodSuffixTest";
101 if (exec
) goodSuffixTest();
104 case 6: name
= "searchTime";
105 if (exec
) searchTime();
108 case 7: name
= "bmsTest";
112 case 8: name
= "bmSearchTest";
113 if (exec
) bmSearchTest();
116 case 9: name
= "udhrTest";
117 if (exec
) udhrTest();
119 case 10: name
= "stringListTest";
120 if (exec
) stringListTest();
124 break; //needed to end loop
129 #if !UCONFIG_NO_BREAK_ITERATION
131 #define PATH_BUFFER_SIZE 2048
132 const char *SSearchTest::getPath(char buffer
[2048], const char *filename
) {
133 UErrorCode status
= U_ZERO_ERROR
;
134 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
136 if (U_FAILURE(status
) || strlen(testDataDirectory
) + strlen(filename
) + 1 >= PATH_BUFFER_SIZE
) {
137 errln("ERROR: getPath() failed - %s", u_errorName(status
));
141 strcpy(buffer
, testDataDirectory
);
142 strcat(buffer
, filename
);
147 void SSearchTest::searchTest()
149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
150 UErrorCode status
= U_ZERO_ERROR
;
151 char path
[PATH_BUFFER_SIZE
];
152 const char *testFilePath
= getPath(path
, "ssearch.xml");
154 if (testFilePath
== NULL
) {
155 return; /* Couldn't get path: error message already output. */
158 LocalPointer
<UXMLParser
> parser(UXMLParser::createParser(status
));
159 TEST_ASSERT_SUCCESS(status
);
160 LocalPointer
<UXMLElement
> root(parser
->parseFile(testFilePath
, status
));
161 TEST_ASSERT_SUCCESS(status
);
162 if (U_FAILURE(status
)) {
166 const UnicodeString
*debugTestCase
= root
->getAttribute("debug");
167 if (debugTestCase
!= NULL
) {
168 // setenv("USEARCH_DEBUG", "1", 1);
172 const UXMLElement
*testCase
;
175 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
177 if (testCase
->getTagName().compare("test-case") != 0) {
178 errln("ssearch, unrecognized XML Element in test file");
181 const UnicodeString
*id
= testCase
->getAttribute("id");
184 id
->extract(0, id
->length(), testId
, sizeof(testId
), US_INV
);
187 // If debugging test case has been specified and this is not it, skip to next.
188 if (id
!=NULL
&& debugTestCase
!=NULL
&& *id
!= *debugTestCase
) {
192 // Get the requested collation strength.
193 // Default is tertiary if the XML attribute is missing from the test case.
195 const UnicodeString
*strength
= testCase
->getAttribute("strength");
196 UColAttributeValue collatorStrength
= UCOL_PRIMARY
;
197 if (strength
==NULL
) { collatorStrength
= UCOL_TERTIARY
;}
198 else if (*strength
=="PRIMARY") { collatorStrength
= UCOL_PRIMARY
;}
199 else if (*strength
=="SECONDARY") { collatorStrength
= UCOL_SECONDARY
;}
200 else if (*strength
=="TERTIARY") { collatorStrength
= UCOL_TERTIARY
;}
201 else if (*strength
=="QUATERNARY") { collatorStrength
= UCOL_QUATERNARY
;}
202 else if (*strength
=="IDENTICAL") { collatorStrength
= UCOL_IDENTICAL
;}
204 // Bogus value supplied for strength. Shouldn't happen, even from
205 // typos, if the XML source has been validated.
206 // This assert is a little deceiving in that strength can be
207 // any of the allowed values, not just TERTIARY, but it will
208 // do the job of getting the error output.
209 TEST_ASSERT(*strength
=="TERTIARY")
213 // Get the collator normalization flag. Default is UCOL_OFF.
215 UColAttributeValue normalize
= UCOL_OFF
;
216 const UnicodeString
*norm
= testCase
->getAttribute("norm");
217 TEST_ASSERT (norm
==NULL
|| *norm
=="ON" || *norm
=="OFF");
218 if (norm
!=NULL
&& *norm
=="ON") {
223 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
225 UColAttributeValue alternateHandling
= UCOL_NON_IGNORABLE
;
226 const UnicodeString
*alt
= testCase
->getAttribute("alternate_handling");
227 TEST_ASSERT (alt
== NULL
|| *alt
== "SHIFTED" || *alt
== "NON_IGNORABLE");
228 if (alt
!= NULL
&& *alt
== "SHIFTED") {
229 alternateHandling
= UCOL_SHIFTED
;
232 const UnicodeString
defLocale("en");
234 const UnicodeString
*locale
= testCase
->getAttribute("locale");
235 if (locale
== NULL
|| locale
->length()==0) {
238 locale
->extract(0, locale
->length(), clocale
, sizeof(clocale
), NULL
);
242 UnicodeString target
;
243 UnicodeString pattern
;
244 int32_t expectedMatchStart
= -1;
245 int32_t expectedMatchLimit
= -1;
246 const UXMLElement
*n
;
247 int32_t nodeCount
= 0;
249 n
= testCase
->getChildElement("pattern");
250 TEST_ASSERT(n
!= NULL
);
254 text
= n
->getText(FALSE
);
255 text
= text
.unescape();
256 pattern
.append(text
);
259 n
= testCase
->getChildElement("pre");
261 text
= n
->getText(FALSE
);
262 text
= text
.unescape();
267 n
= testCase
->getChildElement("m");
269 expectedMatchStart
= target
.length();
270 text
= n
->getText(FALSE
);
271 text
= text
.unescape();
273 expectedMatchLimit
= target
.length();
277 n
= testCase
->getChildElement("post");
279 text
= n
->getText(FALSE
);
280 text
= text
.unescape();
285 // Check that there weren't extra things in the XML
286 TEST_ASSERT(nodeCount
== testCase
->countChildren());
288 // Open a collator and StringSearch based on the parameters
289 // obtained from the XML.
291 status
= U_ZERO_ERROR
;
292 LocalUCollatorPointer
collator(ucol_open(clocale
, &status
));
293 ucol_setStrength(collator
.getAlias(), collatorStrength
);
294 ucol_setAttribute(collator
.getAlias(), UCOL_NORMALIZATION_MODE
, normalize
, &status
);
295 ucol_setAttribute(collator
.getAlias(), UCOL_ALTERNATE_HANDLING
, alternateHandling
, &status
);
296 LocalUStringSearchPointer
uss(usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
297 target
.getBuffer(), target
.length(),
299 NULL
, // the break iterator
302 TEST_ASSERT_SUCCESS(status
);
303 if (U_FAILURE(status
)) {
307 int32_t foundStart
= 0;
308 int32_t foundLimit
= 0;
312 // Do the search, check the match result against the expected results.
314 foundMatch
= usearch_search(uss
.getAlias(), 0, &foundStart
, &foundLimit
, &status
);
315 TEST_ASSERT_SUCCESS(status
);
316 if ((foundMatch
&& expectedMatchStart
<0) ||
317 (foundStart
!= expectedMatchStart
) ||
318 (foundLimit
!= expectedMatchLimit
)) {
319 TEST_ASSERT(FALSE
); // ouput generic error position
320 infoln("Found, expected match start = %d, %d \n"
321 "Found, expected match limit = %d, %d",
322 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
325 // In case there are other matches...
326 // (should we only do this if the test case passed?)
328 expectedMatchStart
= foundStart
;
329 expectedMatchLimit
= foundLimit
;
331 foundMatch
= usearch_search(uss
.getAlias(), foundLimit
, &foundStart
, &foundLimit
, &status
);
334 uss
.adoptInstead(usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
335 target
.getBuffer(), target
.length(),
341 // Do the backwards search, check the match result against the expected results.
343 foundMatch
= usearch_searchBackwards(uss
.getAlias(), target
.length(), &foundStart
, &foundLimit
, &status
);
344 TEST_ASSERT_SUCCESS(status
);
345 if ((foundMatch
&& expectedMatchStart
<0) ||
346 (foundStart
!= expectedMatchStart
) ||
347 (foundLimit
!= expectedMatchLimit
)) {
348 TEST_ASSERT(FALSE
); // ouput generic error position
349 infoln("Found, expected backwards match start = %d, %d \n"
350 "Found, expected backwards match limit = %d, %d",
351 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
363 void SSearchTest::udhrTest()
365 UErrorCode status
= U_ZERO_ERROR
;
366 char path
[PATH_BUFFER_SIZE
];
367 const char *udhrPath
= getPath(path
, "udhr");
369 if (udhrPath
== NULL
) {
370 // couldn't get path: error message already output...
374 UdhrTestCase testCases
[] = {
375 {"en", "udhr_eng.txt"},
376 {"de", "udhr_deu_1996.txt"},
377 {"fr", "udhr_fra.txt"},
378 {"ru", "udhr_rus.txt"},
379 {"th", "udhr_tha.txt"},
380 {"ja", "udhr_jpn.txt"},
381 {"ko", "udhr_kor.txt"},
382 {"zh", "udhr_cmn_hans.txt"},
383 {"zh_Hant", "udhr_cmn_hant.txt"}
386 int32_t testCount
= ARRAY_SIZE(testCases
);
388 for (int32_t t
= 0; t
< testCount
; t
+= 1) {
390 char *resolvedFileName
= NULL
;
391 const char *encoding
= NULL
;
392 UCHARBUF
*ucharBuf
= NULL
;
394 ucbuf_resolveFileName(udhrPath
, testCases
[t
].file
, NULL
, &len
, &status
);
395 resolvedFileName
= NEW_ARRAY(char, len
);
397 if(resolvedFileName
== NULL
){
401 if(status
== U_BUFFER_OVERFLOW_ERROR
){
402 status
= U_ZERO_ERROR
;
405 ucbuf_resolveFileName(udhrPath
, testCases
[t
].file
, resolvedFileName
, &len
, &status
);
406 ucharBuf
= ucbuf_open(resolvedFileName
, &encoding
, TRUE
, FALSE
, &status
);
408 DELETE_ARRAY(resolvedFileName
);
410 if(U_FAILURE(status
)){
411 infoln("Could not open the input file %s. Test skipped\n", testCases
[t
].file
);
415 int32_t targetLen
= 0;
416 const UChar
*target
= ucbuf_getBuffer(ucharBuf
, &targetLen
, &status
);
418 /* The first line of the file contains the pattern */
419 int32_t start
= 0, end
= 0, plen
= 0;
421 for(end
= start
; ; end
+= 1) {
422 UChar ch
= target
[end
];
424 if (ch
== 0x000A || ch
== 0x000D || ch
== 0x2028) {
431 UChar
*pattern
= NEW_ARRAY(UChar
, plen
);
432 for (int32_t i
= 0; i
< plen
; i
+= 1) {
433 pattern
[i
] = target
[start
++];
437 UCollator
*coll
= ucol_open(testCases
[t
].locale
, &status
);
441 if (U_FAILURE(status
)) {
442 errln("Could not open collator for %s", testCases
[t
].locale
);
443 goto delete_collator
;
446 ucd
= ucd_open(coll
, &status
);
448 if (U_FAILURE(status
)) {
449 errln("Could not open CollData object for %s", testCases
[t
].locale
);
453 bms
= bms_open(ucd
, pattern
, plen
, target
, targetLen
, &status
);
455 if (U_FAILURE(status
)) {
456 errln("Could not open search object for %s", testCases
[t
].locale
);
461 while (bms_search(bms
, offset
, &start
, &end
)) {
466 errln("Could not find pattern - locale: %s, file: %s ", testCases
[t
].locale
, testCases
[t
].file
);
478 DELETE_ARRAY(pattern
);
479 ucbuf_close(ucharBuf
);
485 void SSearchTest::bmSearchTest()
487 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
488 UErrorCode status
= U_ZERO_ERROR
;
489 char path
[PATH_BUFFER_SIZE
];
490 const char *testFilePath
= getPath(path
, "ssearch.xml");
492 if (testFilePath
== NULL
) {
493 return; /* Couldn't get path: error message already output. */
496 UXMLParser
*parser
= UXMLParser::createParser(status
);
497 TEST_ASSERT_SUCCESS(status
);
498 UXMLElement
*root
= parser
->parseFile(testFilePath
, status
);
499 TEST_ASSERT_SUCCESS(status
);
500 if (U_FAILURE(status
)) {
504 const UnicodeString
*debugTestCase
= root
->getAttribute("debug");
505 if (debugTestCase
!= NULL
) {
506 // setenv("USEARCH_DEBUG", "1", 1);
510 const UXMLElement
*testCase
;
513 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
515 if (testCase
->getTagName().compare("test-case") != 0) {
516 errln("ssearch, unrecognized XML Element in test file");
519 const UnicodeString
*id
= testCase
->getAttribute("id");
522 id
->extract(0, id
->length(), testId
, sizeof(testId
), US_INV
);
525 // If debugging test case has been specified and this is not it, skip to next.
526 if (id
!=NULL
&& debugTestCase
!=NULL
&& *id
!= *debugTestCase
) {
530 // Get the requested collation strength.
531 // Default is tertiary if the XML attribute is missing from the test case.
533 const UnicodeString
*strength
= testCase
->getAttribute("strength");
534 UColAttributeValue collatorStrength
= UCOL_PRIMARY
;
535 if (strength
==NULL
) { collatorStrength
= UCOL_TERTIARY
;}
536 else if (*strength
=="PRIMARY") { collatorStrength
= UCOL_PRIMARY
;}
537 else if (*strength
=="SECONDARY") { collatorStrength
= UCOL_SECONDARY
;}
538 else if (*strength
=="TERTIARY") { collatorStrength
= UCOL_TERTIARY
;}
539 else if (*strength
=="QUATERNARY") { collatorStrength
= UCOL_QUATERNARY
;}
540 else if (*strength
=="IDENTICAL") { collatorStrength
= UCOL_IDENTICAL
;}
542 // Bogus value supplied for strength. Shouldn't happen, even from
543 // typos, if the XML source has been validated.
544 // This assert is a little deceiving in that strength can be
545 // any of the allowed values, not just TERTIARY, but it will
546 // do the job of getting the error output.
547 TEST_ASSERT(*strength
=="TERTIARY")
551 // Get the collator normalization flag. Default is UCOL_OFF.
553 UColAttributeValue normalize
= UCOL_OFF
;
554 const UnicodeString
*norm
= testCase
->getAttribute("norm");
555 TEST_ASSERT (norm
==NULL
|| *norm
=="ON" || *norm
=="OFF");
556 if (norm
!=NULL
&& *norm
=="ON") {
561 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
563 UColAttributeValue alternateHandling
= UCOL_NON_IGNORABLE
;
564 const UnicodeString
*alt
= testCase
->getAttribute("alternate_handling");
565 TEST_ASSERT (alt
== NULL
|| *alt
== "SHIFTED" || *alt
== "NON_IGNORABLE");
566 if (alt
!= NULL
&& *alt
== "SHIFTED") {
567 alternateHandling
= UCOL_SHIFTED
;
570 const UnicodeString
defLocale("en");
572 const UnicodeString
*locale
= testCase
->getAttribute("locale");
573 if (locale
== NULL
|| locale
->length()==0) {
576 locale
->extract(0, locale
->length(), clocale
, sizeof(clocale
), NULL
);
580 UnicodeString target
;
581 UnicodeString pattern
;
582 int32_t expectedMatchStart
= -1;
583 int32_t expectedMatchLimit
= -1;
584 const UXMLElement
*n
;
585 int32_t nodeCount
= 0;
587 n
= testCase
->getChildElement("pattern");
588 TEST_ASSERT(n
!= NULL
);
592 text
= n
->getText(FALSE
);
593 text
= text
.unescape();
594 pattern
.append(text
);
597 n
= testCase
->getChildElement("pre");
599 text
= n
->getText(FALSE
);
600 text
= text
.unescape();
605 n
= testCase
->getChildElement("m");
607 expectedMatchStart
= target
.length();
608 text
= n
->getText(FALSE
);
609 text
= text
.unescape();
611 expectedMatchLimit
= target
.length();
615 n
= testCase
->getChildElement("post");
617 text
= n
->getText(FALSE
);
618 text
= text
.unescape();
623 // Check that there weren't extra things in the XML
624 TEST_ASSERT(nodeCount
== testCase
->countChildren());
626 // Open a collator and StringSearch based on the parameters
627 // obtained from the XML.
629 status
= U_ZERO_ERROR
;
630 UCollator
*collator
= ucol_open(clocale
, &status
);
631 ucol_setStrength(collator
, collatorStrength
);
632 ucol_setAttribute(collator
, UCOL_NORMALIZATION_MODE
, normalize
, &status
);
633 ucol_setAttribute(collator
, UCOL_ALTERNATE_HANDLING
, alternateHandling
, &status
);
634 UCD
*ucd
= ucd_open(collator
, &status
);
635 BMS
*bms
= bms_open(ucd
, pattern
.getBuffer(), pattern
.length(), target
.getBuffer(), target
.length(), &status
);
637 TEST_ASSERT_SUCCESS(status
);
638 if (U_FAILURE(status
)) {
641 ucol_close(collator
);
645 int32_t foundStart
= 0;
646 int32_t foundLimit
= 0;
650 // Do the search, check the match result against the expected results.
652 foundMatch
= bms_search(bms
, 0, &foundStart
, &foundLimit
);
653 //TEST_ASSERT_SUCCESS(status);
654 if ((foundMatch
&& expectedMatchStart
< 0) ||
655 (foundStart
!= expectedMatchStart
) ||
656 (foundLimit
!= expectedMatchLimit
)) {
657 TEST_ASSERT(FALSE
); // ouput generic error position
658 infoln("Found, expected match start = %d, %d \n"
659 "Found, expected match limit = %d, %d",
660 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
665 ucol_close(collator
);
685 OrderList(UCollator
*coll
, const UnicodeString
&string
, int32_t stringOffset
= 0);
688 int32_t size(void) const;
689 void add(int32_t order
, int32_t low
, int32_t high
);
690 const Order
*get(int32_t index
) const;
691 int32_t getLowOffset(int32_t index
) const;
692 int32_t getHighOffset(int32_t index
) const;
693 int32_t getOrder(int32_t index
) const;
695 UBool
compare(const OrderList
&other
) const;
696 UBool
matchesAt(int32_t offset
, const OrderList
&other
) const;
704 OrderList::OrderList()
705 : list(NULL
), listMax(16), listSize(0)
707 list
= new Order
[listMax
];
710 OrderList::OrderList(UCollator
*coll
, const UnicodeString
&string
, int32_t stringOffset
)
711 : list(NULL
), listMax(16), listSize(0)
713 UErrorCode status
= U_ZERO_ERROR
;
714 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
715 uint32_t strengthMask
= 0;
716 int32_t order
, low
, high
;
718 switch (ucol_getStrength(coll
))
721 strengthMask
|= UCOL_TERTIARYORDERMASK
;
725 strengthMask
|= UCOL_SECONDARYORDERMASK
;
729 strengthMask
|= UCOL_PRIMARYORDERMASK
;
732 list
= new Order
[listMax
];
734 ucol_setOffset(elems
, stringOffset
, &status
);
737 low
= ucol_getOffset(elems
);
738 order
= ucol_next(elems
, &status
);
739 high
= ucol_getOffset(elems
);
741 if (order
!= UCOL_NULLORDER
) {
742 order
&= strengthMask
;
745 if (order
!= UCOL_IGNORABLE
) {
746 add(order
, low
, high
);
748 } while (order
!= UCOL_NULLORDER
);
750 ucol_closeElements(elems
);
753 OrderList::~OrderList()
758 void OrderList::add(int32_t order
, int32_t low
, int32_t high
)
760 if (listSize
>= listMax
) {
763 Order
*newList
= new Order
[listMax
];
765 uprv_memcpy(newList
, list
, listSize
* sizeof(Order
));
770 list
[listSize
].order
= order
;
771 list
[listSize
].lowOffset
= low
;
772 list
[listSize
].highOffset
= high
;
777 const Order
*OrderList::get(int32_t index
) const
779 if (index
>= listSize
) {
786 int32_t OrderList::getLowOffset(int32_t index
) const
788 const Order
*order
= get(index
);
791 return order
->lowOffset
;
797 int32_t OrderList::getHighOffset(int32_t index
) const
799 const Order
*order
= get(index
);
802 return order
->highOffset
;
808 int32_t OrderList::getOrder(int32_t index
) const
810 const Order
*order
= get(index
);
816 return UCOL_NULLORDER
;
819 int32_t OrderList::size() const
824 void OrderList::reverse()
826 for(int32_t f
= 0, b
= listSize
- 1; f
< b
; f
+= 1, b
-= 1) {
827 Order swap
= list
[b
];
834 UBool
OrderList::compare(const OrderList
&other
) const
836 if (listSize
!= other
.listSize
) {
840 for(int32_t i
= 0; i
< listSize
; i
+= 1) {
841 if (list
[i
].order
!= other
.list
[i
].order
||
842 list
[i
].lowOffset
!= other
.list
[i
].lowOffset
||
843 list
[i
].highOffset
!= other
.list
[i
].highOffset
) {
851 UBool
OrderList::matchesAt(int32_t offset
, const OrderList
&other
) const
853 // NOTE: sizes include the NULLORDER, which we don't want to compare.
854 int32_t otherSize
= other
.size() - 1;
856 if (listSize
- 1 - offset
< otherSize
) {
860 for (int32_t i
= offset
, j
= 0; j
< otherSize
; i
+= 1, j
+= 1) {
861 if (getOrder(i
) != other
.getOrder(j
)) {
869 static char *printOffsets(char *buffer
, OrderList
&list
)
871 int32_t size
= list
.size();
874 for(int32_t i
= 0; i
< size
; i
+= 1) {
875 const Order
*order
= list
.get(i
);
878 s
+= sprintf(s
, ", ");
881 s
+= sprintf(s
, "(%d, %d)", order
->lowOffset
, order
->highOffset
);
887 static char *printOrders(char *buffer
, OrderList
&list
)
889 int32_t size
= list
.size();
892 for(int32_t i
= 0; i
< size
; i
+= 1) {
893 const Order
*order
= list
.get(i
);
896 s
+= sprintf(s
, ", ");
899 s
+= sprintf(s
, "%8.8X", order
->order
);
905 void SSearchTest::offsetTest()
907 const char *test
[] = {
908 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
909 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
910 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
912 "\\ua191\\u16ef\\u2036\\u017a",
915 // This results in a complex interaction between contraction,
916 // expansion and normalization that confuses the backwards offset fixups.
917 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
920 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
921 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
924 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
925 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
926 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
927 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
928 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
930 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
931 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
932 "a\\u02FF\\u0316\\u0301",
933 "a\\u0430\\u0301\\u0316",
934 "a\\u0430\\u0316\\u0301",
935 "abc\\u0E41\\u0301\\u0316",
936 "abc\\u0E41\\u0316\\u0301",
937 "\\u0E41\\u0301\\u0316",
938 "\\u0E41\\u0316\\u0301",
952 "A\\u0302\\u0301\\u0323B",
956 " \\uD800\\uDC00\\uDC00",
957 "a\\uD800\\uDC00\\uDC00",
963 "\\u0301A\\u0301\\u0301",
969 int32_t testCount
= ARRAY_SIZE(test
);
970 UErrorCode status
= U_ZERO_ERROR
;
971 RuleBasedCollator
*col
= (RuleBasedCollator
*) Collator::createInstance(Locale::getEnglish(), status
);
972 if (U_FAILURE(status
)) {
973 errcheckln(status
, "Failed to create collator in offsetTest! - %s", u_errorName(status
));
976 char buffer
[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
977 // We could allocate one that's the right size by (CE_count * 10) + 2
978 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
980 col
->setAttribute(UCOL_NORMALIZATION_MODE
, UCOL_ON
, status
);
982 for(int32_t i
= 0; i
< testCount
; i
+= 1) {
983 if (!isICUVersionAtLeast(50, 0) && i
>=4 && i
<=6) {
984 continue; // timebomb until ticket #8080 is resolved
986 UnicodeString ts
= CharsToUnicodeString(test
[i
]);
987 CollationElementIterator
*iter
= col
->createCollationElementIterator(ts
);
988 OrderList forwardList
;
989 OrderList backwardList
;
990 int32_t order
, low
, high
;
993 low
= iter
->getOffset();
994 order
= iter
->next(status
);
995 high
= iter
->getOffset();
997 forwardList
.add(order
, low
, high
);
998 } while (order
!= CollationElementIterator::NULLORDER
);
1001 iter
->setOffset(ts
.length(), status
);
1003 backwardList
.add(CollationElementIterator::NULLORDER
, iter
->getOffset(), iter
->getOffset());
1006 high
= iter
->getOffset();
1007 order
= iter
->previous(status
);
1008 low
= iter
->getOffset();
1010 if (order
== CollationElementIterator::NULLORDER
) {
1014 backwardList
.add(order
, low
, high
);
1017 backwardList
.reverse();
1019 if (forwardList
.compare(backwardList
)) {
1020 logln("Works with \"%s\"", test
[i
]);
1021 logln("Forward offsets: [%s]", printOffsets(buffer
, forwardList
));
1022 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1024 logln("Forward CEs: [%s]", printOrders(buffer
, forwardList
));
1025 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1029 errln("Fails with \"%s\"", test
[i
]);
1030 infoln("Forward offsets: [%s]", printOffsets(buffer
, forwardList
));
1031 infoln("Backward offsets: [%s]", printOffsets(buffer
, backwardList
));
1033 infoln("Forward CEs: [%s]", printOrders(buffer
, forwardList
));
1034 infoln("Backward CEs: [%s]", printOrders(buffer
, backwardList
));
1044 static UnicodeString
&escape(const UnicodeString
&string
, UnicodeString
&buffer
)
1046 for(int32_t i
= 0; i
< string
.length(); i
+= 1) {
1047 UChar32 ch
= string
.char32At(i
);
1049 if (ch
>= 0x0020 && ch
<= 0x007F) {
1051 buffer
.append("\\\\");
1058 if (ch
<= 0xFFFFL
) {
1059 sprintf(cbuffer
, "\\u%4.4X", ch
);
1061 sprintf(cbuffer
, "\\U%8.8X", ch
);
1064 buffer
.append(cbuffer
);
1067 if (ch
>= 0x10000L
) {
1088 PCEList(UCollator
*coll
, const UnicodeString
&string
);
1091 int32_t size() const;
1093 const PCE
*get(int32_t index
) const;
1095 int32_t getLowOffset(int32_t index
) const;
1096 int32_t getHighOffset(int32_t index
) const;
1097 uint64_t getOrder(int32_t index
) const;
1099 UBool
matchesAt(int32_t offset
, const PCEList
&other
) const;
1101 uint64_t operator[](int32_t index
) const;
1104 void add(uint64_t ce
, int32_t low
, int32_t high
);
1111 PCEList::PCEList(UCollator
*coll
, const UnicodeString
&string
)
1113 UErrorCode status
= U_ZERO_ERROR
;
1114 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
1118 list
= new PCE
[listMax
];
1120 ucol_setOffset(elems
, 0, &status
);
1123 order
= ucol_nextProcessed(elems
, &low
, &high
, &status
);
1124 add(order
, low
, high
);
1125 } while (order
!= UCOL_PROCESSED_NULLORDER
);
1127 ucol_closeElements(elems
);
1135 void PCEList::add(uint64_t order
, int32_t low
, int32_t high
)
1137 if (listSize
>= listMax
) {
1140 PCE
*newList
= new PCE
[listMax
];
1142 uprv_memcpy(newList
, list
, listSize
* sizeof(Order
));
1147 list
[listSize
].ce
= order
;
1148 list
[listSize
].lowOffset
= low
;
1149 list
[listSize
].highOffset
= high
;
1154 const PCE
*PCEList::get(int32_t index
) const
1156 if (index
>= listSize
) {
1160 return &list
[index
];
1163 int32_t PCEList::getLowOffset(int32_t index
) const
1165 const PCE
*pce
= get(index
);
1168 return pce
->lowOffset
;
1174 int32_t PCEList::getHighOffset(int32_t index
) const
1176 const PCE
*pce
= get(index
);
1179 return pce
->highOffset
;
1185 uint64_t PCEList::getOrder(int32_t index
) const
1187 const PCE
*pce
= get(index
);
1193 return UCOL_PROCESSED_NULLORDER
;
1196 int32_t PCEList::size() const
1201 UBool
PCEList::matchesAt(int32_t offset
, const PCEList
&other
) const
1203 // NOTE: sizes include the NULLORDER, which we don't want to compare.
1204 int32_t otherSize
= other
.size() - 1;
1206 if (listSize
- 1 - offset
< otherSize
) {
1210 for (int32_t i
= offset
, j
= 0; j
< otherSize
; i
+= 1, j
+= 1) {
1211 if (getOrder(i
) != other
.getOrder(j
)) {
1219 uint64_t PCEList::operator[](int32_t index
) const
1221 return getOrder(index
);
1224 void SSearchTest::boyerMooreTest()
1226 UErrorCode status
= U_ZERO_ERROR
;
1227 UCollator
*coll
= NULL
;
1228 CollData
*data
= NULL
;
1229 const CEList
* ce
= NULL
;
1230 const CEList
* ce1
= NULL
;
1231 UnicodeString lp
= "fuss";
1232 UnicodeString sp
= "fu\\u00DF";
1233 BoyerMooreSearch
*longPattern
= NULL
;
1234 BoyerMooreSearch
*shortPattern
= NULL
;
1235 UnicodeString targets
[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1236 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1237 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1238 int32_t start
= -1, end
= -1;
1240 coll
= ucol_openFromShortString("LEN_S1", FALSE
, NULL
, &status
);
1241 if (U_FAILURE(status
)) {
1242 errcheckln(status
, "Could not open collator. - %s", u_errorName(status
));
1246 data
= CollData::open(coll
, status
);
1247 if (U_FAILURE(status
)) {
1248 errln("Could not open CollData object.");
1252 data
->getDynamicClassID();
1253 if (U_FAILURE(status
)) {
1254 errln("Could not get dynamic class ID of CollData.");
1255 goto close_patterns
;
1258 data
->getStaticClassID();
1259 if (U_FAILURE(status
)) {
1260 errln("Could not get static class ID of CollData.");
1261 goto close_patterns
;
1264 longPattern
= new BoyerMooreSearch(data
, lp
.unescape(), NULL
, status
);
1265 shortPattern
= new BoyerMooreSearch(data
, sp
.unescape(), NULL
, status
);
1266 if (U_FAILURE(status
)) {
1267 errln("Could not create pattern objects.");
1268 goto close_patterns
;
1271 longPattern
->getBadCharacterTable();
1272 shortPattern
->getBadCharacterTable();
1273 if (U_FAILURE(status
)) {
1274 errln("Could not get bad character table.");
1275 goto close_patterns
;
1278 longPattern
->getGoodSuffixTable();
1279 shortPattern
->getGoodSuffixTable();
1280 if (U_FAILURE(status
)) {
1281 errln("Could not get good suffix table.");
1282 goto close_patterns
;
1285 longPattern
->getDynamicClassID();
1286 shortPattern
->getDynamicClassID();
1287 if (U_FAILURE(status
)) {
1288 errln("Could not get dynamic class ID of BoyerMooreSearch.");
1289 goto close_patterns
;
1292 longPattern
->getStaticClassID();
1293 shortPattern
->getStaticClassID();
1294 if (U_FAILURE(status
)) {
1295 errln("Could not get static class ID of BoyerMooreSearch.");
1296 goto close_patterns
;
1299 longPattern
->getData();
1300 shortPattern
->getData();
1301 if (U_FAILURE(status
)) {
1302 errln("Could not get collate data.");
1303 goto close_patterns
;
1306 ce
= longPattern
->getPatternCEs();
1307 ce1
= shortPattern
->getPatternCEs();
1308 if (U_FAILURE(status
)) {
1309 errln("Could not get pattern CEs.");
1310 goto close_patterns
;
1313 ce
->getDynamicClassID();
1314 ce1
->getDynamicClassID();
1315 if (U_FAILURE(status
)) {
1316 errln("Could not get dynamic class ID of CEList.");
1317 goto close_patterns
;
1320 ce
->getStaticClassID();
1321 ce1
->getStaticClassID();
1322 if (U_FAILURE(status
)) {
1323 errln("Could not get static class ID of CEList.");
1324 goto close_patterns
;
1327 if(data
->minLengthInChars(ce
,0) != 3){
1328 errln("Minimal Length in Characters for 'data' with 'ce' was suppose to give 3.");
1329 goto close_patterns
;
1332 if(data
->minLengthInChars(ce1
,0) != 3){
1333 errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to give 3.");
1334 goto close_patterns
;
1337 for (uint32_t t
= 0; t
< (sizeof(targets
)/sizeof(targets
[0])); t
+= 1) {
1338 UnicodeString target
= targets
[t
].unescape();
1340 longPattern
->setTargetString(&target
, status
);
1341 if (longPattern
->search(0, start
, end
)) {
1342 logln("Test %d: found long pattern at [%d, %d].", t
, start
, end
);
1344 errln("Test %d: did not find long pattern.", t
);
1347 shortPattern
->setTargetString(&target
, status
);
1348 if (shortPattern
->search(0, start
, end
)) {
1349 logln("Test %d: found short pattern at [%d, %d].", t
, start
, end
);
1351 errln("Test %d: did not find short pattern.", t
);
1354 if(longPattern
->empty()){
1355 errln("Test %d: Long pattern should not have been empty.");
1358 if(shortPattern
->empty()){
1359 errln("Test %d: Short pattern should not have been empty.");
1364 delete shortPattern
;
1368 CollData::close(data
);
1372 void SSearchTest::bmsTest()
1374 UErrorCode status
= U_ZERO_ERROR
;
1375 UCollator
*coll
= NULL
;
1377 UnicodeString lp
= "fuss";
1378 UnicodeString lpu
= lp
.unescape();
1379 UnicodeString sp
= "fu\\u00DF";
1380 UnicodeString spu
= sp
.unescape();
1381 BMS
*longPattern
= NULL
;
1382 BMS
*shortPattern
= NULL
;
1383 UnicodeString targets
[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1384 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1385 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1386 int32_t start
= -1, end
= -1;
1388 coll
= ucol_openFromShortString("LEN_S1", FALSE
, NULL
, &status
);
1389 if (U_FAILURE(status
)) {
1390 errcheckln(status
, "Could not open collator. - %s", u_errorName(status
));
1394 data
= ucd_open(coll
, &status
);
1395 if (U_FAILURE(status
)) {
1396 errln("Could not open CollData object.");
1400 longPattern
= bms_open(data
, lpu
.getBuffer(), lpu
.length(), NULL
, 0, &status
);
1401 shortPattern
= bms_open(data
, spu
.getBuffer(), spu
.length(), NULL
, 0, &status
);
1402 if (U_FAILURE(status
)) {
1403 errln("Couldn't open pattern objects.");
1404 goto close_patterns
;
1407 for (uint32_t t
= 0; t
< (sizeof(targets
)/sizeof(targets
[0])); t
+= 1) {
1408 UnicodeString target
= targets
[t
].unescape();
1410 bms_setTargetString(longPattern
, target
.getBuffer(), target
.length(), &status
);
1411 if (bms_search(longPattern
, 0, &start
, &end
)) {
1412 logln("Test %d: found long pattern at [%d, %d].", t
, start
, end
);
1414 errln("Test %d: did not find long pattern.", t
);
1417 bms_setTargetString(shortPattern
, target
.getBuffer(), target
.length(), &status
);
1418 if (bms_search(shortPattern
, 0, &start
, &end
)) {
1419 logln("Test %d: found short pattern at [%d, %d].", t
, start
, end
);
1421 errln("Test %d: did not find short pattern.", t
);
1425 /* Add better coverage for bms code. */
1426 if(bms_empty(longPattern
)) {
1427 errln("FAIL: longgPattern is empty.");
1430 if (!bms_getData(longPattern
)) {
1431 errln("FAIL: bms_getData returned NULL.");
1434 if (!ucd_getCollator(data
)) {
1435 errln("FAIL: ucd_getCollator returned NULL.");
1439 bms_close(shortPattern
);
1440 bms_close(longPattern
);
1448 void SSearchTest::goodSuffixTest()
1450 UErrorCode status
= U_ZERO_ERROR
;
1451 UCollator
*coll
= NULL
;
1452 CollData
*data
= NULL
;
1453 UnicodeString pat
= /*"gcagagag"*/ "fxeld";
1454 UnicodeString target
= /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
1455 BoyerMooreSearch
*pattern
= NULL
;
1456 int32_t start
= -1, end
= -1;
1458 coll
= ucol_open(NULL
, &status
);
1459 if (U_FAILURE(status
)) {
1460 errcheckln(status
, "Couldn't open collator. - %s", u_errorName(status
));
1464 data
= CollData::open(coll
, status
);
1465 if (U_FAILURE(status
)) {
1466 errln("Couldn't open CollData object.");
1470 pattern
= new BoyerMooreSearch(data
, pat
, &target
, status
);
1471 if (U_FAILURE(status
)) {
1472 errln("Couldn't open pattern object.");
1476 if (pattern
->search(0, start
, end
)) {
1477 logln("Found pattern at [%d, %d].", start
, end
);
1479 errln("Did not find pattern.");
1486 CollData::close(data
);
1491 // searchTime() A quick and dirty performance test for string search.
1492 // Probably doesn't really belong as part of intltest, but it
1493 // does check that the search succeeds, and gets the right result,
1494 // so it serves as a functionality test also.
1496 // To run as a perf test, up the loop count, select by commenting
1497 // and uncommenting in the code the operation to be measured,
1498 // rebuild, and measure the running time of this test alone.
1500 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
1502 void SSearchTest::searchTime() {
1503 static const char *longishText
=
1504 "Whylom, as olde stories tellen us,\n"
1505 "Ther was a duk that highte Theseus:\n"
1506 "Of Athenes he was lord and governour,\n"
1507 "And in his tyme swich a conquerour,\n"
1508 "That gretter was ther noon under the sonne.\n"
1509 "Ful many a riche contree hadde he wonne;\n"
1510 "What with his wisdom and his chivalrye,\n"
1511 "He conquered al the regne of Femenye,\n"
1512 "That whylom was y-cleped Scithia;\n"
1513 "And weddede the quene Ipolita,\n"
1514 "And broghte hir hoom with him in his contree\n"
1515 "With muchel glorie and greet solempnitee,\n"
1516 "And eek hir yonge suster Emelye.\n"
1517 "And thus with victorie and with melodye\n"
1518 "Lete I this noble duk to Athenes ryde,\n"
1519 "And al his hoost, in armes, him bisyde.\n"
1520 "And certes, if it nere to long to here,\n"
1521 "I wolde han told yow fully the manere,\n"
1522 "How wonnen was the regne of Femenye\n"
1523 "By Theseus, and by his chivalrye;\n"
1524 "And of the grete bataille for the nones\n"
1525 "Bitwixen Athen's and Amazones;\n"
1526 "And how asseged was Ipolita,\n"
1527 "The faire hardy quene of Scithia;\n"
1528 "And of the feste that was at hir weddinge,\n"
1529 "And of the tempest at hir hoom-cominge;\n"
1530 "But al that thing I moot as now forbere.\n"
1531 "I have, God woot, a large feeld to ere,\n"
1532 "And wayke been the oxen in my plough.\n"
1533 "The remenant of the tale is long y-nough.\n"
1534 "I wol nat letten eek noon of this route;\n"
1535 "Lat every felawe telle his tale aboute,\n"
1536 "And lat see now who shal the soper winne;\n"
1537 "And ther I lefte, I wol ageyn biginne.\n"
1538 "This duk, of whom I make mencioun,\n"
1539 "When he was come almost unto the toun,\n"
1540 "In al his wele and in his moste pryde,\n"
1541 "He was war, as he caste his eye asyde,\n"
1542 "Wher that ther kneled in the hye weye\n"
1543 "A companye of ladies, tweye and tweye,\n"
1544 "Ech after other, clad in clothes blake; \n"
1545 "But swich a cry and swich a wo they make,\n"
1546 "That in this world nis creature livinge,\n"
1547 "That herde swich another weymentinge;\n"
1548 "And of this cry they nolde never stenten,\n"
1549 "Til they the reynes of his brydel henten.\n"
1550 "'What folk ben ye, that at myn hoomcominge\n"
1551 "Perturben so my feste with cryinge'?\n"
1552 "Quod Theseus, 'have ye so greet envye\n"
1553 "Of myn honour, that thus compleyne and crye? \n"
1554 "Or who hath yow misboden, or offended?\n"
1555 "And telleth me if it may been amended;\n"
1556 "And why that ye ben clothed thus in blak'?\n"
1557 "The eldest lady of hem alle spak,\n"
1558 "When she hadde swowned with a deedly chere,\n"
1559 "That it was routhe for to seen and here,\n"
1560 "And seyde: 'Lord, to whom Fortune hath yiven\n"
1561 "Victorie, and as a conquerour to liven,\n"
1562 "Noght greveth us your glorie and your honour;\n"
1563 "But we biseken mercy and socour.\n"
1564 "Have mercy on our wo and our distresse.\n"
1565 "Som drope of pitee, thurgh thy gentilesse,\n"
1566 "Up-on us wrecched wommen lat thou falle.\n"
1567 "For certes, lord, ther nis noon of us alle,\n"
1568 "That she nath been a duchesse or a quene;\n"
1569 "Now be we caitifs, as it is wel sene:\n"
1570 "Thanked be Fortune, and hir false wheel,\n"
1571 "That noon estat assureth to be weel.\n"
1572 "And certes, lord, t'abyden your presence,\n"
1573 "Here in the temple of the goddesse Clemence\n"
1574 "We han ben waytinge al this fourtenight;\n"
1575 "Now help us, lord, sith it is in thy might.\n"
1576 "I wrecche, which that wepe and waille thus,\n"
1577 "Was whylom wyf to king Capaneus,\n"
1578 "That starf at Thebes, cursed be that day!\n"
1579 "And alle we, that been in this array,\n"
1580 "And maken al this lamentacioun,\n"
1581 "We losten alle our housbondes at that toun,\n"
1582 "Whyl that the sege ther-aboute lay.\n"
1583 "And yet now th'olde Creon, weylaway!\n"
1584 "The lord is now of Thebes the citee, \n"
1585 "Fulfild of ire and of iniquitee,\n"
1586 "He, for despyt, and for his tirannye,\n"
1587 "To do the dede bodyes vileinye,\n"
1588 "Of alle our lordes, whiche that ben slawe,\n"
1589 "Hath alle the bodyes on an heep y-drawe,\n"
1590 "And wol nat suffren hem, by noon assent,\n"
1591 "Neither to been y-buried nor y-brent,\n"
1592 "But maketh houndes ete hem in despyt. zet'\n";
1594 #define TEST_BOYER_MOORE 1
1595 const char *cPattern
= "maketh houndes ete hem";
1596 //const char *cPattern = "Whylom";
1597 //const char *cPattern = "zet";
1598 const char *testId
= "searchTime()"; // for error macros.
1599 UnicodeString target
= longishText
;
1600 UErrorCode status
= U_ZERO_ERROR
;
1603 LocalUCollatorPointer
collator(ucol_open("en", &status
));
1604 CollData
*data
= CollData::open(collator
.getAlias(), status
);
1605 if (U_FAILURE(status
) || collator
.isNull() || data
== NULL
) {
1606 errcheckln(status
, "Unable to open UCollator or CollData. - %s", u_errorName(status
));
1609 //ucol_setStrength(collator.getAlias(), collatorStrength);
1610 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
1611 UnicodeString uPattern
= cPattern
;
1612 #ifndef TEST_BOYER_MOORE
1613 LocalUStringSearchPointer
uss(usearch_openFromCollator(uPattern
.getBuffer(), uPattern
.length(),
1614 target
.getBuffer(), target
.length(),
1615 collator
.getAlias(),
1616 NULL
, // the break iterator
1618 TEST_ASSERT_SUCCESS(status
);
1620 BoyerMooreSearch
bms(data
, uPattern
, &target
, status
);
1621 TEST_ASSERT_SUCCESS(status
);
1624 // int32_t foundStart;
1625 // int32_t foundEnd;
1628 // Find the match position usgin strstr
1629 const char *pm
= strstr(longishText
, cPattern
);
1630 TEST_ASSERT_M(pm
!=NULL
, "No pattern match with strstr");
1631 int32_t refMatchPos
= (int32_t)(pm
- longishText
);
1632 int32_t icuMatchPos
;
1633 int32_t icuMatchEnd
;
1634 #ifndef TEST_BOYER_MOORE
1635 usearch_search(uss
.getAlias(), 0, &icuMatchPos
, &icuMatchEnd
, &status
);
1636 TEST_ASSERT_SUCCESS(status
);
1638 found
= bms
.search(0, icuMatchPos
, icuMatchEnd
);
1640 TEST_ASSERT_M(refMatchPos
== icuMatchPos
, "strstr and icu give different match positions.");
1645 // Try loopcounts around 100000 to some millions, depending on the operation,
1646 // to get runtimes of at least several seconds.
1647 for (i
=0; i
<10000; i
++) {
1648 #ifndef TEST_BOYER_MOORE
1649 found
= usearch_search(uss
.getAlias(), 0, &icuMatchPos
, &icuMatchEnd
, &status
);
1651 found
= bms
.search(0, icuMatchPos
, icuMatchEnd
);
1653 //TEST_ASSERT_SUCCESS(status);
1654 //TEST_ASSERT(found);
1656 // usearch_setOffset(uss.getAlias(), 0, &status);
1657 // icuMatchPos = usearch_next(uss.getAlias(), &status);
1659 // The i+j stuff is to confuse the optimizer and get it to actually leave the
1660 // call to strstr in place.
1661 //pm = strstr(longishText+j, cPattern);
1665 //printf("%ld, %d\n", pm-longishText, j);
1666 #ifdef TEST_BOYER_MOORE
1667 CollData::close(data
);
1672 //----------------------------------------------------------------------------------------
1674 // Random Numbers. Similar to standard lib rand() and srand()
1675 // Not using library to
1676 // 1. Get same results on all platforms.
1677 // 2. Get access to current seed, to more easily reproduce failures.
1679 //---------------------------------------------------------------------------------------
1680 static uint32_t m_seed
= 1;
1682 static uint32_t m_rand()
1684 m_seed
= m_seed
* 1103515245 + 12345;
1685 return (uint32_t)(m_seed
/65536) % 32768;
1691 virtual void append(UnicodeString
&test
, UnicodeString
&alternate
) = 0;
1708 class SetMonkey
: public Monkey
1711 SetMonkey(const USet
*theSet
);
1714 virtual void append(UnicodeString
&test
, UnicodeString
&alternate
);
1720 SetMonkey::SetMonkey(const USet
*theSet
)
1721 : Monkey(), set(theSet
)
1726 SetMonkey::~SetMonkey()
1731 void SetMonkey::append(UnicodeString
&test
, UnicodeString
&alternate
)
1733 int32_t size
= uset_size(set
);
1734 int32_t index
= m_rand() % size
;
1735 UChar32 ch
= uset_charAt(set
, index
);
1736 UnicodeString
str(ch
);
1739 alternate
.append(str
); // flip case, or some junk?
1742 class StringSetMonkey
: public Monkey
1745 StringSetMonkey(const USet
*theSet
, UCollator
*theCollator
, CollData
*theCollData
);
1748 void append(UnicodeString
&testCase
, UnicodeString
&alternate
);
1751 UnicodeString
&generateAlternative(const UnicodeString
&testCase
, UnicodeString
&alternate
);
1758 StringSetMonkey::StringSetMonkey(const USet
*theSet
, UCollator
*theCollator
, CollData
*theCollData
)
1759 : Monkey(), set(theSet
), coll(theCollator
), collData(theCollData
)
1764 StringSetMonkey::~StringSetMonkey()
1769 void StringSetMonkey::append(UnicodeString
&testCase
, UnicodeString
&alternate
)
1771 int32_t itemCount
= uset_getItemCount(set
), len
= 0;
1772 int32_t index
= m_rand() % itemCount
;
1773 UChar32 rangeStart
= 0, rangeEnd
= 0;
1775 UErrorCode err
= U_ZERO_ERROR
;
1777 len
= uset_getItem(set
, index
, &rangeStart
, &rangeEnd
, buffer
, 16, &err
);
1780 int32_t offset
= m_rand() % (rangeEnd
- rangeStart
+ 1);
1781 UChar32 ch
= rangeStart
+ offset
;
1782 UnicodeString
str(ch
);
1784 testCase
.append(str
);
1785 generateAlternative(str
, alternate
);
1786 } else if (len
> 0) {
1787 // should check that len < 16...
1788 UnicodeString
str(buffer
, len
);
1790 testCase
.append(str
);
1791 generateAlternative(str
, alternate
);
1793 // shouldn't happen...
1797 UnicodeString
&StringSetMonkey::generateAlternative(const UnicodeString
&testCase
, UnicodeString
&alternate
)
1799 // find out shortest string for the longest sequence of ces.
1800 // needs to be refined to use dynamic programming, but will be roughly right
1801 UErrorCode status
= U_ZERO_ERROR
;
1802 CEList
ceList(coll
, testCase
, status
);
1806 if (ceList
.size() == 0) {
1807 return alternate
.append(testCase
);
1810 while (offset
< ceList
.size()) {
1811 int32_t ce
= ceList
.get(offset
);
1812 const StringList
*strings
= collData
->getStringList(ce
);
1814 if (strings
== NULL
) {
1815 return alternate
.append(testCase
);
1818 int32_t stringCount
= strings
->size();
1821 // find random string that generates the same CEList
1822 const CEList
*ceList2
= NULL
;
1823 const UnicodeString
*string
= NULL
;
1824 UBool matches
= FALSE
;
1827 int32_t s
= m_rand() % stringCount
;
1829 if (tries
++ > stringCount
) {
1830 alternate
.append(testCase
);
1834 string
= strings
->get(s
);
1835 ceList2
= collData
->getCEList(string
);
1836 matches
= ceList
.matchesAt(offset
, ceList2
);
1839 collData
->freeCEList((CEList
*) ceList2
);
1841 } while (! matches
);
1843 alt
.append(*string
);
1844 offset
+= ceList2
->size();
1845 collData
->freeCEList(ceList2
);
1848 const CEList
altCEs(coll
, alt
, status
);
1850 if (ceList
.matchesAt(0, &altCEs
)) {
1851 return alternate
.append(alt
);
1854 return alternate
.append(testCase
);
1857 static void generateTestCase(UCollator
*coll
, Monkey
*monkeys
[], int32_t monkeyCount
, UnicodeString
&testCase
, UnicodeString
&alternate
)
1859 int32_t pieces
= (m_rand() % 4) + 1;
1860 UErrorCode status
= U_ZERO_ERROR
;
1866 monkeys
[0]->append(testCase
, alternate
);
1868 for(int32_t piece
= 0; piece
< pieces
; piece
+= 1) {
1869 int32_t monkey
= m_rand() % monkeyCount
;
1871 monkeys
[monkey
]->append(testCase
, alternate
);
1874 const CEList
ceTest(coll
, testCase
, status
);
1875 const CEList
ceAlt(coll
, alternate
, status
);
1877 matches
= ceTest
.matchesAt(0, &ceAlt
);
1878 } while (! matches
);
1882 // Find the next acceptable boundary following the specified starting index
1883 // in the target text being searched.
1884 // TODO: refine what is an acceptable boundary. For the moment,
1885 // choose the next position not within a combining sequence.
1888 static int32_t nextBoundaryAfter(const UnicodeString
&string
, int32_t startIndex
) {
1889 const UChar
*text
= string
.getBuffer();
1890 int32_t textLen
= string
.length();
1892 if (startIndex
>= textLen
) {
1897 int32_t i
= startIndex
;
1899 U16_NEXT(text
, i
, textLen
, c
);
1901 // If we are on a control character, stop without looking for combining marks.
1902 // Control characters do not combine.
1903 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1904 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
1908 // The initial character was not a control, and can thus accept trailing
1909 // combining characters. Advance over however many of them there are.
1910 int32_t indexOfLastCharChecked
;
1913 indexOfLastCharChecked
= i
;
1919 U16_NEXT(text
, i
, textLen
, c
);
1920 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1922 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
1927 return indexOfLastCharChecked
;
1932 static UBool
isInCombiningSequence(const UnicodeString
&string
, int32_t index
) {
1933 const UChar
*text
= string
.getBuffer();
1934 int32_t textLen
= string
.length();
1936 if (index
>=textLen
|| index
<=0) {
1940 // If the character at the current index is not a GRAPHEME_EXTEND
1941 // then we can not be within a combining sequence.
1943 U16_GET(text
, 0, index
, textLen
, c
);
1944 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1945 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
1949 // We are at a combining mark. If the preceding character is anything
1950 // except a CONTROL, CR or LF, we are in a combining sequence.
1951 U16_PREV(text
, 0, index
, c
);
1952 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1954 return !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
1958 static UBool
simpleSearch(UCollator
*coll
, const UnicodeString
&target
, int32_t offset
, const UnicodeString
&pattern
, int32_t &matchStart
, int32_t &matchEnd
)
1960 UErrorCode status
= U_ZERO_ERROR
;
1961 OrderList
targetOrders(coll
, target
, offset
);
1962 OrderList
patternOrders(coll
, pattern
);
1963 int32_t targetSize
= targetOrders
.size() - 1;
1964 int32_t patternSize
= patternOrders
.size() - 1;
1965 UBreakIterator
*charBreakIterator
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(coll
, ULOC_VALID_LOCALE
, &status
),
1966 target
.getBuffer(), target
.length(), &status
);
1968 if (patternSize
== 0) {
1969 // Searching for an empty pattern always fails
1970 matchStart
= matchEnd
= -1;
1971 ubrk_close(charBreakIterator
);
1975 matchStart
= matchEnd
= -1;
1977 for(int32_t i
= 0; i
< targetSize
; i
+= 1) {
1978 if (targetOrders
.matchesAt(i
, patternOrders
)) {
1979 int32_t start
= targetOrders
.getLowOffset(i
);
1980 int32_t maxLimit
= targetOrders
.getLowOffset(i
+ patternSize
);
1981 int32_t minLimit
= targetOrders
.getLowOffset(i
+ patternSize
- 1);
1983 // if the low and high offsets of the first CE in
1984 // the match are the same, it means that the match
1985 // starts in the middle of an expansion - all but
1986 // the first CE of the expansion will have the offset
1987 // of the following character.
1988 if (start
== targetOrders
.getHighOffset(i
)) {
1992 // Make sure match starts on a grapheme boundary
1993 if (! ubrk_isBoundary(charBreakIterator
, start
)) {
1997 // If the low and high offsets of the CE after the match
1998 // are the same, it means that the match ends in the middle
1999 // of an expansion sequence.
2000 if (maxLimit
== targetOrders
.getHighOffset(i
+ patternSize
) &&
2001 targetOrders
.getOrder(i
+ patternSize
) != UCOL_NULLORDER
) {
2005 int32_t mend
= maxLimit
;
2007 // Find the first grapheme break after the character index
2008 // of the last CE in the match. If it's after character index
2009 // that's after the last CE in the match, use that index
2010 // as the end of the match.
2011 if (minLimit
< maxLimit
) {
2012 // When the last CE's low index is same with its high index, the CE is likely
2013 // a part of expansion. In this case, the index is located just after the
2014 // character corresponding to the CEs compared above. If the index is right
2015 // at the break boundary, move the position to the next boundary will result
2016 // incorrect match length when there are ignorable characters exist between
2017 // the position and the next character produces CE(s). See ticket#8482.
2018 if (minLimit
== targetOrders
.getHighOffset(i
+ patternSize
- 1) && ubrk_isBoundary(charBreakIterator
, minLimit
)) {
2021 int32_t nba
= ubrk_following(charBreakIterator
, minLimit
);
2023 if (nba
>= targetOrders
.getHighOffset(i
+ patternSize
- 1)) {
2029 if (mend
> maxLimit
) {
2033 if (! ubrk_isBoundary(charBreakIterator
, mend
)) {
2040 ubrk_close(charBreakIterator
);
2045 ubrk_close(charBreakIterator
);
2049 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2050 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
2051 int32_t val
= defaultVal
;
2053 name
.append(" *= *(-?\\d+)");
2055 UErrorCode status
= U_ZERO_ERROR
;
2056 RegexMatcher
m(name
, params
, 0, status
);
2059 // The param exists. Convert the string to an int.
2060 char valString
[100];
2061 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
2063 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
2064 paramLength
= (int32_t)(sizeof(valString
)-2);
2067 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
2068 val
= strtol(valString
, NULL
, 10);
2070 // Delete this parameter from the params string.
2072 params
= m
.replaceFirst("", status
);
2075 //U_ASSERT(U_SUCCESS(status));
2076 if (! U_SUCCESS(status
)) {
2084 #if !UCONFIG_NO_COLLATION
2085 int32_t SSearchTest::monkeyTestCase(UCollator
*coll
, const UnicodeString
&testCase
, const UnicodeString
&pattern
, const UnicodeString
&altPattern
,
2086 const char *name
, const char *strength
, uint32_t seed
)
2088 UErrorCode status
= U_ZERO_ERROR
;
2089 int32_t actualStart
= -1, actualEnd
= -1;
2090 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2091 int32_t expectedStart
= -1, expectedEnd
= -1;
2092 int32_t notFoundCount
= 0;
2093 LocalUStringSearchPointer
uss(usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
2094 testCase
.getBuffer(), testCase
.length(),
2096 NULL
, // the break iterator
2099 // **** TODO: find *all* matches, not just first one ****
2100 simpleSearch(coll
, testCase
, 0, pattern
, expectedStart
, expectedEnd
);
2102 usearch_search(uss
.getAlias(), 0, &actualStart
, &actualEnd
, &status
);
2104 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2105 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2106 " strength=%s seed=%d",
2107 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
);
2110 if (expectedStart
== -1 && actualStart
== -1) {
2114 // **** TODO: find *all* matches, not just first one ****
2115 simpleSearch(coll
, testCase
, 0, altPattern
, expectedStart
, expectedEnd
);
2117 usearch_setPattern(uss
.getAlias(), altPattern
.getBuffer(), altPattern
.length(), &status
);
2119 usearch_search(uss
.getAlias(), 0, &actualStart
, &actualEnd
, &status
);
2121 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2122 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2123 " strength=%s seed=%d",
2124 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
);
2127 if (expectedStart
== -1 && actualStart
== -1) {
2131 return notFoundCount
;
2134 static void hexForUnicodeString(const UnicodeString
&ustr
, char * cbuf
, int32_t cbuflen
)
2136 int32_t ustri
, ustrlen
= ustr
.length();
2138 for (ustri
= 0; ustri
< ustrlen
; ++ustri
) {
2139 if (cbuflen
>= 9 /* format width for single code unit(5) + terminating ellipsis(3) + null(1) */) {
2140 int len
= sprintf(cbuf
, " %04X", ustr
.charAt(ustri
));
2144 if (cbuflen
>= 4 /* terminating ellipsis(3) + null(1) */) {
2145 sprintf(cbuf
, "...");
2146 } else if (cbuflen
>= 1) {
2154 int32_t SSearchTest::bmMonkeyTestCase(UCollator
*coll
, const UnicodeString
&testCase
, const UnicodeString
&pattern
, const UnicodeString
&altPattern
,
2155 BoyerMooreSearch
*bms
, BoyerMooreSearch
*abms
,
2156 const char *name
, const char *strength
, uint32_t seed
)
2158 UErrorCode status
= U_ZERO_ERROR
;
2159 int32_t actualStart
= -1, actualEnd
= -1;
2160 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2161 int32_t expectedStart
= -1, expectedEnd
= -1;
2162 int32_t notFoundCount
= 0;
2165 // **** TODO: find *all* matches, not just first one ****
2166 simpleSearch(coll
, testCase
, 0, pattern
, expectedStart
, expectedEnd
);
2168 bms
->setTargetString(&testCase
, status
);
2169 bms
->search(0, actualStart
, actualEnd
);
2171 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2172 hexForUnicodeString(pattern
, hexbuf
, sizeof(hexbuf
));
2173 errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2174 " strength=%s seed=%d <pattern>: %s",
2175 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
, hexbuf
);
2178 if (expectedStart
== -1 && actualStart
== -1) {
2182 // **** TODO: find *all* matches, not just first one ****
2183 simpleSearch(coll
, testCase
, 0, altPattern
, expectedStart
, expectedEnd
);
2185 abms
->setTargetString(&testCase
, status
);
2186 abms
->search(0, actualStart
, actualEnd
);
2188 if (expectedStart
>= 0 && (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
)) {
2189 hexForUnicodeString(altPattern
, hexbuf
, sizeof(hexbuf
));
2190 errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2191 " strength=%s seed=%d <pattern>: %s",
2192 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
, hexbuf
);
2195 if (expectedStart
== -1 && actualStart
== -1) {
2200 return notFoundCount
;
2204 void SSearchTest::monkeyTest(char *params
)
2207 UErrorCode status
= U_ZERO_ERROR
;
2208 //UCollator *coll = ucol_open(NULL, &status);
2209 UCollator
*coll
= ucol_openFromShortString("S1", FALSE
, NULL
, &status
);
2211 if (U_FAILURE(status
)) {
2212 errcheckln(status
, "Failed to create collator in MonkeyTest! - %s", u_errorName(status
));
2216 CollData
*monkeyData
= CollData::open(coll
, status
);
2218 USet
*expansions
= uset_openEmpty();
2219 USet
*contractions
= uset_openEmpty();
2221 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
2223 U_STRING_DECL(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2224 U_STRING_INIT(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2225 USet
*letters
= uset_openPattern(letter_pattern
, 39, &status
);
2226 SetMonkey
letterMonkey(letters
);
2227 StringSetMonkey
contractionMonkey(contractions
, coll
, monkeyData
);
2228 StringSetMonkey
expansionMonkey(expansions
, coll
, monkeyData
);
2229 UnicodeString testCase
;
2230 UnicodeString alternate
;
2231 UnicodeString pattern
, altPattern
;
2232 UnicodeString prefix
, altPrefix
;
2233 UnicodeString suffix
, altSuffix
;
2235 Monkey
*monkeys
[] = {
2245 int32_t monkeyCount
= sizeof(monkeys
) / sizeof(monkeys
[0]);
2246 // int32_t nonMatchCount = 0;
2248 UCollationStrength strengths
[] = {UCOL_PRIMARY
, UCOL_SECONDARY
, UCOL_TERTIARY
};
2249 const char *strengthNames
[] = {"primary", "secondary", "tertiary"};
2250 int32_t strengthCount
= sizeof(strengths
) / sizeof(strengths
[0]);
2251 int32_t loopCount
= quick
? 1000 : 10000;
2252 int32_t firstStrength
= 0;
2253 int32_t lastStrength
= strengthCount
- 1; //*/ 0;
2255 if (params
!= NULL
) {
2256 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2257 UnicodeString
p(params
);
2259 loopCount
= getIntParam("loop", p
, loopCount
);
2260 m_seed
= getIntParam("seed", p
, m_seed
);
2262 RegexMatcher
m(" *strength *= *(primary|secondary|tertiary) *", p
, 0, status
);
2264 UnicodeString breakType
= m
.group(1, status
);
2266 for (int32_t s
= 0; s
< strengthCount
; s
+= 1) {
2267 if (breakType
== strengthNames
[s
]) {
2268 firstStrength
= lastStrength
= s
;
2274 p
= m
.replaceFirst("", status
);
2277 if (RegexMatcher("\\S", p
, 0, status
).find()) {
2278 // Each option is stripped out of the option string as it is processed.
2279 // All options have been checked. The option string should have been completely emptied..
2281 p
.extract(buf
, sizeof(buf
), NULL
, status
);
2282 buf
[sizeof(buf
)-1] = 0;
2283 errln("Unrecognized or extra parameter: %s\n", buf
);
2287 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2291 for(int32_t s
= firstStrength
; s
<= lastStrength
; s
+= 1) {
2292 int32_t notFoundCount
= 0;
2294 logln("Setting strength to %s.", strengthNames
[s
]);
2295 ucol_setStrength(coll
, strengths
[s
]);
2297 // TODO: try alternate prefix and suffix too?
2298 // TODO: alterntaes are only equal at primary strength. Is this OK?
2299 for(int32_t t
= 0; t
< loopCount
; t
+= 1) {
2300 uint32_t seed
= m_seed
;
2303 generateTestCase(coll
, monkeys
, monkeyCount
, pattern
, altPattern
);
2304 generateTestCase(coll
, monkeys
, monkeyCount
, prefix
, altPrefix
);
2305 generateTestCase(coll
, monkeys
, monkeyCount
, suffix
, altSuffix
);
2308 notFoundCount
+= monkeyTestCase(coll
, pattern
, pattern
, altPattern
, "pattern", strengthNames
[s
], seed
);
2311 testCase
.append(prefix
);
2312 testCase
.append(/*alt*/pattern
);
2315 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "prefix + pattern", strengthNames
[s
], seed
);
2317 testCase
.append(suffix
);
2319 // prefix + pattern + suffix
2320 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "prefix + pattern + suffix", strengthNames
[s
], seed
);
2323 testCase
.append(pattern
);
2324 testCase
.append(suffix
);
2327 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "pattern + suffix", strengthNames
[s
], seed
);
2330 logln("For strength %s the not found count is %d.", strengthNames
[s
], notFoundCount
);
2333 uset_close(contractions
);
2334 uset_close(expansions
);
2335 uset_close(letters
);
2337 CollData::close(monkeyData
);
2342 void SSearchTest::bmMonkeyTest(char *params
)
2344 static const UChar skipChars
[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb
2346 UErrorCode status
= U_ZERO_ERROR
;
2347 UCollator
*coll
= ucol_openFromShortString("LEN_S1", FALSE
, NULL
, &status
);
2349 if (U_FAILURE(status
)) {
2350 errcheckln(status
, "Failed to create collator in MonkeyTest! - %s", u_errorName(status
));
2354 CollData
*monkeyData
= CollData::open(coll
, status
);
2356 USet
*expansions
= uset_openEmpty();
2357 USet
*contractions
= uset_openEmpty();
2359 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
2361 U_STRING_DECL(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2362 U_STRING_INIT(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2363 USet
*letters
= uset_openPattern(letter_pattern
, 39, &status
);
2364 SetMonkey
letterMonkey(letters
);
2365 StringSetMonkey
contractionMonkey(contractions
, coll
, monkeyData
);
2366 StringSetMonkey
expansionMonkey(expansions
, coll
, monkeyData
);
2367 UnicodeString testCase
;
2368 UnicodeString alternate
;
2369 UnicodeString pattern
, altPattern
;
2370 UnicodeString prefix
, altPrefix
;
2371 UnicodeString suffix
, altSuffix
;
2373 Monkey
*monkeys
[] = {
2383 int32_t monkeyCount
= sizeof(monkeys
) / sizeof(monkeys
[0]);
2384 // int32_t nonMatchCount = 0;
2386 UCollationStrength strengths
[] = {UCOL_PRIMARY
, UCOL_SECONDARY
, UCOL_TERTIARY
};
2387 const char *strengthNames
[] = {"primary", "secondary", "tertiary"};
2388 int32_t strengthCount
= sizeof(strengths
) / sizeof(strengths
[0]);
2389 int32_t loopCount
= quick
? 1000 : 10000;
2390 int32_t firstStrength
= 0;
2391 int32_t lastStrength
= strengthCount
- 1; //*/ 0;
2393 if (params
!= NULL
) {
2394 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2395 UnicodeString
p(params
);
2397 loopCount
= getIntParam("loop", p
, loopCount
);
2398 m_seed
= getIntParam("seed", p
, m_seed
);
2400 RegexMatcher
m(" *strength *= *(primary|secondary|tertiary) *", p
, 0, status
);
2402 UnicodeString breakType
= m
.group(1, status
);
2404 for (int32_t s
= 0; s
< strengthCount
; s
+= 1) {
2405 if (breakType
== strengthNames
[s
]) {
2406 firstStrength
= lastStrength
= s
;
2412 p
= m
.replaceFirst("", status
);
2415 if (RegexMatcher("\\S", p
, 0, status
).find()) {
2416 // Each option is stripped out of the option string as it is processed.
2417 // All options have been checked. The option string should have been completely emptied..
2419 p
.extract(buf
, sizeof(buf
), NULL
, status
);
2420 buf
[sizeof(buf
)-1] = 0;
2421 errln("Unrecognized or extra parameter: %s\n", buf
);
2425 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2429 for(int32_t s
= firstStrength
; s
<= lastStrength
; s
+= 1) {
2430 int32_t notFoundCount
= 0;
2432 logln("Setting strength to %s.", strengthNames
[s
]);
2433 ucol_setStrength(coll
, strengths
[s
]);
2435 CollData
*data
= CollData::open(coll
, status
);
2437 UnicodeString
skipString(skipChars
); // for timebomb
2438 UnicodeSet
* skipSet
= UnicodeSet::createFromAll(skipString
); // for timebomb
2439 // TODO: try alternate prefix and suffix too?
2440 // TODO: alterntaes are only equal at primary strength. Is this OK?
2441 for(int32_t t
= 0; t
< loopCount
; t
+= 1) {
2442 uint32_t seed
= m_seed
;
2445 generateTestCase(coll
, monkeys
, monkeyCount
, pattern
, altPattern
);
2446 generateTestCase(coll
, monkeys
, monkeyCount
, prefix
, altPrefix
);
2447 generateTestCase(coll
, monkeys
, monkeyCount
, suffix
, altSuffix
);
2449 if (!isICUVersionAtLeast(50, 0) && skipSet
->containsSome(pattern
)) {
2450 continue; // timebomb until ticket #8080 is resolved
2453 BoyerMooreSearch
pat(data
, pattern
, NULL
, status
);
2454 BoyerMooreSearch
alt(data
, altPattern
, NULL
, status
);
2456 // **** need a better way to deal with this ****
2465 notFoundCount
+= bmMonkeyTestCase(coll
, pattern
, pattern
, altPattern
, &pat
, &alt
, "pattern", strengthNames
[s
], seed
);
2468 testCase
.append(prefix
);
2469 testCase
.append(/*alt*/pattern
);
2472 notFoundCount
+= bmMonkeyTestCase(coll
, testCase
, pattern
, altPattern
, &pat
, &alt
, "prefix + pattern", strengthNames
[s
], seed
);
2474 testCase
.append(suffix
);
2476 // prefix + pattern + suffix
2477 notFoundCount
+= bmMonkeyTestCase(coll
, testCase
, pattern
, altPattern
, &pat
, &alt
, "prefix + pattern + suffix", strengthNames
[s
], seed
);
2480 testCase
.append(pattern
);
2481 testCase
.append(suffix
);
2484 notFoundCount
+= bmMonkeyTestCase(coll
, testCase
, pattern
, altPattern
, &pat
, &alt
, "pattern + suffix", strengthNames
[s
], seed
);
2486 delete skipSet
; // for timebomb
2488 CollData::close(data
);
2490 logln("For strength %s the not found count is %d.", strengthNames
[s
], notFoundCount
);
2493 uset_close(contractions
);
2494 uset_close(expansions
);
2495 uset_close(letters
);
2497 CollData::close(monkeyData
);
2502 void SSearchTest::stringListTest(){
2503 UErrorCode status
= U_ZERO_ERROR
;
2504 StringList
*sl
= new StringList(status
);
2505 if(U_FAILURE(status
)){
2506 errln("ERROR: stringListTest: Could not start StringList");
2509 const UChar chars
[] = {
2512 sl
->add(chars
, (int32_t) 0, status
);
2513 if(U_FAILURE(status
)){
2514 errln("ERROR: stringListTest: StringList::add");
2517 if(sl
->getDynamicClassID() != StringList::getStaticClassID()){
2518 errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID does not match");