2 **********************************************************************
3 * Copyright (C) 2005-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_COLLATION
13 #include "unicode/unistr.h"
14 #include "unicode/putil.h"
15 #include "unicode/usearch.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/ustring.h"
35 #include "xmlparser.h"
43 #define TEST_ASSERT(x) {if (!(x)) { \
44 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
46 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
47 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
49 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
50 errln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
51 __FILE__, __LINE__, testId, u_errorName(errcode));}}
53 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
55 //---------------------------------------------------------------------------
57 // Test class boilerplate
59 //---------------------------------------------------------------------------
60 SSearchTest::SSearchTest()
64 SSearchTest::~SSearchTest()
68 void SSearchTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char *params
)
70 if (exec
) logln("TestSuite SSearchTest: ");
72 #if !UCONFIG_NO_BREAK_ITERATION
73 case 0: name
= "searchTest";
74 if (exec
) searchTest();
77 case 1: name
= "offsetTest";
78 if (exec
) offsetTest();
81 case 2: name
= "monkeyTest";
82 if (exec
) monkeyTest(params
);
86 break; //needed to end loop
91 #if !UCONFIG_NO_BREAK_ITERATION
93 #define PATH_BUFFER_SIZE 2048
94 const char *SSearchTest::getPath(char buffer
[2048], const char *filename
) {
95 UErrorCode status
= U_ZERO_ERROR
;
96 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
98 if (U_FAILURE(status
) || strlen(testDataDirectory
) + strlen(filename
) + 1 >= PATH_BUFFER_SIZE
) {
99 errln("ERROR: getPath() failed - %s", u_errorName(status
));
103 strcpy(buffer
, testDataDirectory
);
104 strcat(buffer
, filename
);
109 void SSearchTest::searchTest()
111 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
112 UErrorCode status
= U_ZERO_ERROR
;
113 char path
[PATH_BUFFER_SIZE
];
114 const char *testFilePath
= getPath(path
, "ssearch.xml");
116 if (testFilePath
== NULL
) {
117 return; /* Couldn't get path: error message already output. */
120 UXMLParser
*parser
= UXMLParser::createParser(status
);
121 TEST_ASSERT_SUCCESS(status
);
122 UXMLElement
*root
= parser
->parseFile(testFilePath
, status
);
123 TEST_ASSERT_SUCCESS(status
);
124 if (U_FAILURE(status
)) {
128 const UnicodeString
*debugTestCase
= root
->getAttribute("debug");
129 if (debugTestCase
!= NULL
) {
130 // setenv("USEARCH_DEBUG", "1", 1);
134 const UXMLElement
*testCase
;
137 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
139 if (testCase
->getTagName().compare("test-case") != 0) {
140 errln("ssearch, unrecognized XML Element in test file");
143 const UnicodeString
*id
= testCase
->getAttribute("id");
146 id
->extract(0, id
->length(), testId
, sizeof(testId
), US_INV
);
149 // If debugging test case has been specified and this is not it, skip to next.
150 if (id
!=NULL
&& debugTestCase
!=NULL
&& *id
!= *debugTestCase
) {
154 // Get the requested collation strength.
155 // Default is tertiary if the XML attribute is missing from the test case.
157 const UnicodeString
*strength
= testCase
->getAttribute("strength");
158 UColAttributeValue collatorStrength
;
159 if (strength
==NULL
) { collatorStrength
= UCOL_TERTIARY
;}
160 else if (*strength
=="PRIMARY") { collatorStrength
= UCOL_PRIMARY
;}
161 else if (*strength
=="SECONDARY") { collatorStrength
= UCOL_SECONDARY
;}
162 else if (*strength
=="TERTIARY") { collatorStrength
= UCOL_TERTIARY
;}
163 else if (*strength
=="QUATERNARY") { collatorStrength
= UCOL_QUATERNARY
;}
164 else if (*strength
=="IDENTICAL") { collatorStrength
= UCOL_IDENTICAL
;}
166 // Bogus value supplied for strength. Shouldn't happen, even from
167 // typos, if the XML source has been validated.
168 // This assert is a little deceiving in that strength can be
169 // any of the allowed values, not just TERTIARY, but it will
170 // do the job of getting the error output.
171 TEST_ASSERT(*strength
=="TERTIARY")
175 // Get the collator normalization flag. Default is UCOL_OFF.
177 UColAttributeValue normalize
= UCOL_OFF
;
178 const UnicodeString
*norm
= testCase
->getAttribute("norm");
179 TEST_ASSERT (norm
==NULL
|| *norm
=="ON" || *norm
=="OFF");
180 if (norm
!=NULL
&& *norm
=="ON") {
184 const UnicodeString
defLocale("en");
186 const UnicodeString
*locale
= testCase
->getAttribute("locale");
187 if (locale
== NULL
|| locale
->length()==0) {
190 locale
->extract(0, locale
->length(), clocale
, sizeof(clocale
), NULL
);
194 UnicodeString target
;
195 UnicodeString pattern
;
196 int32_t expectedMatchStart
= -1;
197 int32_t expectedMatchLimit
= -1;
198 const UXMLElement
*n
;
201 n
= testCase
->getChildElement("pattern");
202 TEST_ASSERT(n
!= NULL
);
206 text
= n
->getText(FALSE
);
207 text
= text
.unescape();
208 pattern
.append(text
);
211 n
= testCase
->getChildElement("pre");
213 text
= n
->getText(FALSE
);
214 text
= text
.unescape();
219 n
= testCase
->getChildElement("m");
221 expectedMatchStart
= target
.length();
222 text
= n
->getText(FALSE
);
223 text
= text
.unescape();
225 expectedMatchLimit
= target
.length();
229 n
= testCase
->getChildElement("post");
231 text
= n
->getText(FALSE
);
232 text
= text
.unescape();
237 // Check that there weren't extra things in the XML
238 TEST_ASSERT(nodeCount
== testCase
->countChildren());
240 // Open a collotor and StringSearch based on the parameters
241 // obtained from the XML.
243 status
= U_ZERO_ERROR
;
244 UCollator
*collator
= ucol_open(clocale
, &status
);
245 ucol_setStrength(collator
, collatorStrength
);
246 ucol_setAttribute(collator
, UCOL_NORMALIZATION_MODE
, normalize
, &status
);
247 UStringSearch
*uss
= usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
248 target
.getBuffer(), target
.length(),
250 NULL
, // the break iterator
253 TEST_ASSERT_SUCCESS(status
);
254 if (U_FAILURE(status
)) {
256 ucol_close(collator
);
260 int32_t foundStart
= 0;
261 int32_t foundLimit
= 0;
265 // Do the search, check the match result against the expected results.
267 foundMatch
= usearch_search(uss
, 0, &foundStart
, &foundLimit
, &status
);
268 TEST_ASSERT_SUCCESS(status
);
269 if (foundMatch
&& expectedMatchStart
<0 ||
270 foundStart
!= expectedMatchStart
||
271 foundLimit
!= expectedMatchLimit
) {
272 TEST_ASSERT(FALSE
); // ouput generic error position
273 infoln("Found, expected match start = %d, %d \n"
274 "Found, expected match limit = %d, %d",
275 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
278 // In case there are other matches...
279 // (should we only do this if the test case passed?)
281 expectedMatchStart
= foundStart
;
282 expectedMatchLimit
= foundLimit
;
284 foundMatch
= usearch_search(uss
, foundLimit
, &foundStart
, &foundLimit
, &status
);
289 uss
= usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
290 target
.getBuffer(), target
.length(),
296 // Do the backwards search, check the match result against the expected results.
298 foundMatch
= usearch_searchBackwards(uss
, target
.length(), &foundStart
, &foundLimit
, &status
);
299 TEST_ASSERT_SUCCESS(status
);
300 if (foundMatch
&& expectedMatchStart
<0 ||
301 foundStart
!= expectedMatchStart
||
302 foundLimit
!= expectedMatchLimit
) {
303 TEST_ASSERT(FALSE
); // ouput generic error position
304 infoln("Found, expected backwards match start = %d, %d \n"
305 "Found, expected backwards match limit = %d, %d",
306 foundStart
, expectedMatchStart
, foundLimit
, expectedMatchLimit
);
310 ucol_close(collator
);
329 OrderList(UCollator
*coll
, const UnicodeString
&string
, int32_t stringOffset
= 0);
332 int32_t size(void) const;
333 void add(int32_t order
, int32_t low
, int32_t high
);
334 const Order
*get(int32_t index
) const;
335 int32_t getLowOffset(int32_t index
) const;
336 int32_t getHighOffset(int32_t index
) const;
337 int32_t getOrder(int32_t index
) const;
339 UBool
compare(const OrderList
&other
) const;
340 UBool
matchesAt(int32_t offset
, const OrderList
&other
) const;
348 OrderList::OrderList()
349 : list(NULL
), listSize(0), listMax(16)
351 list
= new Order
[listMax
];
354 OrderList::OrderList(UCollator
*coll
, const UnicodeString
&string
, int32_t stringOffset
)
355 : list(NULL
), listMax(16), listSize(0)
357 UErrorCode status
= U_ZERO_ERROR
;
358 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
359 uint32_t strengthMask
= 0;
360 int32_t order
, low
, high
;
362 switch (ucol_getStrength(coll
))
365 strengthMask
|= UCOL_TERTIARYORDERMASK
;
369 strengthMask
|= UCOL_SECONDARYORDERMASK
;
373 strengthMask
|= UCOL_PRIMARYORDERMASK
;
376 list
= new Order
[listMax
];
378 ucol_setOffset(elems
, stringOffset
, &status
);
381 low
= ucol_getOffset(elems
);
382 order
= ucol_next(elems
, &status
);
383 high
= ucol_getOffset(elems
);
385 if (order
!= UCOL_NULLORDER
) {
386 order
&= strengthMask
;
389 if (order
!= UCOL_IGNORABLE
) {
390 add(order
, low
, high
);
392 } while (order
!= UCOL_NULLORDER
);
394 ucol_closeElements(elems
);
397 OrderList::~OrderList()
402 void OrderList::add(int32_t order
, int32_t low
, int32_t high
)
404 if (listSize
>= listMax
) {
407 Order
*newList
= new Order
[listMax
];
409 uprv_memcpy(newList
, list
, listSize
* sizeof(Order
));
414 list
[listSize
].order
= order
;
415 list
[listSize
].lowOffset
= low
;
416 list
[listSize
].highOffset
= high
;
421 const Order
*OrderList::get(int32_t index
) const
423 if (index
>= listSize
) {
430 int32_t OrderList::getLowOffset(int32_t index
) const
432 const Order
*order
= get(index
);
435 return order
->lowOffset
;
441 int32_t OrderList::getHighOffset(int32_t index
) const
443 const Order
*order
= get(index
);
446 return order
->highOffset
;
452 int32_t OrderList::getOrder(int32_t index
) const
454 const Order
*order
= get(index
);
460 return UCOL_NULLORDER
;
463 int32_t OrderList::size() const
468 void OrderList::reverse()
470 for(int32_t f
= 0, b
= listSize
- 1; f
< b
; f
+= 1, b
-= 1) {
471 Order swap
= list
[b
];
478 UBool
OrderList::compare(const OrderList
&other
) const
480 if (listSize
!= other
.listSize
) {
484 for(int32_t i
= 0; i
< listSize
; i
+= 1) {
485 if (list
[i
].order
!= other
.list
[i
].order
||
486 list
[i
].lowOffset
!= other
.list
[i
].lowOffset
||
487 list
[i
].highOffset
!= other
.list
[i
].highOffset
) {
495 UBool
OrderList::matchesAt(int32_t offset
, const OrderList
&other
) const
497 // NOTE: sizes include the NULLORDER, which we don't want to compare.
498 int32_t otherSize
= other
.size() - 1;
500 if (listSize
- 1 - offset
< otherSize
) {
504 for (int32_t i
= offset
, j
= 0; j
< otherSize
; i
+= 1, j
+= 1) {
505 if (getOrder(i
) != other
.getOrder(j
)) {
513 static char *printOffsets(char *buffer
, OrderList
&list
)
515 int32_t size
= list
.size();
518 for(int32_t i
= 0; i
< size
; i
+= 1) {
519 const Order
*order
= list
.get(i
);
522 s
+= sprintf(s
, ", ");
525 s
+= sprintf(s
, "(%d, %d)", order
->lowOffset
, order
->highOffset
);
531 static char *printOrders(char *buffer
, OrderList
&list
)
533 int32_t size
= list
.size();
536 for(int32_t i
= 0; i
< size
; i
+= 1) {
537 const Order
*order
= list
.get(i
);
540 s
+= sprintf(s
, ", ");
543 s
+= sprintf(s
, "%8.8X", order
->order
);
549 void SSearchTest::offsetTest()
551 const char *test
[] = {
552 "\\ua191\\u16ef\\u2036\\u017a",
555 // This results in a complex interaction between contraction,
556 // expansion and normalization that confuses the backwards offset fixups.
557 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
560 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
561 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
564 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
565 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
566 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
567 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
568 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
570 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
571 "abc\\u0E41\\u0301\\u0316",
572 "abc\\u0E41\\u0316\\u0301",
573 "\\u0E41\\u0301\\u0316",
574 "\\u0E41\\u0316\\u0301",
588 "A\\u0302\\u0301\\u0323B",
592 " \\uD800\\uDC00\\uDC00",
593 "a\\uD800\\uDC00\\uDC00",
599 "\\u0301A\\u0301\\u0301",
605 int32_t testCount
= ARRAY_SIZE(test
);
606 UErrorCode status
= U_ZERO_ERROR
;
607 RuleBasedCollator
*col
= (RuleBasedCollator
*) Collator::createInstance(Locale::getEnglish(), status
);
608 if (U_FAILURE(status
)) {
609 errln("Failed to create collator in offsetTest!");
612 char buffer
[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
613 // We could allocate one that's the right size by (CE_count * 10) + 2
614 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
616 col
->setAttribute(UCOL_NORMALIZATION_MODE
, UCOL_ON
, status
);
618 for(int32_t i
= 0; i
< testCount
; i
+= 1) {
619 UnicodeString ts
= CharsToUnicodeString(test
[i
]);
620 CollationElementIterator
*iter
= col
->createCollationElementIterator(ts
);
621 OrderList forwardList
;
622 OrderList backwardList
;
623 int32_t order
, low
, high
;
626 low
= iter
->getOffset();
627 order
= iter
->next(status
);
628 high
= iter
->getOffset();
630 forwardList
.add(order
, low
, high
);
631 } while (order
!= CollationElementIterator::NULLORDER
);
634 iter
->setOffset(ts
.length(), status
);
636 backwardList
.add(CollationElementIterator::NULLORDER
, iter
->getOffset(), iter
->getOffset());
639 high
= iter
->getOffset();
640 order
= iter
->previous(status
);
641 low
= iter
->getOffset();
643 if (order
== CollationElementIterator::NULLORDER
) {
647 backwardList
.add(order
, low
, high
);
650 backwardList
.reverse();
652 if (forwardList
.compare(backwardList
)) {
653 logln("Works with \"%s\"", test
[i
]);
654 logln("Forward offsets: [%s]", printOffsets(buffer
, forwardList
));
655 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
657 logln("Forward CEs: [%s]", printOrders(buffer
, forwardList
));
658 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
662 errln("Fails with \"%s\"", test
[i
]);
663 infoln("Forward offsets: [%s]", printOffsets(buffer
, forwardList
));
664 infoln("Backward offsets: [%s]", printOffsets(buffer
, backwardList
));
666 infoln("Forward CEs: [%s]", printOrders(buffer
, forwardList
));
667 infoln("Backward CEs: [%s]", printOrders(buffer
, backwardList
));
679 CEList(UCollator
*coll
, const UnicodeString
&string
);
682 int32_t size() const;
683 int32_t get(int32_t index
) const;
684 UBool
matchesAt(int32_t offset
, const CEList
*other
) const;
687 void add(int32_t ce
);
694 CEList::CEList(UCollator
*coll
, const UnicodeString
&string
)
695 : ces(NULL
), listMax(8), listSize(0)
697 UErrorCode status
= U_ZERO_ERROR
;
698 UCollationElements
*elems
= ucol_openElements(coll
, string
.getBuffer(), string
.length(), &status
);
699 uint32_t strengthMask
= 0;
703 switch (ucol_getStrength(coll
))
706 strengthMask
|= UCOL_TERTIARYORDERMASK
;
710 strengthMask
|= UCOL_SECONDARYORDERMASK
;
714 strengthMask
|= UCOL_PRIMARYORDERMASK
;
717 strengthMask
= UCOL_PRIMARYORDERMASK
;
720 ces
= new int32_t[listMax
];
722 while ((order
= ucol_next(elems
, &status
)) != UCOL_NULLORDER
) {
723 order
&= strengthMask
;
725 if (order
== UCOL_IGNORABLE
) {
732 ucol_closeElements(elems
);
740 void CEList::add(int32_t ce
)
742 if (listSize
>= listMax
) {
745 int32_t *newCEs
= new int32_t[listMax
];
747 uprv_memcpy(newCEs
, ces
, listSize
* sizeof(int32_t));
752 ces
[listSize
++] = ce
;
755 int32_t CEList::get(int32_t index
) const
757 if (index
>= 0 && index
< listSize
) {
764 UBool
CEList::matchesAt(int32_t offset
, const CEList
*other
) const
766 if (listSize
- offset
< other
->size()) {
770 for (int32_t i
= offset
, j
= 0; j
< other
->size(); i
+= 1, j
+= 1) {
771 if (ces
[i
] != other
->get(j
)) {
779 int32_t CEList::size() const
790 void add(const UnicodeString
*string
);
791 void add(const UChar
*chars
, int32_t count
);
792 const UnicodeString
*get(int32_t index
) const;
793 int32_t size() const;
796 UnicodeString
*strings
;
801 StringList::StringList()
802 : strings(NULL
), listMax(16), listSize(0)
804 strings
= new UnicodeString
[listMax
];
807 StringList::~StringList()
812 void StringList::add(const UnicodeString
*string
)
814 if (listSize
>= listMax
) {
817 UnicodeString
*newStrings
= new UnicodeString
[listMax
];
819 uprv_memcpy(newStrings
, strings
, listSize
* sizeof(UnicodeString
));
822 strings
= newStrings
;
825 // The ctor initialized all the strings in
826 // the array to empty strings, so this
827 // is the same as copying the source string.
828 strings
[listSize
++].append(*string
);
831 void StringList::add(const UChar
*chars
, int32_t count
)
833 const UnicodeString
string(chars
, count
);
838 const UnicodeString
*StringList::get(int32_t index
) const
840 if (index
>= 0 && index
< listSize
) {
841 return &strings
[index
];
847 int32_t StringList::size() const
859 void put(int32_t ce
, UnicodeString
*string
);
860 StringList
*getStringList(int32_t ce
) const;
864 static void deleteStringList(void *obj
);
865 void putStringList(int32_t ce
, StringList
*stringList
);
869 CEToStringsMap::CEToStringsMap()
871 UErrorCode status
= U_ZERO_ERROR
;
873 map
= uhash_open(uhash_hashLong
, uhash_compareLong
,
874 uhash_compareCaselessUnicodeString
,
877 uhash_setValueDeleter(map
, deleteStringList
);
880 CEToStringsMap::~CEToStringsMap()
885 void CEToStringsMap::put(int32_t ce
, UnicodeString
*string
)
887 StringList
*strings
= getStringList(ce
);
889 if (strings
== NULL
) {
890 strings
= new StringList();
891 putStringList(ce
, strings
);
894 strings
->add(string
);
897 StringList
*CEToStringsMap::getStringList(int32_t ce
) const
899 return (StringList
*) uhash_iget(map
, ce
);
902 void CEToStringsMap::putStringList(int32_t ce
, StringList
*stringList
)
904 UErrorCode status
= U_ZERO_ERROR
;
906 uhash_iput(map
, ce
, (void *) stringList
, &status
);
909 void CEToStringsMap::deleteStringList(void *obj
)
911 StringList
*strings
= (StringList
*) obj
;
922 void put(const UnicodeString
*string
, const CEList
*ces
);
923 const CEList
*get(const UnicodeString
*string
);
927 static void deleteCEList(void *obj
);
928 static void deleteUnicodeStringKey(void *obj
);
933 StringToCEsMap::StringToCEsMap()
935 UErrorCode status
= U_ZERO_ERROR
;
937 map
= uhash_open(uhash_hashCaselessUnicodeString
,
938 uhash_compareCaselessUnicodeString
,
942 uhash_setValueDeleter(map
, deleteCEList
);
943 uhash_setKeyDeleter(map
, deleteUnicodeStringKey
);
946 StringToCEsMap::~StringToCEsMap()
951 void StringToCEsMap::put(const UnicodeString
*string
, const CEList
*ces
)
953 UErrorCode status
= U_ZERO_ERROR
;
955 uhash_put(map
, (void *) string
, (void *) ces
, &status
);
958 const CEList
*StringToCEsMap::get(const UnicodeString
*string
)
960 return (const CEList
*) uhash_get(map
, string
);
963 void StringToCEsMap::deleteCEList(void *obj
)
965 CEList
*list
= (CEList
*) obj
;
970 void StringToCEsMap::deleteUnicodeStringKey(void *obj
)
972 UnicodeString
*key
= (UnicodeString
*) obj
;
977 static void buildData(UCollator
*coll
, USet
*charsToTest
, StringToCEsMap
*charsToCEList
, CEToStringsMap
*ceToCharsStartingWith
)
979 int32_t itemCount
= uset_getItemCount(charsToTest
);
980 UErrorCode status
= U_ZERO_ERROR
;
982 for(int32_t item
= 0; item
< itemCount
; item
+= 1) {
983 UChar32 start
= 0, end
= 0;
985 int32_t len
= uset_getItem(charsToTest
, item
, &start
, &end
,
986 buffer
, 16, &status
);
989 for (UChar32 ch
= start
; ch
<= end
; ch
+= 1) {
990 UnicodeString
*st
= new UnicodeString(ch
);
991 CEList
*ceList
= new CEList(coll
, *st
);
993 charsToCEList
->put(st
, ceList
);
994 ceToCharsStartingWith
->put(ceList
->get(0), st
);
996 } else if (len
> 0) {
997 UnicodeString
*st
= new UnicodeString(buffer
, len
);
998 CEList
*ceList
= new CEList(coll
, *st
);
1000 charsToCEList
->put(st
, ceList
);
1001 ceToCharsStartingWith
->put(ceList
->get(0), st
);
1003 // shouldn't happen...
1008 static UnicodeString
&escape(const UnicodeString
&string
, UnicodeString
&buffer
)
1010 for(int32_t i
= 0; i
< string
.length(); i
+= 1) {
1011 UChar32 ch
= string
.char32At(i
);
1013 if (ch
>= 0x0020 && ch
<= 0x007F) {
1015 buffer
.append("\\\\");
1022 if (ch
<= 0xFFFFL
) {
1023 sprintf(cbuffer
, "\\u%4.4X", ch
);
1025 sprintf(cbuffer
, "\\U%8.8X", ch
);
1028 buffer
.append(cbuffer
);
1031 if (ch
>= 0x10000L
) {
1039 static int32_t minLengthInChars(const CEList
*ceList
, int32_t offset
, StringToCEsMap
*charsToCEList
, CEToStringsMap
*ceToCharsStartingWith
,
1040 UnicodeString
&debug
)
1042 // find out shortest string for the longest sequence of ces.
1043 // needs to be refined to use dynamic programming, but will be roughly right
1044 int32_t totalStringLength
= 0;
1046 while (offset
< ceList
->size()) {
1047 int32_t ce
= ceList
->get(offset
);
1048 int32_t bestLength
= INT32_MIN
;
1049 const UnicodeString
*bestString
= NULL
;
1050 int32_t bestCeLength
= 0;
1051 const StringList
*strings
= ceToCharsStartingWith
->getStringList(ce
);
1052 int32_t stringCount
= strings
->size();
1054 for (int32_t s
= 0; s
< stringCount
; s
+= 1) {
1055 const UnicodeString
*string
= strings
->get(s
);
1056 const CEList
*ceList2
= charsToCEList
->get(string
);
1058 if (ceList
->matchesAt(offset
, ceList2
)) {
1059 int32_t length
= ceList2
->size() - string
->length();
1061 if (bestLength
< length
) {
1062 bestLength
= length
;
1063 bestCeLength
= ceList2
->size();
1064 bestString
= string
;
1069 totalStringLength
+= bestString
->length();
1070 escape(*bestString
, debug
).append("/");
1071 offset
+= bestCeLength
;
1074 debug
.append((UChar
)0x0000);
1075 return totalStringLength
;
1078 static void minLengthTest(UCollator
*coll
, StringToCEsMap
*charsToCEList
, CEToStringsMap
*ceToCharsStartingWith
)
1080 UnicodeString examples
[] = {"fuss", "fiss", "affliss", "VII"};
1081 UnicodeString debug
;
1082 int32_t nExamples
= sizeof(examples
) / sizeof(examples
[0]);
1084 for (int32_t s
= 0; s
< nExamples
; s
+= 1) {
1085 CEList
*ceList
= new CEList(coll
, examples
[s
]);
1087 //infoln("%S:", examples[s].getTerminatedBuffer());
1089 for(int32_t i
= 0; i
< examples
[s
].length(); i
+= 1) {
1092 int32_t minLength
= minLengthInChars(ceList
, i
, charsToCEList
, ceToCharsStartingWith
, debug
);
1093 //infoln("\t%d\t%S", minLength, debug.getTerminatedBuffer());
1101 //----------------------------------------------------------------------------------------
1103 // Random Numbers. Similar to standard lib rand() and srand()
1104 // Not using library to
1105 // 1. Get same results on all platforms.
1106 // 2. Get access to current seed, to more easily reproduce failures.
1108 //---------------------------------------------------------------------------------------
1109 static uint32_t m_seed
= 1;
1111 static uint32_t m_rand()
1113 m_seed
= m_seed
* 1103515245 + 12345;
1114 return (uint32_t)(m_seed
/65536) % 32768;
1120 virtual void append(UnicodeString
&test
, UnicodeString
&alternate
) = 0;
1137 class SetMonkey
: public Monkey
1140 SetMonkey(const USet
*theSet
);
1143 virtual void append(UnicodeString
&test
, UnicodeString
&alternate
);
1149 SetMonkey::SetMonkey(const USet
*theSet
)
1150 : Monkey(), set(theSet
)
1155 SetMonkey::~SetMonkey()
1160 void SetMonkey::append(UnicodeString
&test
, UnicodeString
&alternate
)
1162 int32_t size
= uset_size(set
);
1163 int32_t index
= m_rand() % size
;
1164 UChar32 ch
= uset_charAt(set
, index
);
1165 UnicodeString
str(ch
);
1168 alternate
.append(str
); // flip case, or some junk?
1171 class StringSetMonkey
: public Monkey
1174 StringSetMonkey(const USet
*theSet
, UCollator
*theCollator
, StringToCEsMap
*theCharsToCEList
, CEToStringsMap
*theCeToCharsStartingWith
);
1177 void append(UnicodeString
&testCase
, UnicodeString
&alternate
);
1180 UnicodeString
&generateAlternative(const UnicodeString
&testCase
, UnicodeString
&alternate
);
1184 StringToCEsMap
*charsToCEList
;
1185 CEToStringsMap
*ceToCharsStartingWith
;
1188 StringSetMonkey::StringSetMonkey(const USet
*theSet
, UCollator
*theCollator
, StringToCEsMap
*theCharsToCEList
, CEToStringsMap
*theCeToCharsStartingWith
)
1189 : Monkey(), set(theSet
), coll(theCollator
), charsToCEList(theCharsToCEList
), ceToCharsStartingWith(theCeToCharsStartingWith
)
1194 StringSetMonkey::~StringSetMonkey()
1199 void StringSetMonkey::append(UnicodeString
&testCase
, UnicodeString
&alternate
)
1201 int32_t itemCount
= uset_getItemCount(set
), len
= 0;
1202 int32_t index
= m_rand() % itemCount
;
1203 UChar32 rangeStart
= 0, rangeEnd
= 0;
1205 UErrorCode err
= U_ZERO_ERROR
;
1207 len
= uset_getItem(set
, index
, &rangeStart
, &rangeEnd
, buffer
, 16, &err
);
1210 int32_t offset
= m_rand() % (rangeEnd
- rangeStart
+ 1);
1211 UChar32 ch
= rangeStart
+ offset
;
1212 UnicodeString
str(ch
);
1214 testCase
.append(str
);
1215 generateAlternative(str
, alternate
);
1216 } else if (len
> 0) {
1217 // should check that len < 16...
1218 UnicodeString
str(buffer
, len
);
1220 testCase
.append(str
);
1221 generateAlternative(str
, alternate
);
1223 // shouldn't happen...
1227 UnicodeString
&StringSetMonkey::generateAlternative(const UnicodeString
&testCase
, UnicodeString
&alternate
)
1229 // find out shortest string for the longest sequence of ces.
1230 // needs to be refined to use dynamic programming, but will be roughly right
1231 CEList
ceList(coll
, testCase
);
1235 if (ceList
.size() == 0) {
1236 return alternate
.append(testCase
);
1239 while (offset
< ceList
.size()) {
1240 int32_t ce
= ceList
.get(offset
);
1241 const StringList
*strings
= ceToCharsStartingWith
->getStringList(ce
);
1243 if (strings
== NULL
) {
1244 return alternate
.append(testCase
);
1247 int32_t stringCount
= strings
->size();
1250 // find random string that generates the same CEList
1251 const CEList
*ceList2
;
1252 const UnicodeString
*string
;
1255 int32_t s
= m_rand() % stringCount
;
1257 if (tries
++ > stringCount
) {
1258 alternate
.append(testCase
);
1262 string
= strings
->get(s
);
1263 ceList2
= charsToCEList
->get(string
);
1264 } while (! ceList
.matchesAt(offset
, ceList2
));
1266 alt
.append(*string
);
1267 offset
+= ceList2
->size();
1270 const CEList
altCEs(coll
, alt
);
1272 if (ceList
.matchesAt(0, &altCEs
)) {
1273 return alternate
.append(alt
);
1276 return alternate
.append(testCase
);
1279 static void generateTestCase(UCollator
*coll
, Monkey
*monkeys
[], int32_t monkeyCount
, UnicodeString
&testCase
, UnicodeString
&alternate
)
1281 int32_t pieces
= (m_rand() % 4) + 1;
1287 monkeys
[0]->append(testCase
, alternate
);
1289 for(int32_t piece
= 0; piece
< pieces
; piece
+= 1) {
1290 int32_t monkey
= m_rand() % monkeyCount
;
1292 monkeys
[monkey
]->append(testCase
, alternate
);
1295 const CEList
ceTest(coll
, testCase
);
1296 const CEList
ceAlt(coll
, alternate
);
1298 matches
= ceTest
.matchesAt(0, &ceAlt
);
1299 } while (! matches
);
1302 static inline USet
*uset_openEmpty()
1304 return uset_open(1, 0);
1308 // Find the next acceptable boundary following the specified starting index
1309 // in the target text being searched.
1310 // TODO: refine what is an acceptable boundary. For the moment,
1311 // choose the next position not within a combining sequence.
1313 static int32_t nextBoundaryAfter(const UnicodeString
&string
, int32_t startIndex
) {
1314 const UChar
*text
= string
.getBuffer();
1315 int32_t textLen
= string
.length();
1317 if (startIndex
>= textLen
) {
1322 int32_t i
= startIndex
;
1324 U16_NEXT(text
, i
, textLen
, c
);
1326 // If we are on a control character, stop without looking for combining marks.
1327 // Control characters do not combine.
1328 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1329 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
1333 // The initial character was not a control, and can thus accept trailing
1334 // combining characters. Advance over however many of them there are.
1335 int32_t indexOfLastCharChecked
;
1338 indexOfLastCharChecked
= i
;
1344 U16_NEXT(text
, i
, textLen
, c
);
1345 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1347 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
1352 return indexOfLastCharChecked
;
1355 static UBool
isInCombiningSequence(const UnicodeString
&string
, int32_t index
) {
1356 const UChar
*text
= string
.getBuffer();
1357 int32_t textLen
= string
.length();
1359 if (index
>=textLen
|| index
<=0) {
1363 // If the character at the current index is not a GRAPHEME_EXTEND
1364 // then we can not be within a combining sequence.
1366 U16_GET(text
, 0, index
, textLen
, c
);
1367 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1368 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
1372 // We are at a combining mark. If the preceding character is anything
1373 // except a CONTROL, CR or LF, we are in a combining sequence.
1374 U16_PREV(text
, 0, index
, c
);
1375 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
1377 return !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
1380 static UBool
simpleSearch(UCollator
*coll
, const UnicodeString
&target
, int32_t offset
, const UnicodeString
&pattern
, int32_t &matchStart
, int32_t &matchEnd
)
1382 UErrorCode status
= U_ZERO_ERROR
;
1383 OrderList
targetOrders(coll
, target
, offset
);
1384 OrderList
patternOrders(coll
, pattern
);
1385 int32_t targetSize
= targetOrders
.size() - 1;
1386 int32_t patternSize
= patternOrders
.size() - 1;
1387 UBreakIterator
*charBreakIterator
= ubrk_open(UBRK_CHARACTER
, ucol_getLocale(coll
, ULOC_VALID_LOCALE
, &status
),
1388 target
.getBuffer(), target
.length(), &status
);
1390 if (patternSize
== 0) {
1391 matchStart
= matchEnd
= 0;
1395 matchStart
= matchEnd
= -1;
1397 for(int32_t i
= 0; i
< targetSize
; i
+= 1) {
1398 if (targetOrders
.matchesAt(i
, patternOrders
)) {
1399 int32_t start
= targetOrders
.getLowOffset(i
);
1400 int32_t maxLimit
= targetOrders
.getLowOffset(i
+ patternSize
);
1401 int32_t minLimit
= targetOrders
.getLowOffset(i
+ patternSize
- 1);
1403 // if the low and high offsets of the first CE in
1404 // the match are the same, it means that the match
1405 // starts in the middle of an expansion - all but
1406 // the first CE of the expansion will have the offset
1407 // of the following character.
1408 if (start
== targetOrders
.getHighOffset(i
)) {
1412 // Make sure match starts on a grapheme boundary
1413 if (! ubrk_isBoundary(charBreakIterator
, start
)) {
1417 // If the low and high offsets of the CE after the match
1418 // are the same, it means that the match ends in the middle
1419 // of an expansion sequence.
1420 if (maxLimit
== targetOrders
.getHighOffset(i
+ patternSize
) &&
1421 targetOrders
.getOrder(i
+ patternSize
) != UCOL_NULLORDER
) {
1425 int32_t mend
= maxLimit
;
1427 // Find the first grapheme break after the character index
1428 // of the last CE in the match. If it's after character index
1429 // that's after the last CE in the match, use that index
1430 // as the end of the match.
1431 if (minLimit
< maxLimit
) {
1432 int32_t nba
= ubrk_following(charBreakIterator
, minLimit
);
1434 if (nba
>= targetOrders
.getHighOffset(i
+ patternSize
- 1)) {
1439 if (mend
> maxLimit
) {
1443 if (! ubrk_isBoundary(charBreakIterator
, mend
)) {
1450 ubrk_close(charBreakIterator
);
1455 ubrk_close(charBreakIterator
);
1459 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1460 static int32_t getIntParam(UnicodeString name
, UnicodeString
¶ms
, int32_t defaultVal
) {
1461 int32_t val
= defaultVal
;
1463 name
.append(" *= *(-?\\d+)");
1465 UErrorCode status
= U_ZERO_ERROR
;
1466 RegexMatcher
m(name
, params
, 0, status
);
1469 // The param exists. Convert the string to an int.
1470 char valString
[100];
1471 int32_t paramLength
= m
.end(1, status
) - m
.start(1, status
);
1473 if (paramLength
>= (int32_t)(sizeof(valString
)-1)) {
1474 paramLength
= (int32_t)(sizeof(valString
)-2);
1477 params
.extract(m
.start(1, status
), paramLength
, valString
, sizeof(valString
));
1478 val
= strtol(valString
, NULL
, 10);
1480 // Delete this parameter from the params string.
1482 params
= m
.replaceFirst("", status
);
1485 //U_ASSERT(U_SUCCESS(status));
1486 if (! U_SUCCESS(status
)) {
1494 #if !UCONFIG_NO_COLLATION
1495 int32_t SSearchTest::monkeyTestCase(UCollator
*coll
, const UnicodeString
&testCase
, const UnicodeString
&pattern
, const UnicodeString
&altPattern
,
1496 const char *name
, const char *strength
, uint32_t seed
)
1498 UErrorCode status
= U_ZERO_ERROR
;
1499 int32_t actualStart
= -1, actualEnd
= -1;
1500 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1501 int32_t expectedStart
= -1, expectedEnd
= -1;
1502 int32_t notFoundCount
= 0;
1503 UStringSearch
*uss
= usearch_openFromCollator(pattern
.getBuffer(), pattern
.length(),
1504 testCase
.getBuffer(), testCase
.length(),
1506 NULL
, // the break iterator
1509 // **** TODO: find *all* matches, not just first one ****
1510 simpleSearch(coll
, testCase
, 0, pattern
, expectedStart
, expectedEnd
);
1513 usearch_search(uss
, 0, &actualStart
, &actualEnd
, &status
);
1515 actualStart
= usearch_next(uss
, &status
);
1516 actualEnd
= actualStart
+ usearch_getMatchedLength(uss
);
1519 if (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
) {
1520 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1521 " strength=%s seed=%d",
1522 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
);
1525 if (expectedStart
== -1 && actualStart
== -1) {
1529 // **** TODO: find *all* matches, not just first one ****
1530 simpleSearch(coll
, testCase
, 0, altPattern
, expectedStart
, expectedEnd
);
1532 usearch_setPattern(uss
, altPattern
.getBuffer(), altPattern
.length(), &status
);
1535 usearch_search(uss
, 0, &actualStart
, &actualEnd
, &status
);
1538 actualStart
= usearch_next(uss
, &status
);
1539 actualEnd
= actualStart
+ usearch_getMatchedLength(uss
);
1542 if (actualStart
!= expectedStart
|| actualEnd
!= expectedEnd
) {
1543 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1544 " strength=%s seed=%d",
1545 name
, expectedStart
, expectedEnd
, actualStart
, actualEnd
, strength
, seed
);
1548 if (expectedStart
== -1 && actualStart
== -1) {
1554 return notFoundCount
;
1558 void SSearchTest::monkeyTest(char *params
)
1561 UErrorCode status
= U_ZERO_ERROR
;
1562 U_STRING_DECL(test_pattern
, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
1563 U_STRING_INIT(test_pattern
, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
1564 UCollator
*coll
= ucol_open(NULL
, &status
);
1565 if (U_FAILURE(status
)) {
1566 errln("Failed to create collator in MonkeyTest!");
1569 USet
*charsToTest
= uset_openPattern(test_pattern
, 47, &status
);
1570 USet
*expansions
= uset_openEmpty();
1571 USet
*contractions
= uset_openEmpty();
1572 StringToCEsMap
*charsToCEList
= new StringToCEsMap();
1573 CEToStringsMap
*ceToCharsStartingWith
= new CEToStringsMap();
1575 ucol_getContractionsAndExpansions(coll
, contractions
, expansions
, FALSE
, &status
);
1577 uset_addAll(charsToTest
, contractions
);
1578 uset_addAll(charsToTest
, expansions
);
1580 // TODO: set strength to UCOL_PRIMARY, change CEList to use strength?
1581 buildData(coll
, charsToTest
, charsToCEList
, ceToCharsStartingWith
);
1583 U_STRING_DECL(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1584 U_STRING_INIT(letter_pattern
, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1585 USet
*letters
= uset_openPattern(letter_pattern
, 39, &status
);
1586 SetMonkey
letterMonkey(letters
);
1587 StringSetMonkey
contractionMonkey(contractions
, coll
, charsToCEList
, ceToCharsStartingWith
);
1588 StringSetMonkey
expansionMonkey(expansions
, coll
, charsToCEList
, ceToCharsStartingWith
);
1589 UnicodeString testCase
;
1590 UnicodeString alternate
;
1591 UnicodeString pattern
, altPattern
;
1592 UnicodeString prefix
, altPrefix
;
1593 UnicodeString suffix
, altSuffix
;
1595 Monkey
*monkeys
[] = {
1605 int32_t monkeyCount
= sizeof(monkeys
) / sizeof(monkeys
[0]);
1606 int32_t nonMatchCount
= 0;
1608 UCollationStrength strengths
[] = {UCOL_PRIMARY
, UCOL_SECONDARY
, UCOL_TERTIARY
};
1609 const char *strengthNames
[] = {"primary", "secondary", "tertiary"};
1610 int32_t strengthCount
= sizeof(strengths
) / sizeof(strengths
[0]);
1611 int32_t loopCount
= quick
? 1000 : 10000;
1612 int32_t firstStrength
= 0;
1613 int32_t lastStrength
= strengthCount
- 1;
1615 if (params
!= NULL
) {
1616 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1617 UnicodeString
p(params
);
1619 loopCount
= getIntParam("loop", p
, loopCount
);
1620 m_seed
= getIntParam("seed", p
, m_seed
);
1622 RegexMatcher
m(" *strength *= *(primary|secondary|tertiary) *", p
, 0, status
);
1624 UnicodeString breakType
= m
.group(1, status
);
1626 for (int32_t s
= 0; s
< strengthCount
; s
+= 1) {
1627 if (breakType
== strengthNames
[s
]) {
1628 firstStrength
= lastStrength
= s
;
1634 p
= m
.replaceFirst("", status
);
1637 if (RegexMatcher("\\S", p
, 0, status
).find()) {
1638 // Each option is stripped out of the option string as it is processed.
1639 // All options have been checked. The option string should have been completely emptied..
1641 p
.extract(buf
, sizeof(buf
), NULL
, status
);
1642 buf
[sizeof(buf
)-1] = 0;
1643 errln("Unrecognized or extra parameter: %s\n", buf
);
1647 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1651 for(int32_t s
= firstStrength
; s
<= lastStrength
; s
+= 1) {
1652 int32_t notFoundCount
= 0;
1654 ucol_setStrength(coll
, strengths
[s
]);
1656 // TODO: try alternate prefix and suffix too?
1657 // TODO: alterntaes are only equal at primary strength. Is this OK?
1658 for(int32_t t
= 0; t
< 10000; t
+= 1) {
1659 uint32_t seed
= m_seed
;
1662 generateTestCase(coll
, monkeys
, monkeyCount
, pattern
, altPattern
);
1663 generateTestCase(coll
, monkeys
, monkeyCount
, prefix
, altPrefix
);
1664 generateTestCase(coll
, monkeys
, monkeyCount
, suffix
, altSuffix
);
1667 notFoundCount
+= monkeyTestCase(coll
, pattern
, pattern
, altPattern
, "pattern", strengthNames
[s
], seed
);
1670 testCase
.append(prefix
);
1671 testCase
.append(/*alt*/pattern
);
1674 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "prefix + pattern", strengthNames
[s
], seed
);
1676 testCase
.append(suffix
);
1678 // prefix + pattern + suffix
1679 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "prefix + pattern + suffix", strengthNames
[s
], seed
);
1682 testCase
.append(pattern
);
1683 testCase
.append(suffix
);
1686 notFoundCount
+= monkeyTestCase(coll
, testCase
, pattern
, altPattern
, "pattern + suffix", strengthNames
[s
], seed
);
1689 logln("For strength %s the not found count is %d.", strengthNames
[s
], notFoundCount
);
1692 delete ceToCharsStartingWith
;
1693 delete charsToCEList
;
1695 uset_close(contractions
);
1696 uset_close(expansions
);
1697 uset_close(charsToTest
);
1698 uset_close(letters
);