/*
**********************************************************************
- * Copyright (C) 2005-2008, International Business Machines
+ * Copyright (C) 2005-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "intltest.h"
#include "ssearch.h"
+#include "unicode/colldata.h"
+#include "unicode/bmsearch.h"
+#include "unicode/bms.h"
+
#include "xmlparser.h"
+#include "ucbuf.h"
#include <stdlib.h>
#include <string.h>
errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
- errln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
+ dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
__FILE__, __LINE__, testId, u_errorName(errcode));}}
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
+#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
+#define DELETE_ARRAY(array) uprv_free((void *) (array))
//---------------------------------------------------------------------------
//
case 2: name = "monkeyTest";
if (exec) monkeyTest(params);
break;
+
+ case 3: name = "bmMonkeyTest";
+ if (exec) bmMonkeyTest(params);
+ break;
+
+ case 4: name = "boyerMooreTest";
+ if (exec) boyerMooreTest();
+ break;
+
+ case 5: name = "goodSuffixTest";
+ if (exec) goodSuffixTest();
+ break;
+
+ case 6: name = "searchTime";
+ if (exec) searchTime();
+ break;
+
+ case 7: name = "bmsTest";
+ if (exec) bmsTest();
+ break;
+
+ case 8: name = "bmSearchTest";
+ if (exec) bmSearchTest();
+ break;
+
+ case 9: name = "udhrTest";
+ if (exec) udhrTest();
+ break;
+ case 10: name = "stringListTest";
+ if (exec) stringListTest();
+ break;
#endif
default: name = "";
break; //needed to end loop
void SSearchTest::searchTest()
{
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
UErrorCode status = U_ZERO_ERROR;
char path[PATH_BUFFER_SIZE];
const char *testFilePath = getPath(path, "ssearch.xml");
return; /* Couldn't get path: error message already output. */
}
- UXMLParser *parser = UXMLParser::createParser(status);
+ LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
TEST_ASSERT_SUCCESS(status);
- UXMLElement *root = parser->parseFile(testFilePath, status);
+ LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
return;
// Default is tertiary if the XML attribute is missing from the test case.
//
const UnicodeString *strength = testCase->getAttribute("strength");
- UColAttributeValue collatorStrength;
+ UColAttributeValue collatorStrength = UCOL_PRIMARY;
if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
normalize = UCOL_ON;
}
+ //
+ // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
+ //
+ UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
+ const UnicodeString *alt = testCase->getAttribute("alternate_handling");
+ TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
+ if (alt != NULL && *alt == "SHIFTED") {
+ alternateHandling = UCOL_SHIFTED;
+ }
+
const UnicodeString defLocale("en");
char clocale[100];
const UnicodeString *locale = testCase->getAttribute("locale");
int32_t expectedMatchStart = -1;
int32_t expectedMatchLimit = -1;
const UXMLElement *n;
- int nodeCount = 0;
+ int32_t nodeCount = 0;
n = testCase->getChildElement("pattern");
TEST_ASSERT(n != NULL);
target.append(text);
nodeCount++;
}
-
+
n = testCase->getChildElement("m");
if (n!=NULL) {
expectedMatchStart = target.length();
// Check that there weren't extra things in the XML
TEST_ASSERT(nodeCount == testCase->countChildren());
- // Open a collotor and StringSearch based on the parameters
+ // Open a collator and StringSearch based on the parameters
// obtained from the XML.
//
status = U_ZERO_ERROR;
- UCollator *collator = ucol_open(clocale, &status);
- ucol_setStrength(collator, collatorStrength);
- ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
- UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
- target.getBuffer(), target.length(),
- collator,
- NULL, // the break iterator
- &status);
-
+ LocalUCollatorPointer collator(ucol_open(clocale, &status));
+ ucol_setStrength(collator.getAlias(), collatorStrength);
+ ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
+ ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
+ LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
+ target.getBuffer(), target.length(),
+ collator.getAlias(),
+ NULL, // the break iterator
+ &status));
+
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
- usearch_close(uss);
- ucol_close(collator);
continue;
}
//
// Do the search, check the match result against the expected results.
//
- foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status);
+ foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
TEST_ASSERT_SUCCESS(status);
- if (foundMatch && expectedMatchStart<0 ||
- foundStart != expectedMatchStart ||
- foundLimit != expectedMatchLimit) {
+ if ((foundMatch && expectedMatchStart<0) ||
+ (foundStart != expectedMatchStart) ||
+ (foundLimit != expectedMatchLimit)) {
TEST_ASSERT(FALSE); // ouput generic error position
infoln("Found, expected match start = %d, %d \n"
"Found, expected match limit = %d, %d",
expectedMatchStart = foundStart;
expectedMatchLimit = foundLimit;
- foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status);
+ foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
}
- usearch_close(uss);
-
- uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
+ uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
target.getBuffer(), target.length(),
- collator,
+ collator.getAlias(),
NULL,
- &status);
+ &status));
//
// Do the backwards search, check the match result against the expected results.
//
- foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status);
+ foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
TEST_ASSERT_SUCCESS(status);
- if (foundMatch && expectedMatchStart<0 ||
- foundStart != expectedMatchStart ||
- foundLimit != expectedMatchLimit) {
+ if ((foundMatch && expectedMatchStart<0) ||
+ (foundStart != expectedMatchStart) ||
+ (foundLimit != expectedMatchLimit)) {
TEST_ASSERT(FALSE); // ouput generic error position
infoln("Found, expected backwards match start = %d, %d \n"
"Found, expected backwards match limit = %d, %d",
foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
}
+ }
+#endif
+}
+
+struct UdhrTestCase
+{
+ const char *locale;
+ const char *file;
+};
+
+void SSearchTest::udhrTest()
+{
+ UErrorCode status = U_ZERO_ERROR;
+ char path[PATH_BUFFER_SIZE];
+ const char *udhrPath = getPath(path, "udhr");
+
+ if (udhrPath == NULL) {
+ // couldn't get path: error message already output...
+ return;
+ }
+
+ UdhrTestCase testCases[] = {
+ {"en", "udhr_eng.txt"},
+ {"de", "udhr_deu_1996.txt"},
+ {"fr", "udhr_fra.txt"},
+ {"ru", "udhr_rus.txt"},
+ {"th", "udhr_tha.txt"},
+ {"ja", "udhr_jpn.txt"},
+ {"ko", "udhr_kor.txt"},
+ {"zh", "udhr_cmn_hans.txt"},
+ {"zh_Hant", "udhr_cmn_hant.txt"}
+ };
+
+ int32_t testCount = ARRAY_SIZE(testCases);
+
+ for (int32_t t = 0; t < testCount; t += 1) {
+ int32_t len = 0;
+ char *resolvedFileName = NULL;
+ const char *encoding = NULL;
+ UCHARBUF *ucharBuf = NULL;
+
+ ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status);
+ resolvedFileName = NEW_ARRAY(char, len);
+
+ if(resolvedFileName == NULL){
+ continue;
+ }
+
+ if(status == U_BUFFER_OVERFLOW_ERROR){
+ status = U_ZERO_ERROR;
+ }
+
+ ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status);
+ ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status);
+
+ DELETE_ARRAY(resolvedFileName);
+
+ if(U_FAILURE(status)){
+ infoln("Could not open the input file %s. Test skipped\n", testCases[t].file);
+ continue;
+ }
+
+ int32_t targetLen = 0;
+ const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status);
+
+ /* The first line of the file contains the pattern */
+ int32_t start = 0, end = 0, plen = 0;
+
+ for(end = start; ; end += 1) {
+ UChar ch = target[end];
+
+ if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
+ break;
+ }
+ }
+
+ plen = end - start;
+
+ UChar *pattern = NEW_ARRAY(UChar, plen);
+ for (int32_t i = 0; i < plen; i += 1) {
+ pattern[i] = target[start++];
+ }
+
+ int32_t offset = 0;
+ UCollator *coll = ucol_open(testCases[t].locale, &status);
+ UCD *ucd = NULL;
+ BMS *bms = NULL;
+
+ if (U_FAILURE(status)) {
+ errln("Could not open collator for %s", testCases[t].locale);
+ goto delete_collator;
+ }
+
+ ucd = ucd_open(coll, &status);
+
+ if (U_FAILURE(status)) {
+ errln("Could not open CollData object for %s", testCases[t].locale);
+ goto delete_ucd;
+ }
+
+ bms = bms_open(ucd, pattern, plen, target, targetLen, &status);
+
+ if (U_FAILURE(status)) {
+ errln("Could not open search object for %s", testCases[t].locale);
+ goto delete_bms;
+ }
+
+ start = end = -1;
+ while (bms_search(bms, offset, &start, &end)) {
+ offset = end;
+ }
+
+ if (offset == 0) {
+ errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file);
+ }
+
+delete_bms:
+ bms_close(bms);
+
+delete_ucd:
+ ucd_close(ucd);
+
+delete_collator:
+ ucol_close(coll);
+
+ DELETE_ARRAY(pattern);
+ ucbuf_close(ucharBuf);
+ }
+
+ ucd_flushCache();
+}
+
+void SSearchTest::bmSearchTest()
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+ UErrorCode status = U_ZERO_ERROR;
+ char path[PATH_BUFFER_SIZE];
+ const char *testFilePath = getPath(path, "ssearch.xml");
+
+ if (testFilePath == NULL) {
+ return; /* Couldn't get path: error message already output. */
+ }
+
+ UXMLParser *parser = UXMLParser::createParser(status);
+ TEST_ASSERT_SUCCESS(status);
+ UXMLElement *root = parser->parseFile(testFilePath, status);
+ TEST_ASSERT_SUCCESS(status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+
+ const UnicodeString *debugTestCase = root->getAttribute("debug");
+ if (debugTestCase != NULL) {
+// setenv("USEARCH_DEBUG", "1", 1);
+ }
+
+
+ const UXMLElement *testCase;
+ int32_t tc = 0;
+
+ while((testCase = root->nextChildElement(tc)) != NULL) {
+
+ if (testCase->getTagName().compare("test-case") != 0) {
+ errln("ssearch, unrecognized XML Element in test file");
+ continue;
+ }
+ const UnicodeString *id = testCase->getAttribute("id");
+ *testId = 0;
+ if (id != NULL) {
+ id->extract(0, id->length(), testId, sizeof(testId), US_INV);
+ }
+
+ // If debugging test case has been specified and this is not it, skip to next.
+ if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
+ continue;
+ }
+ //
+ // Get the requested collation strength.
+ // Default is tertiary if the XML attribute is missing from the test case.
+ //
+ const UnicodeString *strength = testCase->getAttribute("strength");
+ UColAttributeValue collatorStrength = UCOL_PRIMARY;
+ if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
+ else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
+ else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
+ else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
+ else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
+ else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
+ else {
+ // Bogus value supplied for strength. Shouldn't happen, even from
+ // typos, if the XML source has been validated.
+ // This assert is a little deceiving in that strength can be
+ // any of the allowed values, not just TERTIARY, but it will
+ // do the job of getting the error output.
+ TEST_ASSERT(*strength=="TERTIARY")
+ }
+
+ //
+ // Get the collator normalization flag. Default is UCOL_OFF.
+ //
+ UColAttributeValue normalize = UCOL_OFF;
+ const UnicodeString *norm = testCase->getAttribute("norm");
+ TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
+ if (norm!=NULL && *norm=="ON") {
+ normalize = UCOL_ON;
+ }
+
+ //
+ // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
+ //
+ UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
+ const UnicodeString *alt = testCase->getAttribute("alternate_handling");
+ TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
+ if (alt != NULL && *alt == "SHIFTED") {
+ alternateHandling = UCOL_SHIFTED;
+ }
+
+ const UnicodeString defLocale("en");
+ char clocale[100];
+ const UnicodeString *locale = testCase->getAttribute("locale");
+ if (locale == NULL || locale->length()==0) {
+ locale = &defLocale;
+ };
+ locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
+
+
+ UnicodeString text;
+ UnicodeString target;
+ UnicodeString pattern;
+ int32_t expectedMatchStart = -1;
+ int32_t expectedMatchLimit = -1;
+ const UXMLElement *n;
+ int32_t nodeCount = 0;
+
+ n = testCase->getChildElement("pattern");
+ TEST_ASSERT(n != NULL);
+ if (n==NULL) {
+ continue;
+ }
+ text = n->getText(FALSE);
+ text = text.unescape();
+ pattern.append(text);
+ nodeCount++;
+
+ n = testCase->getChildElement("pre");
+ if (n!=NULL) {
+ text = n->getText(FALSE);
+ text = text.unescape();
+ target.append(text);
+ nodeCount++;
+ }
+
+ n = testCase->getChildElement("m");
+ if (n!=NULL) {
+ expectedMatchStart = target.length();
+ text = n->getText(FALSE);
+ text = text.unescape();
+ target.append(text);
+ expectedMatchLimit = target.length();
+ nodeCount++;
+ }
+
+ n = testCase->getChildElement("post");
+ if (n!=NULL) {
+ text = n->getText(FALSE);
+ text = text.unescape();
+ target.append(text);
+ nodeCount++;
+ }
+
+ // Check that there weren't extra things in the XML
+ TEST_ASSERT(nodeCount == testCase->countChildren());
+
+ // Open a collator and StringSearch based on the parameters
+ // obtained from the XML.
+ //
+ status = U_ZERO_ERROR;
+ UCollator *collator = ucol_open(clocale, &status);
+ ucol_setStrength(collator, collatorStrength);
+ ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
+ ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
+ UCD *ucd = ucd_open(collator, &status);
+ BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status);
+
+ TEST_ASSERT_SUCCESS(status);
+ if (U_FAILURE(status)) {
+ bms_close(bms);
+ ucd_close(ucd);
+ ucol_close(collator);
+ continue;
+ }
+
+ int32_t foundStart = 0;
+ int32_t foundLimit = 0;
+ UBool foundMatch;
+
+ //
+ // Do the search, check the match result against the expected results.
+ //
+ foundMatch = bms_search(bms, 0, &foundStart, &foundLimit);
+ //TEST_ASSERT_SUCCESS(status);
+ if ((foundMatch && expectedMatchStart < 0) ||
+ (foundStart != expectedMatchStart) ||
+ (foundLimit != expectedMatchLimit)) {
+ TEST_ASSERT(FALSE); // ouput generic error position
+ infoln("Found, expected match start = %d, %d \n"
+ "Found, expected match limit = %d, %d",
+ foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
+ }
- usearch_close(uss);
+ bms_close(bms);
+ ucd_close(ucd);
ucol_close(collator);
}
+ ucd_flushCache();
delete root;
delete parser;
#endif
};
OrderList::OrderList()
- : list(NULL), listSize(0), listMax(16)
+ : list(NULL), listMax(16), listSize(0)
{
list = new Order[listMax];
}
uint32_t strengthMask = 0;
int32_t order, low, high;
- switch (ucol_getStrength(coll))
+ switch (ucol_getStrength(coll))
{
default:
strengthMask |= UCOL_TERTIARYORDERMASK;
void SSearchTest::offsetTest()
{
+ static const UVersionInfo icu47 = { 4, 7, 0, 0 };
const char *test[] = {
+ // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
+ // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
+ "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
+
"\\ua191\\u16ef\\u2036\\u017a",
#if 0
"\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
"\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
"\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
- "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
+ "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
- "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
+ "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
+ "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
+ "a\\u02FF\\u0316\\u0301",
+ "a\\u0430\\u0301\\u0316",
+ "a\\u0430\\u0316\\u0301",
"abc\\u0E41\\u0301\\u0316",
- "abc\\u0E41\\u0316\\u0301",
- "\\u0E41\\u0301\\u0316",
- "\\u0E41\\u0316\\u0301",
- "a\\u0301\\u0316",
- "a\\u0316\\u0301",
- "\\uAC52\\uAC53",
- "\\u34CA\\u34CB",
- "\\u11ED\\u11EE",
- "\\u30C3\\u30D0",
- "p\\u00E9ch\\u00E9",
+ "abc\\u0E41\\u0316\\u0301",
+ "\\u0E41\\u0301\\u0316",
+ "\\u0E41\\u0316\\u0301",
+ "a\\u0301\\u0316",
+ "a\\u0316\\u0301",
+ "\\uAC52\\uAC53",
+ "\\u34CA\\u34CB",
+ "\\u11ED\\u11EE",
+ "\\u30C3\\u30D0",
+ "p\\u00E9ch\\u00E9",
"a\\u0301\\u0325",
"a\\u0300\\u0325",
"a\\u0325\\u0300",
UErrorCode status = U_ZERO_ERROR;
RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
if (U_FAILURE(status)) {
- errln("Failed to create collator in offsetTest!");
+ errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
return;
}
char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
for(int32_t i = 0; i < testCount; i += 1) {
+ if (!isICUVersionAtLeast(icu47) && i>=4 && i<=6) {
+ continue; // timebomb until ticket #8080 is resolved
+ }
UnicodeString ts = CharsToUnicodeString(test[i]);
CollationElementIterator *iter = col->createCollationElementIterator(ts);
OrderList forwardList;
delete col;
}
-class CEList
+#if 0
+static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
{
-public:
- CEList(UCollator *coll, const UnicodeString &string);
- ~CEList();
+ for(int32_t i = 0; i < string.length(); i += 1) {
+ UChar32 ch = string.char32At(i);
- int32_t size() const;
- int32_t get(int32_t index) const;
- UBool matchesAt(int32_t offset, const CEList *other) const;
-
-private:
- void add(int32_t ce);
-
- int32_t *ces;
- int32_t listMax;
- int32_t listSize;
-};
-
-CEList::CEList(UCollator *coll, const UnicodeString &string)
- : ces(NULL), listMax(8), listSize(0)
-{
- UErrorCode status = U_ZERO_ERROR;
- UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
- uint32_t strengthMask = 0;
- int32_t order;
-
-#if 0
- switch (ucol_getStrength(coll))
- {
- default:
- strengthMask |= UCOL_TERTIARYORDERMASK;
- /* fall through */
-
- case UCOL_SECONDARY:
- strengthMask |= UCOL_SECONDARYORDERMASK;
- /* fall through */
-
- case UCOL_PRIMARY:
- strengthMask |= UCOL_PRIMARYORDERMASK;
- }
-#else
- strengthMask = UCOL_PRIMARYORDERMASK;
-#endif
-
- ces = new int32_t[listMax];
+ if (ch >= 0x0020 && ch <= 0x007F) {
+ if (ch == 0x005C) {
+ buffer.append("\\\\");
+ } else {
+ buffer.append(ch);
+ }
+ } else {
+ char cbuffer[12];
- while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
- order &= strengthMask;
+ if (ch <= 0xFFFFL) {
+ sprintf(cbuffer, "\\u%4.4X", ch);
+ } else {
+ sprintf(cbuffer, "\\U%8.8X", ch);
+ }
- if (order == UCOL_IGNORABLE) {
- continue;
+ buffer.append(cbuffer);
}
- add(order);
+ if (ch >= 0x10000L) {
+ i += 1;
+ }
}
- ucol_closeElements(elems);
-}
-
-CEList::~CEList()
-{
- delete[] ces;
+ return buffer;
}
+#endif
-void CEList::add(int32_t ce)
-{
- if (listSize >= listMax) {
- listMax *= 2;
-
- int32_t *newCEs = new int32_t[listMax];
-
- uprv_memcpy(newCEs, ces, listSize * sizeof(int32_t));
- delete[] ces;
- ces = newCEs;
- }
-
- ces[listSize++] = ce;
-}
+#if 1
-int32_t CEList::get(int32_t index) const
+struct PCE
{
- if (index >= 0 && index < listSize) {
- return ces[index];
- }
-
- return -1;
-}
+ uint64_t ce;
+ int32_t lowOffset;
+ int32_t highOffset;
+};
-UBool CEList::matchesAt(int32_t offset, const CEList *other) const
+class PCEList
{
- if (listSize - offset < other->size()) {
- return FALSE;
- }
+public:
+ PCEList(UCollator *coll, const UnicodeString &string);
+ ~PCEList();
- for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
- if (ces[i] != other->get(j)) {
- return FALSE;
- }
- }
+ int32_t size() const;
- return TRUE;
-}
+ const PCE *get(int32_t index) const;
-int32_t CEList::size() const
-{
- return listSize;
-}
+ int32_t getLowOffset(int32_t index) const;
+ int32_t getHighOffset(int32_t index) const;
+ uint64_t getOrder(int32_t index) const;
-class StringList
-{
-public:
- StringList();
- ~StringList();
+ UBool matchesAt(int32_t offset, const PCEList &other) const;
- void add(const UnicodeString *string);
- void add(const UChar *chars, int32_t count);
- const UnicodeString *get(int32_t index) const;
- int32_t size() const;
+ uint64_t operator[](int32_t index) const;
private:
- UnicodeString *strings;
+ void add(uint64_t ce, int32_t low, int32_t high);
+
+ PCE *list;
int32_t listMax;
int32_t listSize;
};
-StringList::StringList()
- : strings(NULL), listMax(16), listSize(0)
+PCEList::PCEList(UCollator *coll, const UnicodeString &string)
{
- strings = new UnicodeString [listMax];
+ UErrorCode status = U_ZERO_ERROR;
+ UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
+ uint64_t order;
+ int32_t low, high;
+
+ list = new PCE[listMax];
+
+ ucol_setOffset(elems, 0, &status);
+
+ do {
+ order = ucol_nextProcessed(elems, &low, &high, &status);
+ add(order, low, high);
+ } while (order != UCOL_PROCESSED_NULLORDER);
+
+ ucol_closeElements(elems);
}
-StringList::~StringList()
+PCEList::~PCEList()
{
- delete[] strings;
+ delete[] list;
}
-void StringList::add(const UnicodeString *string)
+void PCEList::add(uint64_t order, int32_t low, int32_t high)
{
if (listSize >= listMax) {
listMax *= 2;
- UnicodeString *newStrings = new UnicodeString[listMax];
+ PCE *newList = new PCE[listMax];
- uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString));
-
- delete[] strings;
- strings = newStrings;
+ uprv_memcpy(newList, list, listSize * sizeof(Order));
+ delete[] list;
+ list = newList;
}
- // The ctor initialized all the strings in
- // the array to empty strings, so this
- // is the same as copying the source string.
- strings[listSize++].append(*string);
-}
-
-void StringList::add(const UChar *chars, int32_t count)
-{
- const UnicodeString string(chars, count);
+ list[listSize].ce = order;
+ list[listSize].lowOffset = low;
+ list[listSize].highOffset = high;
- add(&string);
+ listSize += 1;
}
-const UnicodeString *StringList::get(int32_t index) const
+const PCE *PCEList::get(int32_t index) const
{
- if (index >= 0 && index < listSize) {
- return &strings[index];
+ if (index >= listSize) {
+ return NULL;
}
- return NULL;
+ return &list[index];
}
-int32_t StringList::size() const
+int32_t PCEList::getLowOffset(int32_t index) const
{
- return listSize;
+ const PCE *pce = get(index);
+
+ if (pce != NULL) {
+ return pce->lowOffset;
+ }
+
+ return -1;
}
-class CEToStringsMap
+int32_t PCEList::getHighOffset(int32_t index) const
{
-public:
+ const PCE *pce = get(index);
- CEToStringsMap();
- ~CEToStringsMap();
-
- void put(int32_t ce, UnicodeString *string);
- StringList *getStringList(int32_t ce) const;
+ if (pce != NULL) {
+ return pce->highOffset;
+ }
-private:
-
- static void deleteStringList(void *obj);
- void putStringList(int32_t ce, StringList *stringList);
- UHashtable *map;
-};
+ return -1;
+}
-CEToStringsMap::CEToStringsMap()
+uint64_t PCEList::getOrder(int32_t index) const
{
- UErrorCode status = U_ZERO_ERROR;
+ const PCE *pce = get(index);
- map = uhash_open(uhash_hashLong, uhash_compareLong,
- uhash_compareCaselessUnicodeString,
- &status);
+ if (pce != NULL) {
+ return pce->ce;
+ }
- uhash_setValueDeleter(map, deleteStringList);
+ return UCOL_PROCESSED_NULLORDER;
}
-CEToStringsMap::~CEToStringsMap()
+int32_t PCEList::size() const
{
- uhash_close(map);
+ return listSize;
}
-void CEToStringsMap::put(int32_t ce, UnicodeString *string)
+UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const
{
- StringList *strings = getStringList(ce);
+ // NOTE: sizes include the NULLORDER, which we don't want to compare.
+ int32_t otherSize = other.size() - 1;
- if (strings == NULL) {
- strings = new StringList();
- putStringList(ce, strings);
+ if (listSize - 1 - offset < otherSize) {
+ return FALSE;
+ }
+
+ for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
+ if (getOrder(i) != other.getOrder(j)) {
+ return FALSE;
+ }
}
- strings->add(string);
+ return TRUE;
}
-StringList *CEToStringsMap::getStringList(int32_t ce) const
+uint64_t PCEList::operator[](int32_t index) const
{
- return (StringList *) uhash_iget(map, ce);
+ return getOrder(index);
}
-void CEToStringsMap::putStringList(int32_t ce, StringList *stringList)
+void SSearchTest::boyerMooreTest()
{
UErrorCode status = U_ZERO_ERROR;
+ UCollator *coll = NULL;
+ CollData *data = NULL;
+ const CEList* ce = NULL;
+ const CEList* ce1 = NULL;
+ UnicodeString lp = "fuss";
+ UnicodeString sp = "fu\\u00DF";
+ BoyerMooreSearch *longPattern = NULL;
+ BoyerMooreSearch *shortPattern = NULL;
+ UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
+ "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
+ "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
+ int32_t start = -1, end = -1;
+
+ coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Could not open collator. - %s", u_errorName(status));
+ return;
+ }
- uhash_iput(map, ce, (void *) stringList, &status);
-}
+ data = CollData::open(coll, status);
+ if (U_FAILURE(status)) {
+ errln("Could not open CollData object.");
+ goto close_data;
+ }
-void CEToStringsMap::deleteStringList(void *obj)
-{
- StringList *strings = (StringList *) obj;
+ data->getDynamicClassID();
+ if (U_FAILURE(status)) {
+ errln("Could not get dynamic class ID of CollData.");
+ goto close_patterns;
+ }
- delete strings;
-}
+ data->getStaticClassID();
+ if (U_FAILURE(status)) {
+ errln("Could not get static class ID of CollData.");
+ goto close_patterns;
+ }
-class StringToCEsMap
-{
-public:
- StringToCEsMap();
- ~StringToCEsMap();
+ longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status);
+ shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status);
+ if (U_FAILURE(status)) {
+ errln("Could not create pattern objects.");
+ goto close_patterns;
+ }
- void put(const UnicodeString *string, const CEList *ces);
- const CEList *get(const UnicodeString *string);
+ longPattern->getBadCharacterTable();
+ shortPattern->getBadCharacterTable();
+ if (U_FAILURE(status)) {
+ errln("Could not get bad character table.");
+ goto close_patterns;
+ }
-private:
+ longPattern->getGoodSuffixTable();
+ shortPattern->getGoodSuffixTable();
+ if (U_FAILURE(status)) {
+ errln("Could not get good suffix table.");
+ goto close_patterns;
+ }
- static void deleteCEList(void *obj);
- static void deleteUnicodeStringKey(void *obj);
+ longPattern->getDynamicClassID();
+ shortPattern->getDynamicClassID();
+ if (U_FAILURE(status)) {
+ errln("Could not get dynamic class ID of BoyerMooreSearch.");
+ goto close_patterns;
+ }
- UHashtable *map;
-};
+ longPattern->getStaticClassID();
+ shortPattern->getStaticClassID();
+ if (U_FAILURE(status)) {
+ errln("Could not get static class ID of BoyerMooreSearch.");
+ goto close_patterns;
+ }
-StringToCEsMap::StringToCEsMap()
-{
- UErrorCode status = U_ZERO_ERROR;
+ longPattern->getData();
+ shortPattern->getData();
+ if (U_FAILURE(status)) {
+ errln("Could not get collate data.");
+ goto close_patterns;
+ }
- map = uhash_open(uhash_hashCaselessUnicodeString,
- uhash_compareCaselessUnicodeString,
- uhash_compareLong,
- &status);
+ ce = longPattern->getPatternCEs();
+ ce1 = shortPattern->getPatternCEs();
+ if (U_FAILURE(status)) {
+ errln("Could not get pattern CEs.");
+ goto close_patterns;
+ }
- uhash_setValueDeleter(map, deleteCEList);
- uhash_setKeyDeleter(map, deleteUnicodeStringKey);
-}
+ ce->getDynamicClassID();
+ ce1->getDynamicClassID();
+ if (U_FAILURE(status)) {
+ errln("Could not get dynamic class ID of CEList.");
+ goto close_patterns;
+ }
-StringToCEsMap::~StringToCEsMap()
-{
- uhash_close(map);
-}
+ ce->getStaticClassID();
+ ce1->getStaticClassID();
+ if (U_FAILURE(status)) {
+ errln("Could not get static class ID of CEList.");
+ goto close_patterns;
+ }
-void StringToCEsMap::put(const UnicodeString *string, const CEList *ces)
-{
- UErrorCode status = U_ZERO_ERROR;
+ if(data->minLengthInChars(ce,0) != 3){
+ errln("Minimal Length in Characters for 'data' with 'ce' was suppose to give 3.");
+ goto close_patterns;
+ }
- uhash_put(map, (void *) string, (void *) ces, &status);
-}
+ if(data->minLengthInChars(ce1,0) != 3){
+ errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to give 3.");
+ goto close_patterns;
+ }
-const CEList *StringToCEsMap::get(const UnicodeString *string)
-{
- return (const CEList *) uhash_get(map, string);
-}
+ for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
+ UnicodeString target = targets[t].unescape();
-void StringToCEsMap::deleteCEList(void *obj)
-{
- CEList *list = (CEList *) obj;
+ longPattern->setTargetString(&target, status);
+ if (longPattern->search(0, start, end)) {
+ logln("Test %d: found long pattern at [%d, %d].", t, start, end);
+ } else {
+ errln("Test %d: did not find long pattern.", t);
+ }
- delete list;
-}
+ shortPattern->setTargetString(&target, status);
+ if (shortPattern->search(0, start, end)) {
+ logln("Test %d: found short pattern at [%d, %d].", t, start, end);
+ } else {
+ errln("Test %d: did not find short pattern.", t);
+ }
-void StringToCEsMap::deleteUnicodeStringKey(void *obj)
-{
- UnicodeString *key = (UnicodeString *) obj;
+ if(longPattern->empty()){
+ errln("Test %d: Long pattern should not have been empty.");
+ }
+
+ if(shortPattern->empty()){
+ errln("Test %d: Short pattern should not have been empty.");
+ }
+ }
+
+close_patterns:
+ delete shortPattern;
+ delete longPattern;
- delete key;
+close_data:
+ CollData::close(data);
+ ucol_close(coll);
}
-static void buildData(UCollator *coll, USet *charsToTest, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
+void SSearchTest::bmsTest()
{
- int32_t itemCount = uset_getItemCount(charsToTest);
UErrorCode status = U_ZERO_ERROR;
+ UCollator *coll = NULL;
+ UCD *data = NULL;
+ UnicodeString lp = "fuss";
+ UnicodeString lpu = lp.unescape();
+ UnicodeString sp = "fu\\u00DF";
+ UnicodeString spu = sp.unescape();
+ BMS *longPattern = NULL;
+ BMS *shortPattern = NULL;
+ UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
+ "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
+ "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
+ int32_t start = -1, end = -1;
+
+ coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Could not open collator. - %s", u_errorName(status));
+ return;
+ }
- for(int32_t item = 0; item < itemCount; item += 1) {
- UChar32 start = 0, end = 0;
- UChar buffer[16];
- int32_t len = uset_getItem(charsToTest, item, &start, &end,
- buffer, 16, &status);
+ data = ucd_open(coll, &status);
+ if (U_FAILURE(status)) {
+ errln("Could not open CollData object.");
+ goto close_data;
+ }
- if (len == 0) {
- for (UChar32 ch = start; ch <= end; ch += 1) {
- UnicodeString *st = new UnicodeString(ch);
- CEList *ceList = new CEList(coll, *st);
+ longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status);
+ shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status);
+ if (U_FAILURE(status)) {
+ errln("Couldn't open pattern objects.");
+ goto close_patterns;
+ }
- charsToCEList->put(st, ceList);
- ceToCharsStartingWith->put(ceList->get(0), st);
- }
- } else if (len > 0) {
- UnicodeString *st = new UnicodeString(buffer, len);
- CEList *ceList = new CEList(coll, *st);
+ for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
+ UnicodeString target = targets[t].unescape();
- charsToCEList->put(st, ceList);
- ceToCharsStartingWith->put(ceList->get(0), st);
+ bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status);
+ if (bms_search(longPattern, 0, &start, &end)) {
+ logln("Test %d: found long pattern at [%d, %d].", t, start, end);
} else {
- // shouldn't happen...
+ errln("Test %d: did not find long pattern.", t);
}
- }
-}
-
-static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
-{
- for(int32_t i = 0; i < string.length(); i += 1) {
- UChar32 ch = string.char32At(i);
- if (ch >= 0x0020 && ch <= 0x007F) {
- if (ch == 0x005C) {
- buffer.append("\\\\");
- } else {
- buffer.append(ch);
- }
+ bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status);
+ if (bms_search(shortPattern, 0, &start, &end)) {
+ logln("Test %d: found short pattern at [%d, %d].", t, start, end);
} else {
- char cbuffer[12];
+ errln("Test %d: did not find short pattern.", t);
+ }
+ }
- if (ch <= 0xFFFFL) {
- sprintf(cbuffer, "\\u%4.4X", ch);
- } else {
- sprintf(cbuffer, "\\U%8.8X", ch);
- }
+ /* Add better coverage for bms code. */
+ if(bms_empty(longPattern)) {
+ errln("FAIL: longgPattern is empty.");
+ }
- buffer.append(cbuffer);
- }
+ if (!bms_getData(longPattern)) {
+ errln("FAIL: bms_getData returned NULL.");
+ }
- if (ch >= 0x10000L) {
- i += 1;
- }
+ if (!ucd_getCollator(data)) {
+ errln("FAIL: ucd_getCollator returned NULL.");
}
- return buffer;
+close_patterns:
+ bms_close(shortPattern);
+ bms_close(longPattern);
+
+close_data:
+ ucd_close(data);
+ ucd_freeCache();
+ ucol_close(coll);
}
-static int32_t minLengthInChars(const CEList *ceList, int32_t offset, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith,
- UnicodeString &debug)
+void SSearchTest::goodSuffixTest()
{
- // find out shortest string for the longest sequence of ces.
- // needs to be refined to use dynamic programming, but will be roughly right
- int32_t totalStringLength = 0;
-
- while (offset < ceList->size()) {
- int32_t ce = ceList->get(offset);
- int32_t bestLength = INT32_MIN;
- const UnicodeString *bestString = NULL;
- int32_t bestCeLength = 0;
- const StringList *strings = ceToCharsStartingWith->getStringList(ce);
- int32_t stringCount = strings->size();
-
- for (int32_t s = 0; s < stringCount; s += 1) {
- const UnicodeString *string = strings->get(s);
- const CEList *ceList2 = charsToCEList->get(string);
-
- if (ceList->matchesAt(offset, ceList2)) {
- int32_t length = ceList2->size() - string->length();
-
- if (bestLength < length) {
- bestLength = length;
- bestCeLength = ceList2->size();
- bestString = string;
- }
- }
- }
-
- totalStringLength += bestString->length();
- escape(*bestString, debug).append("/");
- offset += bestCeLength;
+ UErrorCode status = U_ZERO_ERROR;
+ UCollator *coll = NULL;
+ CollData *data = NULL;
+ UnicodeString pat = /*"gcagagag"*/ "fxeld";
+ UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
+ BoyerMooreSearch *pattern = NULL;
+ int32_t start = -1, end = -1;
+
+ coll = ucol_open(NULL, &status);
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Couldn't open collator. - %s", u_errorName(status));
+ return;
+ }
+
+ data = CollData::open(coll, status);
+ if (U_FAILURE(status)) {
+ errln("Couldn't open CollData object.");
+ goto close_data;
+ }
+
+ pattern = new BoyerMooreSearch(data, pat, &target, status);
+ if (U_FAILURE(status)) {
+ errln("Couldn't open pattern object.");
+ goto close_pattern;
}
- debug.append((UChar)0x0000);
- return totalStringLength;
+ if (pattern->search(0, start, end)) {
+ logln("Found pattern at [%d, %d].", start, end);
+ } else {
+ errln("Did not find pattern.");
+ }
+
+close_pattern:
+ delete pattern;
+
+close_data:
+ CollData::close(data);
+ ucol_close(coll);
}
-static void minLengthTest(UCollator *coll, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith)
-{
- UnicodeString examples[] = {"fuss", "fiss", "affliss", "VII"};
- UnicodeString debug;
- int32_t nExamples = sizeof(examples) / sizeof(examples[0]);
+//
+// searchTime() A quick and dirty performance test for string search.
+// Probably doesn't really belong as part of intltest, but it
+// does check that the search succeeds, and gets the right result,
+// so it serves as a functionality test also.
+//
+// To run as a perf test, up the loop count, select by commenting
+// and uncommenting in the code the operation to be measured,
+// rebuild, and measure the running time of this test alone.
+//
+// time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
+//
+void SSearchTest::searchTime() {
+ static const char *longishText =
+"Whylom, as olde stories tellen us,\n"
+"Ther was a duk that highte Theseus:\n"
+"Of Athenes he was lord and governour,\n"
+"And in his tyme swich a conquerour,\n"
+"That gretter was ther noon under the sonne.\n"
+"Ful many a riche contree hadde he wonne;\n"
+"What with his wisdom and his chivalrye,\n"
+"He conquered al the regne of Femenye,\n"
+"That whylom was y-cleped Scithia;\n"
+"And weddede the quene Ipolita,\n"
+"And broghte hir hoom with him in his contree\n"
+"With muchel glorie and greet solempnitee,\n"
+"And eek hir yonge suster Emelye.\n"
+"And thus with victorie and with melodye\n"
+"Lete I this noble duk to Athenes ryde,\n"
+"And al his hoost, in armes, him bisyde.\n"
+"And certes, if it nere to long to here,\n"
+"I wolde han told yow fully the manere,\n"
+"How wonnen was the regne of Femenye\n"
+"By Theseus, and by his chivalrye;\n"
+"And of the grete bataille for the nones\n"
+"Bitwixen Athen's and Amazones;\n"
+"And how asseged was Ipolita,\n"
+"The faire hardy quene of Scithia;\n"
+"And of the feste that was at hir weddinge,\n"
+"And of the tempest at hir hoom-cominge;\n"
+"But al that thing I moot as now forbere.\n"
+"I have, God woot, a large feeld to ere,\n"
+"And wayke been the oxen in my plough.\n"
+"The remenant of the tale is long y-nough.\n"
+"I wol nat letten eek noon of this route;\n"
+"Lat every felawe telle his tale aboute,\n"
+"And lat see now who shal the soper winne;\n"
+"And ther I lefte, I wol ageyn biginne.\n"
+"This duk, of whom I make mencioun,\n"
+"When he was come almost unto the toun,\n"
+"In al his wele and in his moste pryde,\n"
+"He was war, as he caste his eye asyde,\n"
+"Wher that ther kneled in the hye weye\n"
+"A companye of ladies, tweye and tweye,\n"
+"Ech after other, clad in clothes blake; \n"
+"But swich a cry and swich a wo they make,\n"
+"That in this world nis creature livinge,\n"
+"That herde swich another weymentinge;\n"
+"And of this cry they nolde never stenten,\n"
+"Til they the reynes of his brydel henten.\n"
+"'What folk ben ye, that at myn hoomcominge\n"
+"Perturben so my feste with cryinge'?\n"
+"Quod Theseus, 'have ye so greet envye\n"
+"Of myn honour, that thus compleyne and crye? \n"
+"Or who hath yow misboden, or offended?\n"
+"And telleth me if it may been amended;\n"
+"And why that ye ben clothed thus in blak'?\n"
+"The eldest lady of hem alle spak,\n"
+"When she hadde swowned with a deedly chere,\n"
+"That it was routhe for to seen and here,\n"
+"And seyde: 'Lord, to whom Fortune hath yiven\n"
+"Victorie, and as a conquerour to liven,\n"
+"Noght greveth us your glorie and your honour;\n"
+"But we biseken mercy and socour.\n"
+"Have mercy on our wo and our distresse.\n"
+"Som drope of pitee, thurgh thy gentilesse,\n"
+"Up-on us wrecched wommen lat thou falle.\n"
+"For certes, lord, ther nis noon of us alle,\n"
+"That she nath been a duchesse or a quene;\n"
+"Now be we caitifs, as it is wel sene:\n"
+"Thanked be Fortune, and hir false wheel,\n"
+"That noon estat assureth to be weel.\n"
+"And certes, lord, t'abyden your presence,\n"
+"Here in the temple of the goddesse Clemence\n"
+"We han ben waytinge al this fourtenight;\n"
+"Now help us, lord, sith it is in thy might.\n"
+"I wrecche, which that wepe and waille thus,\n"
+"Was whylom wyf to king Capaneus,\n"
+"That starf at Thebes, cursed be that day!\n"
+"And alle we, that been in this array,\n"
+"And maken al this lamentacioun,\n"
+"We losten alle our housbondes at that toun,\n"
+"Whyl that the sege ther-aboute lay.\n"
+"And yet now th'olde Creon, weylaway!\n"
+"The lord is now of Thebes the citee, \n"
+"Fulfild of ire and of iniquitee,\n"
+"He, for despyt, and for his tirannye,\n"
+"To do the dede bodyes vileinye,\n"
+"Of alle our lordes, whiche that ben slawe,\n"
+"Hath alle the bodyes on an heep y-drawe,\n"
+"And wol nat suffren hem, by noon assent,\n"
+"Neither to been y-buried nor y-brent,\n"
+"But maketh houndes ete hem in despyt. zet'\n";
+
+#define TEST_BOYER_MOORE 1
+const char *cPattern = "maketh houndes ete hem";
+//const char *cPattern = "Whylom";
+//const char *cPattern = "zet";
+ const char *testId = "searchTime()"; // for error macros.
+ UnicodeString target = longishText;
+ UErrorCode status = U_ZERO_ERROR;
- for (int32_t s = 0; s < nExamples; s += 1) {
- CEList *ceList = new CEList(coll, examples[s]);
- //infoln("%S:", examples[s].getTerminatedBuffer());
+ LocalUCollatorPointer collator(ucol_open("en", &status));
+ CollData *data = CollData::open(collator.getAlias(), status);
+ if (U_FAILURE(status) || collator.isNull() || data == NULL) {
+ errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status));
+ return;
+ }
+ //ucol_setStrength(collator.getAlias(), collatorStrength);
+ //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
+ UnicodeString uPattern = cPattern;
+#ifndef TEST_BOYER_MOORE
+ LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
+ target.getBuffer(), target.length(),
+ collator.getAlias(),
+ NULL, // the break iterator
+ &status));
+ TEST_ASSERT_SUCCESS(status);
+#else
+ BoyerMooreSearch bms(data, uPattern, &target, status);
+ TEST_ASSERT_SUCCESS(status);
+#endif
- for(int32_t i = 0; i < examples[s].length(); i += 1) {
- debug.remove();
+// int32_t foundStart;
+// int32_t foundEnd;
+ UBool found;
+
+ // Find the match position usgin strstr
+ const char *pm = strstr(longishText, cPattern);
+ TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
+ int32_t refMatchPos = (int32_t)(pm - longishText);
+ int32_t icuMatchPos;
+ int32_t icuMatchEnd;
+#ifndef TEST_BOYER_MOORE
+ usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
+ TEST_ASSERT_SUCCESS(status);
+#else
+ found = bms.search(0, icuMatchPos, icuMatchEnd);
+#endif
+ TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
- int32_t minLength = minLengthInChars(ceList, i, charsToCEList, ceToCharsStartingWith, debug);
- //infoln("\t%d\t%S", minLength, debug.getTerminatedBuffer());
- }
+ int32_t i;
+ int32_t j=0;
+
+ // Try loopcounts around 100000 to some millions, depending on the operation,
+ // to get runtimes of at least several seconds.
+ for (i=0; i<10000; i++) {
+#ifndef TEST_BOYER_MOORE
+ found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
+#else
+ found = bms.search(0, icuMatchPos, icuMatchEnd);
+#endif
+ //TEST_ASSERT_SUCCESS(status);
+ //TEST_ASSERT(found);
+
+ // usearch_setOffset(uss.getAlias(), 0, &status);
+ // icuMatchPos = usearch_next(uss.getAlias(), &status);
- //infoln();
- delete ceList;
+ // The i+j stuff is to confuse the optimizer and get it to actually leave the
+ // call to strstr in place.
+ //pm = strstr(longishText+j, cPattern);
+ //j = (j + i)%5;
}
+
+ printf("%ld, %d\n", pm-longishText, j);
+#ifdef TEST_BOYER_MOORE
+ CollData::close(data);
+#endif
}
+#endif
//----------------------------------------------------------------------------------------
//
class StringSetMonkey : public Monkey
{
public:
- StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith);
+ StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
~StringSetMonkey();
void append(UnicodeString &testCase, UnicodeString &alternate);
UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
const USet *set;
- UCollator *coll;
- StringToCEsMap *charsToCEList;
- CEToStringsMap *ceToCharsStartingWith;
+ UCollator *coll;
+ CollData *collData;
};
-StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith)
-: Monkey(), set(theSet), coll(theCollator), charsToCEList(theCharsToCEList), ceToCharsStartingWith(theCeToCharsStartingWith)
+StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
+: Monkey(), set(theSet), coll(theCollator), collData(theCollData)
{
// ook.
}
{
// find out shortest string for the longest sequence of ces.
// needs to be refined to use dynamic programming, but will be roughly right
- CEList ceList(coll, testCase);
+ UErrorCode status = U_ZERO_ERROR;
+ CEList ceList(coll, testCase, status);
UnicodeString alt;
int32_t offset = 0;
while (offset < ceList.size()) {
int32_t ce = ceList.get(offset);
- const StringList *strings = ceToCharsStartingWith->getStringList(ce);
+ const StringList *strings = collData->getStringList(ce);
if (strings == NULL) {
return alternate.append(testCase);
int32_t stringCount = strings->size();
int32_t tries = 0;
-
+
// find random string that generates the same CEList
- const CEList *ceList2;
- const UnicodeString *string;
+ const CEList *ceList2 = NULL;
+ const UnicodeString *string = NULL;
+ UBool matches = FALSE;
do {
int32_t s = m_rand() % stringCount;
}
string = strings->get(s);
- ceList2 = charsToCEList->get(string);
- } while (! ceList.matchesAt(offset, ceList2));
+ ceList2 = collData->getCEList(string);
+ matches = ceList.matchesAt(offset, ceList2);
+
+ if (! matches) {
+ collData->freeCEList((CEList *) ceList2);
+ }
+ } while (! matches);
alt.append(*string);
offset += ceList2->size();
+ collData->freeCEList(ceList2);
}
- const CEList altCEs(coll, alt);
+ const CEList altCEs(coll, alt, status);
if (ceList.matchesAt(0, &altCEs)) {
return alternate.append(alt);
static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
{
int32_t pieces = (m_rand() % 4) + 1;
+ UErrorCode status = U_ZERO_ERROR;
UBool matches;
do {
monkeys[monkey]->append(testCase, alternate);
}
- const CEList ceTest(coll, testCase);
- const CEList ceAlt(coll, alternate);
+ const CEList ceTest(coll, testCase, status);
+ const CEList ceAlt(coll, alternate, status);
matches = ceTest.matchesAt(0, &ceAlt);
} while (! matches);
}
-static inline USet *uset_openEmpty()
-{
- return uset_open(1, 0);
-}
-
//
// Find the next acceptable boundary following the specified starting index
// in the target text being searched.
// TODO: refine what is an acceptable boundary. For the moment,
// choose the next position not within a combining sequence.
//
+#if 0
static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
const UChar *text = string.getBuffer();
int32_t textLen = string.length();
-
+
if (startIndex >= textLen) {
return startIndex;
}
int32_t i = startIndex;
U16_NEXT(text, i, textLen, c);
-
+
// If we are on a control character, stop without looking for combining marks.
// Control characters do not combine.
int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
return i;
}
-
+
// The initial character was not a control, and can thus accept trailing
// combining characters. Advance over however many of them there are.
int32_t indexOfLastCharChecked;
return indexOfLastCharChecked;
}
-
+#endif
+
+#if 0
static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
const UChar *text = string.getBuffer();
int32_t textLen = string.length();
-
+
if (index>=textLen || index<=0) {
return FALSE;
}
-
+
// If the character at the current index is not a GRAPHEME_EXTEND
// then we can not be within a combining sequence.
UChar32 c;
if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
return FALSE;
}
-
+
// We are at a combining mark. If the preceding character is anything
// except a CONTROL, CR or LF, we are in a combining sequence.
- U16_PREV(text, 0, index, c);
+ U16_PREV(text, 0, index, c);
gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
-}
-
+}
+#endif
+
static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
{
UErrorCode status = U_ZERO_ERROR;
OrderList patternOrders(coll, pattern);
int32_t targetSize = targetOrders.size() - 1;
int32_t patternSize = patternOrders.size() - 1;
- UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status),
- target.getBuffer(), target.length(), &status);
+ UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
+ target.getBuffer(), target.length(), &status);
if (patternSize == 0) {
- matchStart = matchEnd = 0;
+ // Searching for an empty pattern always fails
+ matchStart = matchEnd = -1;
+ ubrk_close(charBreakIterator);
return FALSE;
}
//int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
int32_t expectedStart = -1, expectedEnd = -1;
int32_t notFoundCount = 0;
- UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
- testCase.getBuffer(), testCase.length(),
- coll,
- NULL, // the break iterator
- &status);
+ LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
+ testCase.getBuffer(), testCase.length(),
+ coll,
+ NULL, // the break iterator
+ &status));
// **** TODO: find *all* matches, not just first one ****
simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
-#if 0
- usearch_search(uss, 0, &actualStart, &actualEnd, &status);
-#else
- actualStart = usearch_next(uss, &status);
- actualEnd = actualStart + usearch_getMatchedLength(uss);
-#endif
+ usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
- if (actualStart != expectedStart || actualEnd != expectedEnd) {
+ if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
" strength=%s seed=%d",
name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
// **** TODO: find *all* matches, not just first one ****
simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
- usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status);
+ usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
-#if 0
- usearch_search(uss, 0, &actualStart, &actualEnd, &status);
-#else
- usearch_reset(uss);
- actualStart = usearch_next(uss, &status);
- actualEnd = actualStart + usearch_getMatchedLength(uss);
-#endif
+ usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
- if (actualStart != expectedStart || actualEnd != expectedEnd) {
+ if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
" strength=%s seed=%d",
name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
notFoundCount += 1;
}
- usearch_close(uss);
+ return notFoundCount;
+}
+
+static void hexForUnicodeString(const UnicodeString &ustr, char * cbuf, int32_t cbuflen)
+{
+ int32_t ustri, ustrlen = ustr.length();
+
+ for (ustri = 0; ustri < ustrlen; ++ustri) {
+ if (cbuflen >= 9 /* format width for single code unit(5) + terminating ellipsis(3) + null(1) */) {
+ int len = sprintf(cbuf, " %04X", ustr.charAt(ustri));
+ cbuflen -= len;
+ cbuf += len;
+ } else {
+ if (cbuflen >= 4 /* terminating ellipsis(3) + null(1) */) {
+ sprintf(cbuf, "...");
+ } else if (cbuflen >= 1) {
+ cbuf = 0;
+ }
+ break;
+ }
+ }
+}
+
+int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
+ BoyerMooreSearch *bms, BoyerMooreSearch *abms,
+ const char *name, const char *strength, uint32_t seed)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ int32_t actualStart = -1, actualEnd = -1;
+ //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
+ int32_t expectedStart = -1, expectedEnd = -1;
+ int32_t notFoundCount = 0;
+ char hexbuf[128];
+
+ // **** TODO: find *all* matches, not just first one ****
+ simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
+
+ bms->setTargetString(&testCase, status);
+ bms->search(0, actualStart, actualEnd);
+
+ if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
+ hexForUnicodeString(pattern, hexbuf, sizeof(hexbuf));
+ errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
+ " strength=%s seed=%d <pattern>: %s",
+ name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf);
+ }
+
+ if (expectedStart == -1 && actualStart == -1) {
+ notFoundCount += 1;
+ }
+
+ // **** TODO: find *all* matches, not just first one ****
+ simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
+
+ abms->setTargetString(&testCase, status);
+ abms->search(0, actualStart, actualEnd);
+
+ if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
+ hexForUnicodeString(altPattern, hexbuf, sizeof(hexbuf));
+ errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
+ " strength=%s seed=%d <pattern>: %s",
+ name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf);
+ }
+
+ if (expectedStart == -1 && actualStart == -1) {
+ notFoundCount += 1;
+ }
+
return notFoundCount;
}
{
// ook!
UErrorCode status = U_ZERO_ERROR;
- U_STRING_DECL(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
- U_STRING_INIT(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47);
- UCollator *coll = ucol_open(NULL, &status);
+ //UCollator *coll = ucol_open(NULL, &status);
+ UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
+
if (U_FAILURE(status)) {
- errln("Failed to create collator in MonkeyTest!");
+ errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
return;
}
- USet *charsToTest = uset_openPattern(test_pattern, 47, &status);
+
+ CollData *monkeyData = CollData::open(coll, status);
+
USet *expansions = uset_openEmpty();
USet *contractions = uset_openEmpty();
- StringToCEsMap *charsToCEList = new StringToCEsMap();
- CEToStringsMap *ceToCharsStartingWith = new CEToStringsMap();
ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
- uset_addAll(charsToTest, contractions);
- uset_addAll(charsToTest, expansions);
-
- // TODO: set strength to UCOL_PRIMARY, change CEList to use strength?
- buildData(coll, charsToTest, charsToCEList, ceToCharsStartingWith);
-
U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
USet *letters = uset_openPattern(letter_pattern, 39, &status);
SetMonkey letterMonkey(letters);
- StringSetMonkey contractionMonkey(contractions, coll, charsToCEList, ceToCharsStartingWith);
- StringSetMonkey expansionMonkey(expansions, coll, charsToCEList, ceToCharsStartingWith);
+ StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
+ StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
UnicodeString testCase;
UnicodeString alternate;
UnicodeString pattern, altPattern;
&contractionMonkey,
&expansionMonkey};
int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
- int32_t nonMatchCount = 0;
+ // int32_t nonMatchCount = 0;
UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
const char *strengthNames[] = {"primary", "secondary", "tertiary"};
int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
int32_t loopCount = quick? 1000 : 10000;
int32_t firstStrength = 0;
- int32_t lastStrength = strengthCount - 1;
+ int32_t lastStrength = strengthCount - 1; //*/ 0;
if (params != NULL) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
int32_t notFoundCount = 0;
+ logln("Setting strength to %s.", strengthNames[s]);
ucol_setStrength(coll, strengths[s]);
// TODO: try alternate prefix and suffix too?
// TODO: alterntaes are only equal at primary strength. Is this OK?
- for(int32_t t = 0; t < 10000; t += 1) {
+ for(int32_t t = 0; t < loopCount; t += 1) {
uint32_t seed = m_seed;
- int32_t nmc = 0;
+ // int32_t nmc = 0;
generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
testCase.remove();
testCase.append(pattern);
testCase.append(suffix);
-
+
// pattern + suffix
notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
}
- logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
+ logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
+ }
+
+ uset_close(contractions);
+ uset_close(expansions);
+ uset_close(letters);
+
+ CollData::close(monkeyData);
+
+ ucol_close(coll);
+}
+
+void SSearchTest::bmMonkeyTest(char *params)
+{
+ static const UVersionInfo icu47 = { 4, 7, 0, 0 }; // for timebomb
+ static const UChar skipChars[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb
+ // ook!
+ UErrorCode status = U_ZERO_ERROR;
+ UCollator *coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
+
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
+ return;
+ }
+
+ CollData *monkeyData = CollData::open(coll, status);
+
+ USet *expansions = uset_openEmpty();
+ USet *contractions = uset_openEmpty();
+
+ ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
+
+ U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
+ U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
+ USet *letters = uset_openPattern(letter_pattern, 39, &status);
+ SetMonkey letterMonkey(letters);
+ StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
+ StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
+ UnicodeString testCase;
+ UnicodeString alternate;
+ UnicodeString pattern, altPattern;
+ UnicodeString prefix, altPrefix;
+ UnicodeString suffix, altSuffix;
+
+ Monkey *monkeys[] = {
+ &letterMonkey,
+ &contractionMonkey,
+ &expansionMonkey,
+ &contractionMonkey,
+ &expansionMonkey,
+ &contractionMonkey,
+ &expansionMonkey,
+ &contractionMonkey,
+ &expansionMonkey};
+ int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
+ // int32_t nonMatchCount = 0;
+
+ UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
+ const char *strengthNames[] = {"primary", "secondary", "tertiary"};
+ int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
+ int32_t loopCount = quick? 1000 : 10000;
+ int32_t firstStrength = 0;
+ int32_t lastStrength = strengthCount - 1; //*/ 0;
+
+ if (params != NULL) {
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+ UnicodeString p(params);
+
+ loopCount = getIntParam("loop", p, loopCount);
+ m_seed = getIntParam("seed", p, m_seed);
+
+ RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
+ if (m.find()) {
+ UnicodeString breakType = m.group(1, status);
+
+ for (int32_t s = 0; s < strengthCount; s += 1) {
+ if (breakType == strengthNames[s]) {
+ firstStrength = lastStrength = s;
+ break;
+ }
+ }
+
+ m.reset();
+ p = m.replaceFirst("", status);
+ }
+
+ if (RegexMatcher("\\S", p, 0, status).find()) {
+ // Each option is stripped out of the option string as it is processed.
+ // All options have been checked. The option string should have been completely emptied..
+ char buf[100];
+ p.extract(buf, sizeof(buf), NULL, status);
+ buf[sizeof(buf)-1] = 0;
+ errln("Unrecognized or extra parameter: %s\n", buf);
+ return;
+ }
+#else
+ infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
+#endif
}
- delete ceToCharsStartingWith;
- delete charsToCEList;
+ for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
+ int32_t notFoundCount = 0;
+
+ logln("Setting strength to %s.", strengthNames[s]);
+ ucol_setStrength(coll, strengths[s]);
+
+ CollData *data = CollData::open(coll, status);
+
+ UnicodeString skipString(skipChars); // for timebomb
+ UnicodeSet* skipSet = UnicodeSet::createFromAll(skipString); // for timebomb
+ // TODO: try alternate prefix and suffix too?
+ // TODO: alterntaes are only equal at primary strength. Is this OK?
+ for(int32_t t = 0; t < loopCount; t += 1) {
+ uint32_t seed = m_seed;
+ // int32_t nmc = 0;
+
+ generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
+ generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
+ generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
+
+ if (!isICUVersionAtLeast(icu47) && skipSet->containsSome(pattern)) {
+ continue; // timebomb until ticket #8080 is resolved
+ }
+
+ BoyerMooreSearch pat(data, pattern, NULL, status);
+ BoyerMooreSearch alt(data, altPattern, NULL, status);
+
+ // **** need a better way to deal with this ****
+#if 0
+ if (pat.empty() ||
+ alt.empty()) {
+ continue;
+ }
+#endif
+
+ // pattern
+ notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed);
+
+ testCase.remove();
+ testCase.append(prefix);
+ testCase.append(/*alt*/pattern);
+
+ // prefix + pattern
+ notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed);
+
+ testCase.append(suffix);
+
+ // prefix + pattern + suffix
+ notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed);
+
+ testCase.remove();
+ testCase.append(pattern);
+ testCase.append(suffix);
+
+ // pattern + suffix
+ notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed);
+ }
+ delete skipSet; // for timebomb
+
+ CollData::close(data);
+
+ logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
+ }
uset_close(contractions);
uset_close(expansions);
- uset_close(charsToTest);
uset_close(letters);
-
+
+ CollData::close(monkeyData);
+
ucol_close(coll);
}
-#endif
-
+void SSearchTest::stringListTest(){
+ UErrorCode status = U_ZERO_ERROR;
+ StringList *sl = new StringList(status);
+ if(U_FAILURE(status)){
+ errln("ERROR: stringListTest: Could not start StringList");
+ }
+
+ const UChar chars[] = {
+ 0x0000
+ };
+ sl->add(chars, (int32_t) 0, status);
+ if(U_FAILURE(status)){
+ errln("ERROR: stringListTest: StringList::add");
+ }
+
+ if(sl->getDynamicClassID() != StringList::getStaticClassID()){
+ errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID does not match");
+ }
+ delete sl;
+}
+
+#endif
+
#endif