X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/46f4442e9a5a4f3b98b7c1083586332f6a8a99a4..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/test/cintltst/citertst.c diff --git a/icuSources/test/cintltst/citertst.c b/icuSources/test/cintltst/citertst.c index d8025025..c93c19c3 100644 --- a/icuSources/test/cintltst/citertst.c +++ b/icuSources/test/cintltst/citertst.c @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2008, International Business Machines Corporation and + * Copyright (c) 1997-2016, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -22,9 +24,11 @@ #if !UCONFIG_NO_COLLATION #include "unicode/ucol.h" +#include "unicode/ucoleitr.h" #include "unicode/uloc.h" #include "unicode/uchar.h" #include "unicode/ustring.h" +#include "unicode/utf16.h" #include "unicode/putil.h" #include "callcoll.h" #include "cmemory.h" @@ -34,7 +38,7 @@ #include "filestrm.h" #include "cstring.h" #include "ucol_imp.h" -#include "ucol_tok.h" +#include "uparse.h" #include extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *); @@ -52,11 +56,8 @@ void addCollIterTest(TestNode** root) addTest(root, &TestBug672, "tscoll/citertst/TestBug672"); addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize"); addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer"); - addTest(root, &TestCEs, "tscoll/citertst/TestCEs"); addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos"); - addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow"); - addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity"); - addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity"); + addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements"); } /* The locales we support */ @@ -80,7 +81,7 @@ static void TestBug672() { UCollationElements *titer = ucol_openElements(coll, text, -1, &status); if (U_FAILURE(status)) { - log_err("ERROR: in creation of either the collator or the collation iterator :%s\n", + log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", myErrorName(status)); return; } @@ -156,7 +157,7 @@ static void TestBug672Normalize() { pitr = ucol_openElements(coll, pattern, -1, &status); titer = ucol_openElements(coll, text, -1, &status); if (U_FAILURE(status)) { - log_err("ERROR: in creation of either the collator or the collation iterator :%s\n", + log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", myErrorName(status)); return; } @@ -226,7 +227,7 @@ static void TestUnicodeChar() UChar *test; en_us = ucol_open("en_US", &status); if (U_FAILURE(status)){ - log_err("ERROR: in creation of collation data using ucol_open()\n %s\n", + log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n", myErrorName(status)); return; } @@ -293,7 +294,7 @@ static void TestNormalizedUnicodeChar() /* thai should have normalization on */ th_th = ucol_open("th_TH", &status); if (U_FAILURE(status)){ - log_err("ERROR: in creation of thai collation using ucol_open()\n %s\n", + log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n", myErrorName(status)); return; } @@ -369,7 +370,7 @@ static void TestNormalization() coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status); ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); if (U_FAILURE(status)){ - log_err("ERROR: in creation of collator using ucol_openRules()\n %s\n", + log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n", myErrorName(status)); return; } @@ -434,7 +435,7 @@ static void TestPrevious() iter=ucol_openElements(coll, test1, u_strlen(test1), &status); log_verbose("English locale testing back and forth\n"); if(U_FAILURE(status)){ - log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", + log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", myErrorName(status)); ucol_close(coll); return; @@ -574,7 +575,7 @@ static void TestOffset() log_verbose("Testing getOffset and setOffset for collations\n"); iter = ucol_openElements(en_us, test1, u_strlen(test1), &status); if(U_FAILURE(status)){ - log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", + log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", myErrorName(status)); ucol_close(en_us); return; @@ -716,7 +717,7 @@ static void TestSetText() log_verbose("testing setText for Collation elements\n"); iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status); if(U_FAILURE(status)){ - log_err("ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n", + log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n", myErrorName(status)); ucol_close(en_us); return; @@ -760,15 +761,10 @@ static void TestSetText() /* Now set it to point to a null string with fake length*/ ucol_setText(iter2, NULL, 2, &status); - if (U_FAILURE(status)) - { - log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status)); - } - else + if (status != U_ILLEGAL_ARGUMENT_ERROR) { - if (ucol_next(iter2, &status) != UCOL_NULLORDER) { - log_err("iter2 with null text expected to return UCOL_NULLORDER\n"); - } + log_err("call to iter2->setText(null, 2) should yield an illegal-argument-error - %s\n", + myErrorName(status)); } ucol_closeElements(iter2); @@ -786,7 +782,7 @@ static void TestMaxExpansion() UChar ch = 0; UChar32 unassigned = 0xEFFFD; UChar supplementary[2]; - uint32_t index = 0; + uint32_t stringOffset = 0; UBool isError = FALSE; uint32_t sorder = 0; UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/ @@ -858,7 +854,8 @@ static void TestMaxExpansion() ch, 3); } - U16_APPEND(supplementary, index, 2, unassigned, isError); + U16_APPEND(supplementary, stringOffset, 2, unassigned, isError); + (void)isError; /* Suppress set but not used warning. */ ucol_setText(iter, supplementary, 2, &status); sorder = ucol_previous(iter, &status); @@ -905,7 +902,7 @@ static void TestMaxExpansion() ucol_closeElements(iter); ucol_close(coll); } else { - log_data_err("Couldn't open collator\n"); + log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); } } @@ -994,11 +991,6 @@ static void TestSmallBuffer() free(orders); ucol_reset(testiter); - /* ensures that the writable buffer was cleared */ - if (testiter->iteratordata_.writableBuffer != - testiter->iteratordata_.stackWritableBuffer) { - log_err("Error Writable buffer in collation element iterator not reset\n"); - } /* ensures closing of elements done properly to clear writable buffer */ ucol_next(testiter, &status); @@ -1007,356 +999,8 @@ static void TestSmallBuffer() ucol_closeElements(iter); ucol_close(coll); } else { - log_data_err("Couldn't open collator\n"); - } -} - -/** -* Sniplets of code from genuca -*/ -static int32_t hex2num(char hex) { - if(hex>='0' && hex <='9') { - return hex-'0'; - } else if(hex>='a' && hex<='f') { - return hex-'a'+10; - } else if(hex>='A' && hex<='F') { - return hex-'A'+10; - } else { - return 0; - } -} - -/** -* Getting codepoints from a string -* @param str character string contain codepoints seperated by space and ended -* by a semicolon -* @param codepoints array for storage, assuming size > 5 -* @return position at the end of the codepoint section -*/ -static char * getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) { - char *pStartCP = str; - char *pEndCP = str + 4; - - *codepoints = (UChar)((hex2num(*pStartCP) << 12) | - (hex2num(*(pStartCP + 1)) << 8) | - (hex2num(*(pStartCP + 2)) << 4) | - (hex2num(*(pStartCP + 3)))); - if (*pEndCP == '|' || *(pEndCP+1) == '|') { - /* pre-context rule */ - pStartCP = pEndCP; - while (*pStartCP==' ' || *pStartCP== '|' ) { - pStartCP++; - } - pEndCP = pStartCP+4; - *contextCPs = *codepoints; - *(++codepoints) = (UChar)((hex2num(*pStartCP) << 12) | - (hex2num(*(pStartCP + 1)) << 8) | - (hex2num(*(pStartCP + 2)) << 4) | - (hex2num(*(pStartCP + 3)))); - contextCPs++; - } - *contextCPs = 0; - codepoints ++; - while (*pEndCP != ';') { - pStartCP = pEndCP + 1; - *codepoints = (UChar)((hex2num(*pStartCP) << 12) | - (hex2num(*(pStartCP + 1)) << 8) | - (hex2num(*(pStartCP + 2)) << 4) | - (hex2num(*(pStartCP + 3)))); - codepoints ++; - pEndCP = pStartCP + 4; - } - *codepoints = 0; - return pEndCP + 1; -} - -/** -* Sniplets of code from genuca -*/ -static int32_t -readElement(char **from, char *to, char separator, UErrorCode *status) -{ - if (U_SUCCESS(*status)) { - char buffer[1024]; - int32_t i = 0; - while (**from != separator) { - if (**from != ' ') { - *(buffer+i++) = **from; - } - (*from)++; - } - (*from)++; - *(buffer + i) = 0; - strcpy(to, buffer); - return i/2; - } - - return 0; -} - -/** -* Sniplets of code from genuca -*/ -static uint32_t -getSingleCEValue(char *primary, char *secondary, char *tertiary, - UErrorCode *status) -{ - if (U_SUCCESS(*status)) { - uint32_t value = 0; - char primsave = '\0'; - char secsave = '\0'; - char tersave = '\0'; - char *primend = primary+4; - char *secend = secondary+2; - char *terend = tertiary+2; - uint32_t primvalue; - uint32_t secvalue; - uint32_t tervalue; - - if (uprv_strlen(primary) > 4) { - primsave = *primend; - *primend = '\0'; - } - - if (uprv_strlen(secondary) > 2) { - secsave = *secend; - *secend = '\0'; - } - - if (uprv_strlen(tertiary) > 2) { - tersave = *terend; - *terend = '\0'; - } - - primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0; - secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0; - tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0; - if(primvalue <= 0xFF) { - primvalue <<= 8; - } - - value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK) - | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK) - | (tervalue & UCOL_TERTIARYORDERMASK); - - if(primsave!='\0') { - *primend = primsave; - } - if(secsave!='\0') { - *secend = secsave; - } - if(tersave!='\0') { - *terend = tersave; - } - return value; - } - return 0; -} - -/** -* Getting collation elements generated from a string -* @param str character string contain collation elements contained in [] and -* seperated by space -* @param ce array for storage, assuming size > 20 -* @param status error status -* @return position at the end of the codepoint section -*/ -static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) { - char *pStartCP = uprv_strchr(str, '['); - int count = 0; - char *pEndCP; - char primary[100]; - char secondary[100]; - char tertiary[100]; - - while (*pStartCP == '[') { - uint32_t primarycount = 0; - uint32_t secondarycount = 0; - uint32_t tertiarycount = 0; - uint32_t CEi = 1; - pEndCP = strchr(pStartCP, ']'); - if(pEndCP == NULL) { - break; - } - pStartCP ++; - - primarycount = readElement(&pStartCP, primary, ',', status); - secondarycount = readElement(&pStartCP, secondary, ',', status); - tertiarycount = readElement(&pStartCP, tertiary, ']', status); - - /* I want to get the CEs entered right here, including continuation */ - ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status); - if (U_FAILURE(*status)) { - break; - } - - while (2 * CEi < primarycount || CEi < secondarycount || - CEi < tertiarycount) { - uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ - if (2 * CEi < primarycount) { - value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28); - value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24); - } - - if (2 * CEi + 1 < primarycount) { - value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20); - value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16); - } - - if (CEi < secondarycount) { - value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12); - value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8); - } - - if (CEi < tertiarycount) { - value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4); - value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF); - } - - CEi ++; - ces[count ++] = value; - } - - pStartCP = pEndCP + 1; - } - ces[count] = 0; - return pStartCP; -} - -/** -* Getting the FractionalUCA.txt file stream -*/ -static FileStream * getFractionalUCA(void) -{ - char newPath[256]; - char backupPath[256]; - FileStream *result = NULL; - - /* Look inside ICU_DATA first */ - uprv_strcpy(newPath, ctest_dataSrcDir()); - uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING ); - uprv_strcat(newPath, "FractionalUCA.txt"); - - /* As a fallback, try to guess where the source data was located - * at the time ICU was built, and look there. - */ -#if defined (U_TOPSRCDIR) - strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); -#else - { - UErrorCode errorCode = U_ZERO_ERROR; - strcpy(backupPath, loadTestData(&errorCode)); - strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); - } -#endif - strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt"); - - result = T_FileStream_open(newPath, "rb"); - - if (result == NULL) { - result = T_FileStream_open(backupPath, "rb"); - if (result == NULL) { - log_err("Failed to open either %s or %s\n", newPath, backupPath); - } - } - return result; -} - -/** -* Testing the CEs returned by the iterator -*/ -static void TestCEs() { - FileStream *file = NULL; - char line[1024]; - char *str; - UChar codepoints[10]; - uint32_t ces[20]; - UErrorCode status = U_ZERO_ERROR; - UCollator *coll = ucol_open("", &status); - uint32_t lineNo = 0; - UChar contextCPs[5]; - - if (U_FAILURE(status)) { - log_err("Error in opening root collator\n"); - return; - } - - file = getFractionalUCA(); - - if (file == NULL) { - log_err("*** unable to open input FractionalUCA.txt file ***\n"); - return; - } - - - while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { - int count = 0; - UCollationElements *iter; - int32_t preContextCeLen=0; - lineNo++; - /* skip this line if it is empty or a comment or is a return value - or start of some variable section */ - if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || - line[0] == 0x000D || line[0] == '[') { - continue; - } - - str = getCodePoints(line, codepoints, contextCPs); - - /* these are 'fake' codepoints in the fractional UCA, and are used just - * for positioning of indirect values. They should not go through this - * test. - */ - if(*codepoints == 0xFDD0) { - continue; - } - if (*contextCPs != 0) { - iter = ucol_openElements(coll, contextCPs, -1, &status); - if (U_FAILURE(status)) { - log_err("Error in opening collation elements\n"); - break; - } - while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) { - preContextCeLen++; - } - ucol_closeElements(iter); - } - - getCEs(str, ces+preContextCeLen, &status); - if (U_FAILURE(status)) { - log_err("Error in parsing collation elements in FractionalUCA.txt\n"); - break; - } - iter = ucol_openElements(coll, codepoints, -1, &status); - if (U_FAILURE(status)) { - log_err("Error in opening collation elements\n"); - break; - } - for (;;) { - uint32_t ce = (uint32_t)ucol_next(iter, &status); - if (ce == 0xFFFFFFFF) { - ce = 0; - } - /* we now unconditionally reorder Thai/Lao prevowels, so this - * test would fail if we don't skip here. - */ - if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) { - continue; - } - if (ce != ces[count] || U_FAILURE(status)) { - log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n"); - break; - } - if (ces[count] == 0) { - break; - } - count ++; - } - ucol_closeElements(iter); + log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); } - - T_FileStream_close(file); - ucol_close(coll); } /** @@ -1410,7 +1054,7 @@ static void TestDiscontiguos() { resultiter = ucol_openElements(coll, rule, 1, &status); if (U_FAILURE(status)) { - log_err("Error opening collation rules\n"); + log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status)); return; } @@ -1469,489 +1113,149 @@ static void TestDiscontiguos() { ucol_close(coll); } -static void TestCEBufferOverflow() -{ - UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1]; - UErrorCode status = U_ZERO_ERROR; - UChar rule[10]; - UCollator *coll; - UCollationElements *iter; - - u_uastrcpy(rule, "&z < AB"); - coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status); - if (U_FAILURE(status)) { - log_err("Rule based collator not created for testing ce buffer overflow\n"); - return; - } - - /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic - test. this will cause an overflow in getPrev */ - str[0] = 0x0041; /* 'A' */ - /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/ - uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE); - str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */ - iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1, - &status); - if (ucol_previous(iter, &status) == UCOL_NULLORDER || - status == U_BUFFER_OVERFLOW_ERROR) { - log_err("CE buffer should not overflow with long string of trail surrogates\n"); - } - ucol_closeElements(iter); - ucol_close(coll); -} - -/** -* Byte bounds checks. Checks if each byte in data is between upper and lower -* inclusive. -*/ -static UBool checkByteBounds(uint32_t data, char upper, char lower) -{ - int count = 4; - while (count > 0) { - char b = (char)(data & 0xFF); - if (b > upper || b < lower) { - return FALSE; - } - data = data >> 8; - count --; - } - return TRUE; -} - -/** -* Determines case of the string of codepoints. -* If it is a multiple codepoints it has to treated as a contraction. -*/ -#if 0 -static uint8_t getCase(const UChar *s, uint32_t len) { - UBool lower = FALSE; - UBool upper = FALSE; - UBool title = FALSE; - UErrorCode status = U_ZERO_ERROR; - UChar str[256]; - const UChar *ps = s; - - if (len == 0) { - return UCOL_LOWER_CASE; - } - - while (len > 0) { - UChar c = *ps ++; - - if (u_islower(c)) { - lower = TRUE; - } - if (u_isupper(c)) { - upper = TRUE; - } - if (u_istitle(c)) { - title = TRUE; - } - - len --; - } - if ((lower && !upper && !title) || (!lower && !upper && !title)){ - return UCOL_LOWER_CASE; - } - if (upper && !lower && !title) { - return UCOL_UPPER_CASE; - } - /* mix of cases here */ - /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status); - if (U_FAILURE(status)) { - log_err("Error normalizing data string\n"); - return UCOL_LOWER_CASE; - }*/ - - if ((title && len >= 2) || (lower && upper)) { - return UCOL_MIXED_CASE; - } - if (u_isupper(s[0])) { - return UCOL_UPPER_CASE; - } - return UCOL_LOWER_CASE; -} -#endif - /** -* Checking collation element validity given the boundary arguments. -*/ -static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints, - int length, uint32_t primarymax, - uint32_t secondarymax) -{ - UErrorCode status = U_ZERO_ERROR; - UCollationElements *iter = ucol_openElements(coll, codepoints, length, - &status); - uint32_t ce; - UBool first = TRUE; -/* - UBool upper = FALSE; - UBool lower = FALSE; -*/ - - if (U_FAILURE(status)) { - log_err("Error creating iterator for testing validity\n"); - } - - ce = ucol_next(iter, &status); - - while (ce != UCOL_NULLORDER) { - if (ce != 0) { - uint32_t primary = UCOL_PRIMARYORDER(ce); - uint32_t secondary = UCOL_SECONDARYORDER(ce); - uint32_t tertiary = UCOL_TERTIARYORDER(ce); -/* uint32_t scasebits = tertiary & 0xC0;*/ - - if ((tertiary == 0 && secondary != 0) || - (tertiary < 0xC0 && secondary == 0 && primary != 0)) { - /* n-1th level is not zero when the nth level is - except for continuations, this is wrong */ - log_err("Lower level weight not 0 when high level weight is 0\n"); - goto fail; - } - else { - /* checks if any byte is illegal ie = 01 02 03. */ - if (checkByteBounds(ce, 0x3, 0x1)) { - log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n"); - goto fail; - } - } - if ((primary != 0 && primary < primarymax) - || ((primary & 0xFF) == 0xFF) || (((primary>>8) & 0xFF) == 0xFF) - || ((primary & 0xFF) && ((primary & 0xFF) <= 0x03)) - || (((primary>>8) & 0xFF) && ((primary>>8) & 0xFF) <= 0x03) - || (primary >= 0xFE00 && !isContinuation(ce))) { - log_err("UCA primary weight out of bounds: %04X for string starting with %04X\n", - primary, codepoints[0]); - goto fail; - } - /* case matching not done since data generated by ken */ - if (first) { - if (secondary >= 6 && secondary <= secondarymax) { - log_err("Secondary weight out of range\n"); - goto fail; - } - first = FALSE; - } - } - ce = ucol_next(iter, &status); - } - ucol_closeElements(iter); - return TRUE; -fail : - ucol_closeElements(iter); - return FALSE; -} - -static void TestCEValidity() -{ - /* testing UCA collation elements */ - UErrorCode status = U_ZERO_ERROR; - /* en_US has no tailorings */ - UCollator *coll = ucol_open("root", &status); - /* tailored locales */ - char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"}; - const char *loc; - FileStream *file = NULL; - char line[1024]; - UChar codepoints[10]; - int count = 0; - int maxCount = 0; - UChar contextCPs[3]; - UParseError parseError; - if (U_FAILURE(status)) { - log_err("en_US collator creation failed\n"); - return; - } - log_verbose("Testing UCA elements\n"); - file = getFractionalUCA(); - if (file == NULL) { - log_err("Fractional UCA data can not be opened\n"); - return; - } - - while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { - if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || - line[0] == 0x000D || line[0] == '[') { - continue; - } - - getCodePoints(line, codepoints, contextCPs); - checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86); - } - - log_verbose("Testing UCA elements for the whole range of unicode characters\n"); - codepoints[0] = 0; - while (codepoints[0] < 0xFFFF) { - if (u_isdefined((UChar32)codepoints[0])) { - checkCEValidity(coll, codepoints, 1, 5, 86); - } - codepoints[0] ++; - } - - ucol_close(coll); - - /* testing tailored collation elements */ - log_verbose("Testing tailored elements\n"); - if(QUICK) { - maxCount = sizeof(locale)/sizeof(locale[0]); - } else { - maxCount = uloc_countAvailable(); - } - while (count < maxCount) { - const UChar *rules = NULL, - *current = NULL; - UChar *rulesCopy = NULL; - int32_t ruleLen = 0; - - uint32_t chOffset = 0; - uint32_t chLen = 0; - uint32_t exOffset = 0; - uint32_t exLen = 0; - uint32_t prefixOffset = 0; - uint32_t prefixLen = 0; - UBool startOfRules = TRUE; - UColOptionSet opts; - - UColTokenParser src; - uint32_t strength = 0; - uint16_t specs = 0; - if(QUICK) { - loc = locale[count]; - } else { - loc = uloc_getAvailable(count); - if(!hasCollationElements(loc)) { - count++; - continue; - } - } - - log_verbose("Testing CEs for %s\n", loc); - - coll = ucol_open(loc, &status); - if (U_FAILURE(status)) { - log_err("%s collator creation failed\n", loc); - return; - } - - src.opts = &opts; - rules = ucol_getRules(coll, &ruleLen); - - if (ruleLen > 0) { - rulesCopy = (UChar *)malloc((ruleLen + - UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); - uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); - src.current = src.source = rulesCopy; - src.end = rulesCopy + ruleLen; - src.extraCurrent = src.end; - src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; - - while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) { - strength = src.parsedToken.strength; - chOffset = src.parsedToken.charsOffset; - chLen = src.parsedToken.charsLen; - exOffset = src.parsedToken.extensionOffset; - exLen = src.parsedToken.extensionLen; - prefixOffset = src.parsedToken.prefixOffset; - prefixLen = src.parsedToken.prefixLen; - specs = src.parsedToken.flags; - - startOfRules = FALSE; - uprv_memcpy(codepoints, src.source + chOffset, - chLen * sizeof(UChar)); - codepoints[chLen] = 0; - checkCEValidity(coll, codepoints, chLen, 4, 85); - } - free(rulesCopy); - } - - ucol_close(coll); - count ++; - } - T_FileStream_close(file); -} - -static void printSortKeyError(const UChar *codepoints, int length, - uint8_t *sortkey, int sklen) -{ - int count = 0; - log_err("Sortkey not valid for "); - while (length > 0) { - log_err("0x%04x ", *codepoints); - length --; - codepoints ++; - } - log_err("\nSortkey : "); - while (count < sklen) { - log_err("0x%02x ", sortkey[count]); - count ++; - } - log_err("\n"); -} - -/** -* Checking sort key validity for all levels +* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with +* normalization on AND jamo tailoring, among other things. +* +* Note: This test is sensitive to changes of the root collator, +* for example whether the ae-ligature maps to three CEs (as in the DUCET) +* or to two CEs (as in the CLDR 24 FractionalUCA.txt). +* It is also sensitive to how those CEs map to the iterator's 32-bit CE encoding. +* For example, the DUCET's artificial secondary CE in the ae-ligature +* may map to two 32-bit iterator CEs (as it did until ICU 52). */ -static UBool checkSortKeyValidity(UCollator *coll, - const UChar *codepoints, - int length) +static const UChar tsceText[] = { /* Nothing in here should be ignorable */ + 0x0020, 0xAC00, /* simple LV Hangul */ + 0x0020, 0xAC01, /* simple LVT Hangul */ + 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */ + 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */ + 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */ + 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */ + 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */ + 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */ + 0x0020, 0x00E6, /* small letter ae, expands */ + 0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */ + 0x0020 +}; +enum { kLen_tsceText = UPRV_LENGTHOF(tsceText) }; + +static const int32_t rootStandardOffsets[] = { + 0, 1,2, + 2, 3,4,4, + 4, 5,6,6, + 6, 7,8,8, + 8, 9,10,11, + 12, 13,14,15, + 16, 17,18,19, + 20, 21,22,23, + 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */ + 26, 27,28,28, + 28, + 29 +}; +enum { kLen_rootStandardOffsets = UPRV_LENGTHOF(rootStandardOffsets) }; + +static const int32_t rootSearchOffsets[] = { + 0, 1,2, + 2, 3,4,4, + 4, 5,6,6,6, + 6, 7,8,8,8,8,8,8, + 8, 9,10,11, + 12, 13,14,15, + 16, 17,18,19,20, + 20, 21,22,22,23,23,23,24, + 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */ + 26, 27,28,28, + 28, + 29 +}; +enum { kLen_rootSearchOffsets = UPRV_LENGTHOF(rootSearchOffsets) }; + +typedef struct { + const char * locale; + const int32_t * offsets; + int32_t offsetsLen; +} TSCEItem; + +static const TSCEItem tsceItems[] = { + { "root", rootStandardOffsets, kLen_rootStandardOffsets }, + { "root@collation=search", rootSearchOffsets, kLen_rootSearchOffsets }, + { NULL, NULL, 0 } +}; + +static void TestSearchCollatorElements(void) { - UErrorCode status = U_ZERO_ERROR; - UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY, - UCOL_TERTIARY, UCOL_QUATERNARY, - UCOL_IDENTICAL}; - int strengthlen = 5; - int index = 0; - int caselevel = 0; - - while (caselevel < 1) { - if (caselevel == 0) { - ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status); - } - else { - ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status); - } - - while (index < strengthlen) { - int count01 = 0; - uint32_t count = 0; - uint8_t sortkey[128]; - uint32_t sklen; - - ucol_setStrength(coll, strength[index]); - sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128); - while (sortkey[count] != 0) { - if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && index != 4)) { - printSortKeyError(codepoints, length, sortkey, sklen); - return FALSE; - } - if (sortkey[count] == 1) { - count01 ++; + const TSCEItem * tsceItemPtr; + for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) { + UErrorCode status = U_ZERO_ERROR; + UCollator* ucol = ucol_open(tsceItemPtr->locale, &status); + if ( U_SUCCESS(status) ) { + UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status); + if ( U_SUCCESS(status) ) { + int32_t offset, element; + const int32_t * nextOffsetPtr; + const int32_t * limitOffsetPtr; + + nextOffsetPtr = tsceItemPtr->offsets; + limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; + do { + offset = ucol_getOffset(uce); + element = ucol_next(uce, &status); + log_verbose("(%s) offset=%2d ce=%08x\n", tsceItemPtr->locale, offset, element); + if ( element == 0 ) { + log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale ); + } + if ( nextOffsetPtr < limitOffsetPtr ) { + if (offset != *nextOffsetPtr) { + log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n", + tsceItemPtr->locale, *nextOffsetPtr, offset ); + nextOffsetPtr = limitOffsetPtr; + break; + } + nextOffsetPtr++; + } else { + log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale ); + } + } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); + if ( nextOffsetPtr < limitOffsetPtr ) { + log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale ); } - count ++; - } - - if (count + 1 != sklen || (count01 != index + caselevel)) { - printSortKeyError(codepoints, length, sortkey, sklen); - return FALSE; - } - index ++; - } - caselevel ++; - } - return TRUE; -} - -static void TestSortKeyValidity(void) -{ - /* testing UCA collation elements */ - UErrorCode status = U_ZERO_ERROR; - /* en_US has no tailorings */ - UCollator *coll = ucol_open("en_US", &status); - /* tailored locales */ - char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"}; - FileStream *file = NULL; - char line[1024]; - UChar codepoints[10]; - int count = 0; - UChar contextCPs[5]; - UParseError parseError; - if (U_FAILURE(status)) { - log_err("en_US collator creation failed\n"); - return; - } - log_verbose("Testing UCA elements\n"); - file = getFractionalUCA(); - if (file == NULL) { - log_err("Fractional UCA data can not be opened\n"); - return; - } - - while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { - if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || - line[0] == 0x000D || line[0] == '[') { - continue; - } - getCodePoints(line, codepoints, contextCPs); - checkSortKeyValidity(coll, codepoints, u_strlen(codepoints)); - } - - log_verbose("Testing UCA elements for the whole range of unicode characters\n"); - codepoints[0] = 0; - - while (codepoints[0] < 0xFFFF) { - if (u_isdefined((UChar32)codepoints[0])) { - checkSortKeyValidity(coll, codepoints, 1); - } - codepoints[0] ++; - } - - ucol_close(coll); - - /* testing tailored collation elements */ - log_verbose("Testing tailored elements\n"); - while (count < 5) { - const UChar *rules = NULL, - *current = NULL; - UChar *rulesCopy = NULL; - int32_t ruleLen = 0; - - uint32_t chOffset = 0; - uint32_t chLen = 0; - uint32_t exOffset = 0; - uint32_t exLen = 0; - uint32_t prefixOffset = 0; - uint32_t prefixLen = 0; - UBool startOfRules = TRUE; - UColOptionSet opts; - - UColTokenParser src; - uint32_t strength = 0; - uint16_t specs = 0; - - coll = ucol_open(locale[count], &status); - if (U_FAILURE(status)) { - log_err("%s collator creation failed\n", locale[count]); - return; - } + ucol_setOffset(uce, kLen_tsceText, &status); + status = U_ZERO_ERROR; + nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; + limitOffsetPtr = tsceItemPtr->offsets; + do { + offset = ucol_getOffset(uce); + element = ucol_previous(uce, &status); + if ( element == 0 ) { + log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale ); + } + if ( nextOffsetPtr > limitOffsetPtr ) { + nextOffsetPtr--; + if (offset != *nextOffsetPtr) { + log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n", + tsceItemPtr->locale, *nextOffsetPtr, offset ); + nextOffsetPtr = limitOffsetPtr; + break; + } + } else { + log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale ); + } + } while ( U_SUCCESS(status) && element != UCOL_NULLORDER ); + if ( nextOffsetPtr > limitOffsetPtr ) { + log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale ); + } - src.opts = &opts; - rules = ucol_getRules(coll, &ruleLen); - - if (ruleLen > 0) { - rulesCopy = (UChar *)malloc((ruleLen + - UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); - uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); - src.current = src.source = rulesCopy; - src.end = rulesCopy + ruleLen; - src.extraCurrent = src.end; - src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; - - while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) { - strength = src.parsedToken.strength; - chOffset = src.parsedToken.charsOffset; - chLen = src.parsedToken.charsLen; - exOffset = src.parsedToken.extensionOffset; - exLen = src.parsedToken.extensionLen; - prefixOffset = src.parsedToken.prefixOffset; - prefixLen = src.parsedToken.prefixLen; - specs = src.parsedToken.flags; - - startOfRules = FALSE; - uprv_memcpy(codepoints, src.source + chOffset, - chLen * sizeof(UChar)); - codepoints[chLen] = 0; - checkSortKeyValidity(coll, codepoints, chLen); + ucol_closeElements(uce); + } else { + log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) ); } - free(rulesCopy); + ucol_close(ucol); + } else { + log_data_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) ); } - - ucol_close(coll); - count ++; } - T_FileStream_close(file); } #endif /* #if !UCONFIG_NO_COLLATION */