+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2011, International Business Machines Corporation and
+ * Copyright (c) 1997-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
#include "unicode/uloc.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
+#include "unicode/utf16.h"
#include "unicode/putil.h"
#include "callcoll.h"
#include "cmemory.h"
#include "filestrm.h"
#include "cstring.h"
#include "ucol_imp.h"
-#include "ucol_tok.h"
#include "uparse.h"
#include <stdio.h>
addTest(root, &TestBug672, "tscoll/citertst/TestBug672");
addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize");
addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer");
- addTest(root, &TestCEs, "tscoll/citertst/TestCEs");
addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos");
- addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
- addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
- addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements");
}
/* Now set it to point to a null string with fake length*/
ucol_setText(iter2, NULL, 2, &status);
- if (U_FAILURE(status))
+ if (status != U_ILLEGAL_ARGUMENT_ERROR)
{
- log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status));
- }
- else
- {
- if (ucol_next(iter2, &status) != UCOL_NULLORDER) {
- log_err("iter2 with null text expected to return UCOL_NULLORDER\n");
- }
+ log_err("call to iter2->setText(null, 2) should yield an illegal-argument-error - %s\n",
+ myErrorName(status));
}
ucol_closeElements(iter2);
}
U16_APPEND(supplementary, stringOffset, 2, unassigned, isError);
+ (void)isError; /* Suppress set but not used warning. */
ucol_setText(iter, supplementary, 2, &status);
sorder = ucol_previous(iter, &status);
}
}
-/**
-* Sniplets of code from genuca
-*/
-static int32_t hex2num(char hex) {
- if(hex>='0' && hex <='9') {
- return hex-'0';
- } else if(hex>='a' && hex<='f') {
- return hex-'a'+10;
- } else if(hex>='A' && hex<='F') {
- return hex-'A'+10;
- } else {
- return 0;
- }
-}
-
-/**
-* Getting codepoints from a string
-* @param str character string contain codepoints seperated by space and ended
-* by a semicolon
-* @param codepoints array for storage, assuming size > 5
-* @return position at the end of the codepoint section
-*/
-static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
- UErrorCode errorCode = U_ZERO_ERROR;
- char *semi = uprv_strchr(str, ';');
- char *pipe = uprv_strchr(str, '|');
- char *s;
- *codepoints = 0;
- *contextCPs = 0;
- if(semi == NULL) {
- log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str);
- return str;
- }
- if(pipe != NULL) {
- int32_t contextLength;
- *pipe = 0;
- contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode);
- *pipe = '|';
- if(U_FAILURE(errorCode)) {
- log_err("error parsing precontext string from FractionalUCA.txt %s\n", str);
- return str;
- }
- /* prepend the precontext string to the codepoints */
- u_memcpy(codepoints, contextCPs, contextLength);
- codepoints += contextLength;
- /* start of the code point string */
- s = pipe + 1;
- } else {
- s = str;
- }
- u_parseString(s, codepoints, 99, NULL, &errorCode);
- if(U_FAILURE(errorCode)) {
- log_err("error parsing code point string from FractionalUCA.txt %s\n", str);
- return str;
- }
- return semi + 1;
-}
-
-/**
-* Sniplets of code from genuca
-*/
-static int32_t
-readElement(char **from, char *to, char separator, UErrorCode *status)
-{
- if (U_SUCCESS(*status)) {
- char buffer[1024];
- int32_t i = 0;
- while (**from != separator) {
- if (**from != ' ') {
- *(buffer+i++) = **from;
- }
- (*from)++;
- }
- (*from)++;
- *(buffer + i) = 0;
- strcpy(to, buffer);
- return i/2;
- }
-
- return 0;
-}
-
-/**
-* Sniplets of code from genuca
-*/
-static uint32_t
-getSingleCEValue(char *primary, char *secondary, char *tertiary,
- UErrorCode *status)
-{
- if (U_SUCCESS(*status)) {
- uint32_t value = 0;
- char primsave = '\0';
- char secsave = '\0';
- char tersave = '\0';
- char *primend = primary+4;
- char *secend = secondary+2;
- char *terend = tertiary+2;
- uint32_t primvalue;
- uint32_t secvalue;
- uint32_t tervalue;
-
- if (uprv_strlen(primary) > 4) {
- primsave = *primend;
- *primend = '\0';
- }
-
- if (uprv_strlen(secondary) > 2) {
- secsave = *secend;
- *secend = '\0';
- }
-
- if (uprv_strlen(tertiary) > 2) {
- tersave = *terend;
- *terend = '\0';
- }
-
- primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0;
- secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0;
- tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0;
- if(primvalue <= 0xFF) {
- primvalue <<= 8;
- }
-
- value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK)
- | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK)
- | (tervalue & UCOL_TERTIARYORDERMASK);
-
- if(primsave!='\0') {
- *primend = primsave;
- }
- if(secsave!='\0') {
- *secend = secsave;
- }
- if(tersave!='\0') {
- *terend = tersave;
- }
- return value;
- }
- return 0;
-}
-
-/**
-* Getting collation elements generated from a string
-* @param str character string contain collation elements contained in [] and
-* seperated by space
-* @param ce array for storage, assuming size > 20
-* @param status error status
-* @return position at the end of the codepoint section
-*/
-static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) {
- char *pStartCP = uprv_strchr(str, '[');
- int count = 0;
- char *pEndCP;
- char primary[100];
- char secondary[100];
- char tertiary[100];
-
- while (*pStartCP == '[') {
- uint32_t primarycount = 0;
- uint32_t secondarycount = 0;
- uint32_t tertiarycount = 0;
- uint32_t CEi = 1;
- pEndCP = strchr(pStartCP, ']');
- if(pEndCP == NULL) {
- break;
- }
- pStartCP ++;
-
- primarycount = readElement(&pStartCP, primary, ',', status);
- secondarycount = readElement(&pStartCP, secondary, ',', status);
- tertiarycount = readElement(&pStartCP, tertiary, ']', status);
-
- /* I want to get the CEs entered right here, including continuation */
- ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status);
- if (U_FAILURE(*status)) {
- break;
- }
-
- while (2 * CEi < primarycount || CEi < secondarycount ||
- CEi < tertiarycount) {
- uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
- if (2 * CEi < primarycount) {
- value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28);
- value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24);
- }
-
- if (2 * CEi + 1 < primarycount) {
- value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20);
- value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16);
- }
-
- if (CEi < secondarycount) {
- value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12);
- value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8);
- }
-
- if (CEi < tertiarycount) {
- value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4);
- value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF);
- }
-
- CEi ++;
- ces[count ++] = value;
- }
-
- pStartCP = pEndCP + 1;
- }
- ces[count] = 0;
- return pStartCP;
-}
-
-/**
-* Getting the FractionalUCA.txt file stream
-*/
-static FileStream * getFractionalUCA(void)
-{
- char newPath[256];
- char backupPath[256];
- FileStream *result = NULL;
-
- /* Look inside ICU_DATA first */
- uprv_strcpy(newPath, ctest_dataSrcDir());
- uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING );
- uprv_strcat(newPath, "FractionalUCA.txt");
-
- /* As a fallback, try to guess where the source data was located
- * at the time ICU was built, and look there.
- */
-#if defined (U_TOPSRCDIR)
- strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data");
-#else
- {
- UErrorCode errorCode = U_ZERO_ERROR;
- strcpy(backupPath, loadTestData(&errorCode));
- strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
- }
-#endif
- strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt");
-
- result = T_FileStream_open(newPath, "rb");
-
- if (result == NULL) {
- result = T_FileStream_open(backupPath, "rb");
- if (result == NULL) {
- log_err("Failed to open either %s or %s\n", newPath, backupPath);
- }
- }
- return result;
-}
-
-/**
-* Testing the CEs returned by the iterator
-*/
-static void TestCEs() {
- FileStream *file = NULL;
- char line[2048];
- char *str;
- UChar codepoints[10];
- uint32_t ces[20];
- UErrorCode status = U_ZERO_ERROR;
- UCollator *coll = ucol_open("", &status);
- uint32_t lineNo = 0;
- UChar contextCPs[5];
-
- if (U_FAILURE(status)) {
- log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status));
- return;
- }
-
- file = getFractionalUCA();
-
- if (file == NULL) {
- log_err("*** unable to open input FractionalUCA.txt file ***\n");
- return;
- }
-
-
- while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
- int count = 0;
- UCollationElements *iter;
- int32_t preContextCeLen=0;
- lineNo++;
- /* skip this line if it is empty or a comment or is a return value
- or start of some variable section */
- if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
- line[0] == 0x000D || line[0] == '[') {
- continue;
- }
-
- str = getCodePoints(line, codepoints, contextCPs);
-
- /* these are 'fake' codepoints in the fractional UCA, and are used just
- * for positioning of indirect values. They should not go through this
- * test.
- */
- if(*codepoints == 0xFDD0) {
- continue;
- }
- if (*contextCPs != 0) {
- iter = ucol_openElements(coll, contextCPs, -1, &status);
- if (U_FAILURE(status)) {
- log_err("Error in opening collation elements\n");
- break;
- }
- while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) {
- preContextCeLen++;
- }
- ucol_closeElements(iter);
- }
-
- getCEs(str, ces+preContextCeLen, &status);
- if (U_FAILURE(status)) {
- log_err("Error in parsing collation elements in FractionalUCA.txt\n");
- break;
- }
- iter = ucol_openElements(coll, codepoints, -1, &status);
- if (U_FAILURE(status)) {
- log_err("Error in opening collation elements\n");
- break;
- }
- for (;;) {
- uint32_t ce = (uint32_t)ucol_next(iter, &status);
- if (ce == 0xFFFFFFFF) {
- ce = 0;
- }
- /* we now unconditionally reorder Thai/Lao prevowels, so this
- * test would fail if we don't skip here.
- */
- if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) {
- continue;
- }
- if (ce != ces[count] || U_FAILURE(status)) {
- log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n");
- break;
- }
- if (ces[count] == 0) {
- break;
- }
- count ++;
- }
- ucol_closeElements(iter);
- }
-
- T_FileStream_close(file);
- ucol_close(coll);
-}
-
/**
* Testing the discontigous contractions
*/
ucol_close(coll);
}
-static void TestCEBufferOverflow()
-{
- UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1];
- UErrorCode status = U_ZERO_ERROR;
- UChar rule[10];
- UCollator *coll;
- UCollationElements *iter;
-
- u_uastrcpy(rule, "&z < AB");
- coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status);
- if (U_FAILURE(status)) {
- log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status));
- return;
- }
-
- /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic
- test. this will cause an overflow in getPrev */
- str[0] = 0x0041; /* 'A' */
- /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/
- uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);
- str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */
- iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1,
- &status);
- if (ucol_previous(iter, &status) == UCOL_NULLORDER ||
- status == U_BUFFER_OVERFLOW_ERROR) {
- log_err("CE buffer should not overflow with long string of trail surrogates\n");
- }
- ucol_closeElements(iter);
- ucol_close(coll);
-}
-
-/**
-* Checking collation element validity.
-*/
-#define MAX_CODEPOINTS_TO_SHOW 10
-static void showCodepoints(const UChar *codepoints, int length, char * codepointText) {
- int i, lengthToUse = length;
- if (lengthToUse > MAX_CODEPOINTS_TO_SHOW) {
- lengthToUse = MAX_CODEPOINTS_TO_SHOW;
- }
- for (i = 0; i < lengthToUse; ++i) {
- int bytesWritten = sprintf(codepointText, " %04X", *codepoints++);
- if (bytesWritten <= 0) {
- break;
- }
- codepointText += bytesWritten;
- }
- if (i < length) {
- sprintf(codepointText, " ...");
- }
-}
-
-static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
- int length)
-{
- UErrorCode status = U_ZERO_ERROR;
- UCollationElements *iter = ucol_openElements(coll, codepoints, length,
- &status);
- UBool result = FALSE;
- UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE;
- const char * collLocale;
-
- if (U_FAILURE(status)) {
- log_err("Error creating iterator for testing validity\n");
- return FALSE;
- }
- collLocale = ucol_getLocale(coll, ULOC_VALID_LOCALE, &status);
- if (U_FAILURE(status) || collLocale==NULL) {
- status = U_ZERO_ERROR;
- collLocale = "?";
- }
-
- for (;;) {
- uint32_t ce = ucol_next(iter, &status);
- uint32_t primary, p1, p2, secondary, tertiary;
- if (ce == UCOL_NULLORDER) {
- result = TRUE;
- break;
- }
- if (ce == 0) {
- continue;
- }
- if (ce == 0x02000202) {
- /* special CE for merge-sort character */
- if (*codepoints == 0xFFFE /* && length == 1 */) {
- /*
- * Note: We should check for length==1 but the token parser appears
- * to give us trailing NUL characters.
- * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
- * rather than the internal collation rule parser
- */
- continue;
- } else {
- log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n",
- (int)*codepoints, (int)length);
- break;
- }
- }
- primary = UCOL_PRIMARYORDER(ce);
- p1 = primary >> 8;
- p2 = primary & 0xFF;
- secondary = UCOL_SECONDARYORDER(ce);
- tertiary = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION;
-
- if (!isContinuation(ce)) {
- if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
- log_err("Empty CE %08lX except for case bits\n", (long)ce);
- break;
- }
- if (p1 == 0) {
- if (p2 != 0) {
- log_err("Primary 00 xx in %08lX\n", (long)ce);
- break;
- }
- primaryDone = TRUE;
- } else {
- if (p1 <= 2 || p1 >= 0xF0) {
- /* Primary first bytes F0..FF are specials. */
- log_err("Primary first byte of %08lX out of range\n", (long)ce);
- break;
- }
- if (p2 == 0) {
- primaryDone = TRUE;
- } else {
- if (p2 <= 3 || p2 >= 0xFF) {
- /* Primary second bytes 03 and FF are sort key compression terminators. */
- log_err("Primary second byte of %08lX out of range\n", (long)ce);
- break;
- }
- primaryDone = FALSE;
- }
- }
- if (secondary == 0) {
- if (primary != 0) {
- log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce);
- break;
- }
- secondaryDone = TRUE;
- } else {
- if (secondary <= 2 ||
- (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COMMON + 0x80))
- ) {
- /* Secondary first bytes common+1..+0x80 are used for sort key compression. */
- log_err("Secondary byte of %08lX out of range\n", (long)ce);
- break;
- }
- secondaryDone = FALSE;
- }
- if (tertiary == 0) {
- /* We know that ce != 0. */
- log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce);
- break;
- }
- if (tertiary <= 2) {
- log_err("Tertiary byte of %08lX out of range\n", (long)ce);
- break;
- }
- tertiaryDone = FALSE;
- } else {
- if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
- log_err("Empty continuation %08lX\n", (long)ce);
- break;
- }
- if (primaryDone && primary != 0) {
- log_err("Primary was done but continues in %08lX\n", (long)ce);
- break;
- }
- if (p1 == 0) {
- if (p2 != 0) {
- log_err("Primary 00 xx in %08lX\n", (long)ce);
- break;
- }
- primaryDone = TRUE;
- } else {
- if (p1 <= 2) {
- log_err("Primary first byte of %08lX out of range\n", (long)ce);
- break;
- }
- if (p2 == 0) {
- primaryDone = TRUE;
- } else {
- if (p2 <= 3) {
- log_err("Primary second byte of %08lX out of range\n", (long)ce);
- break;
- }
- }
- }
- if (secondaryDone && secondary != 0) {
- log_err("Secondary was done but continues in %08lX\n", (long)ce);
- break;
- }
- if (secondary == 0) {
- secondaryDone = TRUE;
- } else {
- if (secondary <= 2) {
- log_err("Secondary byte of %08lX out of range\n", (long)ce);
- break;
- }
- }
- if (tertiaryDone && tertiary != 0) {
- log_err("Tertiary was done but continues in %08lX\n", (long)ce);
- break;
- }
- if (tertiary == 0) {
- tertiaryDone = TRUE;
- } else if (tertiary <= 2) {
- log_err("Tertiary byte of %08lX out of range\n", (long)ce);
- break;
- }
- }
- }
- if (!result) {
- char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];
- showCodepoints(codepoints, length, codepointText);
- log_err("Locale: %s Code point string: %s\n", collLocale, codepointText);
- }
- ucol_closeElements(iter);
- return result;
-}
-
-static void TestCEValidity()
-{
- /* testing UCA collation elements */
- UErrorCode status = U_ZERO_ERROR;
- /* en_US has no tailorings */
- UCollator *coll = ucol_open("root", &status);
- /* tailored locales */
- char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
- const char *loc;
- FileStream *file = NULL;
- char line[2048];
- UChar codepoints[11];
- int count = 0;
- int maxCount = 0;
- UChar contextCPs[3];
- UChar32 c;
- UParseError parseError;
- if (U_FAILURE(status)) {
- log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
- return;
- }
- log_verbose("Testing UCA elements\n");
- file = getFractionalUCA();
- if (file == NULL) {
- log_err("Fractional UCA data can not be opened\n");
- return;
- }
-
- while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
- if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
- line[0] == 0x000D || line[0] == '[') {
- continue;
- }
-
- getCodePoints(line, codepoints, contextCPs);
- checkCEValidity(coll, codepoints, u_strlen(codepoints));
- }
-
- log_verbose("Testing UCA elements for the whole range of unicode characters\n");
- for (c = 0; c <= 0xffff; ++c) {
- if (u_isdefined(c)) {
- codepoints[0] = (UChar)c;
- checkCEValidity(coll, codepoints, 1);
- }
- }
- for (; c <= 0x10ffff; ++c) {
- if (u_isdefined(c)) {
- int32_t i = 0;
- U16_APPEND_UNSAFE(codepoints, i, c);
- checkCEValidity(coll, codepoints, i);
- }
- }
-
- ucol_close(coll);
-
- /* testing tailored collation elements */
- log_verbose("Testing tailored elements\n");
- if(getTestOption(QUICK_OPTION)) {
- maxCount = sizeof(locale)/sizeof(locale[0]);
- } else {
- maxCount = uloc_countAvailable();
- }
- while (count < maxCount) {
- const UChar *rules = NULL,
- *current = NULL;
- UChar *rulesCopy = NULL;
- int32_t ruleLen = 0;
-
- uint32_t chOffset = 0;
- uint32_t chLen = 0;
- uint32_t exOffset = 0;
- uint32_t exLen = 0;
- uint32_t prefixOffset = 0;
- uint32_t prefixLen = 0;
- UBool startOfRules = TRUE;
- UColOptionSet opts;
-
- UColTokenParser src;
- uint32_t strength = 0;
- uint16_t specs = 0;
- if(getTestOption(QUICK_OPTION)) {
- loc = locale[count];
- } else {
- loc = uloc_getAvailable(count);
- if(!hasCollationElements(loc)) {
- count++;
- continue;
- }
- }
-
- uprv_memset(&src, 0, sizeof(UColTokenParser));
-
- log_verbose("Testing CEs for %s\n", loc);
-
- coll = ucol_open(loc, &status);
- if (U_FAILURE(status)) {
- log_err("%s collator creation failed\n", loc);
- return;
- }
-
- src.opts = &opts;
- rules = ucol_getRules(coll, &ruleLen);
-
- if (ruleLen > 0) {
- rulesCopy = (UChar *)uprv_malloc((ruleLen +
- UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
- uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
- src.current = src.source = rulesCopy;
- src.end = rulesCopy + ruleLen;
- src.extraCurrent = src.end;
- src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
-
- /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
- the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
- while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) {
- strength = src.parsedToken.strength;
- chOffset = src.parsedToken.charsOffset;
- chLen = src.parsedToken.charsLen;
- exOffset = src.parsedToken.extensionOffset;
- exLen = src.parsedToken.extensionLen;
- prefixOffset = src.parsedToken.prefixOffset;
- prefixLen = src.parsedToken.prefixLen;
- specs = src.parsedToken.flags;
-
- startOfRules = FALSE;
- uprv_memcpy(codepoints, src.source + chOffset,
- chLen * sizeof(UChar));
- codepoints[chLen] = 0;
- checkCEValidity(coll, codepoints, chLen);
- }
- uprv_free(src.source);
- }
-
- ucol_close(coll);
- count ++;
- }
- T_FileStream_close(file);
-}
-
-static void printSortKeyError(const UChar *codepoints, int length,
- uint8_t *sortkey, int sklen)
-{
- int count = 0;
- log_err("Sortkey not valid for ");
- while (length > 0) {
- log_err("0x%04x ", *codepoints);
- length --;
- codepoints ++;
- }
- log_err("\nSortkey : ");
- while (count < sklen) {
- log_err("0x%02x ", sortkey[count]);
- count ++;
- }
- log_err("\n");
-}
-
-/**
-* Checking sort key validity for all levels
-*/
-static UBool checkSortKeyValidity(UCollator *coll,
- const UChar *codepoints,
- int length)
-{
- UErrorCode status = U_ZERO_ERROR;
- UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY,
- UCOL_TERTIARY, UCOL_QUATERNARY,
- UCOL_IDENTICAL};
- int strengthlen = 5;
- int strengthIndex = 0;
- int caselevel = 0;
-
- while (caselevel < 1) {
- if (caselevel == 0) {
- ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status);
- }
- else {
- ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status);
- }
-
- while (strengthIndex < strengthlen) {
- int count01 = 0;
- uint32_t count = 0;
- uint8_t sortkey[128];
- uint32_t sklen;
-
- ucol_setStrength(coll, strength[strengthIndex]);
- sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128);
- while (sortkey[count] != 0) {
- if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && strengthIndex != 4)) {
- printSortKeyError(codepoints, length, sortkey, sklen);
- return FALSE;
- }
- if (sortkey[count] == 1) {
- count01 ++;
- }
- count ++;
- }
-
- if (count + 1 != sklen || (count01 != strengthIndex + caselevel)) {
- printSortKeyError(codepoints, length, sortkey, sklen);
- return FALSE;
- }
- strengthIndex ++;
- }
- caselevel ++;
- }
- return TRUE;
-}
-
-static void TestSortKeyValidity(void)
-{
- /* testing UCA collation elements */
- UErrorCode status = U_ZERO_ERROR;
- /* en_US has no tailorings */
- UCollator *coll = ucol_open("en_US", &status);
- /* tailored locales */
- char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
- FileStream *file = NULL;
- char line[2048];
- UChar codepoints[10];
- int count = 0;
- UChar contextCPs[5];
- UParseError parseError;
- if (U_FAILURE(status)) {
- log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
- return;
- }
- log_verbose("Testing UCA elements\n");
- file = getFractionalUCA();
- if (file == NULL) {
- log_err("Fractional UCA data can not be opened\n");
- return;
- }
-
- while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
- if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
- line[0] == 0x000D || line[0] == '[') {
- continue;
- }
-
- getCodePoints(line, codepoints, contextCPs);
- if(codepoints[0] == 0xFFFE) {
- /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
- continue;
- }
- checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
- }
-
- log_verbose("Testing UCA elements for the whole range of unicode characters\n");
- codepoints[0] = 0;
-
- while (codepoints[0] < 0xFFFF) {
- if (u_isdefined((UChar32)codepoints[0])) {
- checkSortKeyValidity(coll, codepoints, 1);
- }
- codepoints[0] ++;
- }
-
- ucol_close(coll);
-
- /* testing tailored collation elements */
- log_verbose("Testing tailored elements\n");
- while (count < 5) {
- const UChar *rules = NULL,
- *current = NULL;
- UChar *rulesCopy = NULL;
- int32_t ruleLen = 0;
-
- uint32_t chOffset = 0;
- uint32_t chLen = 0;
- uint32_t exOffset = 0;
- uint32_t exLen = 0;
- uint32_t prefixOffset = 0;
- uint32_t prefixLen = 0;
- UBool startOfRules = TRUE;
- UColOptionSet opts;
-
- UColTokenParser src;
- uint32_t strength = 0;
- uint16_t specs = 0;
-
- uprv_memset(&src, 0, sizeof(UColTokenParser));
-
- coll = ucol_open(locale[count], &status);
- if (U_FAILURE(status)) {
- log_err("%s collator creation failed\n", locale[count]);
- return;
- }
-
- src.opts = &opts;
- rules = ucol_getRules(coll, &ruleLen);
-
- if (ruleLen > 0) {
- rulesCopy = (UChar *)uprv_malloc((ruleLen +
- UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
- uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
- src.current = src.source = rulesCopy;
- src.end = rulesCopy + ruleLen;
- src.extraCurrent = src.end;
- src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
-
- /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
- the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
- while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) {
- strength = src.parsedToken.strength;
- chOffset = src.parsedToken.charsOffset;
- chLen = src.parsedToken.charsLen;
- exOffset = src.parsedToken.extensionOffset;
- exLen = src.parsedToken.extensionLen;
- prefixOffset = src.parsedToken.prefixOffset;
- prefixLen = src.parsedToken.prefixLen;
- specs = src.parsedToken.flags;
-
- startOfRules = FALSE;
- uprv_memcpy(codepoints, src.source + chOffset,
- chLen * sizeof(UChar));
- codepoints[chLen] = 0;
- if(codepoints[0] == 0xFFFE) {
- /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
- continue;
- }
- checkSortKeyValidity(coll, codepoints, chLen);
- }
- uprv_free(src.source);
- }
-
- ucol_close(coll);
- count ++;
- }
- T_FileStream_close(file);
-}
-
/**
* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
* normalization on AND jamo tailoring, among other things.
+*
+* Note: This test is sensitive to changes of the root collator,
+* for example whether the ae-ligature maps to three CEs (as in the DUCET)
+* or to two CEs (as in the CLDR 24 FractionalUCA.txt).
+* It is also sensitive to how those CEs map to the iterator's 32-bit CE encoding.
+* For example, the DUCET's artificial secondary CE in the ae-ligature
+* may map to two 32-bit iterator CEs (as it did until ICU 52).
*/
static const UChar tsceText[] = { /* Nothing in here should be ignorable */
0x0020, 0xAC00, /* simple LV Hangul */
0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */
0x0020
};
-enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) };
+enum { kLen_tsceText = UPRV_LENGTHOF(tsceText) };
static const int32_t rootStandardOffsets[] = {
0, 1,2,
12, 13,14,15,
16, 17,18,19,
20, 21,22,23,
- 24, 25,26,26,26,
+ 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */
26, 27,28,28,
28,
29
};
-enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandardOffsets[0]) };
+enum { kLen_rootStandardOffsets = UPRV_LENGTHOF(rootStandardOffsets) };
static const int32_t rootSearchOffsets[] = {
0, 1,2,
12, 13,14,15,
16, 17,18,19,20,
20, 21,22,22,23,23,23,24,
- 24, 25,26,26,26,
+ 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */
26, 27,28,28,
28,
29
};
-enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffsets[0]) };
+enum { kLen_rootSearchOffsets = UPRV_LENGTHOF(rootSearchOffsets) };
typedef struct {
const char * locale;
do {
offset = ucol_getOffset(uce);
element = ucol_next(uce, &status);
+ log_verbose("(%s) offset=%2d ce=%08x\n", tsceItemPtr->locale, offset, element);
if ( element == 0 ) {
log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale );
}