+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2008, International Business Machines Corporation and
+ * Copyright (c) 1997-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
#if !UCONFIG_NO_COLLATION
#include "unicode/ucol.h"
+#include "unicode/ucoleitr.h"
#include "unicode/uloc.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
+#include "unicode/utf16.h"
#include "unicode/putil.h"
#include "callcoll.h"
#include "cmemory.h"
#include "filestrm.h"
#include "cstring.h"
#include "ucol_imp.h"
-#include "ucol_tok.h"
+#include "uparse.h"
#include <stdio.h>
extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
addTest(root, &TestBug672, "tscoll/citertst/TestBug672");
addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize");
addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer");
- addTest(root, &TestCEs, "tscoll/citertst/TestCEs");
addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos");
- addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
- addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
- addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
+ addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements");
}
/* The locales we support */
UCollationElements *titer = ucol_openElements(coll, text, -1,
&status);
if (U_FAILURE(status)) {
- log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
+ log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
myErrorName(status));
return;
}
pitr = ucol_openElements(coll, pattern, -1, &status);
titer = ucol_openElements(coll, text, -1, &status);
if (U_FAILURE(status)) {
- log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
+ log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
myErrorName(status));
return;
}
UChar *test;
en_us = ucol_open("en_US", &status);
if (U_FAILURE(status)){
- log_err("ERROR: in creation of collation data using ucol_open()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n",
myErrorName(status));
return;
}
/* thai should have normalization on */
th_th = ucol_open("th_TH", &status);
if (U_FAILURE(status)){
- log_err("ERROR: in creation of thai collation using ucol_open()\n %s\n",
+ log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n",
myErrorName(status));
return;
}
coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status);
ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
if (U_FAILURE(status)){
- log_err("ERROR: in creation of collator using ucol_openRules()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n",
myErrorName(status));
return;
}
iter=ucol_openElements(coll, test1, u_strlen(test1), &status);
log_verbose("English locale testing back and forth\n");
if(U_FAILURE(status)){
- log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
myErrorName(status));
ucol_close(coll);
return;
log_verbose("Testing getOffset and setOffset for collations\n");
iter = ucol_openElements(en_us, test1, u_strlen(test1), &status);
if(U_FAILURE(status)){
- log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
myErrorName(status));
ucol_close(en_us);
return;
log_verbose("testing setText for Collation elements\n");
iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status);
if(U_FAILURE(status)){
- log_err("ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
myErrorName(status));
ucol_close(en_us);
return;
/* Now set it to point to a null string with fake length*/
ucol_setText(iter2, NULL, 2, &status);
- if (U_FAILURE(status))
- {
- log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status));
- }
- else
+ if (status != U_ILLEGAL_ARGUMENT_ERROR)
{
- if (ucol_next(iter2, &status) != UCOL_NULLORDER) {
- log_err("iter2 with null text expected to return UCOL_NULLORDER\n");
- }
+ log_err("call to iter2->setText(null, 2) should yield an illegal-argument-error - %s\n",
+ myErrorName(status));
}
ucol_closeElements(iter2);
UChar ch = 0;
UChar32 unassigned = 0xEFFFD;
UChar supplementary[2];
- uint32_t index = 0;
+ uint32_t stringOffset = 0;
UBool isError = FALSE;
uint32_t sorder = 0;
UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/
ch, 3);
}
- U16_APPEND(supplementary, index, 2, unassigned, isError);
+ U16_APPEND(supplementary, stringOffset, 2, unassigned, isError);
+ (void)isError; /* Suppress set but not used warning. */
ucol_setText(iter, supplementary, 2, &status);
sorder = ucol_previous(iter, &status);
ucol_closeElements(iter);
ucol_close(coll);
} else {
- log_data_err("Couldn't open collator\n");
+ log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
}
}
free(orders);
ucol_reset(testiter);
- /* ensures that the writable buffer was cleared */
- if (testiter->iteratordata_.writableBuffer !=
- testiter->iteratordata_.stackWritableBuffer) {
- log_err("Error Writable buffer in collation element iterator not reset\n");
- }
/* ensures closing of elements done properly to clear writable buffer */
ucol_next(testiter, &status);
ucol_closeElements(iter);
ucol_close(coll);
} else {
- log_data_err("Couldn't open collator\n");
- }
-}
-
-/**
-* Sniplets of code from genuca
-*/
-static int32_t hex2num(char hex) {
- if(hex>='0' && hex <='9') {
- return hex-'0';
- } else if(hex>='a' && hex<='f') {
- return hex-'a'+10;
- } else if(hex>='A' && hex<='F') {
- return hex-'A'+10;
- } else {
- return 0;
- }
-}
-
-/**
-* Getting codepoints from a string
-* @param str character string contain codepoints seperated by space and ended
-* by a semicolon
-* @param codepoints array for storage, assuming size > 5
-* @return position at the end of the codepoint section
-*/
-static char * getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
- char *pStartCP = str;
- char *pEndCP = str + 4;
-
- *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
- (hex2num(*(pStartCP + 1)) << 8) |
- (hex2num(*(pStartCP + 2)) << 4) |
- (hex2num(*(pStartCP + 3))));
- if (*pEndCP == '|' || *(pEndCP+1) == '|') {
- /* pre-context rule */
- pStartCP = pEndCP;
- while (*pStartCP==' ' || *pStartCP== '|' ) {
- pStartCP++;
- }
- pEndCP = pStartCP+4;
- *contextCPs = *codepoints;
- *(++codepoints) = (UChar)((hex2num(*pStartCP) << 12) |
- (hex2num(*(pStartCP + 1)) << 8) |
- (hex2num(*(pStartCP + 2)) << 4) |
- (hex2num(*(pStartCP + 3))));
- contextCPs++;
- }
- *contextCPs = 0;
- codepoints ++;
- while (*pEndCP != ';') {
- pStartCP = pEndCP + 1;
- *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
- (hex2num(*(pStartCP + 1)) << 8) |
- (hex2num(*(pStartCP + 2)) << 4) |
- (hex2num(*(pStartCP + 3))));
- codepoints ++;
- pEndCP = pStartCP + 4;
- }
- *codepoints = 0;
- return pEndCP + 1;
-}
-
-/**
-* Sniplets of code from genuca
-*/
-static int32_t
-readElement(char **from, char *to, char separator, UErrorCode *status)
-{
- if (U_SUCCESS(*status)) {
- char buffer[1024];
- int32_t i = 0;
- while (**from != separator) {
- if (**from != ' ') {
- *(buffer+i++) = **from;
- }
- (*from)++;
- }
- (*from)++;
- *(buffer + i) = 0;
- strcpy(to, buffer);
- return i/2;
- }
-
- return 0;
-}
-
-/**
-* Sniplets of code from genuca
-*/
-static uint32_t
-getSingleCEValue(char *primary, char *secondary, char *tertiary,
- UErrorCode *status)
-{
- if (U_SUCCESS(*status)) {
- uint32_t value = 0;
- char primsave = '\0';
- char secsave = '\0';
- char tersave = '\0';
- char *primend = primary+4;
- char *secend = secondary+2;
- char *terend = tertiary+2;
- uint32_t primvalue;
- uint32_t secvalue;
- uint32_t tervalue;
-
- if (uprv_strlen(primary) > 4) {
- primsave = *primend;
- *primend = '\0';
- }
-
- if (uprv_strlen(secondary) > 2) {
- secsave = *secend;
- *secend = '\0';
- }
-
- if (uprv_strlen(tertiary) > 2) {
- tersave = *terend;
- *terend = '\0';
- }
-
- primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0;
- secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0;
- tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0;
- if(primvalue <= 0xFF) {
- primvalue <<= 8;
- }
-
- value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK)
- | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK)
- | (tervalue & UCOL_TERTIARYORDERMASK);
-
- if(primsave!='\0') {
- *primend = primsave;
- }
- if(secsave!='\0') {
- *secend = secsave;
- }
- if(tersave!='\0') {
- *terend = tersave;
- }
- return value;
- }
- return 0;
-}
-
-/**
-* Getting collation elements generated from a string
-* @param str character string contain collation elements contained in [] and
-* seperated by space
-* @param ce array for storage, assuming size > 20
-* @param status error status
-* @return position at the end of the codepoint section
-*/
-static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) {
- char *pStartCP = uprv_strchr(str, '[');
- int count = 0;
- char *pEndCP;
- char primary[100];
- char secondary[100];
- char tertiary[100];
-
- while (*pStartCP == '[') {
- uint32_t primarycount = 0;
- uint32_t secondarycount = 0;
- uint32_t tertiarycount = 0;
- uint32_t CEi = 1;
- pEndCP = strchr(pStartCP, ']');
- if(pEndCP == NULL) {
- break;
- }
- pStartCP ++;
-
- primarycount = readElement(&pStartCP, primary, ',', status);
- secondarycount = readElement(&pStartCP, secondary, ',', status);
- tertiarycount = readElement(&pStartCP, tertiary, ']', status);
-
- /* I want to get the CEs entered right here, including continuation */
- ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status);
- if (U_FAILURE(*status)) {
- break;
- }
-
- while (2 * CEi < primarycount || CEi < secondarycount ||
- CEi < tertiarycount) {
- uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
- if (2 * CEi < primarycount) {
- value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28);
- value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24);
- }
-
- if (2 * CEi + 1 < primarycount) {
- value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20);
- value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16);
- }
-
- if (CEi < secondarycount) {
- value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12);
- value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8);
- }
-
- if (CEi < tertiarycount) {
- value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4);
- value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF);
- }
-
- CEi ++;
- ces[count ++] = value;
- }
-
- pStartCP = pEndCP + 1;
- }
- ces[count] = 0;
- return pStartCP;
-}
-
-/**
-* Getting the FractionalUCA.txt file stream
-*/
-static FileStream * getFractionalUCA(void)
-{
- char newPath[256];
- char backupPath[256];
- FileStream *result = NULL;
-
- /* Look inside ICU_DATA first */
- uprv_strcpy(newPath, ctest_dataSrcDir());
- uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING );
- uprv_strcat(newPath, "FractionalUCA.txt");
-
- /* As a fallback, try to guess where the source data was located
- * at the time ICU was built, and look there.
- */
-#if defined (U_TOPSRCDIR)
- strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data");
-#else
- {
- UErrorCode errorCode = U_ZERO_ERROR;
- strcpy(backupPath, loadTestData(&errorCode));
- strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
- }
-#endif
- strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt");
-
- result = T_FileStream_open(newPath, "rb");
-
- if (result == NULL) {
- result = T_FileStream_open(backupPath, "rb");
- if (result == NULL) {
- log_err("Failed to open either %s or %s\n", newPath, backupPath);
- }
- }
- return result;
-}
-
-/**
-* Testing the CEs returned by the iterator
-*/
-static void TestCEs() {
- FileStream *file = NULL;
- char line[1024];
- char *str;
- UChar codepoints[10];
- uint32_t ces[20];
- UErrorCode status = U_ZERO_ERROR;
- UCollator *coll = ucol_open("", &status);
- uint32_t lineNo = 0;
- UChar contextCPs[5];
-
- if (U_FAILURE(status)) {
- log_err("Error in opening root collator\n");
- return;
- }
-
- file = getFractionalUCA();
-
- if (file == NULL) {
- log_err("*** unable to open input FractionalUCA.txt file ***\n");
- return;
- }
-
-
- while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
- int count = 0;
- UCollationElements *iter;
- int32_t preContextCeLen=0;
- lineNo++;
- /* skip this line if it is empty or a comment or is a return value
- or start of some variable section */
- if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
- line[0] == 0x000D || line[0] == '[') {
- continue;
- }
-
- str = getCodePoints(line, codepoints, contextCPs);
-
- /* these are 'fake' codepoints in the fractional UCA, and are used just
- * for positioning of indirect values. They should not go through this
- * test.
- */
- if(*codepoints == 0xFDD0) {
- continue;
- }
- if (*contextCPs != 0) {
- iter = ucol_openElements(coll, contextCPs, -1, &status);
- if (U_FAILURE(status)) {
- log_err("Error in opening collation elements\n");
- break;
- }
- while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) {
- preContextCeLen++;
- }
- ucol_closeElements(iter);
- }
-
- getCEs(str, ces+preContextCeLen, &status);
- if (U_FAILURE(status)) {
- log_err("Error in parsing collation elements in FractionalUCA.txt\n");
- break;
- }
- iter = ucol_openElements(coll, codepoints, -1, &status);
- if (U_FAILURE(status)) {
- log_err("Error in opening collation elements\n");
- break;
- }
- for (;;) {
- uint32_t ce = (uint32_t)ucol_next(iter, &status);
- if (ce == 0xFFFFFFFF) {
- ce = 0;
- }
- /* we now unconditionally reorder Thai/Lao prevowels, so this
- * test would fail if we don't skip here.
- */
- if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) {
- continue;
- }
- if (ce != ces[count] || U_FAILURE(status)) {
- log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n");
- break;
- }
- if (ces[count] == 0) {
- break;
- }
- count ++;
- }
- ucol_closeElements(iter);
+ log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
}
-
- T_FileStream_close(file);
- ucol_close(coll);
}
/**
resultiter = ucol_openElements(coll, rule, 1, &status);
if (U_FAILURE(status)) {
- log_err("Error opening collation rules\n");
+ log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status));
return;
}
ucol_close(coll);
}
-static void TestCEBufferOverflow()
-{
- UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1];
- UErrorCode status = U_ZERO_ERROR;
- UChar rule[10];
- UCollator *coll;
- UCollationElements *iter;
-
- u_uastrcpy(rule, "&z < AB");
- coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status);
- if (U_FAILURE(status)) {
- log_err("Rule based collator not created for testing ce buffer overflow\n");
- return;
- }
-
- /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic
- test. this will cause an overflow in getPrev */
- str[0] = 0x0041; /* 'A' */
- /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/
- uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);
- str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */
- iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1,
- &status);
- if (ucol_previous(iter, &status) == UCOL_NULLORDER ||
- status == U_BUFFER_OVERFLOW_ERROR) {
- log_err("CE buffer should not overflow with long string of trail surrogates\n");
- }
- ucol_closeElements(iter);
- ucol_close(coll);
-}
-
-/**
-* Byte bounds checks. Checks if each byte in data is between upper and lower
-* inclusive.
-*/
-static UBool checkByteBounds(uint32_t data, char upper, char lower)
-{
- int count = 4;
- while (count > 0) {
- char b = (char)(data & 0xFF);
- if (b > upper || b < lower) {
- return FALSE;
- }
- data = data >> 8;
- count --;
- }
- return TRUE;
-}
-
-/**
-* Determines case of the string of codepoints.
-* If it is a multiple codepoints it has to treated as a contraction.
-*/
-#if 0
-static uint8_t getCase(const UChar *s, uint32_t len) {
- UBool lower = FALSE;
- UBool upper = FALSE;
- UBool title = FALSE;
- UErrorCode status = U_ZERO_ERROR;
- UChar str[256];
- const UChar *ps = s;
-
- if (len == 0) {
- return UCOL_LOWER_CASE;
- }
-
- while (len > 0) {
- UChar c = *ps ++;
-
- if (u_islower(c)) {
- lower = TRUE;
- }
- if (u_isupper(c)) {
- upper = TRUE;
- }
- if (u_istitle(c)) {
- title = TRUE;
- }
-
- len --;
- }
- if ((lower && !upper && !title) || (!lower && !upper && !title)){
- return UCOL_LOWER_CASE;
- }
- if (upper && !lower && !title) {
- return UCOL_UPPER_CASE;
- }
- /* mix of cases here */
- /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status);
- if (U_FAILURE(status)) {
- log_err("Error normalizing data string\n");
- return UCOL_LOWER_CASE;
- }*/
-
- if ((title && len >= 2) || (lower && upper)) {
- return UCOL_MIXED_CASE;
- }
- if (u_isupper(s[0])) {
- return UCOL_UPPER_CASE;
- }
- return UCOL_LOWER_CASE;
-}
-#endif
-
/**
-* Checking collation element validity given the boundary arguments.
-*/
-static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
- int length, uint32_t primarymax,
- uint32_t secondarymax)
-{
- UErrorCode status = U_ZERO_ERROR;
- UCollationElements *iter = ucol_openElements(coll, codepoints, length,
- &status);
- uint32_t ce;
- UBool first = TRUE;
-/*
- UBool upper = FALSE;
- UBool lower = FALSE;
-*/
-
- if (U_FAILURE(status)) {
- log_err("Error creating iterator for testing validity\n");
- }
-
- ce = ucol_next(iter, &status);
-
- while (ce != UCOL_NULLORDER) {
- if (ce != 0) {
- uint32_t primary = UCOL_PRIMARYORDER(ce);
- uint32_t secondary = UCOL_SECONDARYORDER(ce);
- uint32_t tertiary = UCOL_TERTIARYORDER(ce);
-/* uint32_t scasebits = tertiary & 0xC0;*/
-
- if ((tertiary == 0 && secondary != 0) ||
- (tertiary < 0xC0 && secondary == 0 && primary != 0)) {
- /* n-1th level is not zero when the nth level is
- except for continuations, this is wrong */
- log_err("Lower level weight not 0 when high level weight is 0\n");
- goto fail;
- }
- else {
- /* checks if any byte is illegal ie = 01 02 03. */
- if (checkByteBounds(ce, 0x3, 0x1)) {
- log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n");
- goto fail;
- }
- }
- if ((primary != 0 && primary < primarymax)
- || ((primary & 0xFF) == 0xFF) || (((primary>>8) & 0xFF) == 0xFF)
- || ((primary & 0xFF) && ((primary & 0xFF) <= 0x03))
- || (((primary>>8) & 0xFF) && ((primary>>8) & 0xFF) <= 0x03)
- || (primary >= 0xFE00 && !isContinuation(ce))) {
- log_err("UCA primary weight out of bounds: %04X for string starting with %04X\n",
- primary, codepoints[0]);
- goto fail;
- }
- /* case matching not done since data generated by ken */
- if (first) {
- if (secondary >= 6 && secondary <= secondarymax) {
- log_err("Secondary weight out of range\n");
- goto fail;
- }
- first = FALSE;
- }
- }
- ce = ucol_next(iter, &status);
- }
- ucol_closeElements(iter);
- return TRUE;
-fail :
- ucol_closeElements(iter);
- return FALSE;
-}
-
-static void TestCEValidity()
-{
- /* testing UCA collation elements */
- UErrorCode status = U_ZERO_ERROR;
- /* en_US has no tailorings */
- UCollator *coll = ucol_open("root", &status);
- /* tailored locales */
- char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
- const char *loc;
- FileStream *file = NULL;
- char line[1024];
- UChar codepoints[10];
- int count = 0;
- int maxCount = 0;
- UChar contextCPs[3];
- UParseError parseError;
- if (U_FAILURE(status)) {
- log_err("en_US collator creation failed\n");
- return;
- }
- log_verbose("Testing UCA elements\n");
- file = getFractionalUCA();
- if (file == NULL) {
- log_err("Fractional UCA data can not be opened\n");
- return;
- }
-
- while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
- if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
- line[0] == 0x000D || line[0] == '[') {
- continue;
- }
-
- getCodePoints(line, codepoints, contextCPs);
- checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86);
- }
-
- log_verbose("Testing UCA elements for the whole range of unicode characters\n");
- codepoints[0] = 0;
- while (codepoints[0] < 0xFFFF) {
- if (u_isdefined((UChar32)codepoints[0])) {
- checkCEValidity(coll, codepoints, 1, 5, 86);
- }
- codepoints[0] ++;
- }
-
- ucol_close(coll);
-
- /* testing tailored collation elements */
- log_verbose("Testing tailored elements\n");
- if(QUICK) {
- maxCount = sizeof(locale)/sizeof(locale[0]);
- } else {
- maxCount = uloc_countAvailable();
- }
- while (count < maxCount) {
- const UChar *rules = NULL,
- *current = NULL;
- UChar *rulesCopy = NULL;
- int32_t ruleLen = 0;
-
- uint32_t chOffset = 0;
- uint32_t chLen = 0;
- uint32_t exOffset = 0;
- uint32_t exLen = 0;
- uint32_t prefixOffset = 0;
- uint32_t prefixLen = 0;
- UBool startOfRules = TRUE;
- UColOptionSet opts;
-
- UColTokenParser src;
- uint32_t strength = 0;
- uint16_t specs = 0;
- if(QUICK) {
- loc = locale[count];
- } else {
- loc = uloc_getAvailable(count);
- if(!hasCollationElements(loc)) {
- count++;
- continue;
- }
- }
-
- log_verbose("Testing CEs for %s\n", loc);
-
- coll = ucol_open(loc, &status);
- if (U_FAILURE(status)) {
- log_err("%s collator creation failed\n", loc);
- return;
- }
-
- src.opts = &opts;
- rules = ucol_getRules(coll, &ruleLen);
-
- if (ruleLen > 0) {
- rulesCopy = (UChar *)malloc((ruleLen +
- UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
- uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
- src.current = src.source = rulesCopy;
- src.end = rulesCopy + ruleLen;
- src.extraCurrent = src.end;
- src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
-
- while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) {
- strength = src.parsedToken.strength;
- chOffset = src.parsedToken.charsOffset;
- chLen = src.parsedToken.charsLen;
- exOffset = src.parsedToken.extensionOffset;
- exLen = src.parsedToken.extensionLen;
- prefixOffset = src.parsedToken.prefixOffset;
- prefixLen = src.parsedToken.prefixLen;
- specs = src.parsedToken.flags;
-
- startOfRules = FALSE;
- uprv_memcpy(codepoints, src.source + chOffset,
- chLen * sizeof(UChar));
- codepoints[chLen] = 0;
- checkCEValidity(coll, codepoints, chLen, 4, 85);
- }
- free(rulesCopy);
- }
-
- ucol_close(coll);
- count ++;
- }
- T_FileStream_close(file);
-}
-
-static void printSortKeyError(const UChar *codepoints, int length,
- uint8_t *sortkey, int sklen)
-{
- int count = 0;
- log_err("Sortkey not valid for ");
- while (length > 0) {
- log_err("0x%04x ", *codepoints);
- length --;
- codepoints ++;
- }
- log_err("\nSortkey : ");
- while (count < sklen) {
- log_err("0x%02x ", sortkey[count]);
- count ++;
- }
- log_err("\n");
-}
-
-/**
-* Checking sort key validity for all levels
+* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
+* normalization on AND jamo tailoring, among other things.
+*
+* Note: This test is sensitive to changes of the root collator,
+* for example whether the ae-ligature maps to three CEs (as in the DUCET)
+* or to two CEs (as in the CLDR 24 FractionalUCA.txt).
+* It is also sensitive to how those CEs map to the iterator's 32-bit CE encoding.
+* For example, the DUCET's artificial secondary CE in the ae-ligature
+* may map to two 32-bit iterator CEs (as it did until ICU 52).
*/
-static UBool checkSortKeyValidity(UCollator *coll,
- const UChar *codepoints,
- int length)
+static const UChar tsceText[] = { /* Nothing in here should be ignorable */
+ 0x0020, 0xAC00, /* simple LV Hangul */
+ 0x0020, 0xAC01, /* simple LVT Hangul */
+ 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */
+ 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */
+ 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */
+ 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */
+ 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */
+ 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */
+ 0x0020, 0x00E6, /* small letter ae, expands */
+ 0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */
+ 0x0020
+};
+enum { kLen_tsceText = UPRV_LENGTHOF(tsceText) };
+
+static const int32_t rootStandardOffsets[] = {
+ 0, 1,2,
+ 2, 3,4,4,
+ 4, 5,6,6,
+ 6, 7,8,8,
+ 8, 9,10,11,
+ 12, 13,14,15,
+ 16, 17,18,19,
+ 20, 21,22,23,
+ 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */
+ 26, 27,28,28,
+ 28,
+ 29
+};
+enum { kLen_rootStandardOffsets = UPRV_LENGTHOF(rootStandardOffsets) };
+
+static const int32_t rootSearchOffsets[] = {
+ 0, 1,2,
+ 2, 3,4,4,
+ 4, 5,6,6,6,
+ 6, 7,8,8,8,8,8,8,
+ 8, 9,10,11,
+ 12, 13,14,15,
+ 16, 17,18,19,20,
+ 20, 21,22,22,23,23,23,24,
+ 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs */
+ 26, 27,28,28,
+ 28,
+ 29
+};
+enum { kLen_rootSearchOffsets = UPRV_LENGTHOF(rootSearchOffsets) };
+
+typedef struct {
+ const char * locale;
+ const int32_t * offsets;
+ int32_t offsetsLen;
+} TSCEItem;
+
+static const TSCEItem tsceItems[] = {
+ { "root", rootStandardOffsets, kLen_rootStandardOffsets },
+ { "root@collation=search", rootSearchOffsets, kLen_rootSearchOffsets },
+ { NULL, NULL, 0 }
+};
+
+static void TestSearchCollatorElements(void)
{
- UErrorCode status = U_ZERO_ERROR;
- UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY,
- UCOL_TERTIARY, UCOL_QUATERNARY,
- UCOL_IDENTICAL};
- int strengthlen = 5;
- int index = 0;
- int caselevel = 0;
-
- while (caselevel < 1) {
- if (caselevel == 0) {
- ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status);
- }
- else {
- ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status);
- }
-
- while (index < strengthlen) {
- int count01 = 0;
- uint32_t count = 0;
- uint8_t sortkey[128];
- uint32_t sklen;
-
- ucol_setStrength(coll, strength[index]);
- sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128);
- while (sortkey[count] != 0) {
- if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && index != 4)) {
- printSortKeyError(codepoints, length, sortkey, sklen);
- return FALSE;
- }
- if (sortkey[count] == 1) {
- count01 ++;
+ const TSCEItem * tsceItemPtr;
+ for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) {
+ UErrorCode status = U_ZERO_ERROR;
+ UCollator* ucol = ucol_open(tsceItemPtr->locale, &status);
+ if ( U_SUCCESS(status) ) {
+ UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status);
+ if ( U_SUCCESS(status) ) {
+ int32_t offset, element;
+ const int32_t * nextOffsetPtr;
+ const int32_t * limitOffsetPtr;
+
+ nextOffsetPtr = tsceItemPtr->offsets;
+ limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+ do {
+ offset = ucol_getOffset(uce);
+ element = ucol_next(uce, &status);
+ log_verbose("(%s) offset=%2d ce=%08x\n", tsceItemPtr->locale, offset, element);
+ if ( element == 0 ) {
+ log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale );
+ }
+ if ( nextOffsetPtr < limitOffsetPtr ) {
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n",
+ tsceItemPtr->locale, *nextOffsetPtr, offset );
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ nextOffsetPtr++;
+ } else {
+ log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale );
+ }
+ } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+ if ( nextOffsetPtr < limitOffsetPtr ) {
+ log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale );
}
- count ++;
- }
-
- if (count + 1 != sklen || (count01 != index + caselevel)) {
- printSortKeyError(codepoints, length, sortkey, sklen);
- return FALSE;
- }
- index ++;
- }
- caselevel ++;
- }
- return TRUE;
-}
-
-static void TestSortKeyValidity(void)
-{
- /* testing UCA collation elements */
- UErrorCode status = U_ZERO_ERROR;
- /* en_US has no tailorings */
- UCollator *coll = ucol_open("en_US", &status);
- /* tailored locales */
- char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
- FileStream *file = NULL;
- char line[1024];
- UChar codepoints[10];
- int count = 0;
- UChar contextCPs[5];
- UParseError parseError;
- if (U_FAILURE(status)) {
- log_err("en_US collator creation failed\n");
- return;
- }
- log_verbose("Testing UCA elements\n");
- file = getFractionalUCA();
- if (file == NULL) {
- log_err("Fractional UCA data can not be opened\n");
- return;
- }
-
- while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
- if(line[0] == 0 || line[0] == '#' || line[0] == '\n' ||
- line[0] == 0x000D || line[0] == '[') {
- continue;
- }
- getCodePoints(line, codepoints, contextCPs);
- checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
- }
-
- log_verbose("Testing UCA elements for the whole range of unicode characters\n");
- codepoints[0] = 0;
-
- while (codepoints[0] < 0xFFFF) {
- if (u_isdefined((UChar32)codepoints[0])) {
- checkSortKeyValidity(coll, codepoints, 1);
- }
- codepoints[0] ++;
- }
-
- ucol_close(coll);
-
- /* testing tailored collation elements */
- log_verbose("Testing tailored elements\n");
- while (count < 5) {
- const UChar *rules = NULL,
- *current = NULL;
- UChar *rulesCopy = NULL;
- int32_t ruleLen = 0;
-
- uint32_t chOffset = 0;
- uint32_t chLen = 0;
- uint32_t exOffset = 0;
- uint32_t exLen = 0;
- uint32_t prefixOffset = 0;
- uint32_t prefixLen = 0;
- UBool startOfRules = TRUE;
- UColOptionSet opts;
-
- UColTokenParser src;
- uint32_t strength = 0;
- uint16_t specs = 0;
-
- coll = ucol_open(locale[count], &status);
- if (U_FAILURE(status)) {
- log_err("%s collator creation failed\n", locale[count]);
- return;
- }
+ ucol_setOffset(uce, kLen_tsceText, &status);
+ status = U_ZERO_ERROR;
+ nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+ limitOffsetPtr = tsceItemPtr->offsets;
+ do {
+ offset = ucol_getOffset(uce);
+ element = ucol_previous(uce, &status);
+ if ( element == 0 ) {
+ log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale );
+ }
+ if ( nextOffsetPtr > limitOffsetPtr ) {
+ nextOffsetPtr--;
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n",
+ tsceItemPtr->locale, *nextOffsetPtr, offset );
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ } else {
+ log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale );
+ }
+ } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+ if ( nextOffsetPtr > limitOffsetPtr ) {
+ log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale );
+ }
- src.opts = &opts;
- rules = ucol_getRules(coll, &ruleLen);
-
- if (ruleLen > 0) {
- rulesCopy = (UChar *)malloc((ruleLen +
- UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
- uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
- src.current = src.source = rulesCopy;
- src.end = rulesCopy + ruleLen;
- src.extraCurrent = src.end;
- src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
-
- while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) {
- strength = src.parsedToken.strength;
- chOffset = src.parsedToken.charsOffset;
- chLen = src.parsedToken.charsLen;
- exOffset = src.parsedToken.extensionOffset;
- exLen = src.parsedToken.extensionLen;
- prefixOffset = src.parsedToken.prefixOffset;
- prefixLen = src.parsedToken.prefixLen;
- specs = src.parsedToken.flags;
-
- startOfRules = FALSE;
- uprv_memcpy(codepoints, src.source + chOffset,
- chLen * sizeof(UChar));
- codepoints[chLen] = 0;
- checkSortKeyValidity(coll, codepoints, chLen);
+ ucol_closeElements(uce);
+ } else {
+ log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
}
- free(rulesCopy);
+ ucol_close(ucol);
+ } else {
+ log_data_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
}
-
- ucol_close(coll);
- count ++;
}
- T_FileStream_close(file);
}
#endif /* #if !UCONFIG_NO_COLLATION */