/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2003, International Business Machines Corporation and
+ * Copyright (c) 1997-2011, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
#if !UCONFIG_NO_COLLATION
#include "unicode/ucol.h"
+#include "unicode/ucoleitr.h"
#include "unicode/uloc.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
+#include "unicode/putil.h"
+#include "callcoll.h"
#include "cmemory.h"
#include "cintltst.h"
#include "citertst.h"
#include "cstring.h"
#include "ucol_imp.h"
#include "ucol_tok.h"
+#include "uparse.h"
#include <stdio.h>
extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
+ addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements");
}
/* The locales we support */
UCollationElements *titer = ucol_openElements(coll, text, -1,
&status);
if (U_FAILURE(status)) {
- log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
+ log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
myErrorName(status));
return;
}
pitr = ucol_openElements(coll, pattern, -1, &status);
titer = ucol_openElements(coll, text, -1, &status);
if (U_FAILURE(status)) {
- log_err("ERROR: in creation of either the collator or the collation iterator :%s\n",
+ log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n",
myErrorName(status));
return;
}
UChar *test;
en_us = ucol_open("en_US", &status);
if (U_FAILURE(status)){
- log_err("ERROR: in creation of collation data using ucol_open()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n",
myErrorName(status));
return;
}
/* thai should have normalization on */
th_th = ucol_open("th_TH", &status);
if (U_FAILURE(status)){
- log_err("ERROR: in creation of thai collation using ucol_open()\n %s\n",
+ log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n",
myErrorName(status));
return;
}
coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status);
ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
if (U_FAILURE(status)){
- log_err("ERROR: in creation of collator using ucol_openRules()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n",
myErrorName(status));
return;
}
UCollator *c1, *c2, *c3;
UCollationElements *iter;
UErrorCode status = U_ZERO_ERROR;
+ UChar test1[50];
+ UChar test2[50];
- test1=(UChar*)malloc(sizeof(UChar) * 50);
- test2=(UChar*)malloc(sizeof(UChar) * 50);
u_uastrcpy(test1, "What subset of all possible test cases?");
u_uastrcpy(test2, "has the highest probability of detecting");
coll = ucol_open("en_US", &status);
iter=ucol_openElements(coll, test1, u_strlen(test1), &status);
log_verbose("English locale testing back and forth\n");
if(U_FAILURE(status)){
- log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
myErrorName(status));
ucol_close(coll);
return;
ucol_close(coll);
free(source);
- free(test1);
- free(test2);
}
/**
UCollator *en_us=NULL;
UCollationElements *iter, *pristine;
int32_t offset;
- int32_t *orders;
+ OrderAndOffset *orders;
int32_t orderLength=0;
int count = 0;
- test1=(UChar*)malloc(sizeof(UChar) * 50);
- test2=(UChar*)malloc(sizeof(UChar) * 50);
+ UChar test1[50];
+ UChar test2[50];
+
u_uastrcpy(test1, "What subset of all possible test cases?");
u_uastrcpy(test2, "has the highest probability of detecting");
en_us = ucol_open("en_US", &status);
- log_verbose("Testing getOffset and setOffset for CollationElements\n");
+ log_verbose("Testing getOffset and setOffset for collations\n");
iter = ucol_openElements(en_us, test1, u_strlen(test1), &status);
if(U_FAILURE(status)){
- log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n",
myErrorName(status));
ucol_close(en_us);
return;
}
+
+ /* testing boundaries */
+ ucol_setOffset(iter, 0, &status);
+ if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) {
+ log_err("Error: After setting offset to 0, we should be at the end "
+ "of the backwards iteration");
+ }
+ ucol_setOffset(iter, u_strlen(test1), &status);
+ if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) {
+ log_err("Error: After setting offset to end of the string, we should "
+ "be at the end of the backwards iteration");
+ }
+
/* Run all the way through the iterator, then get the offset */
orders = getOrders(iter, &orderLength);
switch (count) {
case 0:
if (ucol_getOffset(iter) != 1) {
- log_err("ERROR: Offset of iteration should be 0\n");
+ log_err("ERROR: Offset of iteration should be 1\n");
}
break;
case 3:
U_SUCCESS(status)) {
switch (count) {
case 0:
+ case 1:
if (ucol_getOffset(iter) != 3) {
log_err("ERROR: Offset of iteration should be 3\n");
}
break;
+ case 2:
+ if (ucol_getOffset(iter) != 1) {
+ log_err("ERROR: Offset of iteration should be 1\n");
+ }
+ break;
default:
if (ucol_getOffset(iter) != 0) {
log_err("ERROR: Offset of iteration should be 0\n");
ucol_closeElements(iter);
ucol_close(en_us);
- free(test1);
- free(test2);
}
/**
UErrorCode status = U_ZERO_ERROR;
UCollator *en_us=NULL;
UCollationElements *iter1, *iter2;
- test1=(UChar*)malloc(sizeof(UChar) * 50);
- test2=(UChar*)malloc(sizeof(UChar) * 50);
+ UChar test1[50];
+ UChar test2[50];
+
u_uastrcpy(test1, "What subset of all possible test cases?");
u_uastrcpy(test2, "has the highest probability of detecting");
en_us = ucol_open("en_US", &status);
log_verbose("testing setText for Collation elements\n");
iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status);
if(U_FAILURE(status)){
- log_err("ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
+ log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n",
myErrorName(status));
ucol_close(en_us);
return;
ucol_closeElements(iter2);
ucol_closeElements(iter1);
ucol_close(en_us);
- free(test1);
- free(test2);
-}
-
-
-
-static void backAndForth(UCollationElements *iter)
-{
- /* Run through the iterator forwards and stick it into an array */
- int32_t index, o;
- UErrorCode status = U_ZERO_ERROR;
- int32_t orderLength = 0;
- int32_t *orders;
- orders= getOrders(iter, &orderLength);
-
-
- /* Now go through it backwards and make sure we get the same values */
- index = orderLength;
- ucol_reset(iter);
-
- /* synwee : changed */
- while ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
- {
- if (o != orders[-- index])
- {
- if (o == 0)
- index ++;
- else
- {
- while (index > 0 && orders[-- index] == 0)
- {
- }
- if (o != orders[index])
- {
- log_err("Mismatch at index : 0x%x\n", index);
- return;
- }
-
- }
- }
- }
-
- while (index != 0 && orders[index - 1] == 0) {
- index --;
- }
-
- if (index != 0)
- {
- log_err("Didn't get back to beginning - index is %d\n", index);
-
- ucol_reset(iter);
- log_err("\nnext: ");
- if ((o = ucol_next(iter, &status)) != UCOL_NULLORDER)
- {
- log_err("Error at %x\n", o);
- }
- log_err("\nprev: ");
- if ((o = ucol_previous(iter, &status)) != UCOL_NULLORDER)
- {
- log_err("Error at %x\n", o);
- }
- log_verbose("\n");
- }
-
- free(orders);
}
/** @bug 4108762
UErrorCode status = U_ZERO_ERROR;
UCollator *coll ;/*= ucol_open("en_US", &status);*/
UChar ch = 0;
- UChar supplementary[2] = {0xD800, 0xDC00};
+ UChar32 unassigned = 0xEFFFD;
+ UChar supplementary[2];
+ uint32_t stringOffset = 0;
+ UBool isError = FALSE;
uint32_t sorder = 0;
UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/
uint32_t temporder = 0;
ch, 3);
}
+ U16_APPEND(supplementary, stringOffset, 2, unassigned, isError);
ucol_setText(iter, supplementary, 2, &status);
sorder = ucol_previous(iter, &status);
ucol_closeElements(iter);
ucol_close(coll);
} else {
- log_data_err("Couldn't open collator\n");
- }
-
-}
-
-/**
- * Return an integer array containing all of the collation orders
- * returned by calls to next on the specified iterator
- */
-static int32_t* getOrders(UCollationElements *iter, int32_t *orderLength)
-{
- UErrorCode status;
- int32_t order;
- int32_t maxSize = 100;
- int32_t size = 0;
- int32_t *temp;
- int32_t *orders =(int32_t*)malloc(sizeof(int32_t) * maxSize);
- status= U_ZERO_ERROR;
-
-
- while ((order=ucol_next(iter, &status)) != UCOL_NULLORDER)
- {
- if (size == maxSize)
- {
- maxSize *= 2;
- temp = (int32_t*)malloc(sizeof(int32_t) * maxSize);
-
- memcpy(temp, orders, size * sizeof(int32_t));
- free(orders);
- orders = temp;
-
- }
-
- orders[size++] = order;
- }
-
- if (maxSize > size)
- {
- if (size == 0) {
- size = 1;
- temp = (int32_t*)malloc(sizeof(int32_t) * size);
- temp[0] = 0;
- }
- else {
- temp = (int32_t*)malloc(sizeof(int32_t) * size);
- memcpy(temp, orders, size * sizeof(int32_t));
- }
-
- free(orders);
- orders = temp;
+ log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
}
- *orderLength = size;
- return orders;
}
UCollationElements *testiter,
*iter;
int32_t count = 0;
- int32_t *testorders,
+ OrderAndOffset *testorders,
*orders;
UChar teststr[500];
while (count != 0) {
/* UCA collation element for 0x0F76 */
- if ((count > 250 && testorders[-- count] != orders[1]) ||
- (count <= 250 && testorders[-- count] != orders[0])) {
+ if ((count > 250 && testorders[-- count].order != orders[1].order) ||
+ (count <= 250 && testorders[-- count].order != orders[0].order)) {
log_err("Error decomposition does not give the right collation element at %d count\n", count);
break;
}
free(orders);
ucol_reset(testiter);
- /* ensures that the writable buffer was cleared */
- if (testiter->iteratordata_.writableBuffer !=
- testiter->iteratordata_.stackWritableBuffer) {
- log_err("Error Writable buffer in collation element iterator not reset\n");
- }
/* ensures closing of elements done properly to clear writable buffer */
ucol_next(testiter, &status);
ucol_closeElements(iter);
ucol_close(coll);
} else {
- log_data_err("Couldn't open collator\n");
+ log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status));
}
}
* @param codepoints array for storage, assuming size > 5
* @return position at the end of the codepoint section
*/
-static char * getCodePoints(char *str, UChar *codepoints) {
- char *pStartCP = str;
- char *pEndCP = str + 4;
-
- *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
- (hex2num(*(pStartCP + 1)) << 8) |
- (hex2num(*(pStartCP + 2)) << 4) |
- (hex2num(*(pStartCP + 3))));
- codepoints ++;
- while (*pEndCP != ';') {
- pStartCP = pEndCP + 1;
- *codepoints = (UChar)((hex2num(*pStartCP) << 12) |
- (hex2num(*(pStartCP + 1)) << 8) |
- (hex2num(*(pStartCP + 2)) << 4) |
- (hex2num(*(pStartCP + 3))));
- codepoints ++;
- pEndCP = pStartCP + 4;
- }
+static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ char *semi = uprv_strchr(str, ';');
+ char *pipe = uprv_strchr(str, '|');
+ char *s;
*codepoints = 0;
- return pEndCP + 1;
+ *contextCPs = 0;
+ if(semi == NULL) {
+ log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str);
+ return str;
+ }
+ if(pipe != NULL) {
+ int32_t contextLength;
+ *pipe = 0;
+ contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode);
+ *pipe = '|';
+ if(U_FAILURE(errorCode)) {
+ log_err("error parsing precontext string from FractionalUCA.txt %s\n", str);
+ return str;
+ }
+ /* prepend the precontext string to the codepoints */
+ u_memcpy(codepoints, contextCPs, contextLength);
+ codepoints += contextLength;
+ /* start of the code point string */
+ s = pipe + 1;
+ } else {
+ s = str;
+ }
+ u_parseString(s, codepoints, 99, NULL, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ log_err("error parsing code point string from FractionalUCA.txt %s\n", str);
+ return str;
+ }
+ return semi + 1;
}
/**
FileStream *result = NULL;
/* Look inside ICU_DATA first */
- uprv_strcpy(newPath, u_getDataDirectory());
+ uprv_strcpy(newPath, ctest_dataSrcDir());
uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING );
uprv_strcat(newPath, "FractionalUCA.txt");
*/
static void TestCEs() {
FileStream *file = NULL;
- char line[1024];
+ char line[2048];
char *str;
- UChar codepoints[5];
+ UChar codepoints[10];
uint32_t ces[20];
UErrorCode status = U_ZERO_ERROR;
UCollator *coll = ucol_open("", &status);
uint32_t lineNo = 0;
+ UChar contextCPs[5];
if (U_FAILURE(status)) {
- log_err("Error in opening root collator\n");
+ log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status));
return;
}
while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) {
int count = 0;
UCollationElements *iter;
+ int32_t preContextCeLen=0;
lineNo++;
/* skip this line if it is empty or a comment or is a return value
or start of some variable section */
continue;
}
- str = getCodePoints(line, codepoints);
+ str = getCodePoints(line, codepoints, contextCPs);
/* these are 'fake' codepoints in the fractional UCA, and are used just
* for positioning of indirect values. They should not go through this
if(*codepoints == 0xFDD0) {
continue;
}
+ if (*contextCPs != 0) {
+ iter = ucol_openElements(coll, contextCPs, -1, &status);
+ if (U_FAILURE(status)) {
+ log_err("Error in opening collation elements\n");
+ break;
+ }
+ while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) {
+ preContextCeLen++;
+ }
+ ucol_closeElements(iter);
+ }
- getCEs(str, ces, &status);
+ getCEs(str, ces+preContextCeLen, &status);
if (U_FAILURE(status)) {
log_err("Error in parsing collation elements in FractionalUCA.txt\n");
break;
resultiter = ucol_openElements(coll, rule, 1, &status);
if (U_FAILURE(status)) {
- log_err("Error opening collation rules\n");
+ log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status));
return;
}
u_uastrcpy(rule, "&z < AB");
coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status);
if (U_FAILURE(status)) {
- log_err("Rule based collator not created for testing ce buffer overflow\n");
+ log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status));
return;
}
str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */
iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1,
&status);
- if (ucol_previous(iter, &status) != UCOL_NULLORDER ||
- status != U_BUFFER_OVERFLOW_ERROR) {
- log_err("CE buffer expected to overflow with long string of trail surrogates\n");
+ if (ucol_previous(iter, &status) == UCOL_NULLORDER ||
+ status == U_BUFFER_OVERFLOW_ERROR) {
+ log_err("CE buffer should not overflow with long string of trail surrogates\n");
}
ucol_closeElements(iter);
ucol_close(coll);
}
/**
-* Byte bounds checks. Checks if each byte in data is between upper and lower
-* inclusive.
-*/
-static UBool checkByteBounds(uint32_t data, char upper, char lower)
-{
- int count = 4;
- while (count > 0) {
- char b = (char)(data & 0xFF);
- if (b > upper || b < lower) {
- return FALSE;
- }
- data = data >> 8;
- count --;
- }
- return TRUE;
-}
-
-/**
-* Determines case of the string of codepoints.
-* If it is a multiple codepoints it has to treated as a contraction.
+* Checking collation element validity.
*/
-#if 0
-static uint8_t getCase(const UChar *s, uint32_t len) {
- UBool lower = FALSE;
- UBool upper = FALSE;
- UBool title = FALSE;
- UErrorCode status = U_ZERO_ERROR;
- UChar str[256];
- const UChar *ps = s;
-
- if (len == 0) {
- return UCOL_LOWER_CASE;
- }
-
- while (len > 0) {
- UChar c = *ps ++;
-
- if (u_islower(c)) {
- lower = TRUE;
- }
- if (u_isupper(c)) {
- upper = TRUE;
- }
- if (u_istitle(c)) {
- title = TRUE;
+#define MAX_CODEPOINTS_TO_SHOW 10
+static void showCodepoints(const UChar *codepoints, int length, char * codepointText) {
+ int i, lengthToUse = length;
+ if (lengthToUse > MAX_CODEPOINTS_TO_SHOW) {
+ lengthToUse = MAX_CODEPOINTS_TO_SHOW;
+ }
+ for (i = 0; i < lengthToUse; ++i) {
+ int bytesWritten = sprintf(codepointText, " %04X", *codepoints++);
+ if (bytesWritten <= 0) {
+ break;
}
-
- len --;
- }
- if ((lower && !upper && !title) || (!lower && !upper && !title)){
- return UCOL_LOWER_CASE;
- }
- if (upper && !lower && !title) {
- return UCOL_UPPER_CASE;
- }
- /* mix of cases here */
- /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status);
- if (U_FAILURE(status)) {
- log_err("Error normalizing data string\n");
- return UCOL_LOWER_CASE;
- }*/
-
- if ((title && len >= 2) || (lower && upper)) {
- return UCOL_MIXED_CASE;
+ codepointText += bytesWritten;
}
- if (u_isupper(s[0])) {
- return UCOL_UPPER_CASE;
+ if (i < length) {
+ sprintf(codepointText, " ...");
}
- return UCOL_LOWER_CASE;
}
-#endif
-/**
-* Checking collation element validity given the boundary arguments.
-*/
static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
- int length, uint32_t primarymax,
- uint32_t secondarymax)
+ int length)
{
UErrorCode status = U_ZERO_ERROR;
UCollationElements *iter = ucol_openElements(coll, codepoints, length,
&status);
- uint32_t ce;
- UBool first = TRUE;
-/*
- UBool upper = FALSE;
- UBool lower = FALSE;
-*/
+ UBool result = FALSE;
+ UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE;
+ const char * collLocale;
if (U_FAILURE(status)) {
log_err("Error creating iterator for testing validity\n");
+ return FALSE;
+ }
+ collLocale = ucol_getLocale(coll, ULOC_VALID_LOCALE, &status);
+ if (U_FAILURE(status) || collLocale==NULL) {
+ status = U_ZERO_ERROR;
+ collLocale = "?";
}
- ce = ucol_next(iter, &status);
-
- while (ce != UCOL_NULLORDER) {
- if (ce != 0) {
- uint32_t primary = UCOL_PRIMARYORDER(ce);
- uint32_t secondary = UCOL_SECONDARYORDER(ce);
- uint32_t tertiary = UCOL_TERTIARYORDER(ce);
-/* uint32_t scasebits = tertiary & 0xC0;*/
-
- if ((tertiary == 0 && secondary != 0) ||
- (tertiary < 0xC0 && secondary == 0 && primary != 0)) {
- /* n-1th level is not zero when the nth level is
- except for continuations, this is wrong */
- log_err("Lower level weight not 0 when high level weight is 0\n");
- goto fail;
- }
- else {
- /* checks if any byte is illegal ie = 01 02 03. */
- if (checkByteBounds(ce, 0x3, 0x1)) {
- log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n");
- goto fail;
- }
- }
- if ((primary != 0 && primary < primarymax) || (primary >= 0xFF00 && !isContinuation(ce))) {
- log_err("UCA primary weight out of bounds\n");
- goto fail;
- }
- /* case matching not done since data generated by ken */
- if (first) {
- if (secondary >= 6 && secondary <= secondarymax) {
- log_err("Secondary weight out of range\n");
- goto fail;
- }
- first = FALSE;
- }
- }
- ce = ucol_next(iter, &status);
- }
- ucol_closeElements(iter);
- return TRUE;
-fail :
- ucol_closeElements(iter);
- return FALSE;
+ for (;;) {
+ uint32_t ce = ucol_next(iter, &status);
+ uint32_t primary, p1, p2, secondary, tertiary;
+ if (ce == UCOL_NULLORDER) {
+ result = TRUE;
+ break;
+ }
+ if (ce == 0) {
+ continue;
+ }
+ if (ce == 0x02000202) {
+ /* special CE for merge-sort character */
+ if (*codepoints == 0xFFFE /* && length == 1 */) {
+ /*
+ * Note: We should check for length==1 but the token parser appears
+ * to give us trailing NUL characters.
+ * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
+ * rather than the internal collation rule parser
+ */
+ continue;
+ } else {
+ log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n",
+ (int)*codepoints, (int)length);
+ break;
+ }
+ }
+ primary = UCOL_PRIMARYORDER(ce);
+ p1 = primary >> 8;
+ p2 = primary & 0xFF;
+ secondary = UCOL_SECONDARYORDER(ce);
+ tertiary = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION;
+
+ if (!isContinuation(ce)) {
+ if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
+ log_err("Empty CE %08lX except for case bits\n", (long)ce);
+ break;
+ }
+ if (p1 == 0) {
+ if (p2 != 0) {
+ log_err("Primary 00 xx in %08lX\n", (long)ce);
+ break;
+ }
+ primaryDone = TRUE;
+ } else {
+ if (p1 <= 2 || p1 >= 0xF0) {
+ /* Primary first bytes F0..FF are specials. */
+ log_err("Primary first byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ if (p2 == 0) {
+ primaryDone = TRUE;
+ } else {
+ if (p2 <= 3 || p2 >= 0xFF) {
+ /* Primary second bytes 03 and FF are sort key compression terminators. */
+ log_err("Primary second byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ primaryDone = FALSE;
+ }
+ }
+ if (secondary == 0) {
+ if (primary != 0) {
+ log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce);
+ break;
+ }
+ secondaryDone = TRUE;
+ } else {
+ if (secondary <= 2 ||
+ (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COMMON + 0x80))
+ ) {
+ /* Secondary first bytes common+1..+0x80 are used for sort key compression. */
+ log_err("Secondary byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ secondaryDone = FALSE;
+ }
+ if (tertiary == 0) {
+ /* We know that ce != 0. */
+ log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n", (long)ce);
+ break;
+ }
+ if (tertiary <= 2) {
+ log_err("Tertiary byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ tertiaryDone = FALSE;
+ } else {
+ if ((ce & UCOL_REMOVE_CONTINUATION) == 0) {
+ log_err("Empty continuation %08lX\n", (long)ce);
+ break;
+ }
+ if (primaryDone && primary != 0) {
+ log_err("Primary was done but continues in %08lX\n", (long)ce);
+ break;
+ }
+ if (p1 == 0) {
+ if (p2 != 0) {
+ log_err("Primary 00 xx in %08lX\n", (long)ce);
+ break;
+ }
+ primaryDone = TRUE;
+ } else {
+ if (p1 <= 2) {
+ log_err("Primary first byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ if (p2 == 0) {
+ primaryDone = TRUE;
+ } else {
+ if (p2 <= 3) {
+ log_err("Primary second byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ }
+ }
+ if (secondaryDone && secondary != 0) {
+ log_err("Secondary was done but continues in %08lX\n", (long)ce);
+ break;
+ }
+ if (secondary == 0) {
+ secondaryDone = TRUE;
+ } else {
+ if (secondary <= 2) {
+ log_err("Secondary byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ }
+ if (tertiaryDone && tertiary != 0) {
+ log_err("Tertiary was done but continues in %08lX\n", (long)ce);
+ break;
+ }
+ if (tertiary == 0) {
+ tertiaryDone = TRUE;
+ } else if (tertiary <= 2) {
+ log_err("Tertiary byte of %08lX out of range\n", (long)ce);
+ break;
+ }
+ }
+ }
+ if (!result) {
+ char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];
+ showCodepoints(codepoints, length, codepointText);
+ log_err("Locale: %s Code point string: %s\n", collLocale, codepointText);
+ }
+ ucol_closeElements(iter);
+ return result;
}
static void TestCEValidity()
/* testing UCA collation elements */
UErrorCode status = U_ZERO_ERROR;
/* en_US has no tailorings */
- UCollator *coll = ucol_open("en_US", &status);
+ UCollator *coll = ucol_open("root", &status);
/* tailored locales */
- char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
- FileStream *file = getFractionalUCA();
- char line[1024];
- UChar codepoints[10];
+ char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
+ const char *loc;
+ FileStream *file = NULL;
+ char line[2048];
+ UChar codepoints[11];
int count = 0;
+ int maxCount = 0;
+ UChar contextCPs[3];
+ UChar32 c;
UParseError parseError;
if (U_FAILURE(status)) {
- log_err("en_US collator creation failed\n");
+ log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
return;
}
log_verbose("Testing UCA elements\n");
+ file = getFractionalUCA();
if (file == NULL) {
log_err("Fractional UCA data can not be opened\n");
return;
continue;
}
- getCodePoints(line, codepoints);
- checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86);
+ getCodePoints(line, codepoints, contextCPs);
+ checkCEValidity(coll, codepoints, u_strlen(codepoints));
}
log_verbose("Testing UCA elements for the whole range of unicode characters\n");
- codepoints[0] = 0;
- while (codepoints[0] < 0xFFFF) {
- if (u_isdefined((UChar32)codepoints[0])) {
- checkCEValidity(coll, codepoints, 1, 5, 86);
+ for (c = 0; c <= 0xffff; ++c) {
+ if (u_isdefined(c)) {
+ codepoints[0] = (UChar)c;
+ checkCEValidity(coll, codepoints, 1);
+ }
+ }
+ for (; c <= 0x10ffff; ++c) {
+ if (u_isdefined(c)) {
+ int32_t i = 0;
+ U16_APPEND_UNSAFE(codepoints, i, c);
+ checkCEValidity(coll, codepoints, i);
}
- codepoints[0] ++;
}
ucol_close(coll);
/* testing tailored collation elements */
log_verbose("Testing tailored elements\n");
- while (count < 5) {
+ if(getTestOption(QUICK_OPTION)) {
+ maxCount = sizeof(locale)/sizeof(locale[0]);
+ } else {
+ maxCount = uloc_countAvailable();
+ }
+ while (count < maxCount) {
const UChar *rules = NULL,
*current = NULL;
UChar *rulesCopy = NULL;
UColTokenParser src;
uint32_t strength = 0;
uint16_t specs = 0;
+ if(getTestOption(QUICK_OPTION)) {
+ loc = locale[count];
+ } else {
+ loc = uloc_getAvailable(count);
+ if(!hasCollationElements(loc)) {
+ count++;
+ continue;
+ }
+ }
- coll = ucol_open(locale[count], &status);
+ uprv_memset(&src, 0, sizeof(UColTokenParser));
+
+ log_verbose("Testing CEs for %s\n", loc);
+
+ coll = ucol_open(loc, &status);
if (U_FAILURE(status)) {
- log_err("%s collator creation failed\n", locale[count]);
+ log_err("%s collator creation failed\n", loc);
return;
}
rules = ucol_getRules(coll, &ruleLen);
if (ruleLen > 0) {
- rulesCopy = (UChar *)malloc((ruleLen +
+ rulesCopy = (UChar *)uprv_malloc((ruleLen +
UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
src.current = src.source = rulesCopy;
src.extraCurrent = src.end;
src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
+ /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
+ the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) {
strength = src.parsedToken.strength;
chOffset = src.parsedToken.charsOffset;
uprv_memcpy(codepoints, src.source + chOffset,
chLen * sizeof(UChar));
codepoints[chLen] = 0;
- checkCEValidity(coll, codepoints, chLen, 4, 85);
+ checkCEValidity(coll, codepoints, chLen);
}
- free(rulesCopy);
+ uprv_free(src.source);
}
ucol_close(coll);
UCOL_TERTIARY, UCOL_QUATERNARY,
UCOL_IDENTICAL};
int strengthlen = 5;
- int index = 0;
+ int strengthIndex = 0;
int caselevel = 0;
while (caselevel < 1) {
ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status);
}
- while (index < strengthlen) {
+ while (strengthIndex < strengthlen) {
int count01 = 0;
uint32_t count = 0;
uint8_t sortkey[128];
uint32_t sklen;
- ucol_setStrength(coll, strength[index]);
+ ucol_setStrength(coll, strength[strengthIndex]);
sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128);
while (sortkey[count] != 0) {
- if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && index != 4)) {
+ if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && strengthIndex != 4)) {
printSortKeyError(codepoints, length, sortkey, sklen);
return FALSE;
}
count ++;
}
- if (count + 1 != sklen || (count01 != index + caselevel)) {
+ if (count + 1 != sklen || (count01 != strengthIndex + caselevel)) {
printSortKeyError(codepoints, length, sortkey, sklen);
return FALSE;
}
- index ++;
+ strengthIndex ++;
}
caselevel ++;
}
/* en_US has no tailorings */
UCollator *coll = ucol_open("en_US", &status);
/* tailored locales */
- char locale[][6] = {"fr_FR\0", "ko_KR\0", "sh_YU\0", "th_TH\0", "zh_CN\0"};
- FileStream *file = getFractionalUCA();
- char line[1024];
+ char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
+ FileStream *file = NULL;
+ char line[2048];
UChar codepoints[10];
int count = 0;
+ UChar contextCPs[5];
UParseError parseError;
if (U_FAILURE(status)) {
- log_err("en_US collator creation failed\n");
+ log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status));
return;
}
log_verbose("Testing UCA elements\n");
+ file = getFractionalUCA();
if (file == NULL) {
log_err("Fractional UCA data can not be opened\n");
return;
continue;
}
- getCodePoints(line, codepoints);
+ getCodePoints(line, codepoints, contextCPs);
+ if(codepoints[0] == 0xFFFE) {
+ /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
+ continue;
+ }
checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
}
uint32_t strength = 0;
uint16_t specs = 0;
+ uprv_memset(&src, 0, sizeof(UColTokenParser));
+
coll = ucol_open(locale[count], &status);
if (U_FAILURE(status)) {
log_err("%s collator creation failed\n", locale[count]);
rules = ucol_getRules(coll, &ruleLen);
if (ruleLen > 0) {
- rulesCopy = (UChar *)malloc((ruleLen +
+ rulesCopy = (UChar *)uprv_malloc((ruleLen +
UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));
uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar));
src.current = src.source = rulesCopy;
src.extraCurrent = src.end;
src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
+ /* Note that as a result of tickets 7015 or 6912, ucol_tok_parseNextToken can cause the pointer to
+ the rules copy in src.source to get reallocated, freeing the original pointer in rulesCopy */
while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) {
strength = src.parsedToken.strength;
chOffset = src.parsedToken.charsOffset;
uprv_memcpy(codepoints, src.source + chOffset,
chLen * sizeof(UChar));
codepoints[chLen] = 0;
+ if(codepoints[0] == 0xFFFE) {
+ /* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
+ continue;
+ }
checkSortKeyValidity(coll, codepoints, chLen);
}
- free(rulesCopy);
+ uprv_free(src.source);
}
ucol_close(coll);
T_FileStream_close(file);
}
+/**
+* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
+* normalization on AND jamo tailoring, among other things.
+*/
+static const UChar tsceText[] = { /* Nothing in here should be ignorable */
+ 0x0020, 0xAC00, /* simple LV Hangul */
+ 0x0020, 0xAC01, /* simple LVT Hangul */
+ 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */
+ 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */
+ 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */
+ 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */
+ 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */
+ 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */
+ 0x0020, 0x00E6, /* small letter ae, expands */
+ 0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */
+ 0x0020
+};
+enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) };
+
+static const int32_t rootStandardOffsets[] = {
+ 0, 1,2,
+ 2, 3,4,4,
+ 4, 5,6,6,
+ 6, 7,8,8,
+ 8, 9,10,11,
+ 12, 13,14,15,
+ 16, 17,18,19,
+ 20, 21,22,23,
+ 24, 25,26,26,26,
+ 26, 27,28,28,
+ 28,
+ 29
+};
+enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandardOffsets[0]) };
+
+static const int32_t rootSearchOffsets[] = {
+ 0, 1,2,
+ 2, 3,4,4,
+ 4, 5,6,6,6,
+ 6, 7,8,8,8,8,8,8,
+ 8, 9,10,11,
+ 12, 13,14,15,
+ 16, 17,18,19,20,
+ 20, 21,22,22,23,23,23,24,
+ 24, 25,26,26,26,
+ 26, 27,28,28,
+ 28,
+ 29
+};
+enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffsets[0]) };
+
+typedef struct {
+ const char * locale;
+ const int32_t * offsets;
+ int32_t offsetsLen;
+} TSCEItem;
+
+static const TSCEItem tsceItems[] = {
+ { "root", rootStandardOffsets, kLen_rootStandardOffsets },
+#if 1
+ /* No jamo tailorings in Apple version of search collator currently */
+ { "root@collation=search", rootStandardOffsets, kLen_rootStandardOffsets },
+#else
+ /* Use this when we do have jamo tailorings */
+ { "root@collation=search", rootSearchOffsets, kLen_rootSearchOffsets },
+#endif
+ { NULL, NULL, 0 }
+};
+
+static void TestSearchCollatorElements(void)
+{
+ const TSCEItem * tsceItemPtr;
+ for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) {
+ UErrorCode status = U_ZERO_ERROR;
+ UCollator* ucol = ucol_open(tsceItemPtr->locale, &status);
+ if ( U_SUCCESS(status) ) {
+ UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status);
+ if ( U_SUCCESS(status) ) {
+ int32_t offset, element;
+ const int32_t * nextOffsetPtr;
+ const int32_t * limitOffsetPtr;
+
+ nextOffsetPtr = tsceItemPtr->offsets;
+ limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+ do {
+ offset = ucol_getOffset(uce);
+ element = ucol_next(uce, &status);
+ if ( element == 0 ) {
+ log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale );
+ }
+ if ( nextOffsetPtr < limitOffsetPtr ) {
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n",
+ tsceItemPtr->locale, *nextOffsetPtr, offset );
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ nextOffsetPtr++;
+ } else {
+ log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale );
+ }
+ } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+ if ( nextOffsetPtr < limitOffsetPtr ) {
+ log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale );
+ }
+
+ ucol_setOffset(uce, kLen_tsceText, &status);
+ status = U_ZERO_ERROR;
+ nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+ limitOffsetPtr = tsceItemPtr->offsets;
+ do {
+ offset = ucol_getOffset(uce);
+ element = ucol_previous(uce, &status);
+ if ( element == 0 ) {
+ log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale );
+ }
+ if ( nextOffsetPtr > limitOffsetPtr ) {
+ nextOffsetPtr--;
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n",
+ tsceItemPtr->locale, *nextOffsetPtr, offset );
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ } else {
+ log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale );
+ }
+ } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+ if ( nextOffsetPtr > limitOffsetPtr ) {
+ log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale );
+ }
+
+ ucol_closeElements(uce);
+ } else {
+ log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
+ }
+ ucol_close(ucol);
+ } else {
+ log_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
+ }
+ }
+}
+
#endif /* #if !UCONFIG_NO_COLLATION */