-
/*
******************************************************************************
*
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* created by: Markus W. Scherer
*/
-/* set import/export definitions */
-#ifndef U_COMMON_IMPLEMENTATION
-# define U_COMMON_IMPLEMENTATION
-#endif
-
#include "unicode/utypes.h"
+#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/udata.h"
-#include "unicode/uset.h"
#include "ustr_imp.h"
#include "umutex.h"
#include "cmemory.h"
#include "cstring.h"
#include "ucln_cmn.h"
+#include "udataswp.h"
#include "uprops.h"
/* prototypes ------------------------------------------------------------- */
/*
* Maximum length of character names (regular & 1.0).
- * Maximum length of ISO comments.
*/
-static int32_t gMaxNameLength=0, gMaxISOCommentLength=0;
+static int32_t gMaxNameLength=0;
/*
* Set of chars used in character names (regular & 1.0).
- * Set of chars used in ISO comments.
* Chars are platform-dependent (can be EBCDIC).
*/
-static uint32_t gNameSet[8]={ 0 }, gISOCommentSet[8]={ 0 };
-
-static UBool
-isDataLoaded(UErrorCode *pErrorCode);
-
-static UBool U_CALLCONV
-isAcceptable(void *context,
- const char *type, const char *name,
- const UDataInfo *pInfo);
-
-static Group *
-getGroup(UCharNames *names, uint32_t code);
-
-static uint16_t
-getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength);
-
-static const uint8_t *
-expandGroupLengths(const uint8_t *s,
- uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]);
+static uint32_t gNameSet[8]={ 0 };
-static uint16_t
-expandGroupName(UCharNames *names, Group *group,
- uint16_t lineNumber, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength);
-
-static uint16_t
-expandName(UCharNames *names,
- const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength);
-
-static UBool
-compareName(UCharNames *names,
- const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
- const char *otherName);
+#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
+#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
+#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
-static UBool
-enumGroupNames(UCharNames *names, Group *group,
- UChar32 start, UChar32 end,
- UEnumCharNamesFn *fn, void *context,
- UCharNameChoice nameChoice);
+#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
-static UBool
-enumExtNames(UChar32 start, UChar32 end,
- UEnumCharNamesFn *fn, void *context);
+static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
+ "unassigned",
+ "uppercase letter",
+ "lowercase letter",
+ "titlecase letter",
+ "modifier letter",
+ "other letter",
+ "non spacing mark",
+ "enclosing mark",
+ "combining spacing mark",
+ "decimal digit number",
+ "letter number",
+ "other number",
+ "space separator",
+ "line separator",
+ "paragraph separator",
+ "control",
+ "format",
+ "private use area",
+ "surrogate",
+ "dash punctuation",
+ "start punctuation",
+ "end punctuation",
+ "connector punctuation",
+ "other punctuation",
+ "math symbol",
+ "currency symbol",
+ "modifier symbol",
+ "other symbol",
+ "initial punctuation",
+ "final punctuation",
+ "noncharacter",
+ "lead surrogate",
+ "trail surrogate"
+};
-static UBool
-enumNames(UCharNames *names,
- UChar32 start, UChar32 limit,
- UEnumCharNamesFn *fn, void *context,
- UCharNameChoice nameChoice);
+/* implementation ----------------------------------------------------------- */
-static uint16_t
-getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength);
+static UBool U_CALLCONV unames_cleanup(void)
+{
+ if(uCharNamesData) {
+ udata_close(uCharNamesData);
+ uCharNamesData = NULL;
+ }
+ if(uCharNames) {
+ uCharNames = NULL;
+ }
+ gMaxNameLength=0;
+ return TRUE;
+}
-static uint16_t
-writeFactorSuffix(const uint16_t *factors, uint16_t count,
- const char *s, /* suffix elements */
- uint32_t code,
- uint16_t indexes[8], /* output fields from here */
- const char *elementBases[8], const char *elements[8],
- char *buffer, uint16_t bufferLength);
+static UBool U_CALLCONV
+isAcceptable(void *context,
+ const char *type, const char *name,
+ const UDataInfo *pInfo) {
+ return (UBool)(
+ pInfo->size>=20 &&
+ pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
+ pInfo->charsetFamily==U_CHARSET_FAMILY &&
+ pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
+ pInfo->dataFormat[1]==0x6e &&
+ pInfo->dataFormat[2]==0x61 &&
+ pInfo->dataFormat[3]==0x6d &&
+ pInfo->formatVersion[0]==1);
+}
static UBool
-enumAlgNames(AlgorithmicRange *range,
- UChar32 start, UChar32 limit,
- UEnumCharNamesFn *fn, void *context,
- UCharNameChoice nameChoice);
-
-static UChar32
-findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName);
-
-static uint16_t
-getExtName(uint32_t code, char *buffer, uint16_t bufferLength);
+isDataLoaded(UErrorCode *pErrorCode) {
+ /* load UCharNames from file if necessary */
+ UBool isCached;
-#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
-#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
-#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
+ /* do this because double-checked locking is broken */
+ UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
-#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
+ if(!isCached) {
+ UCharNames *names;
+ UDataMemory *data;
-static const char * const
-charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT];
+ /* check error code from previous attempt */
+ if(U_FAILURE(gLoadErrorCode)) {
+ *pErrorCode=gLoadErrorCode;
+ return FALSE;
+ }
-static uint8_t
-getCharCat(UChar32 cp);
+ /* open the data outside the mutex block */
+ data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ gLoadErrorCode=*pErrorCode;
+ return FALSE;
+ }
-static const char *
-getCharCatName(UChar32 cp);
-
-/* public API --------------------------------------------------------------- */
+ names=(UCharNames *)udata_getMemory(data);
-U_CAPI int32_t U_EXPORT2
-u_charName(UChar32 code, UCharNameChoice nameChoice,
- char *buffer, int32_t bufferLength,
- UErrorCode *pErrorCode) {
- AlgorithmicRange *algRange;
- uint32_t *p;
- uint32_t i;
- int32_t length;
+ /* in the mutex block, set the data for this process */
+ {
+ umtx_lock(NULL);
+ if(uCharNames==NULL) {
+ uCharNames=names;
+ uCharNamesData=data;
+ data=NULL;
+ names=NULL;
+ ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
+ }
+ umtx_unlock(NULL);
+ }
- /* check the argument values */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return 0;
- } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
- bufferLength<0 || (bufferLength>0 && buffer==NULL)
- ) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
+ /* if a different thread set it first, then close the extra data */
+ if(data!=NULL) {
+ udata_close(data); /* NULL if it was set correctly */
+ }
}
+ return TRUE;
+}
- if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
- return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
- }
+#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
+ if((bufferLength)>0) { \
+ *(buffer)++=c; \
+ --(bufferLength); \
+ } \
+ ++(bufferPos); \
+}
- length=0;
+#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
- /* try algorithmic names first */
- p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
- i=*p;
- algRange=(AlgorithmicRange *)(p+1);
- while(i>0) {
- if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
- length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
- break;
- }
- algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
- --i;
- }
+/*
+ * Important: expandName() and compareName() are almost the same -
+ * apply fixes to both.
+ *
+ * UnicodeData.txt uses ';' as a field separator, so no
+ * field can contain ';' as part of its contents.
+ * In unames.dat, it is marked as token[';']==-1 only if the
+ * semicolon is used in the data file - which is iff we
+ * have Unicode 1.0 names or ISO comments.
+ * So, it will be token[';']==-1 if we store U1.0 names/ISO comments
+ * although we know that it will never be part of a name.
+ */
+static uint16_t
+expandName(UCharNames *names,
+ const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
+ char *buffer, uint16_t bufferLength) {
+ uint16_t *tokens=(uint16_t *)names+8;
+ uint16_t token, tokenCount=*tokens++, bufferPos=0;
+ uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
+ uint8_t c;
- if(i==0) {
- if (nameChoice == U_EXTENDED_CHAR_NAME) {
- length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
- if (!length) {
- /* extended character name */
- length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
+ if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==U_ISO_COMMENT) {
+ /*
+ * skip the modern name if it is not requested _and_
+ * if the semicolon byte value is a character, not a token number
+ */
+ if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
+ while(nameLength>0) {
+ --nameLength;
+ if(*name++==';') {
+ break;
+ }
+ }
+ if(nameChoice==U_ISO_COMMENT) {
+ /* skip the Unicode 1.0 name as well to get the ISO comment */
+ while(nameLength>0) {
+ --nameLength;
+ if(*name++==';') {
+ break;
+ }
+ }
}
} else {
- /* normal character name */
- length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
+ /*
+ * the semicolon byte value is a token number, therefore
+ * only modern names are stored in unames.dat and there is no
+ * such requested Unicode 1.0 name here
+ */
+ nameLength=0;
}
}
- return u_terminateChars(buffer, bufferLength, length, pErrorCode);
-}
-
-#define _U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
-
-U_CAPI int32_t U_EXPORT2
-u_getISOComment(UChar32 c,
- char *dest, int32_t destCapacity,
- UErrorCode *pErrorCode) {
- int32_t length;
-
- /* check the argument values */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return 0;
- } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
-
- if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
- return u_terminateChars(dest, destCapacity, 0, pErrorCode);
- }
-
- /* the ISO comment is stored like a normal character name */
- length=getName(uCharNames, (uint32_t)c, _U_ISO_COMMENT, dest, (uint16_t)destCapacity);
- return u_terminateChars(dest, destCapacity, length, pErrorCode);
-}
-
-U_CAPI UChar32 U_EXPORT2
-u_charFromName(UCharNameChoice nameChoice,
- const char *name,
- UErrorCode *pErrorCode) {
- char upper[120], lower[120];
- FindName findName;
- AlgorithmicRange *algRange;
- uint32_t *p;
- uint32_t i;
- UChar32 cp = 0;
- char c0;
- UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
-
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return error;
- }
-
- if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return error;
- }
-
- if(!isDataLoaded(pErrorCode)) {
- return error;
- }
-
- /* construct the uppercase and lowercase of the name first */
- for(i=0; i<sizeof(upper); ++i) {
- if((c0=*name++)!=0) {
- upper[i]=uprv_toupper(c0);
- lower[i]=uprv_tolower(c0);
- } else {
- upper[i]=lower[i]=0;
- break;
- }
- }
- if(i==sizeof(upper)) {
- /* name too long, there is no such character */
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- return error;
- }
-
- /* try extended names first */
- if (lower[0] == '<') {
- if (nameChoice == U_EXTENDED_CHAR_NAME) {
- if (lower[--i] == '>') {
- for (--i; lower[i] && lower[i] != '-'; --i);
-
- if (lower[i] == '-') { /* We've got a category. */
- uint32_t cIdx;
-
- lower[i] = 0;
-
- for (++i; lower[i] != '>'; ++i) {
- if (lower[i] >= '0' && lower[i] <= '9') {
- cp = (cp << 4) + lower[i] - '0';
- } else if (lower[i] >= 'a' && lower[i] <= 'f') {
- cp = (cp << 4) + lower[i] - 'a' + 10;
- } else {
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- return error;
- }
- }
-
- /* Now validate the category name.
- We could use a binary search, or a trie, if
- we really wanted to. */
-
- for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
-
- if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
- if (getCharCat(cp) == cIdx) {
- return cp;
- }
- break;
- }
- }
- }
- }
- }
-
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- return error;
- }
-
- /* try algorithmic names now */
- p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
- i=*p;
- algRange=(AlgorithmicRange *)(p+1);
- while(i>0) {
- if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
- return cp;
- }
- algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
- --i;
- }
-
- /* normal character name */
- findName.otherName=upper;
- findName.code=error;
- enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
- if (findName.code == error) {
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- }
- return findName.code;
-}
-
-U_CAPI void U_EXPORT2
-u_enumCharNames(UChar32 start, UChar32 limit,
- UEnumCharNamesFn *fn,
- void *context,
- UCharNameChoice nameChoice,
- UErrorCode *pErrorCode) {
- AlgorithmicRange *algRange;
- uint32_t *p;
- uint32_t i;
-
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return;
- }
-
- if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
-
- if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
- limit = UCHAR_MAX_VALUE + 1;
- }
- if((uint32_t)start>=(uint32_t)limit) {
- return;
- }
-
- if(!isDataLoaded(pErrorCode)) {
- return;
- }
-
- /* interleave the data-driven ones with the algorithmic ones */
- /* iterate over all algorithmic ranges; assume that they are in ascending order */
- p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
- i=*p;
- algRange=(AlgorithmicRange *)(p+1);
- while(i>0) {
- /* enumerate the character names before the current algorithmic range */
- /* here: start<limit */
- if((uint32_t)start<algRange->start) {
- if((uint32_t)limit<=algRange->start) {
- enumNames(uCharNames, start, limit, fn, context, nameChoice);
- return;
- }
- if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
- return;
- }
- start=(UChar32)algRange->start;
- }
- /* enumerate the character names in the current algorithmic range */
- /* here: algRange->start<=start<limit */
- if((uint32_t)start<=algRange->end) {
- if((uint32_t)limit<=(algRange->end+1)) {
- enumAlgNames(algRange, start, limit, fn, context, nameChoice);
- return;
- }
- if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
- return;
- }
- start=(UChar32)algRange->end+1;
- }
- /* continue to the next algorithmic range (here: start<limit) */
- algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
- --i;
- }
- /* enumerate the character names after the last algorithmic range */
- enumNames(uCharNames, start, limit, fn, context, nameChoice);
-}
-
-/* implementation ----------------------------------------------------------- */
-
-UBool
-unames_cleanup()
-{
- if(uCharNamesData) {
- udata_close(uCharNamesData);
- uCharNamesData = NULL;
- }
- if(uCharNames) {
- uCharNames = NULL;
- }
- gMaxNameLength=0;
- return TRUE;
-}
-
-static UBool
-isDataLoaded(UErrorCode *pErrorCode) {
- /* load UCharNames from file if necessary */
- UBool isCached;
-
- /* do this because double-checked locking is broken */
- umtx_lock(NULL);
- isCached=uCharNames!=NULL;
- umtx_unlock(NULL);
-
- if(!isCached) {
- UCharNames *names;
- UDataMemory *data;
-
- /* check error code from previous attempt */
- if(U_FAILURE(gLoadErrorCode)) {
- *pErrorCode=gLoadErrorCode;
- return FALSE;
- }
-
- /* open the data outside the mutex block */
- data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- gLoadErrorCode=*pErrorCode;
- return FALSE;
- }
-
- names=(UCharNames *)udata_getMemory(data);
-
- /* in the mutex block, set the data for this process */
- {
- umtx_lock(NULL);
- if(uCharNames==NULL) {
- uCharNames=names;
- uCharNamesData=data;
- data=NULL;
- names=NULL;
- }
- umtx_unlock(NULL);
- }
-
- /* if a different thread set it first, then close the extra data */
- if(data!=NULL) {
- udata_close(data); /* NULL if it was set correctly */
- }
- }
- return TRUE;
-}
-
-static UBool U_CALLCONV
-isAcceptable(void *context,
- const char *type, const char *name,
- const UDataInfo *pInfo) {
- return (UBool)(
- pInfo->size>=20 &&
- pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
- pInfo->charsetFamily==U_CHARSET_FAMILY &&
- pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
- pInfo->dataFormat[1]==0x6e &&
- pInfo->dataFormat[2]==0x61 &&
- pInfo->dataFormat[3]==0x6d &&
- pInfo->formatVersion[0]==1);
-}
-
-/*
- * getGroup() does a binary search for the group that contains the
- * Unicode code point "code".
- * The return value is always a valid Group* that may contain "code"
- * or else is the highest group before "code".
- * If the lowest group is after "code", then that one is returned.
- */
-static Group *
-getGroup(UCharNames *names, uint32_t code) {
- uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
- start=0,
- limit=*(uint16_t *)((char *)names+names->groupsOffset),
- number;
- Group *groups=(Group *)((char *)names+names->groupsOffset+2);
-
- /* binary search for the group of names that contains the one for code */
- while(start<limit-1) {
- number=(uint16_t)((start+limit)/2);
- if(groupMSB<groups[number].groupMSB) {
- limit=number;
- } else {
- start=number;
- }
- }
-
- /* return this regardless of whether it is an exact match */
- return groups+start;
-}
-
-static uint16_t
-getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength) {
- Group *group=getGroup(names, code);
- if((uint16_t)(code>>GROUP_SHIFT)==group->groupMSB) {
- return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
- buffer, bufferLength);
- } else {
- /* group not found */
- /* zero-terminate */
- if(bufferLength>0) {
- *buffer=0;
- }
- return 0;
- }
-}
-
-/*
- * expandGroupLengths() reads a block of compressed lengths of 32 strings and
- * expands them into offsets and lengths for each string.
- * Lengths are stored with a variable-width encoding in consecutive nibbles:
- * If a nibble<0xc, then it is the length itself (0=empty string).
- * If a nibble>=0xc, then it forms a length value with the following nibble.
- * Calculation see below.
- * The offsets and lengths arrays must be at least 33 (one more) long because
- * there is no check here at the end if the last nibble is still used.
- */
-static const uint8_t *
-expandGroupLengths(const uint8_t *s,
- uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
- /* read the lengths of the 32 strings in this group and get each string's offset */
- uint16_t i=0, offset=0, length=0;
- uint8_t lengthByte;
-
- /* all 32 lengths must be read to get the offset of the first group string */
- while(i<LINES_PER_GROUP) {
- lengthByte=*s++;
-
- /* read even nibble - MSBs of lengthByte */
- if(length>=12) {
- /* double-nibble length spread across two bytes */
- length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
- lengthByte&=0xf;
- } else if((lengthByte /* &0xf0 */)>=0xc0) {
- /* double-nibble length spread across this one byte */
- length=(uint16_t)((lengthByte&0x3f)+12);
- } else {
- /* single-nibble length in MSBs */
- length=(uint16_t)(lengthByte>>4);
- lengthByte&=0xf;
- }
-
- *offsets++=offset;
- *lengths++=length;
-
- offset+=length;
- ++i;
-
- /* read odd nibble - LSBs of lengthByte */
- if((lengthByte&0xf0)==0) {
- /* this nibble was not consumed for a double-nibble length above */
- length=lengthByte;
- if(length<12) {
- /* single-nibble length in LSBs */
- *offsets++=offset;
- *lengths++=length;
-
- offset+=length;
- ++i;
- }
- } else {
- length=0; /* prevent double-nibble detection in the next iteration */
- }
- }
-
- /* now, s is at the first group string */
- return s;
-}
-
-static uint16_t
-expandGroupName(UCharNames *names, Group *group,
- uint16_t lineNumber, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength) {
- uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
- const uint8_t *s=(uint8_t *)names+names->groupStringOffset+
- (group->offsetHigh<<16|group->offsetLow);
- s=expandGroupLengths(s, offsets, lengths);
- return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
- buffer, bufferLength);
-}
-
-#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
- if((bufferLength)>0) { \
- *(buffer)++=c; \
- --(bufferLength); \
- } \
- ++(bufferPos); \
-}
-
-/*
- * Important: expandName() and compareName() are almost the same -
- * apply fixes to both.
- *
- * UnicodeData.txt uses ';' as a field separator, so no
- * field can contain ';' as part of its contents.
- * In unames.dat, it is marked as token[';']==-1 only if the
- * semicolon is used in the data file - which is iff we
- * have Unicode 1.0 names or ISO comments.
- * So, it will be token[';']==-1 if we store U1.0 names/ISO comments
- * although we know that it will never be part of a name.
- */
-static uint16_t
-expandName(UCharNames *names,
- const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength) {
- uint16_t *tokens=(uint16_t *)names+8;
- uint16_t token, tokenCount=*tokens++, bufferPos=0;
- uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
- uint8_t c;
-
- if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==_U_ISO_COMMENT) {
- /*
- * skip the modern name if it is not requested _and_
- * if the semicolon byte value is a character, not a token number
- */
- if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
- while(nameLength>0) {
- --nameLength;
- if(*name++==';') {
- break;
- }
- }
- if(nameChoice==_U_ISO_COMMENT) {
- /* skip the Unicode 1.0 name as well to get the ISO comment */
- while(nameLength>0) {
- --nameLength;
- if(*name++==';') {
- break;
- }
- }
- }
- } else {
- /*
- * the semicolon byte value is a token number, therefore
- * only modern names are stored in unames.dat and there is no
- * such requested Unicode 1.0 name here
- */
- nameLength=0;
- }
- }
-
- /* write each letter directly, and write a token word per token */
- while(nameLength>0) {
- --nameLength;
- c=*name++;
+ /* write each letter directly, and write a token word per token */
+ while(nameLength>0) {
+ --nameLength;
+ c=*name++;
if(c>=tokenCount) {
if(c!=';') {
}
}
}
-
- /* complete match? */
- return (UBool)(*otherName==0);
+
+ /* complete match? */
+ return (UBool)(*otherName==0);
+}
+
+static uint8_t getCharCat(UChar32 cp) {
+ uint8_t cat;
+
+ if (UTF_IS_UNICODE_NONCHAR(cp)) {
+ return U_NONCHARACTER_CODE_POINT;
+ }
+
+ if ((cat = u_charType(cp)) == U_SURROGATE) {
+ cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
+ }
+
+ return cat;
+}
+
+static const char *getCharCatName(UChar32 cp) {
+ uint8_t cat = getCharCat(cp);
+
+ /* Return unknown if the table of names above is not up to
+ date. */
+
+ if (cat >= LENGTHOF(charCatNames)) {
+ return "unknown";
+ } else {
+ return charCatNames[cat];
+ }
+}
+
+static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
+ const char *catname = getCharCatName(code);
+ uint16_t length = 0;
+
+ UChar32 cp;
+ int ndigits, i;
+
+ WRITE_CHAR(buffer, bufferLength, length, '<');
+ while (catname[length - 1]) {
+ WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
+ }
+ WRITE_CHAR(buffer, bufferLength, length, '-');
+ for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
+ ;
+ if (ndigits < 4)
+ ndigits = 4;
+ for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
+ uint8_t v = (uint8_t)(cp & 0xf);
+ buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
+ }
+ buffer += ndigits;
+ length += ndigits;
+ WRITE_CHAR(buffer, bufferLength, length, '>');
+
+ return length;
+}
+
+/*
+ * getGroup() does a binary search for the group that contains the
+ * Unicode code point "code".
+ * The return value is always a valid Group* that may contain "code"
+ * or else is the highest group before "code".
+ * If the lowest group is after "code", then that one is returned.
+ */
+static Group *
+getGroup(UCharNames *names, uint32_t code) {
+ uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
+ start=0,
+ limit=*(uint16_t *)((char *)names+names->groupsOffset),
+ number;
+ Group *groups=(Group *)((char *)names+names->groupsOffset+2);
+
+ /* binary search for the group of names that contains the one for code */
+ while(start<limit-1) {
+ number=(uint16_t)((start+limit)/2);
+ if(groupMSB<groups[number].groupMSB) {
+ limit=number;
+ } else {
+ start=number;
+ }
+ }
+
+ /* return this regardless of whether it is an exact match */
+ return groups+start;
+}
+
+/*
+ * expandGroupLengths() reads a block of compressed lengths of 32 strings and
+ * expands them into offsets and lengths for each string.
+ * Lengths are stored with a variable-width encoding in consecutive nibbles:
+ * If a nibble<0xc, then it is the length itself (0=empty string).
+ * If a nibble>=0xc, then it forms a length value with the following nibble.
+ * Calculation see below.
+ * The offsets and lengths arrays must be at least 33 (one more) long because
+ * there is no check here at the end if the last nibble is still used.
+ */
+static const uint8_t *
+expandGroupLengths(const uint8_t *s,
+ uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
+ /* read the lengths of the 32 strings in this group and get each string's offset */
+ uint16_t i=0, offset=0, length=0;
+ uint8_t lengthByte;
+
+ /* all 32 lengths must be read to get the offset of the first group string */
+ while(i<LINES_PER_GROUP) {
+ lengthByte=*s++;
+
+ /* read even nibble - MSBs of lengthByte */
+ if(length>=12) {
+ /* double-nibble length spread across two bytes */
+ length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
+ lengthByte&=0xf;
+ } else if((lengthByte /* &0xf0 */)>=0xc0) {
+ /* double-nibble length spread across this one byte */
+ length=(uint16_t)((lengthByte&0x3f)+12);
+ } else {
+ /* single-nibble length in MSBs */
+ length=(uint16_t)(lengthByte>>4);
+ lengthByte&=0xf;
+ }
+
+ *offsets++=offset;
+ *lengths++=length;
+
+ offset+=length;
+ ++i;
+
+ /* read odd nibble - LSBs of lengthByte */
+ if((lengthByte&0xf0)==0) {
+ /* this nibble was not consumed for a double-nibble length above */
+ length=lengthByte;
+ if(length<12) {
+ /* single-nibble length in LSBs */
+ *offsets++=offset;
+ *lengths++=length;
+
+ offset+=length;
+ ++i;
+ }
+ } else {
+ length=0; /* prevent double-nibble detection in the next iteration */
+ }
+ }
+
+ /* now, s is at the first group string */
+ return s;
+}
+
+static uint16_t
+expandGroupName(UCharNames *names, Group *group,
+ uint16_t lineNumber, UCharNameChoice nameChoice,
+ char *buffer, uint16_t bufferLength) {
+ uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
+ const uint8_t *s=(uint8_t *)names+names->groupStringOffset+
+ (group->offsetHigh<<16|group->offsetLow);
+ s=expandGroupLengths(s, offsets, lengths);
+ return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
+ buffer, bufferLength);
+}
+
+static uint16_t
+getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
+ char *buffer, uint16_t bufferLength) {
+ Group *group=getGroup(names, code);
+ if((uint16_t)(code>>GROUP_SHIFT)==group->groupMSB) {
+ return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
+ buffer, bufferLength);
+ } else {
+ /* group not found */
+ /* zero-terminate */
+ if(bufferLength>0) {
+ *buffer=0;
+ }
+ return 0;
+ }
}
/*
return TRUE;
}
+static uint16_t
+writeFactorSuffix(const uint16_t *factors, uint16_t count,
+ const char *s, /* suffix elements */
+ uint32_t code,
+ uint16_t indexes[8], /* output fields from here */
+ const char *elementBases[8], const char *elements[8],
+ char *buffer, uint16_t bufferLength) {
+ uint16_t i, factor, bufferPos=0;
+ char c;
+
+ /* write elements according to the factors */
+
+ /*
+ * the factorized elements are determined by modulo arithmetic
+ * with the factors of this algorithm
+ *
+ * note that for fewer operations, count is decremented here
+ */
+ --count;
+ for(i=count; i>0; --i) {
+ factor=factors[i];
+ indexes[i]=(uint16_t)(code%factor);
+ code/=factor;
+ }
+ /*
+ * we don't need to calculate the last modulus because start<=code<=end
+ * guarantees here that code<=factors[0]
+ */
+ indexes[0]=(uint16_t)code;
+
+ /* write each element */
+ for(;;) {
+ if(elementBases!=NULL) {
+ *elementBases++=s;
+ }
+
+ /* skip indexes[i] strings */
+ factor=indexes[i];
+ while(factor>0) {
+ while(*s++!=0) {}
+ --factor;
+ }
+ if(elements!=NULL) {
+ *elements++=s;
+ }
+
+ /* write element */
+ while((c=*s++)!=0) {
+ WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+ }
+
+ /* we do not need to perform the rest of this loop for i==count - break here */
+ if(i>=count) {
+ break;
+ }
+
+ /* skip the rest of the strings for this factors[i] */
+ factor=(uint16_t)(factors[i]-indexes[i]-1);
+ while(factor>0) {
+ while(*s++!=0) {}
+ --factor;
+ }
+
+ ++i;
+ }
+
+ /* zero-terminate */
+ if(bufferLength>0) {
+ *buffer=0;
+ }
+
+ return bufferPos;
+}
+
/*
* Important:
* Parts of findAlgName() are almost the same as some of getAlgName().
return bufferPos;
}
-static uint16_t
-writeFactorSuffix(const uint16_t *factors, uint16_t count,
- const char *s, /* suffix elements */
- uint32_t code,
- uint16_t indexes[8], /* output fields from here */
- const char *elementBases[8], const char *elements[8],
- char *buffer, uint16_t bufferLength) {
- uint16_t i, factor, bufferPos=0;
- char c;
-
- /* write elements according to the factors */
-
- /*
- * the factorized elements are determined by modulo arithmetic
- * with the factors of this algorithm
- *
- * note that for fewer operations, count is decremented here
- */
- --count;
- for(i=count; i>0; --i) {
- factor=factors[i];
- indexes[i]=(uint16_t)(code%factor);
- code/=factor;
- }
- /*
- * we don't need to calculate the last modulus because start<=code<=end
- * guarantees here that code<=factors[0]
- */
- indexes[0]=(uint16_t)code;
-
- /* write each element */
- for(;;) {
- if(elementBases!=NULL) {
- *elementBases++=s;
- }
-
- /* skip indexes[i] strings */
- factor=indexes[i];
- while(factor>0) {
- while(*s++!=0) {}
- --factor;
- }
- if(elements!=NULL) {
- *elements++=s;
- }
-
- /* write element */
- while((c=*s++)!=0) {
- WRITE_CHAR(buffer, bufferLength, bufferPos, c);
- }
-
- /* we do not need to perform the rest of this loop for i==count - break here */
- if(i>=count) {
- break;
- }
-
- /* skip the rest of the strings for this factors[i] */
- factor=(uint16_t)(factors[i]-indexes[i]-1);
- while(factor>0) {
- while(*s++!=0) {}
- --factor;
- }
-
- ++i;
- }
-
- /* zero-terminate */
- if(bufferLength>0) {
- *buffer=0;
- }
-
- return bufferPos;
-}
-
/*
* Important: enumAlgNames() and findAlgName() are almost the same.
* Any fix must be applied to both.
i=count;
for (;;) {
index=(uint16_t)(indexes[--i]+1);
- if(index<factors[i]) {
- /* skip one index and its element string */
- indexes[i]=index;
- s=elements[i];
- while(*s++!=0) {}
- elements[i]=s;
- break;
- } else {
- /* reset this index to 0 and its element string to the first one */
- indexes[i]=0;
- elements[i]=elementBases[i];
- }
- }
-
- /* to make matters a little easier, just compare all elements of the suffix */
- t=otherName;
- for(i=0; i<count; ++i) {
- s=elements[i];
- while((c=*s++)!=0) {
- if(c!=*t++) {
- s=""; /* does not match */
- i=99;
- }
- }
- }
- if(i<99 && *t==0) {
- return start;
- }
- }
- break;
- }
- default:
- /* undefined type */
- break;
- }
-
- return 0xffff;
-}
-
-static uint8_t getCharCat(UChar32 cp) {
- uint8_t cat;
-
- if (UTF_IS_UNICODE_NONCHAR(cp)) {
- return U_NONCHARACTER_CODE_POINT;
- }
-
- if ((cat = u_charType(cp)) == U_SURROGATE) {
- cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
- }
-
- return cat;
-}
-
-static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
- "unassigned",
- "uppercase letter",
- "lowercase letter",
- "titlecase letter",
- "modifier letter",
- "other letter",
- "non spacing mark",
- "enclosing mark",
- "combining spacing mark",
- "decimal digit number",
- "letter number",
- "other number",
- "space separator",
- "line separator",
- "paragraph separator",
- "control",
- "format",
- "private use area",
- "surrogate",
- "dash punctuation",
- "start punctuation",
- "end punctuation",
- "connector punctuation",
- "other punctuation",
- "math symbol",
- "currency symbol",
- "modifier symbol",
- "other symbol",
- "initial punctuation",
- "final punctuation",
- "noncharacter",
- "lead surrogate",
- "trail surrogate"
-};
-
-static const char *getCharCatName(UChar32 cp) {
- uint8_t cat = getCharCat(cp);
-
- /* Return unknown if the table of names above is not up to
- date. */
-
- if (cat >= LENGTHOF(charCatNames)) {
- return "unknown";
- } else {
- return charCatNames[cat];
- }
-}
-
-static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
- const char *catname = getCharCatName(code);
- uint16_t length = 0;
+ if(index<factors[i]) {
+ /* skip one index and its element string */
+ indexes[i]=index;
+ s=elements[i];
+ while(*s++!=0) {}
+ elements[i]=s;
+ break;
+ } else {
+ /* reset this index to 0 and its element string to the first one */
+ indexes[i]=0;
+ elements[i]=elementBases[i];
+ }
+ }
- UChar32 cp;
- int ndigits, i;
-
- WRITE_CHAR(buffer, bufferLength, length, '<');
- while (catname[length - 1]) {
- WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
+ /* to make matters a little easier, just compare all elements of the suffix */
+ t=otherName;
+ for(i=0; i<count; ++i) {
+ s=elements[i];
+ while((c=*s++)!=0) {
+ if(c!=*t++) {
+ s=""; /* does not match */
+ i=99;
+ }
+ }
+ }
+ if(i<99 && *t==0) {
+ return start;
+ }
+ }
+ break;
}
- WRITE_CHAR(buffer, bufferLength, length, '-');
- for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
- ;
- if (ndigits < 4)
- ndigits = 4;
- for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
- uint8_t v = (uint8_t)(cp & 0xf);
- buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
+ default:
+ /* undefined type */
+ break;
}
- buffer += ndigits;
- length += ndigits;
- WRITE_CHAR(buffer, bufferLength, length, '>');
- return length;
+ return 0xffff;
}
/* sets of name characters, maximum name lengths ---------------------------- */
Group *group;
const uint8_t *s, *line, *lineLimit;
- int32_t maxISOCommentLength=0;
int32_t groupCount, lineNumber, length;
tokenLengths=(int8_t *)uprv_malloc(tokenCount);
lineLimit=line+length;
- /* read regular name */
- length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
- if(length>maxNameLength) {
- maxNameLength=length;
- }
- if(line==lineLimit) {
- continue;
- }
+ /* read regular name */
+ length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
+ if(length>maxNameLength) {
+ maxNameLength=length;
+ }
+ if(line==lineLimit) {
+ continue;
+ }
+
+ /* read Unicode 1.0 name */
+ length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
+ if(length>maxNameLength) {
+ maxNameLength=length;
+ }
+ if(line==lineLimit) {
+ continue;
+ }
+
+ /* read ISO comment */
+ /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
+ }
+
+ ++group;
+ --groupCount;
+ }
+
+ if(tokenLengths!=NULL) {
+ uprv_free(tokenLengths);
+ }
+
+ /* set gMax... - name length last for threading */
+ gMaxNameLength=maxNameLength;
+}
+
+static UBool
+calcNameSetsLengths(UErrorCode *pErrorCode) {
+ static const char extChars[]="0123456789ABCDEF<>-";
+ int32_t i, maxNameLength;
+
+ if(gMaxNameLength!=0) {
+ return TRUE;
+ }
+
+ if(!isDataLoaded(pErrorCode)) {
+ return FALSE;
+ }
+
+ /* set hex digits, used in various names, and <>-, used in extended names */
+ for(i=0; i<sizeof(extChars)-1; ++i) {
+ SET_ADD(gNameSet, extChars[i]);
+ }
+
+ /* set sets and lengths from algorithmic names */
+ maxNameLength=calcAlgNameSetsLengths(0);
+
+ /* set sets and lengths from extended names */
+ maxNameLength=calcExtNameSetsLengths(maxNameLength);
+
+ /* set sets and lengths from group names, set global maximum values */
+ calcGroupNameSetsLengths(maxNameLength);
+
+ return TRUE;
+}
+
+/* public API --------------------------------------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+u_charName(UChar32 code, UCharNameChoice nameChoice,
+ char *buffer, int32_t bufferLength,
+ UErrorCode *pErrorCode) {
+ AlgorithmicRange *algRange;
+ uint32_t *p;
+ uint32_t i;
+ int32_t length;
+
+ /* check the argument values */
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
+ bufferLength<0 || (bufferLength>0 && buffer==NULL)
+ ) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
+ return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
+ }
+
+ length=0;
+
+ /* try algorithmic names first */
+ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+ i=*p;
+ algRange=(AlgorithmicRange *)(p+1);
+ while(i>0) {
+ if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
+ length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
+ break;
+ }
+ algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
+ --i;
+ }
+
+ if(i==0) {
+ if (nameChoice == U_EXTENDED_CHAR_NAME) {
+ length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
+ if (!length) {
+ /* extended character name */
+ length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
+ }
+ } else {
+ /* normal character name */
+ length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
+ }
+ }
+
+ return u_terminateChars(buffer, bufferLength, length, pErrorCode);
+}
+
+U_CAPI int32_t U_EXPORT2
+u_getISOComment(UChar32 c,
+ char *dest, int32_t destCapacity,
+ UErrorCode *pErrorCode) {
+ int32_t length;
+
+ /* check the argument values */
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
+ return u_terminateChars(dest, destCapacity, 0, pErrorCode);
+ }
+
+ /* the ISO comment is stored like a normal character name */
+ length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);
+ return u_terminateChars(dest, destCapacity, length, pErrorCode);
+}
+
+U_CAPI UChar32 U_EXPORT2
+u_charFromName(UCharNameChoice nameChoice,
+ const char *name,
+ UErrorCode *pErrorCode) {
+ char upper[120], lower[120];
+ FindName findName;
+ AlgorithmicRange *algRange;
+ uint32_t *p;
+ uint32_t i;
+ UChar32 cp = 0;
+ char c0;
+ UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
+
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return error;
+ }
+
+ if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return error;
+ }
+
+ if(!isDataLoaded(pErrorCode)) {
+ return error;
+ }
+
+ /* construct the uppercase and lowercase of the name first */
+ for(i=0; i<sizeof(upper); ++i) {
+ if((c0=*name++)!=0) {
+ upper[i]=uprv_toupper(c0);
+ lower[i]=uprv_tolower(c0);
+ } else {
+ upper[i]=lower[i]=0;
+ break;
+ }
+ }
+ if(i==sizeof(upper)) {
+ /* name too long, there is no such character */
+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;
+ return error;
+ }
+
+ /* try extended names first */
+ if (lower[0] == '<') {
+ if (nameChoice == U_EXTENDED_CHAR_NAME) {
+ if (lower[--i] == '>') {
+ for (--i; lower[i] && lower[i] != '-'; --i) {
+ }
+
+ if (lower[i] == '-') { /* We've got a category. */
+ uint32_t cIdx;
+
+ lower[i] = 0;
+
+ for (++i; lower[i] != '>'; ++i) {
+ if (lower[i] >= '0' && lower[i] <= '9') {
+ cp = (cp << 4) + lower[i] - '0';
+ } else if (lower[i] >= 'a' && lower[i] <= 'f') {
+ cp = (cp << 4) + lower[i] - 'a' + 10;
+ } else {
+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;
+ return error;
+ }
+ }
+
+ /* Now validate the category name.
+ We could use a binary search, or a trie, if
+ we really wanted to. */
- /* read Unicode 1.0 name */
- length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
- if(length>maxNameLength) {
- maxNameLength=length;
- }
- if(line==lineLimit) {
- continue;
- }
+ for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
- /* read ISO comment */
- length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);
- if(length>maxISOCommentLength) {
- maxISOCommentLength=length;
+ if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
+ if (getCharCat(cp) == cIdx) {
+ return cp;
+ }
+ break;
+ }
+ }
+ }
}
}
- ++group;
- --groupCount;
+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;
+ return error;
}
- if(tokenLengths!=NULL) {
- uprv_free(tokenLengths);
+ /* try algorithmic names now */
+ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+ i=*p;
+ algRange=(AlgorithmicRange *)(p+1);
+ while(i>0) {
+ if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
+ return cp;
+ }
+ algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
+ --i;
}
- /* set gMax... - name length last for threading */
- gMaxISOCommentLength=maxISOCommentLength;
- gMaxNameLength=maxNameLength;
+ /* normal character name */
+ findName.otherName=upper;
+ findName.code=error;
+ enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
+ if (findName.code == error) {
+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;
+ }
+ return findName.code;
}
-static UBool
-calcNameSetsLengths(UErrorCode *pErrorCode) {
- static const char extChars[]="0123456789ABCDEF<>-";
- int32_t i, maxNameLength;
+U_CAPI void U_EXPORT2
+u_enumCharNames(UChar32 start, UChar32 limit,
+ UEnumCharNamesFn *fn,
+ void *context,
+ UCharNameChoice nameChoice,
+ UErrorCode *pErrorCode) {
+ AlgorithmicRange *algRange;
+ uint32_t *p;
+ uint32_t i;
- if(gMaxNameLength!=0) {
- return TRUE;
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return;
}
- if(!isDataLoaded(pErrorCode)) {
- return FALSE;
+ if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return;
}
- /* set hex digits, used in various names, and <>-, used in extended names */
- for(i=0; i<sizeof(extChars)-1; ++i) {
- SET_ADD(gNameSet, extChars[i]);
+ if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
+ limit = UCHAR_MAX_VALUE + 1;
+ }
+ if((uint32_t)start>=(uint32_t)limit) {
+ return;
}
- /* set sets and lengths from algorithmic names */
- maxNameLength=calcAlgNameSetsLengths(0);
-
- /* set sets and lengths from extended names */
- maxNameLength=calcExtNameSetsLengths(maxNameLength);
-
- /* set sets and lengths from group names, set global maximum values */
- calcGroupNameSetsLengths(maxNameLength);
+ if(!isDataLoaded(pErrorCode)) {
+ return;
+ }
- return TRUE;
+ /* interleave the data-driven ones with the algorithmic ones */
+ /* iterate over all algorithmic ranges; assume that they are in ascending order */
+ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+ i=*p;
+ algRange=(AlgorithmicRange *)(p+1);
+ while(i>0) {
+ /* enumerate the character names before the current algorithmic range */
+ /* here: start<limit */
+ if((uint32_t)start<algRange->start) {
+ if((uint32_t)limit<=algRange->start) {
+ enumNames(uCharNames, start, limit, fn, context, nameChoice);
+ return;
+ }
+ if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
+ return;
+ }
+ start=(UChar32)algRange->start;
+ }
+ /* enumerate the character names in the current algorithmic range */
+ /* here: algRange->start<=start<limit */
+ if((uint32_t)start<=algRange->end) {
+ if((uint32_t)limit<=(algRange->end+1)) {
+ enumAlgNames(algRange, start, limit, fn, context, nameChoice);
+ return;
+ }
+ if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
+ return;
+ }
+ start=(UChar32)algRange->end+1;
+ }
+ /* continue to the next algorithmic range (here: start<limit) */
+ algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
+ --i;
+ }
+ /* enumerate the character names after the last algorithmic range */
+ enumNames(uCharNames, start, limit, fn, context, nameChoice);
}
U_CAPI int32_t U_EXPORT2
}
}
-#if 0
-/*
-Currently not used but left for future use. Probably by UnicodeSet.
-urename.h and uprops.h changed accordingly.
-*/
-U_CAPI int32_t U_EXPORT2
-uprv_getMaxISOCommentLength() {
- UErrorCode errorCode=U_ZERO_ERROR;
- if(calcNameSetsLengths(&errorCode)) {
- return gMaxISOCommentLength;
- } else {
- return 0;
- }
-}
-#endif
-
/**
* Converts the char set cset into a Unicode set uset.
* @param cset Set of 256 bit flags corresponding to a set of chars.
* @param uset USet to receive characters. Existing contents are deleted.
*/
static void
-charSetToUSet(uint32_t cset[8], USet* uset) {
+charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
UChar us[256];
char cs[256];
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
- uset_clear(uset);
if(!calcNameSetsLengths(&errorCode)) {
return;
/* add each UChar to the USet */
for(i=0; i<length; ++i) {
if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
- uset_add(uset, us[i]);
+ sa->add(sa->set, us[i]);
}
}
}
/**
* Fills set with characters that are used in Unicode character names.
- * @param set USet to receive characters. Existing contents are deleted.
+ * @param set USet to receive characters.
*/
U_CAPI void U_EXPORT2
-uprv_getCharNameCharacters(USet* set) {
- charSetToUSet(gNameSet, set);
+uprv_getCharNameCharacters(const USetAdder *sa) {
+ charSetToUSet(gNameSet, sa);
}
-#if 0
-/*
-Currently not used but left for future use. Probably by UnicodeSet.
-urename.h and uprops.h changed accordingly.
-*/
-/**
- * Fills set with characters that are used in Unicode character names.
- * @param set USet to receive characters. Existing contents are deleted.
+/* data swapping ------------------------------------------------------------ */
+
+/*
+ * The token table contains non-negative entries for token bytes,
+ * and -1 for bytes that represent themselves in the data file's charset.
+ * -2 entries are used for lead bytes.
+ *
+ * Direct bytes (-1 entries) must be translated from the input charset family
+ * to the output charset family.
+ * makeTokenMap() writes a permutation mapping for this.
+ * Use it once for single-/lead-byte tokens and once more for all trail byte
+ * tokens. (';' is an unused trail byte marked with -1.)
*/
-U_CAPI void U_EXPORT2
-uprv_getISOCommentCharacters(USet* set) {
- charSetToUSet(gISOCommentSet, set);
+static void
+makeTokenMap(const UDataSwapper *ds,
+ int16_t tokens[], uint16_t tokenCount,
+ uint8_t map[256],
+ UErrorCode *pErrorCode) {
+ UBool usedOutChar[256];
+ uint16_t i, j;
+ uint8_t c1, c2;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ if(ds->inCharset==ds->outCharset) {
+ /* Same charset family: identity permutation */
+ for(i=0; i<256; ++i) {
+ map[i]=(uint8_t)i;
+ }
+ } else {
+ uprv_memset(map, 0, 256);
+ uprv_memset(usedOutChar, 0, 256);
+
+ if(tokenCount>256) {
+ tokenCount=256;
+ }
+
+ /* set the direct bytes (byte 0 always maps to itself) */
+ for(i=1; i<tokenCount; ++i) {
+ if(tokens[i]==-1) {
+ /* convert the direct byte character */
+ c1=(uint8_t)i;
+ ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
+ i, ds->inCharset);
+ return;
+ }
+
+ /* enter the converted character into the map and mark it used */
+ map[c1]=c2;
+ usedOutChar[c2]=TRUE;
+ }
+ }
+
+ /* set the mappings for the rest of the permutation */
+ for(i=j=1; i<tokenCount; ++i) {
+ /* set mappings that were not set for direct bytes */
+ if(map[i]==0) {
+ /* set an output byte value that was not used as an output byte above */
+ while(usedOutChar[j]) {
+ ++j;
+ }
+ map[i]=(uint8_t)j++;
+ }
+ }
+
+ /*
+ * leave mappings at tokenCount and above unset if tokenCount<256
+ * because they won't be used
+ */
+ }
+}
+
+U_CAPI int32_t U_EXPORT2
+uchar_swapNames(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint8_t *inBytes;
+ uint8_t *outBytes;
+
+ uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
+ offset, i, count, stringsCount;
+
+ const AlgorithmicRange *inRange;
+ AlgorithmicRange *outRange;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
+ pInfo->dataFormat[1]==0x6e &&
+ pInfo->dataFormat[2]==0x61 &&
+ pInfo->dataFormat[3]==0x6d &&
+ pInfo->formatVersion[0]==1
+ )) {
+ udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ inBytes=(const uint8_t *)inData+headerSize;
+ outBytes=(uint8_t *)outData+headerSize;
+ if(length<0) {
+ algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
+ } else {
+ length-=headerSize;
+ if( length<20 ||
+ (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
+ ) {
+ udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ if(length<0) {
+ /* preflighting: iterate through algorithmic ranges */
+ offset=algNamesOffset;
+ count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
+ offset+=4;
+
+ for(i=0; i<count; ++i) {
+ inRange=(const AlgorithmicRange *)(inBytes+offset);
+ offset+=ds->readUInt16(inRange->size);
+ }
+ } else {
+ /* swap data */
+ const uint16_t *p;
+ uint16_t *q, *temp;
+
+ int16_t tokens[512];
+ uint16_t tokenCount;
+
+ uint8_t map[256], trailMap[256];
+
+ /* copy the data for inaccessible bytes */
+ if(inBytes!=outBytes) {
+ uprv_memcpy(outBytes, inBytes, length);
+ }
+
+ /* the initial 4 offsets first */
+ tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
+ groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
+ groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
+ ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
+
+ /*
+ * now the tokens table
+ * it needs to be permutated along with the compressed name strings
+ */
+ p=(const uint16_t *)(inBytes+16);
+ q=(uint16_t *)(outBytes+16);
+
+ /* read and swap the tokenCount */
+ tokenCount=ds->readUInt16(*p);
+ ds->swapArray16(ds, p, 2, q, pErrorCode);
+ ++p;
+ ++q;
+
+ /* read the first 512 tokens and make the token maps */
+ if(tokenCount<=512) {
+ count=tokenCount;
+ } else {
+ count=512;
+ }
+ for(i=0; i<count; ++i) {
+ tokens[i]=udata_readInt16(ds, p[i]);
+ }
+ for(; i<512; ++i) {
+ tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
+ }
+ makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
+ makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /*
+ * swap and permutate the tokens
+ * go through a temporary array to support in-place swapping
+ */
+ temp=(uint16_t *)uprv_malloc(tokenCount*2);
+ if(temp==NULL) {
+ udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
+ tokenCount);
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+
+ /* swap and permutate single-/lead-byte tokens */
+ for(i=0; i<tokenCount && i<256; ++i) {
+ ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
+ }
+
+ /* swap and permutate trail-byte tokens */
+ for(; i<tokenCount; ++i) {
+ ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
+ }
+
+ /* copy the result into the output and free the temporary array */
+ uprv_memcpy(q, temp, tokenCount*2);
+ uprv_free(temp);
+
+ /*
+ * swap the token strings but not a possible padding byte after
+ * the terminating NUL of the last string
+ */
+ udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
+ outBytes+tokenStringOffset, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "uchar_swapNames(token strings) failed\n");
+ return 0;
+ }
+
+ /* swap the group table */
+ count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
+ ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
+ outBytes+groupsOffset, pErrorCode);
+
+ /*
+ * swap the group strings
+ * swap the string bytes but not the nibble-encoded string lengths
+ */
+ if(ds->inCharset!=ds->outCharset) {
+ uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
+
+ const uint8_t *inStrings, *nextInStrings;
+ uint8_t *outStrings;
+
+ uint8_t c;
+
+ inStrings=inBytes+groupStringOffset;
+ outStrings=outBytes+groupStringOffset;
+
+ stringsCount=algNamesOffset-groupStringOffset;
+
+ /* iterate through string groups until only a few padding bytes are left */
+ while(stringsCount>32) {
+ nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
+
+ /* move past the length bytes */
+ stringsCount-=(uint32_t)(nextInStrings-inStrings);
+ outStrings+=nextInStrings-inStrings;
+ inStrings=nextInStrings;
+
+ count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
+ stringsCount-=count;
+
+ /* swap the string bytes using map[] and trailMap[] */
+ while(count>0) {
+ c=*inStrings++;
+ *outStrings++=map[c];
+ if(tokens[c]!=-2) {
+ --count;
+ } else {
+ /* token lead byte: swap the trail byte, too */
+ *outStrings++=trailMap[*inStrings++];
+ count-=2;
+ }
+ }
+ }
+ }
+
+ /* swap the algorithmic ranges */
+ offset=algNamesOffset;
+ count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
+ ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
+ offset+=4;
+
+ for(i=0; i<count; ++i) {
+ if(offset>(uint32_t)length) {
+ udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
+ length, i);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ inRange=(const AlgorithmicRange *)(inBytes+offset);
+ outRange=(AlgorithmicRange *)(outBytes+offset);
+ offset+=ds->readUInt16(inRange->size);
+
+ ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
+ ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
+ switch(inRange->type) {
+ case 0:
+ /* swap prefix string */
+ ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
+ outRange+1, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
+ i);
+ return 0;
+ }
+ break;
+ case 1:
+ {
+ /* swap factors and the prefix and factor strings */
+ uint32_t factorsCount;
+
+ factorsCount=inRange->variant;
+ p=(const uint16_t *)(inRange+1);
+ q=(uint16_t *)(outRange+1);
+ ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
+
+ /* swap the strings, up to the last terminating NUL */
+ p+=factorsCount;
+ q+=factorsCount;
+ stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
+ while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
+ --stringsCount;
+ }
+ ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
+ }
+ break;
+ default:
+ udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
+ inRange->type, i);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+ }
+ }
+
+ return headerSize+(int32_t)offset;
}
-#endif
/*
* Hey, Emacs, please set the following: