/*
******************************************************************************
-* Copyright (c) 1996-2004, International Business Machines
+* Copyright (c) 1996-2007, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* File unorm.cpp
*/
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+U_NAMESPACE_USE
+
/*
* This new implementation of the normalization code loads its data from
* unorm.dat, which is generated with the gennorm tool.
_NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
};
+U_CDECL_BEGIN
static inline UBool
isHangulWithoutJamoT(UChar c) {
c-=HANGUL_BASE;
* Given isNorm32HangulOrJamo(),
* is this a Hangul syllable or a Jamo?
*/
-static inline UBool
+/*static inline UBool
isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
return norm32<_NORM_MIN_JAMO_V;
-}
+}*/
/*
* Given norm32 for Jamo V or T,
/* load unorm.dat ----------------------------------------------------------- */
+/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
+static int32_t U_CALLCONV
+getFoldingNormOffset(uint32_t norm32) {
+ if(isNorm32LeadSurrogate(norm32)) {
+ return
+ UTRIE_BMP_INDEX_LENGTH+
+ (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
+ (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
+ } else {
+ return 0;
+ }
+}
+
+/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
+static int32_t U_CALLCONV
+getFoldingAuxOffset(uint32_t data) {
+ return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
+}
+U_CDECL_END
+
+#define UNORM_HARDCODE_DATA 1
+
+#if UNORM_HARDCODE_DATA
+
+/* unorm_props_data.c is machine-generated by gennorm --csource */
+#include "unorm_props_data.c"
+
+static const UBool formatVersion_2_2=TRUE;
+
+#else
+
#define DATA_NAME "unorm"
#define DATA_TYPE "icu"
/* the Unicode version of the normalization data */
static UVersionInfo dataVersion={ 0, 0, 0, 0 };
+#endif
+
/* cache UnicodeSets for each combination of exclusion flags */
static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
U_CDECL_BEGIN
static UBool U_CALLCONV
-unorm_cleanup() {
+unorm_cleanup(void) {
int32_t i;
+#if !UNORM_HARDCODE_DATA
if(normData!=NULL) {
udata_close(normData);
normData=NULL;
}
dataErrorCode=U_ZERO_ERROR;
haveNormData=0;
+#endif
for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
- delete nxCache[i];
+ if (nxCache[i]) {
+ delete nxCache[i];
+ nxCache[i] = 0;
+ }
}
- uprv_memset(nxCache, 0, sizeof(nxCache));
return TRUE;
}
-/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
-static int32_t U_CALLCONV
-getFoldingNormOffset(uint32_t norm32) {
- if(isNorm32LeadSurrogate(norm32)) {
- return
- UTRIE_BMP_INDEX_LENGTH+
- (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
- (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
- } else {
- return 0;
- }
-}
-
-/* fcdTrie: the folding offset is the lead FCD value itself */
-static int32_t U_CALLCONV
-getFoldingFCDOffset(uint32_t data) {
- return (int32_t)data;
-}
-
-/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
-static int32_t U_CALLCONV
-getFoldingAuxOffset(uint32_t data) {
- return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
-}
+#if !UNORM_HARDCODE_DATA
static UBool U_CALLCONV
isAcceptable(void * /* context */,
}
}
+#endif
+
static UBool U_CALLCONV
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
/* add the start code point to the USet */
- USetAdder *sa=(USetAdder *)context;
+ const USetAdder *sa=(const USetAdder *)context;
sa->add(sa->set, start);
return TRUE;
}
U_CDECL_END
+#if !UNORM_HARDCODE_DATA
+
static int8_t
loadNormData(UErrorCode &errorCode) {
/* load Unicode normalization data from file */
if(haveNormData==0) {
UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
+
const int32_t *p=NULL;
const uint8_t *pb;
_normTrie.getFoldingOffset=getFoldingNormOffset;
pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
- utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
- _fcdTrie.getFoldingOffset=getFoldingFCDOffset;
-
if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
- pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
+ utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
+ }
+ pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
+
+ if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
_auxTrie.getFoldingOffset=getFoldingAuxOffset;
}
return haveNormData;
}
+#endif
+
static inline UBool
_haveData(UErrorCode &errorCode) {
- if(haveNormData!=0) {
+#if UNORM_HARDCODE_DATA
+ return U_SUCCESS(errorCode);
+#else
+ if(U_FAILURE(errorCode)) {
+ return FALSE;
+ } else if(haveNormData>0) {
+ return TRUE;
+ } else if(haveNormData<0) {
errorCode=dataErrorCode;
- return (UBool)(haveNormData>0);
- } else {
+ return FALSE;
+ } else /* haveNormData==0 */ {
return (UBool)(loadNormData(errorCode)>0);
}
+#endif
}
U_CAPI UBool U_EXPORT2
return extraData+(norm32>>_NORM_EXTRA_SHIFT);
}
+#if 0
+/*
+ * It is possible to get the FCD data from the main trie if unorm.icu
+ * was built without the FCD trie, although it is slower.
+ * This is not implemented because it is hard to test, and because it seems
+ * unusual to want to use FCD and not build the data file for it.
+ *
+ * Untested sample code:
+ */
+static inline uint16_t
+_getFCD16FromNormData(UChar32 c) {
+ uint32_t norm32, fcd;
+
+ norm32=_getNorm32(c);
+ if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
+ /* get the lead/trail cc from the decomposition data */
+ const uint16_t *nfd=_getExtraData(norm32);
+ if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
+ fcd=nfd[1];
+ }
+ } else {
+ fcd=norm32&_NORM_CC_MASK;
+ if(fcd!=0) {
+ /* use the code point cc value for both lead and trail cc's */
+ fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
+ }
+ }
+
+ return (uint16_t)fcd;
+}
+#endif
+
/* normalization exclusion sets --------------------------------------------- */
/*
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
+ // Compact the set for caching.
+ set->compact();
umtx_lock(NULL);
if(nxCache[UNORM_NX_HANGUL]==NULL) {
nxCache[UNORM_NX_HANGUL]=set;
set=NULL;
+ ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
}
umtx_unlock(NULL);
for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
set->add(start, end);
}
+ // Compact the set for caching.
+ set->compact();
umtx_lock(NULL);
if(nxCache[options]==NULL) {
nxCache[options]=set;
set=NULL;
+ ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
}
umtx_unlock(NULL);
delete set;
return NULL;
}
+ // Compact the set for caching.
+ set->compact();
umtx_lock(NULL);
if(nxCache[options]==NULL) {
nxCache[options]=set;
set=NULL;
+ ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
}
umtx_unlock(NULL);
/* uchar.h */
U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c) {
+#if !UNORM_HARDCODE_DATA
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode)) {
+#endif
uint32_t norm32;
UTRIE_GET32(&normTrie, c, norm32);
return (uint8_t)(norm32>>_NORM_CC_SHIFT);
+#if !UNORM_HARDCODE_DATA
} else {
return 0;
}
+#endif
}
-U_CAPI UBool U_EXPORT2
+U_CFUNC UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c) {
+#if UNORM_HARDCODE_DATA
+ if(auxTrie.index!=NULL) {
+#else
UErrorCode errorCode=U_ZERO_ERROR;
- if(_haveData(errorCode) && formatVersion_2_1) {
+ if(_haveData(errorCode) && auxTrie.index!=NULL) {
+#endif
uint16_t aux;
UTRIE_GET16(&auxTrie, c, aux);
}
}
-U_CAPI UBool U_EXPORT2
+U_CFUNC UBool U_EXPORT2
unorm_isCanonSafeStart(UChar32 c) {
+#if UNORM_HARDCODE_DATA
+ if(auxTrie.index!=NULL) {
+#else
UErrorCode errorCode=U_ZERO_ERROR;
- if(_haveData(errorCode) && formatVersion_2_1) {
+ if(_haveData(errorCode) && auxTrie.index!=NULL) {
+#endif
uint16_t aux;
UTRIE_GET16(&auxTrie, c, aux);
U_CAPI UBool U_EXPORT2
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
+#if !UNORM_HARDCODE_DATA
UErrorCode errorCode=U_ZERO_ERROR;
+#endif
if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
- _haveData(errorCode) && canonStartSets!=NULL
+#if !UNORM_HARDCODE_DATA
+ _haveData(errorCode) &&
+#endif
+ canonStartSets!=NULL
) {
const uint16_t *table;
int32_t i, start, limit;
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
- if(!_haveData(*pErrorCode) || !formatVersion_2_1) {
+ if(!_haveData(*pErrorCode) || auxTrie.index==NULL) {
return 0;
}
/* Is c an NF<mode>-skippable code point? See unormimp.h. */
U_CAPI UBool U_EXPORT2
unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
- UErrorCode errorCode;
uint32_t norm32, mask;
uint16_t aux, fcd;
- errorCode=U_ZERO_ERROR;
+#if !UNORM_HARDCODE_DATA
+ UErrorCode errorCode=U_ZERO_ERROR;
if(!_haveData(errorCode)) {
return FALSE;
}
+#endif
/* handle trivial cases; set the comparison mask for the normal ones */
switch(mode) {
break;
case UNORM_FCD:
/* FCD: skippable if lead cc==0 and trail cc<=1 */
- UTRIE_GET16(&fcdTrie, c, fcd);
- return fcd<=1;
+ if(fcdTrie.index!=NULL) {
+ UTRIE_GET16(&fcdTrie, c, fcd);
+ return fcd<=1;
+ } else {
+ return FALSE;
+ }
default:
return FALSE;
}
/* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
/* NF*C, test (f) flag */
- if(!formatVersion_2_2) {
+ if(!formatVersion_2_2 || auxTrie.index==NULL) {
return FALSE; /* no (f) data, say not skippable to be safe */
}
}
U_CAPI void U_EXPORT2
-unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
+unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
UChar c;
- if(U_FAILURE(*pErrorCode) || !_haveData(*pErrorCode)) {
+ if(!_haveData(*pErrorCode)) {
return;
}
/* add the start code point of each same-value range of each trie */
utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
- utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
- if(formatVersion_2_1) {
+ if(fcdTrie.index!=NULL) {
+ utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
+ }
+ if(auxTrie.index!=NULL) {
utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
}
sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
}
-U_CAPI UNormalizationCheckResult U_EXPORT2
+U_CFUNC UNormalizationCheckResult U_EXPORT2
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
static const uint32_t qcMask[UNORM_MODE_COUNT]={
0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
};
- UErrorCode errorCode;
uint32_t norm32;
- errorCode=U_ZERO_ERROR;
+#if !UNORM_HARDCODE_DATA
+ UErrorCode errorCode=U_ZERO_ERROR;
if(!_haveData(errorCode)) {
return UNORM_YES;
}
+#endif
UTRIE_GET32(&normTrie, c, norm32);
norm32&=qcMask[mode];
}
}
-U_CAPI uint16_t U_EXPORT2
+U_CFUNC uint16_t U_EXPORT2
unorm_getFCD16FromCodePoint(UChar32 c) {
- UErrorCode errorCode;
uint16_t fcd;
-
+#if !UNORM_HARDCODE_DATA
+ UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
- if(!_haveData(errorCode)) {
+#endif
+
+ if(
+#if !UNORM_HARDCODE_DATA
+ !_haveData(errorCode) ||
+#endif
+ fcdTrie.index==NULL
+ ) {
return 0;
}
U_CAPI int32_t U_EXPORT2
unorm_getDecomposition(UChar32 c, UBool compat,
UChar *dest, int32_t destCapacity) {
+#if !UNORM_HARDCODE_DATA
UErrorCode errorCode=U_ZERO_ERROR;
+#endif
if( (uint32_t)c<=0x10ffff &&
+#if !UNORM_HARDCODE_DATA
_haveData(errorCode) &&
+#endif
((dest!=NULL && destCapacity>0) || destCapacity==0)
) {
uint32_t norm32, qcMask;
/* avoid compiler warnings */
norm32=0;
c=0;
+ cc=0;
+ trailCC=0;
if(srcLength>=0) {
/* string with length */
/* decompose [prevStarter..src[ */
length=_decompose(buffer, bufferCapacity,
- prevStarter, src-prevStarter,
+ prevStarter, (int32_t)(src-prevStarter),
compat, nx,
trailCC);
if(length>bufferCapacity) {
return NULL;
}
length=_decompose(buffer, bufferCapacity,
- prevStarter, src-prevStarter,
+ prevStarter, (int32_t)(src-prevStarter),
compat, nx,
trailCC);
}
}
/* return with a pointer to the recomposition and its length */
- length=recomposeLimit-buffer;
+ length=(int32_t)(recomposeLimit-buffer);
return buffer;
}
options=_NORM_OPTIONS_COMPAT;
break;
case UNORM_FCD:
+ if(fcdTrie.index==NULL) {
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return UNORM_MAYBE;
+ }
return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
default:
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
break;
case UNORM_FCD:
+ if(fcdTrie.index==NULL) {
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
return unorm_makeFCD(dest, destCapacity,
src, srcLength,
nx,
}
switch(mode) {
- case UNORM_NFD:
case UNORM_FCD:
+ if(fcdTrie.index==NULL) {
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+ /* fall through to NFD */
+ case UNORM_NFD:
isPreviousBoundary=_isPrevNFDSafe;
minC=_NORM_MIN_WITH_LEAD_CC;
mask=_NORM_CC_MASK|_NORM_QC_NFD;
}
switch(mode) {
- case UNORM_NFD:
case UNORM_FCD:
+ if(fcdTrie.index==NULL) {
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+ /* fall through to NFD */
+ case UNORM_NFD:
isNextBoundary=_isNextNFDSafe;
minC=_NORM_MIN_WITH_LEAD_CC;
mask=_NORM_CC_MASK|_NORM_QC_NFD;
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
-/* data swapping ------------------------------------------------------------ */
-
-U_CAPI int32_t U_EXPORT2
-unorm_swap(const UDataSwapper *ds,
- const void *inData, int32_t length, void *outData,
- UErrorCode *pErrorCode) {
- const UDataInfo *pInfo;
- int32_t headerSize;
-
- const uint8_t *inBytes;
- uint8_t *outBytes;
-
- const int32_t *inIndexes;
- int32_t indexes[32];
-
- int32_t i, offset, count, size;
-
- /* udata_swapDataHeader checks the arguments */
- headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return 0;
- }
-
- /* check data format and format version */
- pInfo=(const UDataInfo *)((const char *)inData+4);
- if(!(
- pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
- pInfo->dataFormat[1]==0x6f &&
- pInfo->dataFormat[2]==0x72 &&
- pInfo->dataFormat[3]==0x6d &&
- pInfo->formatVersion[0]==2
- )) {
- udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n",
- pInfo->dataFormat[0], pInfo->dataFormat[1],
- pInfo->dataFormat[2], pInfo->dataFormat[3],
- pInfo->formatVersion[0]);
- *pErrorCode=U_UNSUPPORTED_ERROR;
- return 0;
- }
-
- inBytes=(const uint8_t *)inData+headerSize;
- outBytes=(uint8_t *)outData+headerSize;
-
- inIndexes=(const int32_t *)inBytes;
-
- if(length>=0) {
- length-=headerSize;
- if(length<32*4) {
- udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n",
- length);
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- }
-
- /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */
- for(i=0; i<32; ++i) {
- indexes[i]=udata_readInt32(ds, inIndexes[i]);
- }
-
- /* calculate the total length of the data */
- size=
- 32*4+ /* size of indexes[] */
- indexes[_NORM_INDEX_TRIE_SIZE]+
- indexes[_NORM_INDEX_UCHAR_COUNT]*2+
- indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+
- indexes[_NORM_INDEX_FCD_TRIE_SIZE]+
- indexes[_NORM_INDEX_AUX_TRIE_SIZE]+
- indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
-
- if(length>=0) {
- if(length<size) {
- udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n",
- length);
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
-
- /* copy the data for inaccessible bytes */
- if(inBytes!=outBytes) {
- uprv_memcpy(outBytes, inBytes, size);
- }
-
- offset=0;
-
- /* swap the indexes[] */
- count=32*4;
- ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
- offset+=count;
-
- /* swap the main UTrie */
- count=indexes[_NORM_INDEX_TRIE_SIZE];
- utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
- offset+=count;
-
- /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */
- count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2;
- ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
- offset+=count;
-
- /* swap the FCD UTrie */
- count=indexes[_NORM_INDEX_FCD_TRIE_SIZE];
- if(count!=0) {
- utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
- offset+=count;
- }
-
- /* swap the aux UTrie */
- count=indexes[_NORM_INDEX_AUX_TRIE_SIZE];
- if(count!=0) {
- utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
- offset+=count;
- }
-
- /* swap the uint16_t combiningTable[] */
- count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
- ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
- offset+=count;
- }
-
- return headerSize+size;
-}
-
#endif /* #if !UCONFIG_NO_NORMALIZATION */