X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..7393aa2fd2c40c89f12c2db881842a563afdb429:/icuSources/common/unorm.cpp diff --git a/icuSources/common/unorm.cpp b/icuSources/common/unorm.cpp index fc9cfb0b..00ee9ec3 100644 --- a/icuSources/common/unorm.cpp +++ b/icuSources/common/unorm.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (c) 1996-2004, International Business Machines +* Copyright (c) 1996-2007, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * File unorm.cpp @@ -78,6 +78,8 @@ */ #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) +U_NAMESPACE_USE + /* * This new implementation of the normalization code loads its data from * unorm.dat, which is generated with the gennorm tool. @@ -117,6 +119,7 @@ enum { _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000 }; +U_CDECL_BEGIN static inline UBool isHangulWithoutJamoT(UChar c) { c-=HANGUL_BASE; @@ -147,10 +150,10 @@ isNorm32HangulOrJamo(uint32_t norm32) { * Given isNorm32HangulOrJamo(), * is this a Hangul syllable or a Jamo? */ -static inline UBool +/*static inline UBool isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) { return norm32<_NORM_MIN_JAMO_V; -} +}*/ /* * Given norm32 for Jamo V or T, @@ -163,6 +166,37 @@ isJamoVTNorm32JamoV(uint32_t norm32) { /* load unorm.dat ----------------------------------------------------------- */ +/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ +static int32_t U_CALLCONV +getFoldingNormOffset(uint32_t norm32) { + if(isNorm32LeadSurrogate(norm32)) { + return + UTRIE_BMP_INDEX_LENGTH+ + (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& + (0x3ff<>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& - (0x3ff<add(sa->set, start); return TRUE; } U_CDECL_END +#if !UNORM_HARDCODE_DATA + static int8_t loadNormData(UErrorCode &errorCode) { /* load Unicode normalization data from file */ @@ -290,6 +311,7 @@ loadNormData(UErrorCode &errorCode) { if(haveNormData==0) { UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 }; UDataMemory *data; + const int32_t *p=NULL; const uint8_t *pb; @@ -310,11 +332,12 @@ loadNormData(UErrorCode &errorCode) { _normTrie.getFoldingOffset=getFoldingNormOffset; pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2; - utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode); - _fcdTrie.getFoldingOffset=getFoldingFCDOffset; - if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) { - pb+=p[_NORM_INDEX_FCD_TRIE_SIZE]; + utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode); + } + pb+=p[_NORM_INDEX_FCD_TRIE_SIZE]; + + if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) { utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode); _auxTrie.getFoldingOffset=getFoldingAuxOffset; } @@ -362,14 +385,24 @@ loadNormData(UErrorCode &errorCode) { return haveNormData; } +#endif + static inline UBool _haveData(UErrorCode &errorCode) { - if(haveNormData!=0) { +#if UNORM_HARDCODE_DATA + return U_SUCCESS(errorCode); +#else + if(U_FAILURE(errorCode)) { + return FALSE; + } else if(haveNormData>0) { + return TRUE; + } else if(haveNormData<0) { errorCode=dataErrorCode; - return (UBool)(haveNormData>0); - } else { + return FALSE; + } else /* haveNormData==0 */ { return (UBool)(loadNormData(errorCode)>0); } +#endif } U_CAPI UBool U_EXPORT2 @@ -436,6 +469,38 @@ _getExtraData(uint32_t norm32) { return extraData+(norm32>>_NORM_EXTRA_SHIFT); } +#if 0 +/* + * It is possible to get the FCD data from the main trie if unorm.icu + * was built without the FCD trie, although it is slower. + * This is not implemented because it is hard to test, and because it seems + * unusual to want to use FCD and not build the data file for it. + * + * Untested sample code: + */ +static inline uint16_t +_getFCD16FromNormData(UChar32 c) { + uint32_t norm32, fcd; + + norm32=_getNorm32(c); + if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) { + /* get the lead/trail cc from the decomposition data */ + const uint16_t *nfd=_getExtraData(norm32); + if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) { + fcd=nfd[1]; + } + } else { + fcd=norm32&_NORM_CC_MASK; + if(fcd!=0) { + /* use the code point cc value for both lead and trail cc's */ + fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */ + } + } + + return (uint16_t)fcd; +} +#endif + /* normalization exclusion sets --------------------------------------------- */ /* @@ -459,11 +524,14 @@ internalGetNXHangul(UErrorCode &errorCode) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } + // Compact the set for caching. + set->compact(); umtx_lock(NULL); if(nxCache[UNORM_NX_HANGUL]==NULL) { nxCache[UNORM_NX_HANGUL]=set; set=NULL; + ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); } umtx_unlock(NULL); @@ -515,11 +583,14 @@ internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) { set->add(start, end); } + // Compact the set for caching. + set->compact(); umtx_lock(NULL); if(nxCache[options]==NULL) { nxCache[options]=set; set=NULL; + ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); } umtx_unlock(NULL); @@ -605,11 +676,14 @@ internalGetNX(int32_t options, UErrorCode &errorCode) { delete set; return NULL; } + // Compact the set for caching. + set->compact(); umtx_lock(NULL); if(nxCache[options]==NULL) { nxCache[options]=set; set=NULL; + ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); } umtx_unlock(NULL); @@ -882,21 +956,29 @@ _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) { /* uchar.h */ U_CAPI uint8_t U_EXPORT2 u_getCombiningClass(UChar32 c) { +#if !UNORM_HARDCODE_DATA UErrorCode errorCode=U_ZERO_ERROR; if(_haveData(errorCode)) { +#endif uint32_t norm32; UTRIE_GET32(&normTrie, c, norm32); return (uint8_t)(norm32>>_NORM_CC_SHIFT); +#if !UNORM_HARDCODE_DATA } else { return 0; } +#endif } -U_CAPI UBool U_EXPORT2 +U_CFUNC UBool U_EXPORT2 unorm_internalIsFullCompositionExclusion(UChar32 c) { +#if UNORM_HARDCODE_DATA + if(auxTrie.index!=NULL) { +#else UErrorCode errorCode=U_ZERO_ERROR; - if(_haveData(errorCode) && formatVersion_2_1) { + if(_haveData(errorCode) && auxTrie.index!=NULL) { +#endif uint16_t aux; UTRIE_GET16(&auxTrie, c, aux); @@ -906,10 +988,14 @@ unorm_internalIsFullCompositionExclusion(UChar32 c) { } } -U_CAPI UBool U_EXPORT2 +U_CFUNC UBool U_EXPORT2 unorm_isCanonSafeStart(UChar32 c) { +#if UNORM_HARDCODE_DATA + if(auxTrie.index!=NULL) { +#else UErrorCode errorCode=U_ZERO_ERROR; - if(_haveData(errorCode) && formatVersion_2_1) { + if(_haveData(errorCode) && auxTrie.index!=NULL) { +#endif uint16_t aux; UTRIE_GET16(&auxTrie, c, aux); @@ -929,9 +1015,14 @@ unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){ U_CAPI UBool U_EXPORT2 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) { +#if !UNORM_HARDCODE_DATA UErrorCode errorCode=U_ZERO_ERROR; +#endif if( fillSet!=NULL && (uint32_t)c<=0x10ffff && - _haveData(errorCode) && canonStartSets!=NULL +#if !UNORM_HARDCODE_DATA + _haveData(errorCode) && +#endif + canonStartSets!=NULL ) { const uint16_t *table; int32_t i, start, limit; @@ -1031,7 +1122,7 @@ u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *p *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } - if(!_haveData(*pErrorCode) || !formatVersion_2_1) { + if(!_haveData(*pErrorCode) || auxTrie.index==NULL) { return 0; } @@ -1061,14 +1152,15 @@ u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *p /* Is c an NF-skippable code point? See unormimp.h. */ U_CAPI UBool U_EXPORT2 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { - UErrorCode errorCode; uint32_t norm32, mask; uint16_t aux, fcd; - errorCode=U_ZERO_ERROR; +#if !UNORM_HARDCODE_DATA + UErrorCode errorCode=U_ZERO_ERROR; if(!_haveData(errorCode)) { return FALSE; } +#endif /* handle trivial cases; set the comparison mask for the normal ones */ switch(mode) { @@ -1089,8 +1181,12 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { break; case UNORM_FCD: /* FCD: skippable if lead cc==0 and trail cc<=1 */ - UTRIE_GET16(&fcdTrie, c, fcd); - return fcd<=1; + if(fcdTrie.index!=NULL) { + UTRIE_GET16(&fcdTrie, c, fcd); + return fcd<=1; + } else { + return FALSE; + } default: return FALSE; } @@ -1118,7 +1214,7 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */ /* NF*C, test (f) flag */ - if(!formatVersion_2_2) { + if(!formatVersion_2_2 || auxTrie.index==NULL) { return FALSE; /* no (f) data, say not skippable to be safe */ } @@ -1129,17 +1225,19 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { } U_CAPI void U_EXPORT2 -unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) { +unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { UChar c; - if(U_FAILURE(*pErrorCode) || !_haveData(*pErrorCode)) { + if(!_haveData(*pErrorCode)) { return; } /* add the start code point of each same-value range of each trie */ utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa); - utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa); - if(formatVersion_2_1) { + if(fcdTrie.index!=NULL) { + utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa); + } + if(auxTrie.index!=NULL) { utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa); } @@ -1151,19 +1249,20 @@ unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) { sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ } -U_CAPI UNormalizationCheckResult U_EXPORT2 +U_CFUNC UNormalizationCheckResult U_EXPORT2 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) { static const uint32_t qcMask[UNORM_MODE_COUNT]={ 0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC }; - UErrorCode errorCode; uint32_t norm32; - errorCode=U_ZERO_ERROR; +#if !UNORM_HARDCODE_DATA + UErrorCode errorCode=U_ZERO_ERROR; if(!_haveData(errorCode)) { return UNORM_YES; } +#endif UTRIE_GET32(&normTrie, c, norm32); norm32&=qcMask[mode]; @@ -1177,13 +1276,20 @@ unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) { } } -U_CAPI uint16_t U_EXPORT2 +U_CFUNC uint16_t U_EXPORT2 unorm_getFCD16FromCodePoint(UChar32 c) { - UErrorCode errorCode; uint16_t fcd; - +#if !UNORM_HARDCODE_DATA + UErrorCode errorCode; errorCode=U_ZERO_ERROR; - if(!_haveData(errorCode)) { +#endif + + if( +#if !UNORM_HARDCODE_DATA + !_haveData(errorCode) || +#endif + fcdTrie.index==NULL + ) { return 0; } @@ -1411,9 +1517,13 @@ _findNextStarter(const UChar *src, const UChar *limit, U_CAPI int32_t U_EXPORT2 unorm_getDecomposition(UChar32 c, UBool compat, UChar *dest, int32_t destCapacity) { +#if !UNORM_HARDCODE_DATA UErrorCode errorCode=U_ZERO_ERROR; +#endif if( (uint32_t)c<=0x10ffff && +#if !UNORM_HARDCODE_DATA _haveData(errorCode) && +#endif ((dest!=NULL && destCapacity>0) || destCapacity==0) ) { uint32_t norm32, qcMask; @@ -1522,6 +1632,8 @@ _decompose(UChar *dest, int32_t destCapacity, /* avoid compiler warnings */ norm32=0; c=0; + cc=0; + trailCC=0; if(srcLength>=0) { /* string with length */ @@ -2153,7 +2265,7 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_ /* decompose [prevStarter..src[ */ length=_decompose(buffer, bufferCapacity, - prevStarter, src-prevStarter, + prevStarter, (int32_t)(src-prevStarter), compat, nx, trailCC); if(length>bufferCapacity) { @@ -2162,7 +2274,7 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_ return NULL; } length=_decompose(buffer, bufferCapacity, - prevStarter, src-prevStarter, + prevStarter, (int32_t)(src-prevStarter), compat, nx, trailCC); } @@ -2174,7 +2286,7 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_ } /* return with a pointer to the recomposition and its length */ - length=recomposeLimit-buffer; + length=(int32_t)(recomposeLimit-buffer); return buffer; } @@ -3006,6 +3118,10 @@ _quickCheck(const UChar *src, options=_NORM_OPTIONS_COMPAT; break; case UNORM_FCD: + if(fcdTrie.index==NULL) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return UNORM_MAYBE; + } return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO; default: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; @@ -3218,6 +3334,10 @@ unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity, options|_NORM_OPTIONS_COMPAT, nx, pErrorCode); break; case UNORM_FCD: + if(fcdTrie.index==NULL) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } return unorm_makeFCD(dest, destCapacity, src, srcLength, nx, @@ -3483,8 +3603,13 @@ unorm_previous(UCharIterator *src, } switch(mode) { - case UNORM_NFD: case UNORM_FCD: + if(fcdTrie.index==NULL) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + /* fall through to NFD */ + case UNORM_NFD: isPreviousBoundary=_isPrevNFDSafe; minC=_NORM_MIN_WITH_LEAD_CC; mask=_NORM_CC_MASK|_NORM_QC_NFD; @@ -3731,8 +3856,13 @@ unorm_next(UCharIterator *src, } switch(mode) { - case UNORM_NFD: case UNORM_FCD: + if(fcdTrie.index==NULL) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + /* fall through to NFD */ + case UNORM_NFD: isNextBoundary=_isNextNFDSafe; minC=_NORM_MIN_WITH_LEAD_CC; mask=_NORM_CC_MASK|_NORM_QC_NFD; @@ -3966,127 +4096,4 @@ unorm_concatenate(const UChar *left, int32_t leftLength, return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); } -/* data swapping ------------------------------------------------------------ */ - -U_CAPI int32_t U_EXPORT2 -unorm_swap(const UDataSwapper *ds, - const void *inData, int32_t length, void *outData, - UErrorCode *pErrorCode) { - const UDataInfo *pInfo; - int32_t headerSize; - - const uint8_t *inBytes; - uint8_t *outBytes; - - const int32_t *inIndexes; - int32_t indexes[32]; - - int32_t i, offset, count, size; - - /* udata_swapDataHeader checks the arguments */ - headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } - - /* check data format and format version */ - pInfo=(const UDataInfo *)((const char *)inData+4); - if(!( - pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ - pInfo->dataFormat[1]==0x6f && - pInfo->dataFormat[2]==0x72 && - pInfo->dataFormat[3]==0x6d && - pInfo->formatVersion[0]==2 - )) { - udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n", - pInfo->dataFormat[0], pInfo->dataFormat[1], - pInfo->dataFormat[2], pInfo->dataFormat[3], - pInfo->formatVersion[0]); - *pErrorCode=U_UNSUPPORTED_ERROR; - return 0; - } - - inBytes=(const uint8_t *)inData+headerSize; - outBytes=(uint8_t *)outData+headerSize; - - inIndexes=(const int32_t *)inBytes; - - if(length>=0) { - length-=headerSize; - if(length<32*4) { - udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n", - length); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - } - - /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */ - for(i=0; i<32; ++i) { - indexes[i]=udata_readInt32(ds, inIndexes[i]); - } - - /* calculate the total length of the data */ - size= - 32*4+ /* size of indexes[] */ - indexes[_NORM_INDEX_TRIE_SIZE]+ - indexes[_NORM_INDEX_UCHAR_COUNT]*2+ - indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+ - indexes[_NORM_INDEX_FCD_TRIE_SIZE]+ - indexes[_NORM_INDEX_AUX_TRIE_SIZE]+ - indexes[_NORM_INDEX_CANON_SET_COUNT]*2; - - if(length>=0) { - if(lengthswapArray32(ds, inBytes, count, outBytes, pErrorCode); - offset+=count; - - /* swap the main UTrie */ - count=indexes[_NORM_INDEX_TRIE_SIZE]; - utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - - /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */ - count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2; - ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - - /* swap the FCD UTrie */ - count=indexes[_NORM_INDEX_FCD_TRIE_SIZE]; - if(count!=0) { - utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - } - - /* swap the aux UTrie */ - count=indexes[_NORM_INDEX_AUX_TRIE_SIZE]; - if(count!=0) { - utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - } - - /* swap the uint16_t combiningTable[] */ - count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2; - ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - } - - return headerSize+size; -} - #endif /* #if !UCONFIG_NO_NORMALIZATION */