X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..7393aa2fd2c40c89f12c2db881842a563afdb429:/icuSources/common/unorm.cpp diff --git a/icuSources/common/unorm.cpp b/icuSources/common/unorm.cpp index c2e05253..00ee9ec3 100644 --- a/icuSources/common/unorm.cpp +++ b/icuSources/common/unorm.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (c) 1996-2003, International Business Machines +* Copyright (c) 1996-2007, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * File unorm.cpp @@ -24,23 +24,24 @@ #include "unicode/utypes.h" -// moved up to make unorm_cmpEquivFold work without normalization -#include "unicode/ustring.h" -#include "unormimp.h" -#include "ustr_imp.h" - #if !UCONFIG_NO_NORMALIZATION #include "unicode/udata.h" #include "unicode/uchar.h" +#include "unicode/ustring.h" #include "unicode/uiter.h" #include "unicode/uniset.h" #include "unicode/usetiter.h" #include "unicode/unorm.h" +#include "ucln_cmn.h" +#include "unormimp.h" +#include "ucase.h" #include "cmemory.h" #include "umutex.h" #include "utrie.h" #include "unicode/uset.h" +#include "udataswp.h" +#include "putilimp.h" /* * Status of tailored normalization @@ -75,7 +76,9 @@ * except that this is not implemented for Jamo * - c is treated as having a combining class of 0 */ -#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +U_NAMESPACE_USE /* * This new implementation of the normalization code loads its data from @@ -98,12 +101,25 @@ enum { */ enum { _NORM_OPTIONS_NX_MASK=0x1f, - _NORM_OPTIONS_UNICODE_MASK=0xe0, - _NORM_OPTIONS_SETS_MASK=0xff, + _NORM_OPTIONS_UNICODE_MASK=0x60, + _NORM_OPTIONS_SETS_MASK=0x7f, + + _NORM_OPTIONS_UNICODE_SHIFT=5, + + /* + * The following options are used only in some composition functions. + * They use bits 12 and up to preserve lower bits for the available options + * space in unorm_compare() - + * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT. + */ - _NORM_OPTIONS_UNICODE_SHIFT=5 + /** Options bit 12, for compatibility vs. canonical decomposition. */ + _NORM_OPTIONS_COMPAT=0x1000, + /** Options bit 13, no discontiguous composition (FCC vs. NFC). */ + _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000 }; +U_CDECL_BEGIN static inline UBool isHangulWithoutJamoT(UChar c) { c-=HANGUL_BASE; @@ -134,10 +150,10 @@ isNorm32HangulOrJamo(uint32_t norm32) { * Given isNorm32HangulOrJamo(), * is this a Hangul syllable or a Jamo? */ -static inline UBool +/*static inline UBool isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) { return norm32<_NORM_MIN_JAMO_V; -} +}*/ /* * Given norm32 for Jamo V or T, @@ -148,24 +164,38 @@ isJamoVTNorm32JamoV(uint32_t norm32) { return norm32<_NORM_JAMO_V_TOP; } -/* some prototypes ---------------------------------------------------------- */ +/* load unorm.dat ----------------------------------------------------------- */ -static const UChar * -_findPreviousStarter(const UChar *start, const UChar *src, - uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe); +/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ +static int32_t U_CALLCONV +getFoldingNormOffset(uint32_t norm32) { + if(isNorm32LeadSurrogate(norm32)) { + return + UTRIE_BMP_INDEX_LENGTH+ + (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& + (0x3ff<>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& - (0x3ff<add(sa->set, start); return TRUE; } U_CDECL_END +#if !UNORM_HARDCODE_DATA + static int8_t loadNormData(UErrorCode &errorCode) { /* load Unicode normalization data from file */ @@ -293,6 +311,7 @@ loadNormData(UErrorCode &errorCode) { if(haveNormData==0) { UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 }; UDataMemory *data; + const int32_t *p=NULL; const uint8_t *pb; @@ -313,11 +332,12 @@ loadNormData(UErrorCode &errorCode) { _normTrie.getFoldingOffset=getFoldingNormOffset; pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2; - utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode); - _fcdTrie.getFoldingOffset=getFoldingFCDOffset; - if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) { - pb+=p[_NORM_INDEX_FCD_TRIE_SIZE]; + utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode); + } + pb+=p[_NORM_INDEX_FCD_TRIE_SIZE]; + + if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) { utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode); _auxTrie.getFoldingOffset=getFoldingAuxOffset; } @@ -341,7 +361,6 @@ loadNormData(UErrorCode &errorCode) { } else { p=(const int32_t *)udata_getMemory(normData); } - umtx_unlock(NULL); /* initialize some variables */ extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]); @@ -354,6 +373,8 @@ loadNormData(UErrorCode &errorCode) { (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2; } haveNormData=1; + ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); + umtx_unlock(NULL); /* if a different thread set it first, then close the extra data */ if(data!=NULL) { @@ -364,14 +385,24 @@ loadNormData(UErrorCode &errorCode) { return haveNormData; } +#endif + static inline UBool _haveData(UErrorCode &errorCode) { - if(haveNormData!=0) { +#if UNORM_HARDCODE_DATA + return U_SUCCESS(errorCode); +#else + if(U_FAILURE(errorCode)) { + return FALSE; + } else if(haveNormData>0) { + return TRUE; + } else if(haveNormData<0) { errorCode=dataErrorCode; - return (UBool)(haveNormData>0); - } else { + return FALSE; + } else /* haveNormData==0 */ { return (UBool)(loadNormData(errorCode)>0); } +#endif } U_CAPI UBool U_EXPORT2 @@ -438,6 +469,38 @@ _getExtraData(uint32_t norm32) { return extraData+(norm32>>_NORM_EXTRA_SHIFT); } +#if 0 +/* + * It is possible to get the FCD data from the main trie if unorm.icu + * was built without the FCD trie, although it is slower. + * This is not implemented because it is hard to test, and because it seems + * unusual to want to use FCD and not build the data file for it. + * + * Untested sample code: + */ +static inline uint16_t +_getFCD16FromNormData(UChar32 c) { + uint32_t norm32, fcd; + + norm32=_getNorm32(c); + if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) { + /* get the lead/trail cc from the decomposition data */ + const uint16_t *nfd=_getExtraData(norm32); + if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) { + fcd=nfd[1]; + } + } else { + fcd=norm32&_NORM_CC_MASK; + if(fcd!=0) { + /* use the code point cc value for both lead and trail cc's */ + fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */ + } + } + + return (uint16_t)fcd; +} +#endif + /* normalization exclusion sets --------------------------------------------- */ /* @@ -451,13 +514,9 @@ _getExtraData(uint32_t norm32) { static const UnicodeSet * internalGetNXHangul(UErrorCode &errorCode) { /* internal function, does not check for incoming U_FAILURE */ - UBool isCached; - /* do this because double-checked locking is broken */ - umtx_lock(NULL); - isCached=nxCache[UNORM_NX_HANGUL]!=NULL; - umtx_unlock(NULL); + UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached); if(!isCached) { UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3); @@ -465,11 +524,14 @@ internalGetNXHangul(UErrorCode &errorCode) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } + // Compact the set for caching. + set->compact(); umtx_lock(NULL); if(nxCache[UNORM_NX_HANGUL]==NULL) { nxCache[UNORM_NX_HANGUL]=set; set=NULL; + ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); } umtx_unlock(NULL); @@ -479,113 +541,56 @@ internalGetNXHangul(UErrorCode &errorCode) { return nxCache[UNORM_NX_HANGUL]; } +/* unorm.cpp 1.116 had and used static const UnicodeSet * -internalGetNXCJKCompat(UErrorCode &errorCode) { - /* internal function, does not check for incoming U_FAILURE */ - - UBool isCached; - - /* do this because double-checked locking is broken */ - umtx_lock(NULL); - isCached=nxCache[UNORM_NX_CJK_COMPAT]!=NULL; - umtx_unlock(NULL); - - if(!isCached) { - /* build a set from [CJK Ideographs]&[has canonical decomposition] */ - UnicodeSet *set, *hasDecomp; - - set=new UnicodeSet(UNICODE_STRING("[:Ideographic:]", 15), errorCode); - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - if(U_FAILURE(errorCode)) { - delete set; - return NULL; - } - - /* start with an empty set for [has canonical decomposition] */ - hasDecomp=new UnicodeSet(); - if(hasDecomp==NULL) { - delete set; - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - - /* iterate over all ideographs and remember which canonically decompose */ - UnicodeSetIterator it(*set); - UChar32 start, end; - uint32_t norm32; - - while(it.nextRange() && !it.isString()) { - start=it.getCodepoint(); - end=it.getCodepointEnd(); - while(start<=end) { - UTRIE_GET32(&normTrie, start, norm32); - if(norm32&_NORM_QC_NFD) { - hasDecomp->add(start); - } - ++start; - } - } - - /* hasDecomp now contains all ideographs that decompose canonically */ - - umtx_lock(NULL); - if(nxCache[UNORM_NX_CJK_COMPAT]==NULL) { - nxCache[UNORM_NX_CJK_COMPAT]=hasDecomp; - hasDecomp=NULL; - } - umtx_unlock(NULL); - - delete hasDecomp; - delete set; - } - - return nxCache[UNORM_NX_CJK_COMPAT]; +internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) { + ... } +*/ +/* get and set an exclusion set from a serialized UnicodeSet */ static const UnicodeSet * -internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) { +internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) { /* internal function, does not check for incoming U_FAILURE */ - options&=_NORM_OPTIONS_UNICODE_MASK; - if(options==0) { - return NULL; - } - UBool isCached; - /* do this because double-checked locking is broken */ - umtx_lock(NULL); - isCached=nxCache[options]!=NULL; - umtx_unlock(NULL); + UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached); - if(!isCached) { - /* build a set with all code points that were not designated by the specified Unicode version */ + if( !isCached && + canonStartSets!=NULL && + canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex] + ) { + USerializedSet sset; UnicodeSet *set; + UChar32 start, end; + int32_t i; - switch(options) { - case UNORM_UNICODE_3_2: - set=new UnicodeSet(UNICODE_STRING("[:^Age=3.2:]", 12), errorCode); - break; - default: - errorCode=U_ILLEGAL_ARGUMENT_ERROR; + if( !uset_getSerializedSet( + &sset, + canonStartSets+canonStartSets[nxIndex], + canonStartSets[nxIndex+1]-canonStartSets[nxIndex]) + ) { + errorCode=U_INVALID_FORMAT_ERROR; return NULL; } + /* turn the serialized set into a UnicodeSet */ + set=new UnicodeSet(); if(set==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } - if(U_FAILURE(errorCode)) { - delete set; - return NULL; + for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) { + set->add(start, end); } + // Compact the set for caching. + set->compact(); umtx_lock(NULL); if(nxCache[options]==NULL) { nxCache[options]=set; set=NULL; + ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); } umtx_unlock(NULL); @@ -595,6 +600,37 @@ internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) { return nxCache[options]; } +static const UnicodeSet * +internalGetNXCJKCompat(UErrorCode &errorCode) { + /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */ + return internalGetSerializedNX( + UNORM_NX_CJK_COMPAT, + _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, + errorCode); +} + +static const UnicodeSet * +internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) { + /* internal function, does not check for incoming U_FAILURE */ + int32_t nxIndex; + + options&=_NORM_OPTIONS_UNICODE_MASK; + switch(options) { + case 0: + return NULL; + case UNORM_UNICODE_3_2: + /* [:^Age=3.2:] */ + nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET; + break; + default: + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + /* build a set with all code points that were not designated by the specified Unicode version */ + return internalGetSerializedNX(options, nxIndex, errorCode); +} + /* Get a decomposition exclusion set. The data must be loaded. */ static const UnicodeSet * internalGetNX(int32_t options, UErrorCode &errorCode) { @@ -602,10 +638,7 @@ internalGetNX(int32_t options, UErrorCode &errorCode) { UBool isCached; - /* do this because double-checked locking is broken */ - umtx_lock(NULL); - isCached=nxCache[options]!=NULL; - umtx_unlock(NULL); + UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached); if(!isCached) { /* return basic sets */ @@ -643,11 +676,14 @@ internalGetNX(int32_t options, UErrorCode &errorCode) { delete set; return NULL; } + // Compact the set for caching. + set->compact(); umtx_lock(NULL); if(nxCache[options]==NULL) { nxCache[options]=set; set=NULL; + ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); } umtx_unlock(NULL); @@ -667,6 +703,11 @@ getNX(int32_t options, UErrorCode &errorCode) { } } +U_CFUNC const UnicodeSet * +unorm_getNX(int32_t options, UErrorCode *pErrorCode) { + return getNX(options, *pErrorCode); +} + static inline UBool nx_contains(const UnicodeSet *nx, UChar32 c) { return nx!=NULL && nx->contains(c); @@ -735,10 +776,15 @@ _decompose(uint32_t norm32, int32_t &length, * @return pointer to decomposition, or 0 if none * @internal */ -static const UChar * -_decompose(UChar32 c, UChar buffer[4], int32_t &length) { +U_CFUNC const UChar * +unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) { uint32_t norm32; + if(c0) { buffer[2]=(UChar)(JAMO_T_BASE+c2); - length=3; + *pLength=3; } else { - length=2; + *pLength=2; } buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); @@ -762,7 +808,7 @@ _decompose(UChar32 c, UChar buffer[4], int32_t &length) { } else { /* normal decomposition */ uint8_t cc, trailCC; - return _decompose(norm32, length, cc, trailCC); + return _decompose(norm32, *pLength, cc, trailCC); } } else { return 0; @@ -910,21 +956,29 @@ _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) { /* uchar.h */ U_CAPI uint8_t U_EXPORT2 u_getCombiningClass(UChar32 c) { +#if !UNORM_HARDCODE_DATA UErrorCode errorCode=U_ZERO_ERROR; if(_haveData(errorCode)) { +#endif uint32_t norm32; UTRIE_GET32(&normTrie, c, norm32); return (uint8_t)(norm32>>_NORM_CC_SHIFT); +#if !UNORM_HARDCODE_DATA } else { return 0; } +#endif } -U_CAPI UBool U_EXPORT2 +U_CFUNC UBool U_EXPORT2 unorm_internalIsFullCompositionExclusion(UChar32 c) { +#if UNORM_HARDCODE_DATA + if(auxTrie.index!=NULL) { +#else UErrorCode errorCode=U_ZERO_ERROR; - if(_haveData(errorCode) && formatVersion_2_1) { + if(_haveData(errorCode) && auxTrie.index!=NULL) { +#endif uint16_t aux; UTRIE_GET16(&auxTrie, c, aux); @@ -934,10 +988,14 @@ unorm_internalIsFullCompositionExclusion(UChar32 c) { } } -U_CAPI UBool U_EXPORT2 +U_CFUNC UBool U_EXPORT2 unorm_isCanonSafeStart(UChar32 c) { +#if UNORM_HARDCODE_DATA + if(auxTrie.index!=NULL) { +#else UErrorCode errorCode=U_ZERO_ERROR; - if(_haveData(errorCode) && formatVersion_2_1) { + if(_haveData(errorCode) && auxTrie.index!=NULL) { +#endif uint16_t aux; UTRIE_GET16(&auxTrie, c, aux); @@ -947,11 +1005,24 @@ unorm_isCanonSafeStart(UChar32 c) { } } +U_CAPI void U_EXPORT2 +unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){ + if(unorm_haveData(pErrorCode)){ + uprv_memcpy(*versionInfo, dataVersion, 4); + } +} + + U_CAPI UBool U_EXPORT2 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) { +#if !UNORM_HARDCODE_DATA UErrorCode errorCode=U_ZERO_ERROR; +#endif if( fillSet!=NULL && (uint32_t)c<=0x10ffff && - _haveData(errorCode) && canonStartSets!=NULL +#if !UNORM_HARDCODE_DATA + _haveData(errorCode) && +#endif + canonStartSets!=NULL ) { const uint16_t *table; int32_t i, start, limit; @@ -1051,7 +1122,7 @@ u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *p *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } - if(!_haveData(*pErrorCode) || !formatVersion_2_1) { + if(!_haveData(*pErrorCode) || auxTrie.index==NULL) { return 0; } @@ -1081,14 +1152,15 @@ u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *p /* Is c an NF-skippable code point? See unormimp.h. */ U_CAPI UBool U_EXPORT2 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { - UErrorCode errorCode; uint32_t norm32, mask; uint16_t aux, fcd; - errorCode=U_ZERO_ERROR; +#if !UNORM_HARDCODE_DATA + UErrorCode errorCode=U_ZERO_ERROR; if(!_haveData(errorCode)) { return FALSE; } +#endif /* handle trivial cases; set the comparison mask for the normal ones */ switch(mode) { @@ -1109,8 +1181,12 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { break; case UNORM_FCD: /* FCD: skippable if lead cc==0 and trail cc<=1 */ - UTRIE_GET16(&fcdTrie, c, fcd); - return fcd<=1; + if(fcdTrie.index!=NULL) { + UTRIE_GET16(&fcdTrie, c, fcd); + return fcd<=1; + } else { + return FALSE; + } default: return FALSE; } @@ -1138,7 +1214,7 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */ /* NF*C, test (f) flag */ - if(!formatVersion_2_2) { + if(!formatVersion_2_2 || auxTrie.index==NULL) { return FALSE; /* no (f) data, say not skippable to be safe */ } @@ -1149,7 +1225,7 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { } U_CAPI void U_EXPORT2 -unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode) { +unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { UChar c; if(!_haveData(*pErrorCode)) { @@ -1157,18 +1233,68 @@ unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode) { } /* add the start code point of each same-value range of each trie */ - utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set); - utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); - if(formatVersion_2_1) { - utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set); + utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa); + if(fcdTrie.index!=NULL) { + utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa); + } + if(auxTrie.index!=NULL) { + utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa); } /* add Hangul LV syllables and LV+1 because of skippables */ for(c=HANGUL_BASE; cadd(sa->set, c); + sa->add(sa->set, c+1); + } + sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ +} + +U_CFUNC UNormalizationCheckResult U_EXPORT2 +unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) { + static const uint32_t qcMask[UNORM_MODE_COUNT]={ + 0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC + }; + + uint32_t norm32; + +#if !UNORM_HARDCODE_DATA + UErrorCode errorCode=U_ZERO_ERROR; + if(!_haveData(errorCode)) { + return UNORM_YES; + } +#endif + + UTRIE_GET32(&normTrie, c, norm32); + norm32&=qcMask[mode]; + + if(norm32==0) { + return UNORM_YES; + } else if(norm32&_NORM_QC_ANY_NO) { + return UNORM_NO; + } else /* _NORM_QC_ANY_MAYBE */ { + return UNORM_MAYBE; + } +} + +U_CFUNC uint16_t U_EXPORT2 +unorm_getFCD16FromCodePoint(UChar32 c) { + uint16_t fcd; +#if !UNORM_HARDCODE_DATA + UErrorCode errorCode; + errorCode=U_ZERO_ERROR; +#endif + + if( +#if !UNORM_HARDCODE_DATA + !_haveData(errorCode) || +#endif + fcdTrie.index==NULL + ) { + return 0; } - uset_add(set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ + + UTRIE_GET16(&fcdTrie, c, fcd); + return fcd; } /* reorder UTF-16 in-place -------------------------------------------------- */ @@ -1313,177 +1439,202 @@ _mergeOrdered(UChar *start, UChar *current, } } -/* quick check functions ---------------------------------------------------- */ - -static UBool -unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) { - const UChar *limit; +/* find the last true starter in [start..src[ and return the pointer to it */ +static const UChar * +_findPreviousStarter(const UChar *start, const UChar *src, + uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) { + uint32_t norm32; UChar c, c2; - uint16_t fcd16; - int16_t prevCC, cc; - - /* initialize */ - prevCC=0; - if(srcLength>=0) { - /* string with length */ - limit=src+srcLength; - } else /* srcLength==-1 */ { - /* zero-terminated string */ - limit=NULL; + while(start0) || destCapacity==0) + ) { + uint32_t norm32, qcMask; + UChar32 minNoMaybe; + int32_t length; + + /* initialize */ + if(!compat) { + minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]; + qcMask=_NORM_QC_NFD; } else { - c2=0; + minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]; + qcMask=_NORM_QC_NFKD; } - if(nx_contains(nx, c, c2)) { - prevCC=0; /* excluded: fcd16==0 */ - continue; + if(c0) { + dest[0]=(UChar)c; + } + return -1; } - /* - * prevCC has values from the following ranges: - * 0..0xff - the previous trail combining class - * <0 - the negative value of the previous code unit; - * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() - * was deferred so that average text is checked faster - */ + /* data lookup */ + UTRIE_GET32(&normTrie, c, norm32); + if((norm32&qcMask)==0) { + /* simple case: no decomposition */ + if(c<=0xffff) { + if(destCapacity>0) { + dest[0]=(UChar)c; + } + return -1; + } else { + if(destCapacity>=2) { + dest[0]=UTF16_LEAD(c); + dest[1]=UTF16_TRAIL(c); + } + return -2; + } + } else if(isNorm32HangulOrJamo(norm32)) { + /* Hangul syllable: decompose algorithmically */ + UChar c2; - /* check the combining order */ - cc=(int16_t)(fcd16>>8); - if(cc!=0) { - if(prevCC<0) { - /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */ - if(!nx_contains(nx, (UChar32)-prevCC)) { - prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff); - } else { - prevCC=0; /* excluded: fcd16==0 */ + c-=HANGUL_BASE; + + c2=(UChar)(c%JAMO_T_COUNT); + c/=JAMO_T_COUNT; + if(c2>0) { + if(destCapacity>=3) { + dest[2]=(UChar)(JAMO_T_BASE+c2); } + length=3; + } else { + length=2; } - if(cc=2) { + dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); + dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); } + return length; + } else { + /* c decomposes, get everything from the variable-length extra data */ + const UChar *p, *limit; + uint8_t cc, trailCC; + + p=_decompose(norm32, qcMask, length, cc, trailCC); + if(length<=destCapacity) { + limit=p+length; + do { + *dest++=*p++; + } while(p=0) { /* string with length */ limit=src+srcLength; @@ -1495,640 +1646,708 @@ _quickCheck(const UChar *src, U_ALIGN_CODE(16); for(;;) { - /* skip a run of code units below the minimum or with irrelevant data for the quick check */ + /* count code units below the minimum or with irrelevant data for the quick check */ + prevSrc=src; if(limit==NULL) { - for(;;) { - c=*src++; - if(c=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) { - break; - } + while(src!=limit && ((c=*src)>_NORM_CC_SHIFT); - if(cc!=0 && cc0) { + buffer[2]=(UChar)(JAMO_T_BASE+c2); + length=3; + } else { + length=2; } - prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe); - /* find the next true starter in [src..limit[ - modifies src to point to the next starter */ - src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe); - - /* decompose and recompose [prevStarter..src[ */ - _composePart(stackBuffer, buffer, bufferCapacity, - length, - prevStarter, - src, - qcMask, - prevCC, nx, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - result=UNORM_MAYBE; /* error (out of memory) */ - break; + buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); + buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); + } + } else { + if(isNorm32Regular(norm32)) { + c2=0; + length=1; + } else { + /* c is a lead surrogate, get the real norm32 */ + if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { + ++src; + length=2; + norm32=_getNorm32FromSurrogatePair(norm32, c2); + } else { + c2=0; + length=1; + norm32=0; } + } - /* compare the normalized version with the original */ - if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) { - result=UNORM_NO; /* normalization differs */ - break; + /* get the decomposition and the lead and trail cc's */ + if(nx_contains(nx, c, c2)) { + /* excluded: norm32==0 */ + cc=trailCC=0; + p=NULL; + } else if((norm32&qcMask)==0) { + /* c does not decompose */ + cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT); + p=NULL; + } else { + /* c decomposes, get everything from the variable-length extra data */ + p=_decompose(norm32, qcMask, length, cc, trailCC); + if(length==1) { + /* fastpath a single code unit from decomposition */ + c=*p; + c2=0; + p=NULL; } + } + } - /* continue after the next starter */ + /* append the decomposition to the destination buffer, assume length>0 */ + if((destIndex+length)<=destCapacity) { + UChar *reorderSplit=dest+destIndex; + if(p==NULL) { + /* fastpath: single code point */ + if(cc!=0 && cc0); + } } + } else { + /* buffer overflow */ + /* keep incrementing the destIndex for preflighting */ + destIndex+=length; } - } -endloop: - if(buffer!=stackBuffer) { - uprv_free(buffer); + prevCC=trailCC; + if(prevCC==0) { + reorderStartIndex=destIndex; + } } - return result; + outTrailCC=prevCC; + return destIndex; } -U_CAPI UNormalizationCheckResult U_EXPORT2 -unorm_quickCheck(const UChar *src, - int32_t srcLength, - UNormalizationMode mode, - UErrorCode *pErrorCode) { - return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode); -} +U_CAPI int32_t U_EXPORT2 +unorm_decompose(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UBool compat, int32_t options, + UErrorCode *pErrorCode) { + const UnicodeSet *nx; + int32_t destIndex; + uint8_t trailCC; -U_CAPI UNormalizationCheckResult U_EXPORT2 -unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, - UNormalizationMode mode, int32_t options, - UErrorCode *pErrorCode) { - return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode); -} + if(!_haveData(*pErrorCode)) { + return 0; + } -U_CAPI UBool U_EXPORT2 -unorm_isNormalized(const UChar *src, int32_t srcLength, - UNormalizationMode mode, - UErrorCode *pErrorCode) { - return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode)); -} + nx=getNX(options, *pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } -U_CAPI UBool U_EXPORT2 -unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, - UNormalizationMode mode, int32_t options, - UErrorCode *pErrorCode) { - return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode)); + destIndex=_decompose(dest, destCapacity, + src, srcLength, + compat, nx, + trailCC); + + return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode); } -/* make NFD & NFKD ---------------------------------------------------------- */ +/* make NFC & NFKC ---------------------------------------------------------- */ -U_CAPI int32_t U_EXPORT2 -unorm_getDecomposition(UChar32 c, UBool compat, - UChar *dest, int32_t destCapacity) { - UErrorCode errorCode=U_ZERO_ERROR; - if( (uint32_t)c<=0x10ffff && - _haveData(errorCode) && - ((dest!=NULL && destCapacity>0) || destCapacity==0) - ) { - uint32_t norm32, qcMask; - UChar32 minNoMaybe; - int32_t length; +/* get the composition properties of the next character */ +static inline uint32_t +_getNextCombining(UChar *&p, const UChar *limit, + UChar &c, UChar &c2, + uint16_t &combiningIndex, uint8_t &cc, + const UnicodeSet *nx) { + uint32_t norm32, combineFlags; - /* initialize */ - if(!compat) { - minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]; - qcMask=_NORM_QC_NFD; - } else { - minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]; - qcMask=_NORM_QC_NFKD; - } + /* get properties */ + c=*p++; + norm32=_getNorm32(c); - if(c0) { - dest[0]=(UChar)c; - } - return -1; - } + /* preset output values for most characters */ + c2=0; + combiningIndex=0; + cc=0; - /* data lookup */ - UTRIE_GET32(&normTrie, c, norm32); - if((norm32&qcMask)==0) { - /* simple case: no decomposition */ - if(c<=0xffff) { - if(destCapacity>0) { - dest[0]=(UChar)c; - } - return -1; - } else { - if(destCapacity>=2) { - dest[0]=UTF16_LEAD(c); - dest[1]=UTF16_TRAIL(c); - } - return -2; - } + if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) { + return 0; + } else { + if(isNorm32Regular(norm32)) { + /* set cc etc. below */ } else if(isNorm32HangulOrJamo(norm32)) { - /* Hangul syllable: decompose algorithmically */ - UChar c2; - - c-=HANGUL_BASE; - - c2=(UChar)(c%JAMO_T_COUNT); - c/=JAMO_T_COUNT; - if(c2>0) { - if(destCapacity>=3) { - dest[2]=(UChar)(JAMO_T_BASE+c2); - } - length=3; + /* a compatibility decomposition contained Jamos */ + combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT)); + return norm32&_NORM_COMBINES_ANY; + } else { + /* c is a lead surrogate, get the real norm32 */ + if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) { + ++p; + norm32=_getNorm32FromSurrogatePair(norm32, c2); } else { - length=2; + c2=0; + return 0; } + } - if(destCapacity>=2) { - dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); - dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); - } - return length; - } else { - /* c decomposes, get everything from the variable-length extra data */ - const UChar *p, *limit; - uint8_t cc, trailCC; + if(nx_contains(nx, c, c2)) { + return 0; /* excluded: norm32==0 */ + } - p=_decompose(norm32, qcMask, length, cc, trailCC); - if(length<=destCapacity) { - limit=p+length; - do { - *dest++=*p++; - } while(p>_NORM_CC_SHIFT); + + combineFlags=norm32&_NORM_COMBINES_ANY; + if(combineFlags!=0) { + combiningIndex=*(_getExtraData(norm32)-1); } - } else { - return 0; + return combineFlags; } } -static int32_t -_decompose(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UBool compat, const UnicodeSet *nx, - uint8_t &outTrailCC) { - UChar buffer[3]; - const UChar *limit, *prevSrc, *p; - uint32_t norm32, ccOrQCMask, qcMask; - int32_t destIndex, reorderStartIndex, length; - UChar c, c2, minNoMaybe; - uint8_t cc, prevCC, trailCC; +/* + * given a composition-result starter (c, c2) - which means its cc==0, + * it combines forward, it has extra data, its norm32!=0, + * it is not a Hangul or Jamo, + * get just its combineFwdIndex + * + * norm32(c) is special if and only if c2!=0 + */ +static inline uint16_t +_getCombiningIndexFromStarter(UChar c, UChar c2) { + uint32_t norm32; - if(!compat) { - minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]; - qcMask=_NORM_QC_NFD; - } else { - minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]; - qcMask=_NORM_QC_NFKD; + norm32=_getNorm32(c); + if(c2!=0) { + norm32=_getNorm32FromSurrogatePair(norm32, c2); } + return *(_getExtraData(norm32)-1); +} - /* initialize */ - ccOrQCMask=_NORM_CC_MASK|qcMask; - destIndex=reorderStartIndex=0; - prevCC=0; - - /* avoid compiler warnings */ - norm32=0; - c=0; +/* + * Find the recomposition result for + * a forward-combining character + * (specified with a pointer to its part of the combiningTable[]) + * and a backward-combining character + * (specified with its combineBackIndex). + * + * If these two characters combine, then set (value, value2) + * with the code unit(s) of the composition character. + * + * Return value: + * 0 do not combine + * 1 combine + * >1 combine, and the composition is a forward-combining starter + * + * See unormimp.h for a description of the composition table format. + */ +static inline uint16_t +_combine(const uint16_t *table, uint16_t combineBackIndex, + uint16_t &value, uint16_t &value2) { + uint16_t key; - if(srcLength>=0) { - /* string with length */ - limit=src+srcLength; - } else /* srcLength==-1 */ { - /* zero-terminated string */ - limit=NULL; + /* search in the starter's composition table */ + for(;;) { + key=*table++; + if(key>=combineBackIndex) { + break; + } + table+= *table&0x8000 ? 2 : 1; } - U_ALIGN_CODE(16); + /* mask off bit 15, the last-entry-in-the-list flag */ + if((key&0x7fff)==combineBackIndex) { + /* found! combine! */ + value=*table; - for(;;) { - /* count code units below the minimum or with irrelevant data for the quick check */ - prevSrc=src; - if(limit==NULL) { - while((c=*src)0) { - buffer[2]=(UChar)(JAMO_T_BASE+c2); - length=3; - } else { - length=2; + p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC); + if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))>_NORM_CC_SHIFT); - p=NULL; - } else { - /* c decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, qcMask, length, cc, trailCC); - if(length==1) { - /* fastpath a single code unit from decomposition */ - c=*p; - c2=0; - p=NULL; - } + if(dest!=0) { + *dest=c; } + return TRUE; } - - /* append the decomposition to the destination buffer, assume length>0 */ - if((destIndex+length)<=destCapacity) { - UChar *reorderSplit=dest+destIndex; - if(p==NULL) { - /* fastpath: single code point */ - if(cc!=0 && cc0); - } - } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; + } else if(isHangulWithoutJamoT(prev)) { + /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */ + c=(UChar)(prev+(c-JAMO_T_BASE)); + if(nx_contains(nx, c)) { + return FALSE; } - - prevCC=trailCC; - if(prevCC==0) { - reorderStartIndex=destIndex; + if(dest!=0) { + *dest=c; } + return TRUE; } - - outTrailCC=prevCC; - return destIndex; + return FALSE; } -U_CAPI int32_t U_EXPORT2 -unorm_decompose(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UBool compat, int32_t options, - UErrorCode *pErrorCode) { - const UnicodeSet *nx; - int32_t destIndex; - uint8_t trailCC; +/* + * recompose the characters in [p..limit[ + * (which is in NFD - decomposed and canonically ordered), + * adjust limit, and return the trailing cc + * + * since for NFKC we may get Jamos in decompositions, we need to + * recompose those too + * + * note that recomposition never lengthens the text: + * any character consists of either one or two code units; + * a composition may contain at most one more code unit than the original starter, + * while the combining mark that is removed has at least one code unit + */ +static uint8_t +_recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) { + UChar *starter, *pRemove, *q, *r; + uint32_t combineFlags; + UChar c, c2; + uint16_t combineFwdIndex, combineBackIndex; + uint16_t result, value, value2; + uint8_t cc, prevCC; + UBool starterIsSupplementary; - if(!_haveData(*pErrorCode)) { - return 0; - } + starter=NULL; /* no starter */ + combineFwdIndex=0; /* will not be used until starter!=NULL - avoid compiler warnings */ + combineBackIndex=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */ + value=value2=0; /* always set by _combine() before used - avoid compiler warnings */ + starterIsSupplementary=FALSE; /* will not be used until starter!=NULL - avoid compiler warnings */ + prevCC=0; - nx=getNX(options, *pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } + for(;;) { + combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx); + if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) { + if(combineBackIndex&0x8000) { + /* c is a Jamo V/T, see if we can compose it with the previous character */ + /* for the PRI #29 fix, check that there is no intervening combining mark */ + if((options&UNORM_BEFORE_PRI_29) || prevCC==0) { + pRemove=NULL; /* NULL while no Hangul composition */ + combineFlags=0; + c2=*starter; + if(combineBackIndex==0xfff2) { + /* Jamo V, compose with previous Jamo L and following Jamo T */ + c2=(UChar)(c2-JAMO_L_BASE); + if(c2 + * the rest of the loop body will reset starter to NULL; + * technically, a composed Hangul syllable is a starter, but it + * does not combine forward now that we have consumed all eligible Jamos; + * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD + */ - return src; -} + } else if( + /* the starter is not a Hangul LV or Jamo V/T and */ + !(combineFwdIndex&0x8000) && + /* the combining mark is not blocked and */ + ((options&UNORM_BEFORE_PRI_29) ? + (prevCC!=cc || prevCC==0) : + (prevCC>_NORM_CC_SHIFT); - p=NULL; - } else { - /* c decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, length, cc, trailCC); - if(length==1) { - /* fastpath a single code unit from decomposition */ - c=*p; - c2=0; - p=NULL; - } - } + /* done? */ + if(p==limit) { + return prevCC; + } - /* append the decomposition to the destination buffer, assume length>0 */ - if((destIndex+length)<=destCapacity) { - UChar *reorderSplit=dest+destIndex; - if(p==NULL) { - /* fastpath: single code point */ - if(cc!=0 && cc1) { + combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2); } else { - /* just append (c, c2) */ - dest[destIndex++]=c; - if(c2!=0) { - dest[destIndex++]=c2; - } + starter=NULL; } - } else { - /* general: multiple code points (ordered by themselves) from decomposition */ - if(cc!=0 && cc0); + starterIsSupplementary=TRUE; + starter=p-2; } + combineFwdIndex=combineBackIndex; + } else { + /* it will not combine with anything */ + starter=NULL; } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; + } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) { + /* FCC: no discontiguous compositions; any intervening character blocks */ + starter=NULL; } + } +} - prevCC=trailCC; - if(prevCC==0) { - reorderStartIndex=destIndex; +/* decompose and recompose [prevStarter..src[ */ +static const UChar * +_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length, + const UChar *prevStarter, const UChar *src, + uint8_t &prevCC, + int32_t options, const UnicodeSet *nx, + UErrorCode *pErrorCode) { + UChar *recomposeLimit; + uint8_t trailCC; + UBool compat; + + compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0); + + /* decompose [prevStarter..src[ */ + length=_decompose(buffer, bufferCapacity, + prevStarter, (int32_t)(src-prevStarter), + compat, nx, + trailCC); + if(length>bufferCapacity) { + if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; } + length=_decompose(buffer, bufferCapacity, + prevStarter, (int32_t)(src-prevStarter), + compat, nx, + trailCC); } - return prevCC; + /* recompose the decomposition */ + recomposeLimit=buffer+length; + if(length>=2) { + prevCC=_recompose(buffer, recomposeLimit, options, nx); + } + + /* return with a pointer to the recomposition and its length */ + length=(int32_t)(recomposeLimit-buffer); + return buffer; } static int32_t -unorm_makeFCD(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - const UnicodeSet *nx, - UErrorCode *pErrorCode) { - const UChar *limit, *prevSrc, *decompStart; - int32_t destIndex, length; - UChar c, c2; - uint16_t fcd16; - int16_t prevCC, cc; +_compose(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + int32_t options, const UnicodeSet *nx, + UErrorCode *pErrorCode) { + UChar stackBuffer[_STACK_BUFFER_CAPACITY]; + UChar *buffer; + int32_t bufferCapacity; - if(!_haveData(*pErrorCode)) { - return 0; + const UChar *limit, *prevSrc, *prevStarter; + uint32_t norm32, ccOrQCMask, qcMask; + int32_t destIndex, reorderStartIndex, length; + UChar c, c2, minNoMaybe; + uint8_t cc, prevCC; + + if(options&_NORM_OPTIONS_COMPAT) { + minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]; + qcMask=_NORM_QC_NFKC; + } else { + minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]; + qcMask=_NORM_QC_NFC; } /* initialize */ - decompStart=src; - destIndex=0; + buffer=stackBuffer; + bufferCapacity=_STACK_BUFFER_CAPACITY; + + /* + * prevStarter points to the last character before the current one + * that is a "true" starter with cc==0 and quick check "yes". + * + * prevStarter will be used instead of looking for a true starter + * while incrementally decomposing [prevStarter..prevSrc[ + * in _composePart(). Having a good prevStarter allows to just decompose + * the entire [prevStarter..prevSrc[. + * + * When _composePart() backs out from prevSrc back to prevStarter, + * then it also backs out destIndex by the same amount. + * Therefore, at all times, the (prevSrc-prevStarter) source units + * must correspond 1:1 to destination units counted with destIndex, + * except for reordering. + * This is true for the qc "yes" characters copied in the fast loop, + * and for pure reordering. + * prevStarter must be set forward to src when this is not true: + * In _composePart() and after composing a Hangul syllable. + * + * This mechanism relies on the assumption that the decomposition of a true starter + * also begins with a true starter. gennorm/store.c checks for this. + */ + prevStarter=src; + + ccOrQCMask=_NORM_CC_MASK|qcMask; + destIndex=reorderStartIndex=0; prevCC=0; /* avoid compiler warnings */ + norm32=0; c=0; - fcd16=0; if(srcLength>=0) { /* string with length */ @@ -2141,46 +2360,20 @@ unorm_makeFCD(UChar *dest, int32_t destCapacity, U_ALIGN_CODE(16); for(;;) { - /* skip a run of code units below the minimum or with irrelevant data for the FCD check */ + /* count code units below the minimum or with irrelevant data for the quick check */ prevSrc=src; if(limit==NULL) { - for(;;) { - c=*src; - if(c<_NORM_MIN_WITH_LEAD_CC) { - if(c==0) { - break; - } - prevCC=(int16_t)-c; - } else if((fcd16=_getFCD16(c))==0) { - prevCC=0; - } else { - break; - } + while((c=*src)0 && + _composeHangul( + *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0), + destIndex<=destCapacity ? dest+(destIndex-1) : 0, + nx) + ) { + prevStarter=src; + continue; + } + + /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */ + c2=0; + length=1; + prevStarter=prevSrc; + } else { + if(isNorm32Regular(norm32)) { + c2=0; + length=1; + } else { + /* c is a lead surrogate, get the real norm32 */ + if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { + ++src; + length=2; + norm32=_getNorm32FromSurrogatePair(norm32, c2); } else { - prevCC=0; /* excluded: fcd16==0 */ + /* c is an unpaired lead surrogate, nothing to do */ + c2=0; + length=1; + norm32=0; } + } + + /* we are looking at the character (c, c2) at [prevSrc..src[ */ + if(nx_contains(nx, c, c2)) { + /* excluded: norm32==0 */ + cc=0; + } else if((norm32&qcMask)==0) { + cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); + } else { + const UChar *p; + uint32_t decompQCMask; + + /* + * find appropriate boundaries around this character, + * decompose the source text from between the boundaries, + * and recompose it + * + * this puts the intermediate text into the side buffer because + * it might be longer than the recomposition end result, + * or the destination buffer may be too short or missing + * + * note that destIndex may be adjusted backwards to account + * for source text that passed the quick check but needed to + * take part in the recomposition + */ + decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ /* - * set a pointer to this below-U+0300 character; - * if prevCC==0 then it will moved to after this character below + * find the last true starter in [prevStarter..src[ + * it is either the decomposition of the current character (at prevSrc), + * or prevStarter */ - decompStart=prevSrc-1; - } - } - /* - * now: - * prevSrc==src - used later to adjust destIndex before decomposition - * prevCC>=0 - */ + if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { + prevStarter=prevSrc; + } else { + /* adjust destIndex: back out what had been copied with qc "yes" */ + destIndex-=(int32_t)(prevSrc-prevStarter); + } - /* end of source reached? */ - if(limit==NULL ? c==0 : src==limit) { - break; - } + /* find the next true starter in [src..limit[ - modifies src to point to the next starter */ + src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe); - /* set a pointer to after the last source position where prevCC==0 */ - if(prevCC==0) { - decompStart=prevSrc; - } + /* compose [prevStarter..src[ */ + p=_composePart(stackBuffer, buffer, bufferCapacity, + length, /* output */ + prevStarter, src, + prevCC, /* output */ + options, nx, + pErrorCode); - /* c already contains *src and fcd16 is set for it, increment src */ - ++src; + if(p==NULL) { + destIndex=0; /* an error occurred (out of memory) */ + break; + } - /* check one above-minimum, relevant code unit */ - if(UTF_IS_FIRST_SURROGATE(c)) { - /* c is a lead surrogate, get the real fcd16 */ - if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { - ++src; - fcd16=_getFCD16FromSurrogatePair(fcd16, c2); - } else { - c2=0; - fcd16=0; - } - } else { - c2=0; - } + /* append the recomposed buffer contents to the destination buffer */ + if((destIndex+length)<=destCapacity) { + while(length>0) { + dest[destIndex++]=*p++; + --length; + } + } else { + /* buffer overflow */ + /* keep incrementing the destIndex for preflighting */ + destIndex+=length; + } - /* we are looking at the character (c, c2) at [prevSrc..src[ */ - if(nx_contains(nx, c, c2)) { - fcd16=0; /* excluded: fcd16==0 */ - } + /* set the next starter */ + prevStarter=src; - /* check the combining order, get the lead cc */ - cc=(int16_t)(fcd16>>8); - if(cc==0 || cc>=prevCC) { - /* the order is ok */ - if(cc==0) { - decompStart=prevSrc; + continue; } - prevCC=(int16_t)(fcd16&0xff); + } - /* just append (c, c2) */ - length= c2==0 ? 1 : 2; - if((destIndex+length)<=destCapacity) { + /* append the single code point (c, c2) to the destination buffer */ + if((destIndex+length)<=destCapacity) { + if(cc!=0 && cc>_NORM_EXTRA_SHIFT)); - return norm32&_NORM_COMBINES_ANY; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) { - ++p; - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } else { - c2=0; - return 0; - } - } + } - if(nx_contains(nx, c, c2)) { - return 0; /* excluded: norm32==0 */ - } + nx=getNX(options, *pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } - cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); + /* reset options bits that should only be set here or inside _compose() */ + options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS); - combineFlags=norm32&_NORM_COMBINES_ANY; - if(combineFlags!=0) { - combiningIndex=*(_getExtraData(norm32)-1); - } - return combineFlags; + if(compat) { + options|=_NORM_OPTIONS_COMPAT; } -} -/* - * given a composition-result starter (c, c2) - which means its cc==0, - * it combines forward, it has extra data, its norm32!=0, - * it is not a Hangul or Jamo, - * get just its combineFwdIndex - * - * norm32(c) is special if and only if c2!=0 - */ -static inline uint16_t -_getCombiningIndexFromStarter(UChar c, UChar c2) { - uint32_t norm32; + destIndex=_compose(dest, destCapacity, + src, srcLength, + options, nx, + pErrorCode); - norm32=_getNorm32(c); - if(c2!=0) { - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } - return *(_getExtraData(norm32)-1); + return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode); } -/* - * Find the recomposition result for - * a forward-combining character - * (specified with a pointer to its part of the combiningTable[]) - * and a backward-combining character - * (specified with its combineBackIndex). - * - * If these two characters combine, then set (value, value2) - * with the code unit(s) of the composition character. - * - * Return value: - * 0 do not combine - * 1 combine - * >1 combine, and the composition is a forward-combining starter - * - * See unormimp.h for a description of the composition table format. - */ -static inline uint16_t -_combine(const uint16_t *table, uint16_t combineBackIndex, - uint16_t &value, uint16_t &value2) { - uint16_t key; +/* make FCD ----------------------------------------------------------------- */ - /* search in the starter's composition table */ +static const UChar * +_findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) { + UChar c, c2; + + /* + * find the first position in [src..limit[ after some cc==0 according to FCD data + * + * at the beginning of the loop, we have fcd16 from before src + * + * stop at positions: + * - after trail cc==0 + * - at the end of the source + * - before lead cc==0 + */ for(;;) { - key=*table++; - if(key>=combineBackIndex) { + /* stop if trail cc==0 for the previous character */ + if((fcd16&0xff)==0) { break; } - table+= *table&0x8000 ? 2 : 1; - } - /* mask off bit 15, the last-entry-in-the-list flag */ - if((key&0x7fff)==combineBackIndex) { - /* found! combine! */ - value=*table; + /* get c=*src - stop at end of string */ + if(src==limit) { + break; + } + c=*src; - /* is the composition a starter that combines forward? */ - key=(uint16_t)((value&0x2000)+1); + /* stop if lead cc==0 for this character */ + if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) { + break; /* catches terminating NUL, too */ + } - /* get the composition result code point from the variable-length result value */ - if(value&0x8000) { - if(value&0x4000) { - /* surrogate pair composition result */ - value=(uint16_t)((value&0x3ff)|0xd800); - value2=*(table+1); - } else { - /* BMP composition result U+2000..U+ffff */ - value=*(table+1); - value2=0; + if(!UTF_IS_FIRST_SURROGATE(c)) { + if(fcd16<=0xff) { + break; + } + ++src; + } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) { + /* c is a lead surrogate, get the real fcd16 */ + fcd16=_getFCD16FromSurrogatePair(fcd16, c2); + if(fcd16<=0xff) { + break; } + src+=2; } else { - /* BMP composition result U+0000..U+1fff */ - value&=0x1fff; - value2=0; + /* c is an unpaired first surrogate, lead cc==0 */ + break; } - - return key; - } else { - /* not found */ - return 0; } + + return src; } -/* - * recompose the characters in [p..limit[ - * (which is in NFD - decomposed and canonically ordered), - * adjust limit, and return the trailing cc - * - * since for NFKC we may get Jamos in decompositions, we need to - * recompose those too - * - * note that recomposition never lengthens the text: - * any character consists of either one or two code units; - * a composition may contain at most one more code unit than the original starter, - * while the combining mark that is removed has at least one code unit - */ static uint8_t -_recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) { - UChar *starter, *pRemove, *q, *r; - uint32_t combineFlags; +_decomposeFCD(const UChar *src, const UChar *decompLimit, + UChar *dest, int32_t &destIndex, int32_t destCapacity, + const UnicodeSet *nx) { + const UChar *p; + uint32_t norm32; + int32_t reorderStartIndex, length; UChar c, c2; - uint16_t combineFwdIndex, combineBackIndex; - uint16_t result, value, value2; - uint8_t cc, prevCC; - UBool starterIsSupplementary; + uint8_t cc, prevCC, trailCC; - starter=NULL; /* no starter */ - combineFwdIndex=0; /* will not be used until starter!=NULL - avoid compiler warnings */ - combineBackIndex=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */ - value=value2=0; /* always set by _combine() before used - avoid compiler warnings */ - starterIsSupplementary=FALSE; /* will not be used until starter!=NULL - avoid compiler warnings */ + /* + * canonically decompose [src..decompLimit[ + * + * all characters in this range have some non-zero cc, + * directly or in decomposition, + * so that we do not need to check in the following for quick-check limits etc. + * + * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)! + * + * we also do not need to check for c==0 because we have an established decompLimit + */ + reorderStartIndex=destIndex; prevCC=0; - for(;;) { - combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx); - if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) { - if(combineBackIndex&0x8000) { - /* c is a Jamo V/T, see if we can compose it with the previous character */ - pRemove=NULL; /* NULL while no Hangul composition */ - c2=*starter; - if(combineBackIndex==0xfff2) { - /* Jamo V, compose with previous Jamo L and following Jamo T */ - c2=(UChar)(c2-JAMO_L_BASE); - if(c2 - * the rest of the loop body will reset starter to NULL; - * technically, a composed Hangul syllable is a starter, but it - * does not combine forward now that we have consumed all eligible Jamos; - * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD - */ - - } else if( - /* the starter is not a Jamo V/T and */ - !(combineFwdIndex&0x8000) && - /* the combining mark is not blocked and */ - (prevCC1) { - combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2); - } else { - starter=NULL; - } - - /* we combined and set prevCC, continue with looking for compositions */ - continue; + while(src>_NORM_CC_SHIFT); + p=NULL; + } else { + /* c decomposes, get everything from the variable-length extra data */ + p=_decompose(norm32, length, cc, trailCC); + if(length==1) { + /* fastpath a single code unit from decomposition */ + c=*p; + c2=0; + p=NULL; + } } - /* if (c, c2) did not combine, then check if it is a starter */ - if(cc==0) { - /* found a new starter; combineFlags==0 if (c, c2) is excluded */ - if(combineFlags&_NORM_COMBINES_FWD) { - /* it may combine with something, prepare for it */ - if(c2==0) { - starterIsSupplementary=FALSE; - starter=p-1; + /* append the decomposition to the destination buffer, assume length>0 */ + if((destIndex+length)<=destCapacity) { + UChar *reorderSplit=dest+destIndex; + if(p==NULL) { + /* fastpath: single code point */ + if(cc!=0 && cc0); + } } + } else { + /* buffer overflow */ + /* keep incrementing the destIndex for preflighting */ + destIndex+=length; + } + + prevCC=trailCC; + if(prevCC==0) { + reorderStartIndex=destIndex; } } + + return prevCC; } -/* find the last true starter in [start..src[ and return the pointer to it */ -static const UChar * -_findPreviousStarter(const UChar *start, const UChar *src, - uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) { - uint32_t norm32; +static int32_t +unorm_makeFCD(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const UnicodeSet *nx, + UErrorCode *pErrorCode) { + const UChar *limit, *prevSrc, *decompStart; + int32_t destIndex, length; UChar c, c2; + uint16_t fcd16; + int16_t prevCC, cc; - while(start=0) { + /* string with length */ + limit=src+srcLength; + } else /* srcLength==-1 */ { + /* zero-terminated string */ + limit=NULL; + } + + U_ALIGN_CODE(16); for(;;) { - if(src==limit) { - break; /* end of string */ - } - c=*src; - if(c=0 + */ - /* (c, c2) is not a true starter but its decomposition may be */ - if(norm32&decompQCMask) { - /* (c, c2) decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, decompQCMask, length, cc, trailCC); + /* end of source reached? */ + if(limit==NULL ? c==0 : src==limit) { + break; + } - /* get the first character's norm32 to check if it is a true starter */ - if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) { - break; /* true starter */ + /* set a pointer to after the last source position where prevCC==0 */ + if(prevCC==0) { + decompStart=prevSrc; + } + + /* c already contains *src and fcd16 is set for it, increment src */ + ++src; + + /* check one above-minimum, relevant code unit */ + if(UTF_IS_FIRST_SURROGATE(c)) { + /* c is a lead surrogate, get the real fcd16 */ + if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { + ++src; + fcd16=_getFCD16FromSurrogatePair(fcd16, c2); + } else { + c2=0; + fcd16=0; } + } else { + c2=0; } - src+= c2==0 ? 1 : 2; /* not a true starter, continue */ - } + /* we are looking at the character (c, c2) at [prevSrc..src[ */ + if(nx_contains(nx, c, c2)) { + fcd16=0; /* excluded: fcd16==0 */ + } - return src; -} + /* check the combining order, get the lead cc */ + cc=(int16_t)(fcd16>>8); + if(cc==0 || cc>=prevCC) { + /* the order is ok */ + if(cc==0) { + decompStart=prevSrc; + } + prevCC=(int16_t)(fcd16&0xff); -/* decompose and recompose [prevStarter..src[ */ -static const UChar * -_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length, - const UChar *prevStarter, const UChar *src, - uint32_t qcMask, uint8_t &prevCC, - const UnicodeSet *nx, - UErrorCode *pErrorCode) { - UChar *recomposeLimit; - uint8_t trailCC; - UBool compat; + /* just append (c, c2) */ + length= c2==0 ? 1 : 2; + if((destIndex+length)<=destCapacity) { + dest[destIndex++]=c; + if(c2!=0) { + dest[destIndex++]=c2; + } + } else { + destIndex+=length; + } + } else { + /* + * back out the part of the source that we copied already but + * is now going to be decomposed; + * prevSrc is set to after what was copied + */ + destIndex-=(int32_t)(prevSrc-decompStart); - compat=(UBool)((qcMask&_NORM_QC_NFKC)!=0); + /* + * find the part of the source that needs to be decomposed; + * to be safe and simple, decompose to before the next character with lead cc==0 + */ + src=_findSafeFCD(src, limit, fcd16); - /* decompose [prevStarter..src[ */ - length=_decompose(buffer, bufferCapacity, - prevStarter, src-prevStarter, - compat, nx, - trailCC); - if(length>bufferCapacity) { - if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; + /* + * the source text does not fulfill the conditions for FCD; + * decompose and reorder a limited piece of the text + */ + prevCC=_decomposeFCD(decompStart, src, + dest, destIndex, destCapacity, + nx); + decompStart=src; } - length=_decompose(buffer, bufferCapacity, - prevStarter, src-prevStarter, - compat, nx, - trailCC); - } - - /* recompose the decomposition */ - recomposeLimit=buffer+length; - if(length>=2) { - prevCC=_recompose(buffer, recomposeLimit, nx); } - /* return with a pointer to the recomposition and its length */ - length=recomposeLimit-buffer; - return buffer; + return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode); } -static inline UBool -_composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit, - UBool compat, UChar *dest, const UnicodeSet *nx) { - if(isJamoVTNorm32JamoV(norm32)) { - /* c is a Jamo V, compose with previous Jamo L and following Jamo T */ - prev=(UChar)(prev-JAMO_L_BASE); - if(prev=0) { + /* string with length */ + limit=src+srcLength; + } else /* srcLength==-1 */ { + /* zero-terminated string */ + limit=NULL; + } + + U_ALIGN_CODE(16); + + for(;;) { + /* skip a run of code units below the minimum or with irrelevant data for the FCD check */ + if(limit==NULL) { + for(;;) { + c=*src++; + if(c<_NORM_MIN_WITH_LEAD_CC) { + if(c==0) { + return TRUE; } + /* + * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC + * because chances are good that the next one will have + * a leading cc of 0; + * _getFCD16(-prevCC) is later called when necessary - + * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300 + */ + prevCC=(int16_t)-c; + } else if((fcd16=_getFCD16(c))==0) { + prevCC=0; + } else { + break; } } - if(nx_contains(nx, c)) { - if(!isHangulWithoutJamoT(c)) { - --src; /* undo ++src from reading the Jamo T */ + } else { + for(;;) { + if(src==limit) { + return TRUE; + } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) { + prevCC=(int16_t)-c; + } else if((fcd16=_getFCD16(c))==0) { + prevCC=0; + } else { + break; } - return FALSE; } - if(dest!=0) { - *dest=c; + } + + /* check one above-minimum, relevant code unit */ + if(UTF_IS_FIRST_SURROGATE(c)) { + /* c is a lead surrogate, get the real fcd16 */ + if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { + ++src; + fcd16=_getFCD16FromSurrogatePair(fcd16, c2); + } else { + c2=0; + fcd16=0; } - return TRUE; + } else { + c2=0; } - } else if(isHangulWithoutJamoT(prev)) { - /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */ - c=(UChar)(prev+(c-JAMO_T_BASE)); - if(nx_contains(nx, c)) { - return FALSE; + + if(nx_contains(nx, c, c2)) { + prevCC=0; /* excluded: fcd16==0 */ + continue; } - if(dest!=0) { - *dest=c; + + /* + * prevCC has values from the following ranges: + * 0..0xff - the previous trail combining class + * <0 - the negative value of the previous code unit; + * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() + * was deferred so that average text is checked faster + */ + + /* check the combining order */ + cc=(int16_t)(fcd16>>8); + if(cc!=0) { + if(prevCC<0) { + /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */ + if(!nx_contains(nx, (UChar32)-prevCC)) { + prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff); + } else { + prevCC=0; /* excluded: fcd16==0 */ + } + } + + if(cc=0) { /* string with length */ limit=src+srcLength; @@ -2864,244 +3148,152 @@ _compose(UChar *dest, int32_t destCapacity, U_ALIGN_CODE(16); for(;;) { - /* count code units below the minimum or with irrelevant data for the quick check */ - prevSrc=src; + /* skip a run of code units below the minimum or with irrelevant data for the quick check */ if(limit==NULL) { - while((c=*src)=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) { + break; + } prevCC=0; - ++src; } } - /* copy these code units all at once */ - if(src!=prevSrc) { - length=(int32_t)(src-prevSrc); - if((destIndex+length)<=destCapacity) { - uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR); - } - destIndex+=length; - reorderStartIndex=destIndex; - - /* set prevStarter to the last character in the quick check loop */ - prevStarter=src-1; - if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc>_NORM_CC_SHIFT); + if(cc!=0 && cc0 && - _composeHangul( - *(prevSrc-1), c, norm32, src, limit, compat, - destIndex<=destCapacity ? dest+(destIndex-1) : 0, - nx) - ) { - prevStarter=src; - continue; - } - - /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */ - c2=0; - length=1; - prevStarter=prevSrc; - } else { - if(isNorm32Regular(norm32)) { - c2=0; - length=1; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { - ++src; - length=2; - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } else { - /* c is an unpaired lead surrogate, nothing to do */ - c2=0; - length=1; - norm32=0; - } - } - - /* we are looking at the character (c, c2) at [prevSrc..src[ */ - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - cc=0; - } else if((norm32&qcMask)==0) { - cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); + /* check for "no" or "maybe" quick check flags */ + qcNorm32=norm32&qcMask; + if(qcNorm32&_NORM_QC_ANY_NO) { + result=UNORM_NO; + break; + } else if(qcNorm32!=0) { + /* "maybe" can only occur for NFC and NFKC */ + if(allowMaybe) { + result=UNORM_MAYBE; } else { - const UChar *p; + /* normalize a section around here to see if it is really normalized or not */ + const UChar *prevStarter; uint32_t decompQCMask; + int32_t length; - /* - * find appropriate boundaries around this character, - * decompose the source text from between the boundaries, - * and recompose it - * - * this puts the intermediate text into the side buffer because - * it might be longer than the recomposition end result, - * or the destination buffer may be too short or missing - * - * note that destIndex may be adjusted backwards to account - * for source text that passed the quick check but needed to - * take part in the recomposition - */ decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ - /* - * find the last true starter in [prevStarter..src[ - * it is either the decomposition of the current character (at prevSrc), - * or prevStarter - */ - if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { - prevStarter=prevSrc; - } else { - /* adjust destIndex: back out what had been copied with qc "yes" */ - destIndex-=(int32_t)(prevSrc-prevStarter); + /* find the previous starter */ + prevStarter=src-1; /* set prevStarter to the beginning of the current character */ + if(UTF_IS_TRAIL(*prevStarter)) { + --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */ } + prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe); /* find the next true starter in [src..limit[ - modifies src to point to the next starter */ src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe); - /* compose [prevStarter..src[ */ - p=_composePart(stackBuffer, buffer, bufferCapacity, - length, /* output */ - prevStarter, src, - qcMask, - prevCC, /* output */ - nx, - pErrorCode); - - if(p==NULL) { - destIndex=0; /* an error occurred (out of memory) */ + /* decompose and recompose [prevStarter..src[ */ + _composePart(stackBuffer, buffer, bufferCapacity, + length, + prevStarter, + src, + prevCC, + options, nx, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + result=UNORM_MAYBE; /* error (out of memory) */ break; } - /* append the recomposed buffer contents to the destination buffer */ - if((destIndex+length)<=destCapacity) { - while(length>0) { - dest[destIndex++]=*p++; - --length; - } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; + /* compare the normalized version with the original */ + if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) { + result=UNORM_NO; /* normalization differs */ + break; } - /* set the next starter */ - prevStarter=src; - - continue; - } - } - - /* append the single code point (c, c2) to the destination buffer */ - if((destIndex+length)<=destCapacity) { - if(cc!=0 && cc=0 && c2>=0 - - // get complete code points for c1, c2 for lookups if either is a surrogate - cp1=c1; - if(UTF_IS_SURROGATE(c1)) { - UChar c; - - if(UTF_IS_SURROGATE_FIRST(c1)) { - if(s1!=limit1 && UTF_IS_TRAIL(c=*s1)) { - // advance ++s1; only below if cp1 decomposes/case-folds - cp1=UTF16_GET_PAIR_VALUE(c1, c); - } - } else /* isTrail(c1) */ { - if(start1<=(s1-2) && UTF_IS_LEAD(c=*(s1-2))) { - cp1=UTF16_GET_PAIR_VALUE(c, c1); - } - } - } - - cp2=c2; - if(UTF_IS_SURROGATE(c2)) { - UChar c; - - if(UTF_IS_SURROGATE_FIRST(c2)) { - if(s2!=limit2 && UTF_IS_TRAIL(c=*s2)) { - // advance ++s2; only below if cp2 decomposes/case-folds - cp2=UTF16_GET_PAIR_VALUE(c2, c); - } - } else /* isTrail(c2) */ { - if(start2<=(s2-2) && UTF_IS_LEAD(c=*(s2-2))) { - cp2=UTF16_GET_PAIR_VALUE(c, c2); - } - } - } - - // go down one level for each string - // continue with the main loop as soon as there is a real change - - if( level1==0 && (options&U_COMPARE_IGNORE_CASE) && - (length=u_internalFoldCase((UChar32)cp1, fold1, 32, options))>=0 - ) { - // cp1 case-folds to fold1[length] - if(UTF_IS_SURROGATE(c1)) { - if(UTF_IS_SURROGATE_FIRST(c1)) { - // advance beyond source surrogate pair if it case-folds - ++s1; - } else /* isTrail(c1) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point - --s2; - c2=*(s2-1); - } - } - - // push current level pointers - stack1[0].start=start1; - stack1[0].s=s1; - stack1[0].limit=limit1; - ++level1; - - // set next level pointers to case folding - start1=s1=fold1; - limit1=fold1+length; - - // get ready to read from decomposition, continue with loop - c1=-1; - continue; - } - - if( level2==0 && (options&U_COMPARE_IGNORE_CASE) && - (length=u_internalFoldCase((UChar32)cp2, fold2, 32, options))>=0 - ) { - // cp2 case-folds to fold2[length] - if(UTF_IS_SURROGATE(c2)) { - if(UTF_IS_SURROGATE_FIRST(c2)) { - // advance beyond source surrogate pair if it case-folds - ++s2; - } else /* isTrail(c2) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point - --s1; - c1=*(s1-1); - } - } - - // push current level pointers - stack2[0].start=start2; - stack2[0].s=s2; - stack2[0].limit=limit2; - ++level2; - - // set next level pointers to case folding - start2=s2=fold2; - limit2=fold2+length; - - // get ready to read from decomposition, continue with loop - c2=-1; - continue; - } - - if( level1<2 && (options&_COMPARE_EQUIV) && - 0!=(p=_decompose((UChar32)cp1, decomp1, length)) - ) { - // cp1 decomposes into p[length] - if(UTF_IS_SURROGATE(c1)) { - if(UTF_IS_SURROGATE_FIRST(c1)) { - // advance beyond source surrogate pair if it decomposes - ++s1; - } else /* isTrail(c1) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point - --s2; - c2=*(s2-1); - } - } - - // push current level pointers - stack1[level1].start=start1; - stack1[level1].s=s1; - stack1[level1].limit=limit1; - ++level1; - - // set empty intermediate level if skipped - if(level1<2) { - stack1[level1++].start=NULL; - } - - // set next level pointers to decomposition - start1=s1=p; - limit1=p+length; - - // get ready to read from decomposition, continue with loop - c1=-1; - continue; - } - - if( level2<2 && (options&_COMPARE_EQUIV) && - 0!=(p=_decompose((UChar32)cp2, decomp2, length)) - ) { - // cp2 decomposes into p[length] - if(UTF_IS_SURROGATE(c2)) { - if(UTF_IS_SURROGATE_FIRST(c2)) { - // advance beyond source surrogate pair if it decomposes - ++s2; - } else /* isTrail(c2) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point - --s1; - c1=*(s1-1); - } - } - - // push current level pointers - stack2[level2].start=start2; - stack2[level2].s=s2; - stack2[level2].limit=limit2; - ++level2; - - // set empty intermediate level if skipped - if(level2<2) { - stack2[level2++].start=NULL; - } - - // set next level pointers to decomposition - start2=s2=p; - limit2=p+length; - - // get ready to read from decomposition, continue with loop - c2=-1; - continue; - } - - // no decomposition/case folding, max level for both sides: - // return difference result - - // code point order comparison must not just return cp1-cp2 - // because when single surrogates are present then the surrogate pairs - // that formed cp1 and cp2 may be from different string indexes - - // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units - // c1=d800 cp1=10001 c2=dc00 cp2=10000 - // cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } - - // therefore, use same fix-up as in ustring.c/uprv_strCompare() - // except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ - // so we have slightly different pointer/start/limit comparisons here - - if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { - /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ - if( - (c1<=0xdbff && s1!=limit1 && UTF_IS_TRAIL(*s1)) || - (UTF_IS_TRAIL(c1) && start1!=(s1-1) && UTF_IS_LEAD(*(s1-2))) - ) { - /* part of a surrogate pair, leave >=d800 */ - } else { - /* BMP code point - may be surrogate code point - make =d800 */ - } else { - /* BMP code point - may be surrogate code point - make >UNORM_COMPARE_NORM_OPTIONS_SHIFT), *pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } - - d1=d2=0; - options|=_COMPARE_EQUIV; - result=0; - - /* - * UAX #21 Case Mappings, as fixed for Unicode version 4 - * (see Jitterbug 2021), defines a canonical caseless match as - * - * A string X is a canonical caseless match - * for a string Y if and only if - * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) - * - * For better performance, we check for FCD (or let the caller tell us that - * both strings are in FCD) for the inner normalization. - * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that - * case-folding preserves the FCD-ness of a string. - * The outer normalization is then only performed by unorm_cmpEquivFold() - * when there is a difference. - * - * Exception: When using the Turkic case-folding option, we do perform - * full NFD first. This is because in the Turkic case precomposed characters - * with 0049 capital I or 0069 small i fold differently whether they - * are first decomposed or not, so an FCD check - a check only for - * canonical order - is not sufficient. - */ - if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { - mode=UNORM_NFD; - options&=~UNORM_INPUT_IS_FCD; - } else { - mode=UNORM_FCD; - } - - if(!(options&UNORM_INPUT_IS_FCD)) { - int32_t _len1, _len2; - UBool isFCD1, isFCD2; - - // check if s1 and/or s2 fulfill the FCD conditions - isFCD1= UNORM_YES==_quickCheck(s1, length1, mode, TRUE, nx, pErrorCode); - isFCD2= UNORM_YES==_quickCheck(s2, length2, mode, TRUE, nx, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } - - /* - * ICU 2.4 had a further optimization: - * If both strings were not in FCD, then they were both NFD'ed, - * and the _COMPARE_EQUIV option was turned off. - * It is not entirely clear that this is valid with the current - * definition of the canonical caseless match. - * Therefore, ICU 2.6 removes that optimization. - */ - - if(!isFCD1) { - _len1=unorm_internalNormalize(fcd1, LENGTHOF(fcd1), - s1, length1, - mode, nx, - pErrorCode); - if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - s1=fcd1; - } else { - d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR); - if(d1==0) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - - *pErrorCode=U_ZERO_ERROR; - _len1=unorm_internalNormalize(d1, _len1, - s1, length1, - mode, nx, - pErrorCode); - if(U_FAILURE(*pErrorCode)) { - goto cleanup; - } - - s1=d1; - } - length1=_len1; - } - - if(!isFCD2) { - _len2=unorm_internalNormalize(fcd2, LENGTHOF(fcd2), - s2, length2, - mode, nx, - pErrorCode); - if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - s2=fcd2; - } else { - d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR); - if(d2==0) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - - *pErrorCode=U_ZERO_ERROR; - _len2=unorm_internalNormalize(d2, _len2, - s2, length2, - mode, nx, - pErrorCode); - if(U_FAILURE(*pErrorCode)) { - goto cleanup; - } - - s2=d2; - } - length2=_len2; - } - } - - if(U_SUCCESS(*pErrorCode)) { - result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); - } - -cleanup: - if(d1!=0) { - uprv_free(d1); - } - if(d2!=0) { - uprv_free(d2); - } - - return result; -} - #endif /* #if !UCONFIG_NO_NORMALIZATION */