X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..b331163bffd790ced0e88b73f44f86d49ccc48a5:/icuSources/common/unorm.cpp diff --git a/icuSources/common/unorm.cpp b/icuSources/common/unorm.cpp index fc9cfb0b..f0a026f6 100644 --- a/icuSources/common/unorm.cpp +++ b/icuSources/common/unorm.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (c) 1996-2004, International Business Machines +* Copyright (c) 1996-2014, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * File unorm.cpp @@ -20,6 +20,7 @@ * instead of just wrappers around normlzr.cpp, * load unorm.dat, support Unicode 3.1 with * supplementary code points, etc. +* 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code */ #include "unicode/utypes.h" @@ -27,3255 +28,67 @@ #if !UCONFIG_NO_NORMALIZATION #include "unicode/udata.h" -#include "unicode/uchar.h" #include "unicode/ustring.h" #include "unicode/uiter.h" -#include "unicode/uniset.h" -#include "unicode/usetiter.h" #include "unicode/unorm.h" -#include "ucln_cmn.h" +#include "unicode/unorm2.h" +#include "normalizer2impl.h" #include "unormimp.h" -#include "ucase.h" -#include "cmemory.h" -#include "umutex.h" -#include "utrie.h" -#include "unicode/uset.h" -#include "udataswp.h" -#include "putilimp.h" +#include "uprops.h" +#include "ustr_imp.h" -/* - * Status of tailored normalization - * - * This was done initially for investigation on Unicode public review issue 7 - * (http://www.unicode.org/review/). See Jitterbug 2481. - * While the UTC at meeting #94 (2003mar) did not take up the issue, this is - * a permanent feature in ICU 2.6 in support of IDNA which requires true - * Unicode 3.2 normalization. - * (NormalizationCorrections are rolled into IDNA mapping tables.) - * - * Tailored normalization as implemented here allows to "normalize less" - * than full Unicode normalization would. - * Based internally on a UnicodeSet of code points that are - * "excluded from normalization", the normalization functions leave those - * code points alone ("inert"). This means that tailored normalization - * still transforms text into a canonically equivalent form. - * It does not add decompositions to code points that do not have any or - * change decomposition results. - * - * Any function that searches for a safe boundary has not been touched, - * which means that these functions will be over-pessimistic when - * exclusions are applied. - * This should not matter because subsequent checks and normalizations - * do apply the exclusions; only a little more of the text may be processed - * than necessary under exclusions. - * - * Normalization exclusions have the following effect on excluded code points c: - * - c is not decomposed - * - c is not a composition target - * - c does not combine forward or backward for composition - * except that this is not implemented for Jamo - * - c is treated as having a combining class of 0 - */ -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - -/* - * This new implementation of the normalization code loads its data from - * unorm.dat, which is generated with the gennorm tool. - * The format of that file is described in unormimp.h . - */ - -/* -------------------------------------------------------------------------- */ - -enum { - _STACK_BUFFER_CAPACITY=100 -}; - -/* - * Constants for the bit fields in the options bit set parameter. - * These need not be public. - * A user only needs to know the currently assigned values. - * The number and positions of reserved bits per field can remain private - * and may change in future implementations. - */ -enum { - _NORM_OPTIONS_NX_MASK=0x1f, - _NORM_OPTIONS_UNICODE_MASK=0x60, - _NORM_OPTIONS_SETS_MASK=0x7f, - - _NORM_OPTIONS_UNICODE_SHIFT=5, - - /* - * The following options are used only in some composition functions. - * They use bits 12 and up to preserve lower bits for the available options - * space in unorm_compare() - - * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT. - */ - - /** Options bit 12, for compatibility vs. canonical decomposition. */ - _NORM_OPTIONS_COMPAT=0x1000, - /** Options bit 13, no discontiguous composition (FCC vs. NFC). */ - _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000 -}; - -static inline UBool -isHangulWithoutJamoT(UChar c) { - c-=HANGUL_BASE; - return c=_NORM_MIN_HANGUL; -} - -/* - * Given isNorm32HangulOrJamo(), - * is this a Hangul syllable or a Jamo? - */ -static inline UBool -isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) { - return norm32<_NORM_MIN_JAMO_V; -} - -/* - * Given norm32 for Jamo V or T, - * is this a Jamo V? - */ -static inline UBool -isJamoVTNorm32JamoV(uint32_t norm32) { - return norm32<_NORM_JAMO_V_TOP; -} - -/* load unorm.dat ----------------------------------------------------------- */ - -#define DATA_NAME "unorm" -#define DATA_TYPE "icu" - -static UDataMemory *normData=NULL; -static UErrorCode dataErrorCode=U_ZERO_ERROR; -static int8_t haveNormData=0; - -static int32_t indexes[_NORM_INDEX_TOP]={ 0 }; -static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 }; - -/* - * pointers into the memory-mapped unorm.icu - */ -static const uint16_t *extraData=NULL, - *combiningTable=NULL, - *canonStartSets=NULL; - -static uint8_t formatVersion[4]={ 0, 0, 0, 0 }; -static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE; - -/* the Unicode version of the normalization data */ -static UVersionInfo dataVersion={ 0, 0, 0, 0 }; - -/* cache UnicodeSets for each combination of exclusion flags */ -static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL }; - -U_CDECL_BEGIN - -static UBool U_CALLCONV -unorm_cleanup() { - int32_t i; - - if(normData!=NULL) { - udata_close(normData); - normData=NULL; - } - dataErrorCode=U_ZERO_ERROR; - haveNormData=0; - - for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) { - delete nxCache[i]; - } - uprv_memset(nxCache, 0, sizeof(nxCache)); - - return TRUE; -} - -/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ -static int32_t U_CALLCONV -getFoldingNormOffset(uint32_t norm32) { - if(isNorm32LeadSurrogate(norm32)) { - return - UTRIE_BMP_INDEX_LENGTH+ - (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& - (0x3ff<size>=20 && - pInfo->isBigEndian==U_IS_BIG_ENDIAN && - pInfo->charsetFamily==U_CHARSET_FAMILY && - pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ - pInfo->dataFormat[1]==0x6f && - pInfo->dataFormat[2]==0x72 && - pInfo->dataFormat[3]==0x6d && - pInfo->formatVersion[0]==2 && - pInfo->formatVersion[2]==UTRIE_SHIFT && - pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT - ) { - uprv_memcpy(formatVersion, pInfo->formatVersion, 4); - uprv_memcpy(dataVersion, pInfo->dataVersion, 4); - return TRUE; - } else { - return FALSE; - } -} - -static UBool U_CALLCONV -_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) { - /* add the start code point to the USet */ - USetAdder *sa=(USetAdder *)context; - sa->add(sa->set, start); - return TRUE; -} - -U_CDECL_END - -static int8_t -loadNormData(UErrorCode &errorCode) { - /* load Unicode normalization data from file */ - - /* - * This lazy intialization with double-checked locking (without mutex protection for - * haveNormData==0) is transiently unsafe under certain circumstances. - * Check the readme and use u_init() if necessary. - * - * While u_init() initializes the main normalization data via this functions, - * it does not do so for exclusion sets (which are fully mutexed). - * This is because - * - there can be many exclusion sets - * - they are rarely used - * - they are not usually used in execution paths that are - * as performance-sensitive as others - * (e.g., IDNA takes more time than unorm_quickCheck() anyway) - */ - if(haveNormData==0) { - UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 }; - UDataMemory *data; - const int32_t *p=NULL; - const uint8_t *pb; - - if(&errorCode==NULL || U_FAILURE(errorCode)) { - return 0; - } - - /* open the data outside the mutex block */ - data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode); - dataErrorCode=errorCode; - if(U_FAILURE(errorCode)) { - return haveNormData=-1; - } - - p=(const int32_t *)udata_getMemory(data); - pb=(const uint8_t *)(p+_NORM_INDEX_TOP); - utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode); - _normTrie.getFoldingOffset=getFoldingNormOffset; - - pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2; - utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode); - _fcdTrie.getFoldingOffset=getFoldingFCDOffset; - - if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) { - pb+=p[_NORM_INDEX_FCD_TRIE_SIZE]; - utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode); - _auxTrie.getFoldingOffset=getFoldingAuxOffset; - } - - if(U_FAILURE(errorCode)) { - dataErrorCode=errorCode; - udata_close(data); - return haveNormData=-1; - } - - /* in the mutex block, set the data for this process */ - umtx_lock(NULL); - if(normData==NULL) { - normData=data; - data=NULL; - - uprv_memcpy(&indexes, p, sizeof(indexes)); - uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie)); - uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie)); - uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie)); - } else { - p=(const int32_t *)udata_getMemory(normData); - } - - /* initialize some variables */ - extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]); - combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT]; - formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1); - formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2); - if(formatVersion_2_1) { - canonStartSets=combiningTable+ - indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+ - (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2; - } - haveNormData=1; - ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup); - umtx_unlock(NULL); - - /* if a different thread set it first, then close the extra data */ - if(data!=NULL) { - udata_close(data); /* NULL if it was set correctly */ - } - } - - return haveNormData; -} - -static inline UBool -_haveData(UErrorCode &errorCode) { - if(haveNormData!=0) { - errorCode=dataErrorCode; - return (UBool)(haveNormData>0); - } else { - return (UBool)(loadNormData(errorCode)>0); - } -} - -U_CAPI UBool U_EXPORT2 -unorm_haveData(UErrorCode *pErrorCode) { - return _haveData(*pErrorCode); -} - -U_CAPI const uint16_t * U_EXPORT2 -unorm_getFCDTrie(UErrorCode *pErrorCode) { - if(_haveData(*pErrorCode)) { - return fcdTrie.index; - } else { - return NULL; - } -} - -/* data access primitives --------------------------------------------------- */ - -static inline uint32_t -_getNorm32(UChar c) { - return UTRIE_GET32_FROM_LEAD(&normTrie, c); -} - -static inline uint32_t -_getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) { - /* - * the surrogate index in norm32 stores only the number of the surrogate index block - * see gennorm/store.c/getFoldedNormValue() - */ - norm32= - UTRIE_BMP_INDEX_LENGTH+ - ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& - (0x3ff<>_NORM_EXTRA_SHIFT); -} - -/* normalization exclusion sets --------------------------------------------- */ - -/* - * Normalization exclusion UnicodeSets are used for tailored normalization; - * see the comment near the beginning of this file. - * - * By specifying one or several sets of code points, - * those code points become inert for normalization. - */ - -static const UnicodeSet * -internalGetNXHangul(UErrorCode &errorCode) { - /* internal function, does not check for incoming U_FAILURE */ - UBool isCached; - - UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached); - - if(!isCached) { - UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3); - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - - umtx_lock(NULL); - if(nxCache[UNORM_NX_HANGUL]==NULL) { - nxCache[UNORM_NX_HANGUL]=set; - set=NULL; - } - umtx_unlock(NULL); - - delete set; - } - - return nxCache[UNORM_NX_HANGUL]; -} - -/* unorm.cpp 1.116 had and used -static const UnicodeSet * -internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) { - ... -} -*/ - -/* get and set an exclusion set from a serialized UnicodeSet */ -static const UnicodeSet * -internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) { - /* internal function, does not check for incoming U_FAILURE */ - UBool isCached; - - UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached); - - if( !isCached && - canonStartSets!=NULL && - canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex] - ) { - USerializedSet sset; - UnicodeSet *set; - UChar32 start, end; - int32_t i; - - if( !uset_getSerializedSet( - &sset, - canonStartSets+canonStartSets[nxIndex], - canonStartSets[nxIndex+1]-canonStartSets[nxIndex]) - ) { - errorCode=U_INVALID_FORMAT_ERROR; - return NULL; - } - - /* turn the serialized set into a UnicodeSet */ - set=new UnicodeSet(); - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) { - set->add(start, end); - } - - umtx_lock(NULL); - if(nxCache[options]==NULL) { - nxCache[options]=set; - set=NULL; - } - umtx_unlock(NULL); - - delete set; - } - - return nxCache[options]; -} - -static const UnicodeSet * -internalGetNXCJKCompat(UErrorCode &errorCode) { - /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */ - return internalGetSerializedNX( - UNORM_NX_CJK_COMPAT, - _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, - errorCode); -} - -static const UnicodeSet * -internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) { - /* internal function, does not check for incoming U_FAILURE */ - int32_t nxIndex; - - options&=_NORM_OPTIONS_UNICODE_MASK; - switch(options) { - case 0: - return NULL; - case UNORM_UNICODE_3_2: - /* [:^Age=3.2:] */ - nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET; - break; - default: - errorCode=U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - /* build a set with all code points that were not designated by the specified Unicode version */ - return internalGetSerializedNX(options, nxIndex, errorCode); -} - -/* Get a decomposition exclusion set. The data must be loaded. */ -static const UnicodeSet * -internalGetNX(int32_t options, UErrorCode &errorCode) { - options&=_NORM_OPTIONS_SETS_MASK; - - UBool isCached; - - UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached); - - if(!isCached) { - /* return basic sets */ - if(options==UNORM_NX_HANGUL) { - return internalGetNXHangul(errorCode); - } - if(options==UNORM_NX_CJK_COMPAT) { - return internalGetNXCJKCompat(errorCode); - } - if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) { - return internalGetNXUnicode(options, errorCode); - } - - /* build a set from multiple subsets */ - UnicodeSet *set; - const UnicodeSet *other; - - set=new UnicodeSet(); - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - - if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) { - set->addAll(*other); - } - if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) { - set->addAll(*other); - } - if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) { - set->addAll(*other); - } - - if(U_FAILURE(errorCode)) { - delete set; - return NULL; - } - - umtx_lock(NULL); - if(nxCache[options]==NULL) { - nxCache[options]=set; - set=NULL; - } - umtx_unlock(NULL); - - delete set; - } - - return nxCache[options]; -} - -static inline const UnicodeSet * -getNX(int32_t options, UErrorCode &errorCode) { - if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) { - /* incoming failure, or no decomposition exclusions requested */ - return NULL; - } else { - return internalGetNX(options, errorCode); - } -} - -U_CFUNC const UnicodeSet * -unorm_getNX(int32_t options, UErrorCode *pErrorCode) { - return getNX(options, *pErrorCode); -} - -static inline UBool -nx_contains(const UnicodeSet *nx, UChar32 c) { - return nx!=NULL && nx->contains(c); -} - -static inline UBool -nx_contains(const UnicodeSet *nx, UChar c, UChar c2) { - return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2)); -} - -/* other normalization primitives ------------------------------------------- */ - -/* get the canonical or compatibility decomposition for one character */ -static inline const UChar * -_decompose(uint32_t norm32, uint32_t qcMask, int32_t &length, - uint8_t &cc, uint8_t &trailCC) { - const UChar *p=(const UChar *)_getExtraData(norm32); - length=*p++; - - if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) { - /* use compatibility decomposition, skip canonical data */ - p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK); - length>>=8; - } - - if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) { - /* get the lead and trail cc's */ - UChar bothCCs=*p++; - cc=(uint8_t)(bothCCs>>8); - trailCC=(uint8_t)bothCCs; - } else { - /* lead and trail cc's are both 0 */ - cc=trailCC=0; - } - - length&=_NORM_DECOMP_LENGTH_MASK; - return p; -} - -/* get the canonical decomposition for one character */ -static inline const UChar * -_decompose(uint32_t norm32, int32_t &length, - uint8_t &cc, uint8_t &trailCC) { - const UChar *p=(const UChar *)_getExtraData(norm32); - length=*p++; - - if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) { - /* get the lead and trail cc's */ - UChar bothCCs=*p++; - cc=(uint8_t)(bothCCs>>8); - trailCC=(uint8_t)bothCCs; - } else { - /* lead and trail cc's are both 0 */ - cc=trailCC=0; - } - - length&=_NORM_DECOMP_LENGTH_MASK; - return p; -} - -/** - * Get the canonical decomposition for one code point. - * @param c code point - * @param buffer out-only buffer for algorithmic decompositions of Hangul - * @param length out-only, takes the length of the decomposition, if any - * @return pointer to decomposition, or 0 if none - * @internal - */ -U_CFUNC const UChar * -unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) { - uint32_t norm32; - - if(c0) { - buffer[2]=(UChar)(JAMO_T_BASE+c2); - *pLength=3; - } else { - *pLength=2; - } - - buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); - buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); - return buffer; - } else { - /* normal decomposition */ - uint8_t cc, trailCC; - return _decompose(norm32, *pLength, cc, trailCC); - } - } else { - return 0; - } -} - -/* - * get the combining class of (c, c2)=*p++ - * before: p>_NORM_CC_SHIFT); - } -} - -/* - * read backwards and get norm32 - * return 0 if the character is >_NORM_CC_SHIFT); -} - -/* - * is this a safe boundary character for NF*D? - * (lead cc==0) - */ -static inline UBool -_isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) { - if((norm32&ccOrQCMask)==0) { - return TRUE; /* cc==0 and no decomposition: this is NF*D safe */ - } - - /* inspect its decomposition - maybe a Hangul but not a surrogate here */ - if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) { - int32_t length; - uint8_t cc, trailCC; - - /* decomposes, get everything from the variable-length extra data */ - _decompose(norm32, decompQCMask, length, cc, trailCC); - return cc==0; - } else { - /* no decomposition (or Hangul), test the cc directly */ - return (norm32&_NORM_CC_MASK)==0; - } -} - -/* - * is this (or does its decomposition begin with) a "true starter"? - * (cc==0 and NF*C_YES) - */ -static inline UBool -_isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) { - if((norm32&ccOrQCMask)==0) { - return TRUE; /* this is a true starter (could be Hangul or Jamo L) */ - } - - /* inspect its decomposition - not a Hangul or a surrogate here */ - if((norm32&decompQCMask)!=0) { - const UChar *p; - int32_t length; - uint8_t cc, trailCC; - - /* decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, decompQCMask, length, cc, trailCC); - if(cc==0) { - uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK; - - /* does it begin with NFC_YES? */ - if((_getNorm32(p, qcMask)&qcMask)==0) { - /* yes, the decomposition begins with a true starter */ - return TRUE; - } - } - } - return FALSE; -} - -/* uchar.h */ -U_CAPI uint8_t U_EXPORT2 -u_getCombiningClass(UChar32 c) { - UErrorCode errorCode=U_ZERO_ERROR; - if(_haveData(errorCode)) { - uint32_t norm32; - - UTRIE_GET32(&normTrie, c, norm32); - return (uint8_t)(norm32>>_NORM_CC_SHIFT); - } else { - return 0; - } -} - -U_CAPI UBool U_EXPORT2 -unorm_internalIsFullCompositionExclusion(UChar32 c) { - UErrorCode errorCode=U_ZERO_ERROR; - if(_haveData(errorCode) && formatVersion_2_1) { - uint16_t aux; - - UTRIE_GET16(&auxTrie, c, aux); - return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0); - } else { - return FALSE; - } -} - -U_CAPI UBool U_EXPORT2 -unorm_isCanonSafeStart(UChar32 c) { - UErrorCode errorCode=U_ZERO_ERROR; - if(_haveData(errorCode) && formatVersion_2_1) { - uint16_t aux; - - UTRIE_GET16(&auxTrie, c, aux); - return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0); - } else { - return FALSE; - } -} - -U_CAPI void U_EXPORT2 -unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){ - if(unorm_haveData(pErrorCode)){ - uprv_memcpy(*versionInfo, dataVersion, 4); - } -} - - -U_CAPI UBool U_EXPORT2 -unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) { - UErrorCode errorCode=U_ZERO_ERROR; - if( fillSet!=NULL && (uint32_t)c<=0x10ffff && - _haveData(errorCode) && canonStartSets!=NULL - ) { - const uint16_t *table; - int32_t i, start, limit; - - /* - * binary search for c - * - * There are two search tables, - * one for BMP code points and one for supplementary ones. - * See unormimp.h for details. - */ - if(c<=0xffff) { - table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]; - start=0; - limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]; - - /* each entry is a pair { c, result } */ - while(start>16); - low=(uint16_t)c; - - /* each entry is a triplet { high(c), low(c), result } */ - while(start0)) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - if(!_haveData(*pErrorCode) || !formatVersion_2_1) { - return 0; - } - - UTRIE_GET16(&auxTrie, c, aux); - aux&=_NORM_AUX_FNC_MASK; - if(aux!=0) { - const UChar *s; - int32_t length; - - s=(const UChar *)(extraData+aux); - if(*s<0xff00) { - /* s points to the single-unit string */ - length=1; - } else { - length=*s&0xff; - ++s; - } - if(0-skippable code point? See unormimp.h. */ -U_CAPI UBool U_EXPORT2 -unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) { - UErrorCode errorCode; - uint32_t norm32, mask; - uint16_t aux, fcd; - - errorCode=U_ZERO_ERROR; - if(!_haveData(errorCode)) { - return FALSE; - } - - /* handle trivial cases; set the comparison mask for the normal ones */ - switch(mode) { - case UNORM_NONE: - return TRUE; - case UNORM_NFD: - mask=_NORM_CC_MASK|_NORM_QC_NFD; - break; - case UNORM_NFKD: - mask=_NORM_CC_MASK|_NORM_QC_NFKD; - break; - case UNORM_NFC: - /* case UNORM_FCC: */ - mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO); - break; - case UNORM_NFKC: - mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO); - break; - case UNORM_FCD: - /* FCD: skippable if lead cc==0 and trail cc<=1 */ - UTRIE_GET16(&fcdTrie, c, fcd); - return fcd<=1; - default: - return FALSE; - } - - /* check conditions (a)..(e), see unormimp.h */ - UTRIE_GET32(&normTrie, c, norm32); - if((norm32&mask)!=0) { - return FALSE; /* fails (a)..(e), not skippable */ - } - - if(modeadd(sa->set, c); - sa->add(sa->set, c+1); - } - sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ -} - -U_CAPI UNormalizationCheckResult U_EXPORT2 -unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) { - static const uint32_t qcMask[UNORM_MODE_COUNT]={ - 0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC - }; - - UErrorCode errorCode; - uint32_t norm32; - - errorCode=U_ZERO_ERROR; - if(!_haveData(errorCode)) { - return UNORM_YES; - } - - UTRIE_GET32(&normTrie, c, norm32); - norm32&=qcMask[mode]; - - if(norm32==0) { - return UNORM_YES; - } else if(norm32&_NORM_QC_ANY_NO) { - return UNORM_NO; - } else /* _NORM_QC_ANY_MAYBE */ { - return UNORM_MAYBE; - } -} - -U_CAPI uint16_t U_EXPORT2 -unorm_getFCD16FromCodePoint(UChar32 c) { - UErrorCode errorCode; - uint16_t fcd; - - errorCode=U_ZERO_ERROR; - if(!_haveData(errorCode)) { - return 0; - } - - UTRIE_GET16(&fcdTrie, c, fcd); - return fcd; -} - -/* reorder UTF-16 in-place -------------------------------------------------- */ - -/* - * simpler, single-character version of _mergeOrdered() - - * bubble-insert one single code point into the preceding string - * which is already canonically ordered - * (c, c2) may or may not yet have been inserted at [current..p[ - * - * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) - * - * before: [start..current[ is already ordered, and - * [current..p[ may or may not hold (c, c2) but - * must be exactly the same length as (c, c2) - * after: [start..p[ is ordered - * - * returns the trailing combining class - */ -static uint8_t -_insertOrdered(const UChar *start, UChar *current, UChar *p, - UChar c, UChar c2, uint8_t cc) { - const UChar *pBack, *pPreBack; - UChar *r; - uint8_t prevCC, trailCC=cc; - - if(start=prevCC */ - pPreBack=pBack=current; - prevCC=_getPrevCC(start, pPreBack); - if(cc=prevCC) { - break; - } - pBack=pPreBack; - } - - /* - * this is where we are right now with all these pointers: - * [start..pPreBack[ 0..? code points that we can ignore - * [pPreBack..pBack[ 0..1 code points with prevCC<=cc - * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2) - * [current..p[ 1 code point (c, c2) with cc - */ - - /* move the code units in between up */ - r=p; - do { - *--r=*--current; - } while(pBack!=current); - } - } - - /* insert (c, c2) */ - *current=c; - if(c2!=0) { - *(current+1)=c2; - } - - /* we know the cc of the last code point */ - return trailCC; -} - -/* - * merge two UTF-16 string parts together - * to canonically order (order by combining classes) their concatenation - * - * the two strings may already be adjacent, so that the merging is done in-place - * if the two strings are not adjacent, then the buffer holding the first one - * must be large enough - * the second string may or may not be ordered in itself - * - * before: [start..current[ is already ordered, and - * [next..limit[ may be ordered in itself, but - * is not in relation to [start..current[ - * after: [start..current+(limit-next)[ is ordered - * - * the algorithm is a simple bubble-sort that takes the characters from *next++ - * and inserts them in correct combining class order into the preceding part - * of the string - * - * since this function is called much less often than the single-code point - * _insertOrdered(), it just uses that for easier maintenance - * (see file version from before 2001aug31 for a more optimized version) - * - * returns the trailing combining class - */ -static uint8_t -_mergeOrdered(UChar *start, UChar *current, - const UChar *next, const UChar *limit, UBool isOrdered=TRUE) { - UChar *r; - UChar c, c2; - uint8_t cc, trailCC=0; - UBool adjacent; - - adjacent= current==next; - - if(start!=current || !isOrdered) { - while(next0) || destCapacity==0) - ) { - uint32_t norm32, qcMask; - UChar32 minNoMaybe; - int32_t length; - - /* initialize */ - if(!compat) { - minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]; - qcMask=_NORM_QC_NFD; - } else { - minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]; - qcMask=_NORM_QC_NFKD; - } - - if(c0) { - dest[0]=(UChar)c; - } - return -1; - } - - /* data lookup */ - UTRIE_GET32(&normTrie, c, norm32); - if((norm32&qcMask)==0) { - /* simple case: no decomposition */ - if(c<=0xffff) { - if(destCapacity>0) { - dest[0]=(UChar)c; - } - return -1; - } else { - if(destCapacity>=2) { - dest[0]=UTF16_LEAD(c); - dest[1]=UTF16_TRAIL(c); - } - return -2; - } - } else if(isNorm32HangulOrJamo(norm32)) { - /* Hangul syllable: decompose algorithmically */ - UChar c2; - - c-=HANGUL_BASE; - - c2=(UChar)(c%JAMO_T_COUNT); - c/=JAMO_T_COUNT; - if(c2>0) { - if(destCapacity>=3) { - dest[2]=(UChar)(JAMO_T_BASE+c2); - } - length=3; - } else { - length=2; - } - - if(destCapacity>=2) { - dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); - dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); - } - return length; - } else { - /* c decomposes, get everything from the variable-length extra data */ - const UChar *p, *limit; - uint8_t cc, trailCC; - - p=_decompose(norm32, qcMask, length, cc, trailCC); - if(length<=destCapacity) { - limit=p+length; - do { - *dest++=*p++; - } while(p=0) { - /* string with length */ - limit=src+srcLength; - } else /* srcLength==-1 */ { - /* zero-terminated string */ - limit=NULL; - } - - U_ALIGN_CODE(16); - - for(;;) { - /* count code units below the minimum or with irrelevant data for the quick check */ - prevSrc=src; - if(limit==NULL) { - while((c=*src)0) { - buffer[2]=(UChar)(JAMO_T_BASE+c2); - length=3; - } else { - length=2; - } - - buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); - buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); - } - } else { - if(isNorm32Regular(norm32)) { - c2=0; - length=1; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { - ++src; - length=2; - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } else { - c2=0; - length=1; - norm32=0; - } - } - - /* get the decomposition and the lead and trail cc's */ - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - cc=trailCC=0; - p=NULL; - } else if((norm32&qcMask)==0) { - /* c does not decompose */ - cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT); - p=NULL; - } else { - /* c decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, qcMask, length, cc, trailCC); - if(length==1) { - /* fastpath a single code unit from decomposition */ - c=*p; - c2=0; - p=NULL; - } - } - } - - /* append the decomposition to the destination buffer, assume length>0 */ - if((destIndex+length)<=destCapacity) { - UChar *reorderSplit=dest+destIndex; - if(p==NULL) { - /* fastpath: single code point */ - if(cc!=0 && cc0); - } - } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; - } - - prevCC=trailCC; - if(prevCC==0) { - reorderStartIndex=destIndex; - } - } - - outTrailCC=prevCC; - return destIndex; -} - -U_CAPI int32_t U_EXPORT2 -unorm_decompose(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UBool compat, int32_t options, - UErrorCode *pErrorCode) { - const UnicodeSet *nx; - int32_t destIndex; - uint8_t trailCC; - - if(!_haveData(*pErrorCode)) { - return 0; - } - - nx=getNX(options, *pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } - - destIndex=_decompose(dest, destCapacity, - src, srcLength, - compat, nx, - trailCC); - - return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode); -} - -/* make NFC & NFKC ---------------------------------------------------------- */ - -/* get the composition properties of the next character */ -static inline uint32_t -_getNextCombining(UChar *&p, const UChar *limit, - UChar &c, UChar &c2, - uint16_t &combiningIndex, uint8_t &cc, - const UnicodeSet *nx) { - uint32_t norm32, combineFlags; - - /* get properties */ - c=*p++; - norm32=_getNorm32(c); - - /* preset output values for most characters */ - c2=0; - combiningIndex=0; - cc=0; - - if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) { - return 0; - } else { - if(isNorm32Regular(norm32)) { - /* set cc etc. below */ - } else if(isNorm32HangulOrJamo(norm32)) { - /* a compatibility decomposition contained Jamos */ - combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT)); - return norm32&_NORM_COMBINES_ANY; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) { - ++p; - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } else { - c2=0; - return 0; - } - } - - if(nx_contains(nx, c, c2)) { - return 0; /* excluded: norm32==0 */ - } - - cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); - - combineFlags=norm32&_NORM_COMBINES_ANY; - if(combineFlags!=0) { - combiningIndex=*(_getExtraData(norm32)-1); - } - return combineFlags; - } -} - -/* - * given a composition-result starter (c, c2) - which means its cc==0, - * it combines forward, it has extra data, its norm32!=0, - * it is not a Hangul or Jamo, - * get just its combineFwdIndex - * - * norm32(c) is special if and only if c2!=0 - */ -static inline uint16_t -_getCombiningIndexFromStarter(UChar c, UChar c2) { - uint32_t norm32; - - norm32=_getNorm32(c); - if(c2!=0) { - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } - return *(_getExtraData(norm32)-1); -} - -/* - * Find the recomposition result for - * a forward-combining character - * (specified with a pointer to its part of the combiningTable[]) - * and a backward-combining character - * (specified with its combineBackIndex). - * - * If these two characters combine, then set (value, value2) - * with the code unit(s) of the composition character. - * - * Return value: - * 0 do not combine - * 1 combine - * >1 combine, and the composition is a forward-combining starter - * - * See unormimp.h for a description of the composition table format. - */ -static inline uint16_t -_combine(const uint16_t *table, uint16_t combineBackIndex, - uint16_t &value, uint16_t &value2) { - uint16_t key; - - /* search in the starter's composition table */ - for(;;) { - key=*table++; - if(key>=combineBackIndex) { - break; - } - table+= *table&0x8000 ? 2 : 1; - } - - /* mask off bit 15, the last-entry-in-the-list flag */ - if((key&0x7fff)==combineBackIndex) { - /* found! combine! */ - value=*table; - - /* is the composition a starter that combines forward? */ - key=(uint16_t)((value&0x2000)+1); - - /* get the composition result code point from the variable-length result value */ - if(value&0x8000) { - if(value&0x4000) { - /* surrogate pair composition result */ - value=(uint16_t)((value&0x3ff)|0xd800); - value2=*(table+1); - } else { - /* BMP composition result U+2000..U+ffff */ - value=*(table+1); - value2=0; - } - } else { - /* BMP composition result U+0000..U+1fff */ - value&=0x1fff; - value2=0; - } - - return key; - } else { - /* not found */ - return 0; - } -} - -static inline UBool -_composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit, - UBool compat, UChar *dest, const UnicodeSet *nx) { - if(isJamoVTNorm32JamoV(norm32)) { - /* c is a Jamo V, compose with previous Jamo L and following Jamo T */ - prev=(UChar)(prev-JAMO_L_BASE); - if(prev - * the rest of the loop body will reset starter to NULL; - * technically, a composed Hangul syllable is a starter, but it - * does not combine forward now that we have consumed all eligible Jamos; - * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD - */ - - } else if( - /* the starter is not a Hangul LV or Jamo V/T and */ - !(combineFwdIndex&0x8000) && - /* the combining mark is not blocked and */ - ((options&UNORM_BEFORE_PRI_29) ? - (prevCC!=cc || prevCC==0) : - (prevCC1) { - combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2); - } else { - starter=NULL; - } - - /* we combined; continue with looking for compositions */ - continue; - } - } - - /* no combination this time */ - prevCC=cc; - if(p==limit) { - return prevCC; - } - - /* if (c, c2) did not combine, then check if it is a starter */ - if(cc==0) { - /* found a new starter; combineFlags==0 if (c, c2) is excluded */ - if(combineFlags&_NORM_COMBINES_FWD) { - /* it may combine with something, prepare for it */ - if(c2==0) { - starterIsSupplementary=FALSE; - starter=p-1; - } else { - starterIsSupplementary=TRUE; - starter=p-2; - } - combineFwdIndex=combineBackIndex; - } else { - /* it will not combine with anything */ - starter=NULL; - } - } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) { - /* FCC: no discontiguous compositions; any intervening character blocks */ - starter=NULL; - } - } -} - -/* decompose and recompose [prevStarter..src[ */ -static const UChar * -_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length, - const UChar *prevStarter, const UChar *src, - uint8_t &prevCC, - int32_t options, const UnicodeSet *nx, - UErrorCode *pErrorCode) { - UChar *recomposeLimit; - uint8_t trailCC; - UBool compat; - - compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0); - - /* decompose [prevStarter..src[ */ - length=_decompose(buffer, bufferCapacity, - prevStarter, src-prevStarter, - compat, nx, - trailCC); - if(length>bufferCapacity) { - if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - length=_decompose(buffer, bufferCapacity, - prevStarter, src-prevStarter, - compat, nx, - trailCC); - } - - /* recompose the decomposition */ - recomposeLimit=buffer+length; - if(length>=2) { - prevCC=_recompose(buffer, recomposeLimit, options, nx); - } - - /* return with a pointer to the recomposition and its length */ - length=recomposeLimit-buffer; - return buffer; -} - -static int32_t -_compose(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - int32_t options, const UnicodeSet *nx, - UErrorCode *pErrorCode) { - UChar stackBuffer[_STACK_BUFFER_CAPACITY]; - UChar *buffer; - int32_t bufferCapacity; - - const UChar *limit, *prevSrc, *prevStarter; - uint32_t norm32, ccOrQCMask, qcMask; - int32_t destIndex, reorderStartIndex, length; - UChar c, c2, minNoMaybe; - uint8_t cc, prevCC; - - if(options&_NORM_OPTIONS_COMPAT) { - minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]; - qcMask=_NORM_QC_NFKC; - } else { - minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]; - qcMask=_NORM_QC_NFC; - } - - /* initialize */ - buffer=stackBuffer; - bufferCapacity=_STACK_BUFFER_CAPACITY; - - /* - * prevStarter points to the last character before the current one - * that is a "true" starter with cc==0 and quick check "yes". - * - * prevStarter will be used instead of looking for a true starter - * while incrementally decomposing [prevStarter..prevSrc[ - * in _composePart(). Having a good prevStarter allows to just decompose - * the entire [prevStarter..prevSrc[. - * - * When _composePart() backs out from prevSrc back to prevStarter, - * then it also backs out destIndex by the same amount. - * Therefore, at all times, the (prevSrc-prevStarter) source units - * must correspond 1:1 to destination units counted with destIndex, - * except for reordering. - * This is true for the qc "yes" characters copied in the fast loop, - * and for pure reordering. - * prevStarter must be set forward to src when this is not true: - * In _composePart() and after composing a Hangul syllable. - * - * This mechanism relies on the assumption that the decomposition of a true starter - * also begins with a true starter. gennorm/store.c checks for this. - */ - prevStarter=src; - - ccOrQCMask=_NORM_CC_MASK|qcMask; - destIndex=reorderStartIndex=0; - prevCC=0; - - /* avoid compiler warnings */ - norm32=0; - c=0; - - if(srcLength>=0) { - /* string with length */ - limit=src+srcLength; - } else /* srcLength==-1 */ { - /* zero-terminated string */ - limit=NULL; - } - - U_ALIGN_CODE(16); - - for(;;) { - /* count code units below the minimum or with irrelevant data for the quick check */ - prevSrc=src; - if(limit==NULL) { - while((c=*src)0 && - _composeHangul( - *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0), - destIndex<=destCapacity ? dest+(destIndex-1) : 0, - nx) - ) { - prevStarter=src; - continue; - } - - /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */ - c2=0; - length=1; - prevStarter=prevSrc; - } else { - if(isNorm32Regular(norm32)) { - c2=0; - length=1; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { - ++src; - length=2; - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } else { - /* c is an unpaired lead surrogate, nothing to do */ - c2=0; - length=1; - norm32=0; - } - } - - /* we are looking at the character (c, c2) at [prevSrc..src[ */ - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - cc=0; - } else if((norm32&qcMask)==0) { - cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); - } else { - const UChar *p; - uint32_t decompQCMask; - - /* - * find appropriate boundaries around this character, - * decompose the source text from between the boundaries, - * and recompose it - * - * this puts the intermediate text into the side buffer because - * it might be longer than the recomposition end result, - * or the destination buffer may be too short or missing - * - * note that destIndex may be adjusted backwards to account - * for source text that passed the quick check but needed to - * take part in the recomposition - */ - decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ - - /* - * find the last true starter in [prevStarter..src[ - * it is either the decomposition of the current character (at prevSrc), - * or prevStarter - */ - if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { - prevStarter=prevSrc; - } else { - /* adjust destIndex: back out what had been copied with qc "yes" */ - destIndex-=(int32_t)(prevSrc-prevStarter); - } - - /* find the next true starter in [src..limit[ - modifies src to point to the next starter */ - src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe); - - /* compose [prevStarter..src[ */ - p=_composePart(stackBuffer, buffer, bufferCapacity, - length, /* output */ - prevStarter, src, - prevCC, /* output */ - options, nx, - pErrorCode); - - if(p==NULL) { - destIndex=0; /* an error occurred (out of memory) */ - break; - } - - /* append the recomposed buffer contents to the destination buffer */ - if((destIndex+length)<=destCapacity) { - while(length>0) { - dest[destIndex++]=*p++; - --length; - } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; - } - - /* set the next starter */ - prevStarter=src; - - continue; - } - } - - /* append the single code point (c, c2) to the destination buffer */ - if((destIndex+length)<=destCapacity) { - if(cc!=0 && cc>_NORM_CC_SHIFT); - p=NULL; - } else { - /* c decomposes, get everything from the variable-length extra data */ - p=_decompose(norm32, length, cc, trailCC); - if(length==1) { - /* fastpath a single code unit from decomposition */ - c=*p; - c2=0; - p=NULL; - } - } - - /* append the decomposition to the destination buffer, assume length>0 */ - if((destIndex+length)<=destCapacity) { - UChar *reorderSplit=dest+destIndex; - if(p==NULL) { - /* fastpath: single code point */ - if(cc!=0 && cc0); - } - } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; - } - - prevCC=trailCC; - if(prevCC==0) { - reorderStartIndex=destIndex; - } - } - - return prevCC; -} - -static int32_t -unorm_makeFCD(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - const UnicodeSet *nx, - UErrorCode *pErrorCode) { - const UChar *limit, *prevSrc, *decompStart; - int32_t destIndex, length; - UChar c, c2; - uint16_t fcd16; - int16_t prevCC, cc; - - if(!_haveData(*pErrorCode)) { - return 0; - } - - /* initialize */ - decompStart=src; - destIndex=0; - prevCC=0; - - /* avoid compiler warnings */ - c=0; - fcd16=0; - - if(srcLength>=0) { - /* string with length */ - limit=src+srcLength; - } else /* srcLength==-1 */ { - /* zero-terminated string */ - limit=NULL; - } - - U_ALIGN_CODE(16); - - for(;;) { - /* skip a run of code units below the minimum or with irrelevant data for the FCD check */ - prevSrc=src; - if(limit==NULL) { - for(;;) { - c=*src; - if(c<_NORM_MIN_WITH_LEAD_CC) { - if(c==0) { - break; - } - prevCC=(int16_t)-c; - } else if((fcd16=_getFCD16(c))==0) { - prevCC=0; - } else { - break; - } - ++src; - } - } else { - for(;;) { - if(src==limit) { - break; - } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) { - prevCC=(int16_t)-c; - } else if((fcd16=_getFCD16(c))==0) { - prevCC=0; - } else { - break; - } - ++src; - } - } - - /* - * prevCC has values from the following ranges: - * 0..0xff - the previous trail combining class - * <0 - the negative value of the previous code unit; - * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() - * was deferred so that average text is checked faster - */ - - /* copy these code units all at once */ - if(src!=prevSrc) { - length=(int32_t)(src-prevSrc); - if((destIndex+length)<=destCapacity) { - uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR); - } - destIndex+=length; - prevSrc=src; - - /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc=0 - */ - - /* end of source reached? */ - if(limit==NULL ? c==0 : src==limit) { - break; - } - - /* set a pointer to after the last source position where prevCC==0 */ - if(prevCC==0) { - decompStart=prevSrc; - } - - /* c already contains *src and fcd16 is set for it, increment src */ - ++src; - - /* check one above-minimum, relevant code unit */ - if(UTF_IS_FIRST_SURROGATE(c)) { - /* c is a lead surrogate, get the real fcd16 */ - if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { - ++src; - fcd16=_getFCD16FromSurrogatePair(fcd16, c2); - } else { - c2=0; - fcd16=0; - } - } else { - c2=0; - } - - /* we are looking at the character (c, c2) at [prevSrc..src[ */ - if(nx_contains(nx, c, c2)) { - fcd16=0; /* excluded: fcd16==0 */ - } - - /* check the combining order, get the lead cc */ - cc=(int16_t)(fcd16>>8); - if(cc==0 || cc>=prevCC) { - /* the order is ok */ - if(cc==0) { - decompStart=prevSrc; - } - prevCC=(int16_t)(fcd16&0xff); - - /* just append (c, c2) */ - length= c2==0 ? 1 : 2; - if((destIndex+length)<=destCapacity) { - dest[destIndex++]=c; - if(c2!=0) { - dest[destIndex++]=c2; - } - } else { - destIndex+=length; - } - } else { - /* - * back out the part of the source that we copied already but - * is now going to be decomposed; - * prevSrc is set to after what was copied - */ - destIndex-=(int32_t)(prevSrc-decompStart); - - /* - * find the part of the source that needs to be decomposed; - * to be safe and simple, decompose to before the next character with lead cc==0 - */ - src=_findSafeFCD(src, limit, fcd16); - - /* - * the source text does not fulfill the conditions for FCD; - * decompose and reorder a limited piece of the text - */ - prevCC=_decomposeFCD(decompStart, src, - dest, destIndex, destCapacity, - nx); - decompStart=src; - } - } - - return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode); -} - -/* quick check functions ---------------------------------------------------- */ - -static UBool -unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) { - const UChar *limit; - UChar c, c2; - uint16_t fcd16; - int16_t prevCC, cc; - - /* initialize */ - prevCC=0; - - if(srcLength>=0) { - /* string with length */ - limit=src+srcLength; - } else /* srcLength==-1 */ { - /* zero-terminated string */ - limit=NULL; - } - - U_ALIGN_CODE(16); - - for(;;) { - /* skip a run of code units below the minimum or with irrelevant data for the FCD check */ - if(limit==NULL) { - for(;;) { - c=*src++; - if(c<_NORM_MIN_WITH_LEAD_CC) { - if(c==0) { - return TRUE; - } - /* - * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC - * because chances are good that the next one will have - * a leading cc of 0; - * _getFCD16(-prevCC) is later called when necessary - - * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300 - */ - prevCC=(int16_t)-c; - } else if((fcd16=_getFCD16(c))==0) { - prevCC=0; - } else { - break; - } - } - } else { - for(;;) { - if(src==limit) { - return TRUE; - } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) { - prevCC=(int16_t)-c; - } else if((fcd16=_getFCD16(c))==0) { - prevCC=0; - } else { - break; - } - } - } - - /* check one above-minimum, relevant code unit */ - if(UTF_IS_FIRST_SURROGATE(c)) { - /* c is a lead surrogate, get the real fcd16 */ - if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { - ++src; - fcd16=_getFCD16FromSurrogatePair(fcd16, c2); - } else { - c2=0; - fcd16=0; - } - } else { - c2=0; - } - - if(nx_contains(nx, c, c2)) { - prevCC=0; /* excluded: fcd16==0 */ - continue; - } - - /* - * prevCC has values from the following ranges: - * 0..0xff - the previous trail combining class - * <0 - the negative value of the previous code unit; - * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() - * was deferred so that average text is checked faster - */ - - /* check the combining order */ - cc=(int16_t)(fcd16>>8); - if(cc!=0) { - if(prevCC<0) { - /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */ - if(!nx_contains(nx, (UChar32)-prevCC)) { - prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff); - } else { - prevCC=0; /* excluded: fcd16==0 */ - } - } - - if(cc=0) { - /* string with length */ - limit=src+srcLength; - } else /* srcLength==-1 */ { - /* zero-terminated string */ - limit=NULL; - } - - U_ALIGN_CODE(16); - - for(;;) { - /* skip a run of code units below the minimum or with irrelevant data for the quick check */ - if(limit==NULL) { - for(;;) { - c=*src++; - if(c=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) { - break; - } - prevCC=0; - } - } - - /* check one above-minimum, relevant code unit */ - if(isNorm32LeadSurrogate(norm32)) { - /* c is a lead surrogate, get the real norm32 */ - if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) { - ++src; - norm32=_getNorm32FromSurrogatePair(norm32, c2); - } else { - c2=0; - norm32=0; - } - } else { - c2=0; - } - - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - norm32=0; - } - - /* check the combining order */ - cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); - if(cc!=0 && cc(static_cast(&fn2)), + src, srcLength, pErrorCode); + } else { + return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); + } } U_CAPI UBool U_EXPORT2 unorm_isNormalized(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode) { - return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode)); + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); + return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } U_CAPI UBool U_EXPORT2 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UErrorCode *pErrorCode) { - return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode)); -} - -/* normalize() API ---------------------------------------------------------- */ - -/** - * Internal API for normalizing. - * Does not check for bad input. - * Requires _haveData() to be true. - * @internal - */ -U_CFUNC int32_t -unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UNormalizationMode mode, int32_t options, const UnicodeSet *nx, - UErrorCode *pErrorCode) { - int32_t destLength; - uint8_t trailCC; - - switch(mode) { - case UNORM_NFD: - destLength=_decompose(dest, destCapacity, - src, srcLength, - FALSE, nx, trailCC); - break; - case UNORM_NFKD: - destLength=_decompose(dest, destCapacity, - src, srcLength, - TRUE, nx, trailCC); - break; - case UNORM_NFC: - destLength=_compose(dest, destCapacity, - src, srcLength, - options, nx, pErrorCode); - break; - case UNORM_NFKC: - destLength=_compose(dest, destCapacity, - src, srcLength, - options|_NORM_OPTIONS_COMPAT, nx, pErrorCode); - break; - case UNORM_FCD: - return unorm_makeFCD(dest, destCapacity, - src, srcLength, - nx, - pErrorCode); -#if 0 - case UNORM_FCC: - destLength=_compose(dest, destCapacity, - src, srcLength, - options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode); - break; -#endif - case UNORM_NONE: - /* just copy the string */ - if(srcLength==-1) { - srcLength=u_strlen(src); - } - if(srcLength>0 && srcLength<=destCapacity) { - uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR); - } - destLength=srcLength; - break; - default: - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); + if(options&UNORM_UNICODE_3_2) { + FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); + return unorm2_isNormalized( + reinterpret_cast(static_cast(&fn2)), + src, srcLength, pErrorCode); + } else { + return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); } - - return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); } -/** - * Internal API for normalizing. - * Does not check for bad input. - * @internal - */ -U_CAPI int32_t U_EXPORT2 -unorm_internalNormalize(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - UNormalizationMode mode, int32_t options, - UErrorCode *pErrorCode) { - const UnicodeSet *nx; - - if(!_haveData(*pErrorCode)) { - return 0; - } - - nx=getNX(options, *pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } - - /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */ - options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS); - - return unorm_internalNormalizeWithNX(dest, destCapacity, - src, srcLength, - mode, options, nx, - pErrorCode); -} +/* normalize() API ---------------------------------------------------------- */ /** Public API for normalizing. */ U_CAPI int32_t U_EXPORT2 @@ -3283,417 +96,112 @@ unorm_normalize(const UChar *src, int32_t srcLength, UNormalizationMode mode, int32_t options, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { - /* check argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } - - if( destCapacity<0 || (dest==NULL && destCapacity>0) || - src==NULL || srcLength<-1 - ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - /* check for overlapping src and destination */ - if( dest!=NULL && - ((src>=dest && src<(dest+destCapacity)) || - (srcLength>0 && dest>=src && dest<(src+srcLength))) - ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - return unorm_internalNormalize(dest, destCapacity, - src, srcLength, - mode, options, - pErrorCode); -} - - -/* iteration functions ------------------------------------------------------ */ - -/* - * These iteration functions are the core implementations of the - * Normalizer class iteration API. - * They read from a UCharIterator into their own buffer - * and normalize into the Normalizer iteration buffer. - * Normalizer itself then iterates over its buffer until that needs to be - * filled again. - */ - -/* - * ### TODO: - * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff - * if iteration bounds are reached, - * try to not call hasNext/hasPrevious and instead check for >=0. - */ - -/* backward iteration ------------------------------------------------------- */ - -/* - * read backwards and get norm32 - * return 0 if the character is (static_cast(&fn2)), + src, srcLength, dest, destCapacity, pErrorCode); } else { - /* unpaired second surrogate, undo the c2=src.previous() movement */ - src.move(&src, 1, UITER_CURRENT); - c2=0; - return 0; + return unorm2_normalize((const UNormalizer2 *)n2, + src, srcLength, dest, destCapacity, pErrorCode); } } -/* - * read backwards and check if the character is a previous-iteration boundary - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!) - */ -typedef UBool -IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2); - -/* - * for NF*D: - * read backwards and check if the lead combining class is 0 - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!) - */ -static UBool -_isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) { - return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK); -} -/* - * read backwards and check if the character is (or its decomposition begins with) - * a "true starter" (cc==0 and NF*C_YES) - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!) - */ -static UBool -_isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) { - uint32_t norm32, decompQCMask; - - decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */ - norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2); - return _isTrueStarter(norm32, ccOrQCMask, decompQCMask); -} +/* iteration functions ------------------------------------------------------ */ static int32_t -_findPreviousIterationBoundary(UCharIterator &src, - IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask, - UChar *&buffer, int32_t &bufferCapacity, - int32_t &startIndex, - UErrorCode *pErrorCode) { - UChar *stackBuffer; - UChar c, c2; - UBool isBoundary; - - /* initialize */ - stackBuffer=buffer; - startIndex=bufferCapacity; /* fill the buffer from the end backwards */ - - while(src.hasPrevious(&src)) { - isBoundary=isPrevBoundary(src, minC, mask, c, c2); - - /* always write this character to the front of the buffer */ - /* make sure there is enough space in the buffer */ - if(startIndex < (c2==0 ? 1 : 2)) { - int32_t bufferLength=bufferCapacity; - - if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - src.move(&src, 0, UITER_START); - return 0; - } - - /* move the current buffer contents up */ - uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR); - startIndex+=bufferCapacity-bufferLength; - } - - buffer[--startIndex]=c; - if(c2!=0) { - buffer[--startIndex]=c2; - } - - /* stop if this just-copied character is a boundary */ - if(isBoundary) { - break; - } - } - - /* return the length of the buffer contents */ - return bufferCapacity-startIndex; -} - -U_CAPI int32_t U_EXPORT2 -unorm_previous(UCharIterator *src, - UChar *dest, int32_t destCapacity, - UNormalizationMode mode, int32_t options, - UBool doNormalize, UBool *pNeededToNormalize, - UErrorCode *pErrorCode) { - UChar stackBuffer[100]; - UChar *buffer=NULL; - IsPrevBoundaryFn *isPreviousBoundary=NULL; - uint32_t mask=0; - int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0; - int32_t c=0, c2=0; - UChar minC=0; - - /* check argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { +_iterate(UCharIterator *src, UBool forward, + UChar *dest, int32_t destCapacity, + const Normalizer2 *n2, + UBool doNormalize, UBool *pNeededToNormalize, + UErrorCode *pErrorCode) { + if(U_FAILURE(*pErrorCode)) { return 0; } - - if( destCapacity<0 || (dest==NULL && destCapacity>0) || - src==NULL - ) { + if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } - if(!_haveData(*pErrorCode)) { - return 0; - } - if(pNeededToNormalize!=NULL) { *pNeededToNormalize=FALSE; } + if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { + return u_terminateUChars(dest, destCapacity, 0, pErrorCode); + } - switch(mode) { - case UNORM_NFD: - case UNORM_FCD: - isPreviousBoundary=_isPrevNFDSafe; - minC=_NORM_MIN_WITH_LEAD_CC; - mask=_NORM_CC_MASK|_NORM_QC_NFD; - break; - case UNORM_NFKD: - isPreviousBoundary=_isPrevNFDSafe; - minC=_NORM_MIN_WITH_LEAD_CC; - mask=_NORM_CC_MASK|_NORM_QC_NFKD; - break; - case UNORM_NFC: - isPreviousBoundary=_isPrevTrueStarter; - minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]; - mask=_NORM_CC_MASK|_NORM_QC_NFC; - break; - case UNORM_NFKC: - isPreviousBoundary=_isPrevTrueStarter; - minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]; - mask=_NORM_CC_MASK|_NORM_QC_NFKC; - break; - case UNORM_NONE: - destLength=0; - if((c=src->previous(src))>=0) { - destLength=1; - if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) { - if(UTF_IS_LEAD(c2)) { - if(destCapacity>=2) { - dest[1]=(UChar)c; /* trail surrogate */ - destLength=2; - } - c=c2; /* lead surrogate to be written below */ - } else { - src->move(src, 1, UITER_CURRENT); - } + UnicodeString buffer; + UChar32 c; + if(forward) { + /* get one character and ignore its properties */ + buffer.append(uiter_next32(src)); + /* get all following characters until we see a boundary */ + while((c=uiter_next32(src))>=0) { + if(n2->hasBoundaryBefore(c)) { + /* back out the latest movement to stop at the boundary */ + src->move(src, -U16_LENGTH(c), UITER_CURRENT); + break; + } else { + buffer.append(c); } - - if(destCapacity>0) { - dest[0]=(UChar)c; + } + } else { + while((c=uiter_previous32(src))>=0) { + /* always write this character to the front of the buffer */ + buffer.insert(0, c); + /* stop if this just-copied character is a boundary */ + if(n2->hasBoundaryBefore(c)) { + break; } } - return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); - default: - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; } - buffer=stackBuffer; - bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR); - bufferLength=_findPreviousIterationBoundary(*src, - isPreviousBoundary, minC, mask, - buffer, bufferCapacity, - startIndex, - pErrorCode); - if(bufferLength>0) { - if(doNormalize) { - destLength=unorm_internalNormalize(dest, destCapacity, - buffer+startIndex, bufferLength, - mode, options, - pErrorCode); - if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) { - *pNeededToNormalize= - (UBool)(destLength!=bufferLength || - 0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR)); - } - } else { - /* just copy the source characters */ - if(destCapacity>0) { - uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR); - } - destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode); + UnicodeString destString(dest, 0, destCapacity); + if(buffer.length()>0 && doNormalize) { + n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); + if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { + *pNeededToNormalize= destString!=buffer; } + return destString.length(); } else { - destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode); - } - - /* cleanup */ - if(buffer!=stackBuffer) { - uprv_free(buffer); + /* just copy the source characters */ + return buffer.extract(dest, destCapacity, *pErrorCode); } - - return destLength; } -/* forward iteration -------------------------------------------------------- */ - -/* - * read forward and get norm32 - * return 0 if the character is 0) || - src==NULL - ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - if(!_haveData(*pErrorCode)) { - return 0; - } - - if(pNeededToNormalize!=NULL) { - *pNeededToNormalize=FALSE; - } - - switch(mode) { - case UNORM_NFD: - case UNORM_FCD: - isNextBoundary=_isNextNFDSafe; - minC=_NORM_MIN_WITH_LEAD_CC; - mask=_NORM_CC_MASK|_NORM_QC_NFD; - break; - case UNORM_NFKD: - isNextBoundary=_isNextNFDSafe; - minC=_NORM_MIN_WITH_LEAD_CC; - mask=_NORM_CC_MASK|_NORM_QC_NFKD; - break; - case UNORM_NFC: - isNextBoundary=_isNextTrueStarter; - minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]; - mask=_NORM_CC_MASK|_NORM_QC_NFC; - break; - case UNORM_NFKC: - isNextBoundary=_isNextTrueStarter; - minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]; - mask=_NORM_CC_MASK|_NORM_QC_NFKC; - break; - case UNORM_NONE: - destLength=0; - if((c=src->next(src))>=0) { - destLength=1; - if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) { - if(UTF_IS_TRAIL(c2)) { - if(destCapacity>=2) { - dest[1]=(UChar)c2; /* trail surrogate */ - destLength=2; - } - /* lead surrogate to be written below */ - } else { - src->move(src, -1, UITER_CURRENT); - } - } - - if(destCapacity>0) { - dest[0]=(UChar)c; - } - } - return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); - default: - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - buffer=stackBuffer; - bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR); - bufferLength=_findNextIterationBoundary(*src, - isNextBoundary, minC, mask, - buffer, bufferCapacity, - pErrorCode); - if(bufferLength>0) { - if(doNormalize) { - destLength=unorm_internalNormalize(dest, destCapacity, - buffer, bufferLength, - mode, options, - pErrorCode); - if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) { - *pNeededToNormalize= - (UBool)(destLength!=bufferLength || - 0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR)); - } - } else { - /* just copy the source characters */ - if(destCapacity>0) { - uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR); - } - destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode); - } - } else { - destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode); - } - - /* cleanup */ - if(buffer!=stackBuffer) { - uprv_free(buffer); - } - - return destLength; + return unorm_iterate(src, TRUE, + dest, destCapacity, + mode, options, + doNormalize, pNeededToNormalize, + pErrorCode); } -/* - * ### TODO: check if NF*D and FCD iteration finds optimal boundaries - * and if not, how hard it would be to improve it. - * For example, see _findSafeFCD(). - */ - /* Concatenation of normalized strings -------------------------------------- */ -U_CAPI int32_t U_EXPORT2 -unorm_concatenate(const UChar *left, int32_t leftLength, +static int32_t +_concatenate(const UChar *left, int32_t leftLength, const UChar *right, int32_t rightLength, UChar *dest, int32_t destCapacity, - UNormalizationMode mode, int32_t options, + const Normalizer2 *n2, UErrorCode *pErrorCode) { - UChar stackBuffer[100]; - UChar *buffer; - int32_t bufferLength, bufferCapacity; - - UCharIterator iter; - int32_t leftBoundary, rightBoundary, destLength; - - /* check argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + if(U_FAILURE(*pErrorCode)) { return 0; } - - if( destCapacity<0 || (dest==NULL && destCapacity>0) || - left==NULL || leftLength<-1 || - right==NULL || rightLength<-1 - ) { + if(destCapacity<0 || (dest==NULL && destCapacity>0) || + left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } @@ -3858,235 +244,35 @@ unorm_concatenate(const UChar *left, int32_t leftLength, } /* allow left==dest */ - - /* set up intermediate buffer */ - buffer=stackBuffer; - bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR); - - /* - * Input: left[0..leftLength[ + right[0..rightLength[ - * - * Find normalization-safe boundaries leftBoundary and rightBoundary - * and copy the end parts together: - * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[ - * - * dest=left[0..leftBoundary[ + - * normalize(buffer) + - * right[rightBoundary..rightLength[ - */ - - /* - * find a normalization boundary at the end of the left string - * and copy the end part into the buffer - */ - uiter_setString(&iter, left, leftLength); - iter.index=leftLength=iter.length; /* end of left string */ - - bufferLength=unorm_previous(&iter, buffer, bufferCapacity, - mode, options, - FALSE, NULL, - pErrorCode); - leftBoundary=iter.index; - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - /* dont need to cleanup here since - * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer) - */ - return 0; - } - - /* just copy from the left string: we know the boundary already */ - uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR); - } - - /* - * find a normalization boundary at the beginning of the right string - * and concatenate the beginning part to the buffer - */ - uiter_setString(&iter, right, rightLength); - rightLength=iter.length; /* in case it was -1 */ - - rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength, - mode, options, - FALSE, NULL, - pErrorCode); - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - *pErrorCode=U_ZERO_ERROR; - if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - /* dont need to cleanup here since - * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer) - */ - return 0; - } - - /* just copy from the right string: we know the boundary already */ - uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR); - } - - bufferLength+=rightBoundary; - - /* copy left[0..leftBoundary[ to dest */ - if(left!=dest && leftBoundary>0 && destCapacity>0) { - uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR); - } - destLength=leftBoundary; - - /* concatenate the normalization of the buffer to dest */ - if(destCapacity>destLength) { - destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength, - buffer, bufferLength, - mode, options, - pErrorCode); + UnicodeString destString; + if(left==dest) { + destString.setTo(dest, leftLength, destCapacity); } else { - destLength+=unorm_internalNormalize(NULL, 0, - buffer, bufferLength, - mode, options, - pErrorCode); - } - /* - * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR - * so we dont check for the error code here..just let it pass through - */ - /* concatenate right[rightBoundary..rightLength[ to dest */ - right+=rightBoundary; - rightLength-=rightBoundary; - if(rightLength>0 && destCapacity>destLength) { - uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR); - } - destLength+=rightLength; - - /* cleanup */ - if(buffer!=stackBuffer) { - uprv_free(buffer); + destString.setTo(dest, 0, destCapacity); + destString.append(left, leftLength); } - - return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); + return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). + extract(dest, destCapacity, *pErrorCode); } -/* data swapping ------------------------------------------------------------ */ - U_CAPI int32_t U_EXPORT2 -unorm_swap(const UDataSwapper *ds, - const void *inData, int32_t length, void *outData, - UErrorCode *pErrorCode) { - const UDataInfo *pInfo; - int32_t headerSize; - - const uint8_t *inBytes; - uint8_t *outBytes; - - const int32_t *inIndexes; - int32_t indexes[32]; - - int32_t i, offset, count, size; - - /* udata_swapDataHeader checks the arguments */ - headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } - - /* check data format and format version */ - pInfo=(const UDataInfo *)((const char *)inData+4); - if(!( - pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ - pInfo->dataFormat[1]==0x6f && - pInfo->dataFormat[2]==0x72 && - pInfo->dataFormat[3]==0x6d && - pInfo->formatVersion[0]==2 - )) { - udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n", - pInfo->dataFormat[0], pInfo->dataFormat[1], - pInfo->dataFormat[2], pInfo->dataFormat[3], - pInfo->formatVersion[0]); - *pErrorCode=U_UNSUPPORTED_ERROR; - return 0; - } - - inBytes=(const uint8_t *)inData+headerSize; - outBytes=(uint8_t *)outData+headerSize; - - inIndexes=(const int32_t *)inBytes; - - if(length>=0) { - length-=headerSize; - if(length<32*4) { - udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n", - length); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - } - - /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */ - for(i=0; i<32; ++i) { - indexes[i]=udata_readInt32(ds, inIndexes[i]); - } - - /* calculate the total length of the data */ - size= - 32*4+ /* size of indexes[] */ - indexes[_NORM_INDEX_TRIE_SIZE]+ - indexes[_NORM_INDEX_UCHAR_COUNT]*2+ - indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+ - indexes[_NORM_INDEX_FCD_TRIE_SIZE]+ - indexes[_NORM_INDEX_AUX_TRIE_SIZE]+ - indexes[_NORM_INDEX_CANON_SET_COUNT]*2; - - if(length>=0) { - if(lengthswapArray32(ds, inBytes, count, outBytes, pErrorCode); - offset+=count; - - /* swap the main UTrie */ - count=indexes[_NORM_INDEX_TRIE_SIZE]; - utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - - /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */ - count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2; - ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - - /* swap the FCD UTrie */ - count=indexes[_NORM_INDEX_FCD_TRIE_SIZE]; - if(count!=0) { - utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - } - - /* swap the aux UTrie */ - count=indexes[_NORM_INDEX_AUX_TRIE_SIZE]; - if(count!=0) { - utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; - } - - /* swap the uint16_t combiningTable[] */ - count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2; - ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); - offset+=count; + FilteredNormalizer2 fn2(*n2, *uni32); + return _concatenate(left, leftLength, right, rightLength, + dest, destCapacity, &fn2, pErrorCode); } - - return headerSize+size; + return _concatenate(left, leftLength, right, rightLength, + dest, destCapacity, n2, pErrorCode); } #endif /* #if !UCONFIG_NO_NORMALIZATION */