X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/normalizer2impl.h diff --git a/icuSources/common/normalizer2impl.h b/icuSources/common/normalizer2impl.h index ebcf9815..eb026dbe 100644 --- a/icuSources/common/normalizer2impl.h +++ b/icuSources/common/normalizer2impl.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2009-2010, International Business Machines +* Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -22,26 +22,30 @@ #if !UCONFIG_NO_NORMALIZATION #include "unicode/normalizer2.h" -#include "unicode/udata.h" #include "unicode/unistr.h" #include "unicode/unorm.h" +#include "unicode/utf16.h" #include "mutex.h" #include "uset_imp.h" #include "utrie2.h" U_NAMESPACE_BEGIN -class CanonIterData; +struct CanonIterData; -class Hangul { +class U_COMMON_API Hangul { public: /* Korean Hangul and Jamo constants */ enum { JAMO_L_BASE=0x1100, /* "lead" jamo */ + JAMO_L_END=0x1112, JAMO_V_BASE=0x1161, /* "vowel" jamo */ + JAMO_V_END=0x1175, JAMO_T_BASE=0x11a7, /* "trail" jamo */ + JAMO_T_END=0x11c2, HANGUL_BASE=0xac00, + HANGUL_END=0xd7a3, JAMO_L_COUNT=19, JAMO_V_COUNT=21, @@ -85,13 +89,31 @@ public: return 3; } } + + /** + * Decomposes c, which must be a Hangul syllable, into buffer. + * This is the raw, not recursive, decomposition. Its length is always 2. + */ + static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) { + UChar32 orig=c; + c-=HANGUL_BASE; + UChar32 c2=c%JAMO_T_COUNT; + if(c2==0) { + c/=JAMO_T_COUNT; + buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); + buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); + } else { + buffer[0]=orig-c2; // LV syllable + buffer[1]=(UChar)(JAMO_T_BASE+c2); + } + } private: Hangul(); // no instantiation }; class Normalizer2Impl; -class ReorderingBuffer : public UMemory { +class U_COMMON_API ReorderingBuffer : public UMemory { public: ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : impl(ni), str(dest), @@ -151,6 +173,9 @@ public: reorderStart=limit=newLimit; lastCC=0; } + void copyReorderableSuffixTo(UnicodeString &s) const { + s.setTo(reorderStart, (int32_t)(limit-reorderStart)); + } private: /* * TODO: Revisit whether it makes sense to track reorderStart. @@ -191,23 +216,23 @@ private: UChar *codePointStart, *codePointLimit; }; -class U_COMMON_API Normalizer2Impl : public UMemory { +class U_COMMON_API Normalizer2Impl : public UObject { public: - Normalizer2Impl() : memory(NULL), normTrie(NULL) { - fcdTrieSingleton.fInstance=NULL; - canonIterDataSingleton.fInstance=NULL; + Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) { + fCanonIterDataInitOnce.reset(); } - ~Normalizer2Impl(); + virtual ~Normalizer2Impl(); - void load(const char *packageName, const char *name, UErrorCode &errorCode); + void init(const int32_t *inIndexes, const UTrie2 *inTrie, + const uint16_t *inExtraData, const uint8_t *inSmallFCD); + void addLcccChars(UnicodeSet &set) const; void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; // low-level properties ------------------------------------------------ *** const UTrie2 *getNormTrie() const { return normTrie; } - const UTrie2 *getFCDTrie(UErrorCode &errorCode) const ; UBool ensureCanonIterData(UErrorCode &errorCode) const; @@ -222,6 +247,7 @@ public: return UNORM_NO; } } + UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; } - uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); } - uint16_t getFCD16FromSingleLead(UChar c) const { - return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c); + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + uint16_t getFCD16(UChar32 c) const { + if(c<0) { + return 0; + } else if(c<0x180) { + return tccc180[c]; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); } - uint16_t getFCD16FromSupplementary(UChar32 c) const { - return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c); + /** + * Returns the FCD data for the next code point (post-increment). + * Might skip only a lead surrogate rather than the whole surrogate pair if none of + * the supplementary code points associated with the lead surrogate have non-zero FCD data. + * @param s A valid pointer into a string. Requires s!=limit. + * @param limit The end of the string, or NULL. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { + UChar32 c=*s++; + if(c<0x180) { + return tccc180[c]; + } else if(!singleLeadMightHaveNonZeroFCD16(c)) { + return 0; + } + UChar c2; + if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { + c=U16_GET_SUPPLEMENTARY(c, c2); + ++s; + } + return getFCD16FromNormData(c); } - uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const { - return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2)); + /** + * Returns the FCD data for the previous code point (pre-decrement). + * @param start The start of the string. + * @param s A valid pointer into a string. Requires start>8]; + if(bits==0) { return false; } + return (UBool)((bits>>((lead>>5)&7))&1); + } + /** Returns the FCD value from the regular normalization data. */ + uint16_t getFCD16FromNormData(UChar32 c) const; void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, CanonIterData &newData, UErrorCode &errorCode) const; /** - * Get the decomposition for one code point. + * Gets the decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any @@ -264,6 +350,17 @@ public: */ const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; + /** + * Gets the raw decomposition for one code point. + * @param c code point + * @param buffer out-only buffer for algorithmic decompositions + * @param length out-only, takes the length of the decomposition, if any + * @return pointer to the decomposition, or NULL if none + */ + const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; + + UChar32 composePair(UChar32 a, UChar32 b) const; + UBool isCanonSegmentStarter(UChar32 c) const; UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; @@ -283,7 +380,7 @@ public: // Byte offsets from the start of the data, after the generic header. IX_NORM_TRIE_OFFSET, IX_EXTRA_DATA_OFFSET, - IX_RESERVED2_OFFSET, + IX_SMALL_FCD_OFFSET, IX_RESERVED3_OFFSET, IX_RESERVED4_OFFSET, IX_RESERVED5_OFFSET, @@ -295,19 +392,20 @@ public: IX_MIN_COMP_NO_MAYBE_CP, // Norm16 value thresholds for quick check combinations and types of extra data. - IX_MIN_YES_NO, + IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. IX_MIN_NO_NO, IX_LIMIT_NO_NO, IX_MIN_MAYBE_YES, - IX_RESERVED14, + IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[. + IX_RESERVED15, IX_COUNT }; enum { MAPPING_HAS_CCC_LCCC_WORD=0x80, - MAPPING_PLUS_COMPOSITION_LIST=0x40, + MAPPING_HAS_RAW_MAPPING=0x40, MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, MAPPING_LENGTH_MASK=0x1f }; @@ -324,10 +422,23 @@ public: // higher-level functionality ------------------------------------------ *** + // NFD without an NFD Normalizer2 instance. + UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, + UErrorCode &errorCode) const; + /** + * Decomposes [src, limit[ and writes the result to dest. + * limit can be NULL if src is NUL-terminated. + * destLengthEstimate is the initial dest buffer capacity and can be -1. + */ + void decompose(const UChar *src, const UChar *limit, + UnicodeString &dest, int32_t destLengthEstimate, + UErrorCode &errorCode) const; + const UChar *decompose(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void decomposeAndAppend(const UChar *src, const UChar *limit, UBool doDecompose, + UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool compose(const UChar *src, const UChar *limit, @@ -341,12 +452,14 @@ public: void composeAndAppend(const UChar *src, const UChar *limit, UBool doCompose, UBool onlyContiguous, + UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const UChar *makeFCD(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void makeFCDAndAppend(const UChar *src, const UChar *limit, UBool doMakeFCD, + UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; @@ -365,13 +478,10 @@ public: } UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } private: - static UBool U_CALLCONV - isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); - UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; } static UBool isInert(uint16_t norm16) { return norm16==0; } - // static UBool isJamoL(uint16_t norm16) const { return norm16==1; } + static UBool isJamoL(uint16_t norm16) { return norm16==1; } static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD + (*list&MAPPING_LENGTH_MASK); // + mapping length } /** * @param c code point must have compositions @@ -466,16 +575,13 @@ private: const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const; const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; - const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; } - const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; int32_t getCanonValue(UChar32 c) const; const UnicodeSet &getCanonStartSet(int32_t n) const; - UDataMemory *memory; - UVersionInfo dataVersion; + // UVersionInfo dataVersion; // Code point thresholds for quick check codes. UChar32 minDecompNoCP; @@ -483,16 +589,20 @@ private: // Norm16 value thresholds for quick check combinations and types of extra data. uint16_t minYesNo; + uint16_t minYesNoMappingsOnly; uint16_t minNoNo; uint16_t limitNoNo; uint16_t minMaybeYes; - UTrie2 *normTrie; + const UTrie2 *normTrie; const uint16_t *maybeYesCompositions; const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters + const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 + uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F - SimpleSingleton fcdTrieSingleton; - SimpleSingleton canonIterDataSingleton; +public: // CanonIterData is public to allow access from C callback functions. + UInitOnce fCanonIterDataInitOnce; + CanonIterData *fCanonIterData; }; // bits in canonIterData @@ -506,13 +616,8 @@ private: */ class U_COMMON_API Normalizer2Factory { public: - static const Normalizer2 *getNFCInstance(UErrorCode &errorCode); - static const Normalizer2 *getNFDInstance(UErrorCode &errorCode); static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); - static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode); - static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode); - static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode); static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode); @@ -524,8 +629,6 @@ public: // Get the Impl instance of the Normalizer2. // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); - - static const UTrie2 *getFCDTrie(UErrorCode &errorCode); private: Normalizer2Factory(); // No instantiation. }; @@ -541,102 +644,19 @@ unorm2_swap(const UDataSwapper *ds, * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). * @internal */ -U_CFUNC UNormalizationCheckResult U_EXPORT2 +U_CFUNC UNormalizationCheckResult unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); /** - * Internal API, used by collation code. - * Get access to the internal FCD trie table to be able to perform - * incremental, per-code unit, FCD checks in collation. - * One pointer is sufficient because the trie index values are offset - * by the index size, so that the same pointer is used to access the trie data. - * Code points at fcdHighStart and above have a zero FCD value. - * @internal - */ -U_CAPI const uint16_t * U_EXPORT2 -unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode); - -/** - * Internal API, used by collation code. - * Get the FCD value for a code unit, with - * bits 15..8 lead combining class - * bits 7..0 trail combining class - * - * If c is a lead surrogate and the value is not 0, - * then some of c's associated supplementary code points have a non-zero FCD value. - * - * @internal - */ -static inline uint16_t -unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) { - return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; -} - -/** - * Internal API, used by collation code. - * Get the FCD value of the next code point (post-increment), with - * bits 15..8 lead combining class - * bits 7..0 trail combining class - * - * @internal - */ -static inline uint16_t -unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart, - const UChar *&s, const UChar *limit) { - UChar32 c=*s++; - uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; - if(fcd!=0 && U16_IS_LEAD(c)) { - UChar c2; - if(s!=limit && U16_IS_TRAIL(c2=*s)) { - ++s; - c=U16_GET_SUPPLEMENTARY(c, c2); - if(c