X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/normalizer2impl.cpp?ds=inline diff --git a/icuSources/common/normalizer2impl.cpp b/icuSources/common/normalizer2impl.cpp index 52459be7..ec4809c4 100644 --- a/icuSources/common/normalizer2impl.cpp +++ b/icuSources/common/normalizer2impl.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2009-2010, International Business Machines +* Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -21,11 +21,12 @@ #include "unicode/normalizer2.h" #include "unicode/udata.h" #include "unicode/ustring.h" +#include "unicode/utf16.h" #include "cmemory.h" #include "mutex.h" #include "normalizer2impl.h" +#include "putilimp.h" #include "uassert.h" -#include "uhash.h" #include "uset_imp.h" #include "utrie2.h" #include "uvector.h" @@ -252,71 +253,44 @@ struct CanonIterData : public UMemory { }; Normalizer2Impl::~Normalizer2Impl() { - udata_close(memory); - utrie2_close(normTrie); - UTrie2Singleton(fcdTrieSingleton).deleteInstance(); - delete (CanonIterData *)canonIterDataSingleton.fInstance; -} - -UBool U_CALLCONV -Normalizer2Impl::isAcceptable(void *context, - const char * /* type */, const char * /*name*/, - const UDataInfo *pInfo) { - if( - pInfo->size>=20 && - pInfo->isBigEndian==U_IS_BIG_ENDIAN && - pInfo->charsetFamily==U_CHARSET_FAMILY && - pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ - pInfo->dataFormat[1]==0x72 && - pInfo->dataFormat[2]==0x6d && - pInfo->dataFormat[3]==0x32 && - pInfo->formatVersion[0]==1 - ) { - Normalizer2Impl *me=(Normalizer2Impl *)context; - uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); - return TRUE; - } else { - return FALSE; - } + delete fCanonIterData; } void -Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { - return; - } - memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); - if(U_FAILURE(errorCode)) { - return; - } - const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); - const int32_t *inIndexes=(const int32_t *)inBytes; - int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; - if(indexesLength<=IX_MIN_MAYBE_YES) { - errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. - return; - } - +Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie, + const uint16_t *inExtraData, const uint8_t *inSmallFCD) { minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; minYesNo=inIndexes[IX_MIN_YES_NO]; + minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; minNoNo=inIndexes[IX_MIN_NO_NO]; limitNoNo=inIndexes[IX_LIMIT_NO_NO]; minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; - int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; - int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; - normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, - inBytes+offset, nextOffset-offset, NULL, - &errorCode); - if(U_FAILURE(errorCode)) { - return; - } + normTrie=inTrie; - offset=nextOffset; - maybeYesCompositions=(const uint16_t *)(inBytes+offset); + maybeYesCompositions=inExtraData; extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); + + smallFCD=inSmallFCD; + + // Build tccc180[]. + // gennorm2 enforces lccc=0 for c>=1) { + if((c&0xff)==0) { + bits=smallFCD[c>>8]; // one byte per 0x100 code points + } + if(bits&1) { + for(int i=0; i<0x20; ++i, ++c) { + tccc180[c]=(uint8_t)getFCD16FromNormData(c); + } + } else { + uprv_memset(tccc180+c, 0, 0x20); + c+=0x20; + } + } } uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { @@ -334,8 +308,70 @@ uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, co } } +namespace { + +class LcccContext { +public: + LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {} + + void handleRange(UChar32 start, UChar32 end, uint16_t norm16) { + if(impl.isAlgorithmicNoNo(norm16)) { + // Range of code points with same-norm16-value algorithmic decompositions. + // They might have different non-zero FCD16 values. + do { + uint16_t fcd16=impl.getFCD16(start); + if(fcd16>0xff) { set.add(start); } + } while(++start<=end); + } else { + uint16_t fcd16=impl.getFCD16(start); + if(fcd16>0xff) { set.add(start, end); } + } + } + +private: + const Normalizer2Impl &impl; + UnicodeSet &set; +}; + +struct PropertyStartsContext { + PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder) + : impl(ni), sa(adder) {} + + const Normalizer2Impl &impl; + const USetAdder *sa; +}; + +} // namespace + U_CDECL_BEGIN +static UBool U_CALLCONV +enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { + ((LcccContext *)context)->handleRange(start, end, (uint16_t)value); + return TRUE; +} + +static UBool U_CALLCONV +enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { + /* add the start code point to the USet */ + const PropertyStartsContext *ctx=(const PropertyStartsContext *)context; + const USetAdder *sa=ctx->sa; + sa->add(sa->set, start); + if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) { + // Range of code points with same-norm16-value algorithmic decompositions. + // They might have different non-zero FCD16 values. + uint16_t prevFCD16=ctx->impl.getFCD16(start); + while(++start<=end) { + uint16_t fcd16=ctx->impl.getFCD16(start); + if(fcd16!=prevFCD16) { + sa->add(sa->set, start); + prevFCD16=fcd16; + } + } + } + return TRUE; +} + static UBool U_CALLCONV enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { /* add the start code point to the USet */ @@ -351,10 +387,18 @@ segmentStarterMapper(const void * /*context*/, uint32_t value) { U_CDECL_END +void +Normalizer2Impl::addLcccChars(UnicodeSet &set) const { + /* add the start code point of each same-value range of each trie */ + LcccContext context(*this, set); + utrie2_enum(normTrie, NULL, enumLcccRange, &context); +} + void Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { /* add the start code point of each same-value range of each trie */ - utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); + PropertyStartsContext context(*this, sa); + utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context); /* add Hangul LV syllables and LV+1 because of skippables */ for(UChar c=Hangul::HANGUL_BASE; ctrie, - segmentStarterMapper, enumPropertyStartsRange, sa); + utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); } } @@ -397,6 +440,38 @@ Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, return src; } +UnicodeString & +Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, + UErrorCode &errorCode) const { + if(U_FAILURE(errorCode)) { + dest.setToBogus(); + return dest; + } + const UChar *sArray=src.getBuffer(); + if(&dest==&src || sArray==NULL) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + dest.setToBogus(); + return dest; + } + decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); + return dest; +} + +void +Normalizer2Impl::decompose(const UChar *src, const UChar *limit, + UnicodeString &dest, + int32_t destLengthEstimate, + UErrorCode &errorCode) const { + if(destLengthEstimate<0 && limit!=NULL) { + destLengthEstimate=(int32_t)(limit-src); + } + dest.remove(); + ReorderingBuffer buffer(*this, dest); + if(buffer.init(destLengthEstimate, errorCode)) { + decompose(src, limit, &buffer, errorCode); + } +} + // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/spanQuickCheckYes @@ -524,16 +599,16 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, } else { // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getMapping(norm16); - uint16_t firstUnit=*mapping++; + uint16_t firstUnit=*mapping; int32_t length=firstUnit&MAPPING_LENGTH_MASK; uint8_t leadCC, trailCC; trailCC=(uint8_t)(firstUnit>>8); if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { - leadCC=(uint8_t)(*mapping++>>8); + leadCC=(uint8_t)(*(mapping-1)>>8); } else { leadCC=0; } - return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode); + return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); } } } @@ -558,20 +633,67 @@ Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) c } else { // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getMapping(norm16); - uint16_t firstUnit=*mapping++; - length=firstUnit&MAPPING_LENGTH_MASK; - if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { - ++mapping; + length=*mapping&MAPPING_LENGTH_MASK; + return (const UChar *)mapping+1; + } + } +} + +// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 +// so that a raw mapping fits that consists of one unit ("rm0") +// plus all but the first two code units of the normal mapping. +// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. +const UChar * +Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { + // We do not loop in this method because an algorithmic mapping itself + // becomes a final result rather than having to be decomposed recursively. + uint16_t norm16; + if(c>7)&1)-1; + uint16_t rm0=*rawMapping; + if(rm0<=MAPPING_LENGTH_MASK) { + length=rm0; + return (const UChar *)rawMapping-rm0; + } else { + // Copy the normal mapping and replace its first two code units with rm0. + buffer[0]=(UChar)rm0; + u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); + length=mLength-1; + return buffer; } - return (const UChar *)mapping; + } else { + length=mLength; + return (const UChar *)mapping+1; } } } void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, UBool doDecompose, + UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const { + buffer.copyReorderableSuffixTo(safeMiddle); if(doDecompose) { decompose(src, limit, &buffer, errorCode); return; @@ -584,8 +706,13 @@ void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, prevCC=cc; cc=getCC(iter.next16()); }; - buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) && + if(limit==NULL) { // appendZeroCC() needs limit!=NULL + limit=u_strchr(iter.codePointStart, 0); + } + + if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { buffer.appendZeroCC(iter.codePointStart, limit, errorCode); + } } // Note: hasDecompBoundary() could be implemented as aliases to @@ -606,7 +733,7 @@ UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { } else { // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getMapping(norm16); - uint16_t firstUnit=*mapping++; + uint16_t firstUnit=*mapping; if((firstUnit&MAPPING_LENGTH_MASK)==0) { return FALSE; } @@ -622,7 +749,7 @@ UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { // if(trailCC==1) test leadCC==0, same as checking for before-boundary } // TRUE if leadCC==0 (hasFCDBoundaryBefore()) - return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0; + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; } } } @@ -895,6 +1022,55 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart buffer.setReorderingLimit(limit); } +UChar32 +Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { + uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 + const uint16_t *list; + if(isInert(norm16)) { + return U_SENTINEL; + } else if(norm16minYesNo) { // composite 'a' has both mapping & compositions list + list+= // mapping pointer + 1+ // +1 to skip the first unit with the mapping lenth + (*list&MAPPING_LENGTH_MASK); // + mapping length + } + } + } else if(norm16>1; +#else + int32_t compositeAndFwd=combine(list, b); + return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; +#endif +} + // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize // !doCompose: isNormalized (buffer must be empty and initialized) @@ -1271,6 +1447,7 @@ Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, UBool doCompose, UBool onlyContiguous, + UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const { if(!buffer.isEmpty()) { @@ -1278,9 +1455,10 @@ void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, if(src!=firstStarterInSrc) { const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), buffer.getLimit()); - UnicodeString middle(lastStarterInDest, - (int32_t)(buffer.getLimit()-lastStarterInDest)); - buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest)); + int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); + UnicodeString middle(lastStarterInDest, destSuffixLength); + buffer.removeSuffix(destSuffixLength); + safeMiddle=middle; middle.append(src, (int32_t)(firstStarterInSrc-src)); const UChar *middleStart=middle.getBuffer(); compose(middleStart, middleStart+middle.length(), onlyContiguous, @@ -1294,6 +1472,9 @@ void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, if(doCompose) { compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); } else { + if(limit==NULL) { // appendZeroCC() needs limit!=NULL + limit=u_strchr(src, 0); + } buffer.appendZeroCC(src, limit, errorCode); } } @@ -1317,14 +1498,14 @@ UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { } else { // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getMapping(norm16); - uint16_t firstUnit=*mapping++; + uint16_t firstUnit=*mapping; if((firstUnit&MAPPING_LENGTH_MASK)==0) { return FALSE; } - if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) { + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) { return FALSE; // non-zero leadCC } - int32_t i=0; + int32_t i=1; // skip over the firstUnit UChar32 c; U16_NEXT_UNSAFE(mapping, i, c); return isCompYesAndZeroCC(getNorm16(c)); @@ -1338,7 +1519,8 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo if(isInert(norm16)) { return TRUE; } else if(norm16<=minYesNo) { - // Hangul LVT (==minYesNo) has a boundary after it. + // Hangul: norm16==minYesNo + // Hangul LVT has a boundary after it. // Hangul LV and non-inert yesYes characters combine forward. return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { @@ -1352,12 +1534,13 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo const uint16_t *mapping=getMapping(norm16); uint16_t firstUnit=*mapping; // TRUE if - // c is not deleted, and - // it and its decomposition do not combine forward, and it has a starter, and - // if FCC then trailCC<=1 + // not MAPPING_NO_COMP_BOUNDARY_AFTER + // (which is set if + // c is not deleted, and + // it and its decomposition do not combine forward, and it has a starter) + // and if FCC then trailCC<=1 return - (firstUnit&MAPPING_LENGTH_MASK)!=0 && - (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 && + (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && (!onlyContiguous || firstUnit<=0x1ff); } } @@ -1383,95 +1566,30 @@ const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar * return iter.codePointStart; } -class FCDTrieSingleton : public UTrie2Singleton { -public: - FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) : - UTrie2Singleton(s), impl(ni), errorCode(ec) {} - UTrie2 *getInstance(UErrorCode &errorCode) { - return UTrie2Singleton::getInstance(createInstance, this, errorCode); - } - static void *createInstance(const void *context, UErrorCode &errorCode); - UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { - if(value!=0) { - impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode); - } - return U_SUCCESS(errorCode); - } - - Normalizer2Impl &impl; - UTrie2 *newFCDTrie; - UErrorCode &errorCode; -}; - -U_CDECL_BEGIN - -// Set the FCD value for a range of same-norm16 characters. -static UBool U_CALLCONV -enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { - return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value); -} - -// Collect (OR together) the FCD values for a range of supplementary characters, -// for their lead surrogate code unit. -static UBool U_CALLCONV -enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { - *((uint32_t *)context)|=value; - return TRUE; -} - -U_CDECL_END - -void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) { - FCDTrieSingleton *me=(FCDTrieSingleton *)context; - me->newFCDTrie=utrie2_open(0, 0, &errorCode); - if(U_SUCCESS(errorCode)) { - utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me); - for(UChar lead=0xd800; lead<0xdc00; ++lead) { - uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead); - utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue); - if(oredValue!=0) { - // Set a "bad" value for makeFCD() to break the quick check loop - // and look up the value for the supplementary code point. - // If there is any lccc, then set the worst-case lccc of 1. - // The ORed-together value's tccc is already the worst case. - if(oredValue>0xff) { - oredValue=0x100|(oredValue&0xff); - } - utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode); - } - } - utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode); - if(U_SUCCESS(errorCode)) { - return me->newFCDTrie; - } - } - utrie2_close(me->newFCDTrie); - return NULL; -} +// Note: normalizer2impl.cpp r30982 (2011-nov-27) +// still had getFCDTrie() which built and cached an FCD trie. +// That provided faster access to FCD data than getFCD16FromNormData() +// but required synchronization and consumed some 10kB of heap memory +// in any process that uses FCD (e.g., via collation). +// tccc180[] and smallFCD[] are intended to help with any loss of performance, +// at least for Latin & CJK. -void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, - UTrie2 *newFCDTrie, UErrorCode &errorCode) const { +// Gets the FCD value from the regular normalization data. +uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { // Only loops for 1:1 algorithmic mappings. for(;;) { - if(norm16>=MIN_NORMAL_MAYBE_YES) { - norm16&=0xff; - norm16|=norm16<<8; - } else if(norm16<=minYesNo || minMaybeYes<=norm16) { + uint16_t norm16=getNorm16(c); + if(norm16<=minYesNo) { // no decomposition or Hangul syllable, all zeros - break; - } else if(limitNoNo<=norm16) { - int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1); - if(start==end) { - start+=delta; - norm16=getNorm16(start); - } else { - // the same delta leads from different original characters to different mappings - do { - UChar32 c=start+delta; - setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode); - } while(++start<=end); - break; - } + return 0; + } else if(norm16>=MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16&=0xff; + return norm16|(norm16<<8); + } else if(norm16>=minMaybeYes) { + return 0; + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data const uint16_t *mapping=getMapping(norm16); @@ -1480,27 +1598,18 @@ void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t no // A character that is deleted (maps to an empty string) must // get the worst-case lccc and tccc values because arbitrary // characters on both sides will become adjacent. - norm16=0x1ff; + return 0x1ff; } else { + norm16=firstUnit>>8; // tccc if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { - norm16=mapping[1]&0xff00; // lccc - } else { - norm16=0; + norm16|=*(mapping-1)&0xff00; // lccc } - norm16|=firstUnit>>8; // tccc + return norm16; } } - utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode); - break; } } -const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const { - // Logically const: Synchronized instantiation. - Normalizer2Impl *me=const_cast(this); - return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode); -} - // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes @@ -1521,7 +1630,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, prevBoundary=src; // We know that the previous character's lccc==0. // Fetching the fcd16 value was deferred for this below-U+0300 code point. - prevFCD16=getFCD16FromSingleLead(*(src-1)); + prevFCD16=getFCD16(*(src-1)); if(prevFCD16>1) { --prevBoundary; } @@ -1535,8 +1644,6 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, // The exception is the call to decomposeShort() which uses the buffer // in the normal way. - const UTrie2 *trie=fcdTrie(); - const UChar *prevSrc; UChar32 c=0; uint16_t fcd16=0; @@ -1547,24 +1654,24 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, if((c=*src)1) { --prevBoundary; } @@ -1594,7 +1702,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, --p; // Need to fetch the previous character's FCD value because // prevFCD16 was just for the trail surrogate code point. - prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]); + prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); // Still known to have lccc==0 because its lead surrogate unit had lccc==0. } if(prevFCD16>1) { @@ -1650,6 +1758,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, UBool doMakeFCD, + UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const { if(!buffer.isEmpty()) { @@ -1657,9 +1766,10 @@ void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, if(src!=firstBoundaryInSrc) { const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), buffer.getLimit()); - UnicodeString middle(lastBoundaryInDest, - (int32_t)(buffer.getLimit()-lastBoundaryInDest)); - buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest)); + int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); + UnicodeString middle(lastBoundaryInDest, destSuffixLength); + buffer.removeSuffix(destSuffixLength); + safeMiddle=middle; middle.append(src, (int32_t)(firstBoundaryInSrc-src)); const UChar *middleStart=middle.getBuffer(); makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); @@ -1672,33 +1782,33 @@ void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, if(doMakeFCD) { makeFCD(src, limit, &buffer, errorCode); } else { + if(limit==NULL) { // appendZeroCC() needs limit!=NULL + limit=u_strchr(src, 0); + } buffer.appendZeroCC(src, limit, errorCode); } } const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { - BackwardUTrie2StringIterator iter(fcdTrie(), start, p); - uint16_t fcd16; - do { - fcd16=iter.previous16(); - } while(fcd16>0xff); - return iter.codePointStart; + while(start

0xff) {} + return p; } const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { - ForwardUTrie2StringIterator iter(fcdTrie(), p, limit); - uint16_t fcd16; - do { - fcd16=iter.next16(); - } while(fcd16>0xff); - return iter.codePointStart; + while(prangeHandler(start, end, value); + UErrorCode errorCode = U_ZERO_ERROR; + if (value != 0) { + Normalizer2Impl *impl = (Normalizer2Impl *)context; + impl->makeCanonIterDataFromNorm16( + start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); + } + return U_SUCCESS(errorCode); } -U_CDECL_END -void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) { - CanonIterDataSingleton *me=(CanonIterDataSingleton *)context; - me->newData=new CanonIterData(errorCode); - if(me->newData==NULL) { + +// UInitOnce instantiation function for CanonIterData + +static void U_CALLCONV +initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { + U_ASSERT(impl->fCanonIterData == NULL); + impl->fCanonIterData = new CanonIterData(errorCode); + if (impl->fCanonIterData == NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; } - if(U_SUCCESS(errorCode)) { - utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me); - utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode); - if(U_SUCCESS(errorCode)) { - return me->newData; - } + if (U_SUCCESS(errorCode)) { + utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); + utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); + } + if (U_FAILURE(errorCode)) { + delete impl->fCanonIterData; + impl->fCanonIterData = NULL; } - delete me->newData; - return NULL; } +U_CDECL_END + void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, CanonIterData &newData, UErrorCode &errorCode) const { @@ -1821,16 +1915,16 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, ui if(minYesNo<=norm16_2 && norm16_2(this); - CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode); + umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); return U_SUCCESS(errorCode); } int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { - return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c); + return (int32_t)utrie2_get32(fCanonIterData->trie, c); } const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { - return *(const UnicodeSet *)( - ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]); + return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; } UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { @@ -1939,7 +2032,7 @@ unorm2_swap(const UDataSwapper *ds, pInfo->dataFormat[1]==0x72 && pInfo->dataFormat[2]==0x6d && pInfo->dataFormat[3]==0x32 && - pInfo->formatVersion[0]==1 + (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) )) { udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", pInfo->dataFormat[0], pInfo->dataFormat[1], @@ -1998,10 +2091,14 @@ unorm2_swap(const UDataSwapper *ds, offset=nextOffset; /* swap the uint16_t extraData[] */ - nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1]; + nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); offset=nextOffset; + /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ + nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; + offset=nextOffset; + U_ASSERT(offset==size); }