]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/normalizer2impl.cpp
ICU-57132.0.1.tar.gz
[apple/icu.git] / icuSources / common / normalizer2impl.cpp
index 52459be71904892f35c1cb9d142facc84a915d3b..ec4809c4630995c13a4bedbba3ca49b60be755aa 100644 (file)
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2009-2010, International Business Machines
+*   Copyright (C) 2009-2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 #include "unicode/normalizer2.h"
 #include "unicode/udata.h"
 #include "unicode/ustring.h"
+#include "unicode/utf16.h"
 #include "cmemory.h"
 #include "mutex.h"
 #include "normalizer2impl.h"
+#include "putilimp.h"
 #include "uassert.h"
-#include "uhash.h"
 #include "uset_imp.h"
 #include "utrie2.h"
 #include "uvector.h"
@@ -252,71 +253,44 @@ struct CanonIterData : public UMemory {
 };
 
 Normalizer2Impl::~Normalizer2Impl() {
-    udata_close(memory);
-    utrie2_close(normTrie);
-    UTrie2Singleton(fcdTrieSingleton).deleteInstance();
-    delete (CanonIterData *)canonIterDataSingleton.fInstance;
-}
-
-UBool U_CALLCONV
-Normalizer2Impl::isAcceptable(void *context,
-                              const char * /* type */, const char * /*name*/,
-                              const UDataInfo *pInfo) {
-    if(
-        pInfo->size>=20 &&
-        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
-        pInfo->charsetFamily==U_CHARSET_FAMILY &&
-        pInfo->dataFormat[0]==0x4e &&    /* dataFormat="Nrm2" */
-        pInfo->dataFormat[1]==0x72 &&
-        pInfo->dataFormat[2]==0x6d &&
-        pInfo->dataFormat[3]==0x32 &&
-        pInfo->formatVersion[0]==1
-    ) {
-        Normalizer2Impl *me=(Normalizer2Impl *)context;
-        uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
-        return TRUE;
-    } else {
-        return FALSE;
-    }
+    delete fCanonIterData;
 }
 
 void
-Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
-    if(U_FAILURE(errorCode)) {
-        return;
-    }
-    memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
-    if(U_FAILURE(errorCode)) {
-        return;
-    }
-    const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
-    const int32_t *inIndexes=(const int32_t *)inBytes;
-    int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
-    if(indexesLength<=IX_MIN_MAYBE_YES) {
-        errorCode=U_INVALID_FORMAT_ERROR;  // Not enough indexes.
-        return;
-    }
-
+Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
+                      const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 
     minYesNo=inIndexes[IX_MIN_YES_NO];
+    minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
     minNoNo=inIndexes[IX_MIN_NO_NO];
     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 
-    int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
-    int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
-    normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
-                                       inBytes+offset, nextOffset-offset, NULL,
-                                       &errorCode);
-    if(U_FAILURE(errorCode)) {
-        return;
-    }
+    normTrie=inTrie;
 
-    offset=nextOffset;
-    maybeYesCompositions=(const uint16_t *)(inBytes+offset);
+    maybeYesCompositions=inExtraData;
     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
+
+    smallFCD=inSmallFCD;
+
+    // Build tccc180[].
+    // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
+    uint8_t bits=0;
+    for(UChar c=0; c<0x180; bits>>=1) {
+        if((c&0xff)==0) {
+            bits=smallFCD[c>>8];  // one byte per 0x100 code points
+        }
+        if(bits&1) {
+            for(int i=0; i<0x20; ++i, ++c) {
+                tccc180[c]=(uint8_t)getFCD16FromNormData(c);
+            }
+        } else {
+            uprv_memset(tccc180+c, 0, 0x20);
+            c+=0x20;
+        }
+    }
 }
 
 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
@@ -334,8 +308,70 @@ uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, co
     }
 }
 
+namespace {
+
+class LcccContext {
+public:
+    LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
+
+    void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
+        if(impl.isAlgorithmicNoNo(norm16)) {
+            // Range of code points with same-norm16-value algorithmic decompositions.
+            // They might have different non-zero FCD16 values.
+            do {
+                uint16_t fcd16=impl.getFCD16(start);
+                if(fcd16>0xff) { set.add(start); }
+            } while(++start<=end);
+        } else {
+            uint16_t fcd16=impl.getFCD16(start);
+            if(fcd16>0xff) { set.add(start, end); }
+        }
+    }
+
+private:
+    const Normalizer2Impl &impl;
+    UnicodeSet &set;
+};
+
+struct PropertyStartsContext {
+    PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
+            : impl(ni), sa(adder) {}
+
+    const Normalizer2Impl &impl;
+    const USetAdder *sa;
+};
+
+}  // namespace
+
 U_CDECL_BEGIN
 
+static UBool U_CALLCONV
+enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
+    ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
+    return TRUE;
+}
+
+static UBool U_CALLCONV
+enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
+    /* add the start code point to the USet */
+    const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
+    const USetAdder *sa=ctx->sa;
+    sa->add(sa->set, start);
+    if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
+        // Range of code points with same-norm16-value algorithmic decompositions.
+        // They might have different non-zero FCD16 values.
+        uint16_t prevFCD16=ctx->impl.getFCD16(start);
+        while(++start<=end) {
+            uint16_t fcd16=ctx->impl.getFCD16(start);
+            if(fcd16!=prevFCD16) {
+                sa->add(sa->set, start);
+                prevFCD16=fcd16;
+            }
+        }
+    }
+    return TRUE;
+}
+
 static UBool U_CALLCONV
 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
     /* add the start code point to the USet */
@@ -351,10 +387,18 @@ segmentStarterMapper(const void * /*context*/, uint32_t value) {
 
 U_CDECL_END
 
+void
+Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
+    /* add the start code point of each same-value range of each trie */
+    LcccContext context(*this, set);
+    utrie2_enum(normTrie, NULL, enumLcccRange, &context);
+}
+
 void
 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
     /* add the start code point of each same-value range of each trie */
-    utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
+    PropertyStartsContext context(*this, sa);
+    utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
 
     /* add Hangul LV syllables and LV+1 because of skippables */
     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
@@ -369,8 +413,7 @@ Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &err
     /* add the start code point of each same-value range of the canonical iterator data trie */
     if(ensureCanonIterData(errorCode)) {
         // currently only used for the SEGMENT_STARTER property
-        utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
-                    segmentStarterMapper, enumPropertyStartsRange, sa);
+        utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
     }
 }
 
@@ -397,6 +440,38 @@ Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
     return src;
 }
 
+UnicodeString &
+Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
+                           UErrorCode &errorCode) const {
+    if(U_FAILURE(errorCode)) {
+        dest.setToBogus();
+        return dest;
+    }
+    const UChar *sArray=src.getBuffer();
+    if(&dest==&src || sArray==NULL) {
+        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        dest.setToBogus();
+        return dest;
+    }
+    decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
+    return dest;
+}
+
+void
+Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
+                           UnicodeString &dest,
+                           int32_t destLengthEstimate,
+                           UErrorCode &errorCode) const {
+    if(destLengthEstimate<0 && limit!=NULL) {
+        destLengthEstimate=(int32_t)(limit-src);
+    }
+    dest.remove();
+    ReorderingBuffer buffer(*this, dest);
+    if(buffer.init(destLengthEstimate, errorCode)) {
+        decompose(src, limit, &buffer, errorCode);
+    }
+}
+
 // Dual functionality:
 // buffer!=NULL: normalize
 // buffer==NULL: isNormalized/spanQuickCheckYes
@@ -524,16 +599,16 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
         } else {
             // c decomposes, get everything from the variable-length extra data
             const uint16_t *mapping=getMapping(norm16);
-            uint16_t firstUnit=*mapping++;
+            uint16_t firstUnit=*mapping;
             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
             uint8_t leadCC, trailCC;
             trailCC=(uint8_t)(firstUnit>>8);
             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
-                leadCC=(uint8_t)(*mapping++>>8);
+                leadCC=(uint8_t)(*(mapping-1)>>8);
             } else {
                 leadCC=0;
             }
-            return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
+            return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
         }
     }
 }
@@ -558,20 +633,67 @@ Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) c
         } else {
             // c decomposes, get everything from the variable-length extra data
             const uint16_t *mapping=getMapping(norm16);
-            uint16_t firstUnit=*mapping++;
-            length=firstUnit&MAPPING_LENGTH_MASK;
-            if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
-                ++mapping;
+            length=*mapping&MAPPING_LENGTH_MASK;
+            return (const UChar *)mapping+1;
+        }
+    }
+}
+
+// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
+// so that a raw mapping fits that consists of one unit ("rm0")
+// plus all but the first two code units of the normal mapping.
+// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
+const UChar *
+Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
+    // We do not loop in this method because an algorithmic mapping itself
+    // becomes a final result rather than having to be decomposed recursively.
+    uint16_t norm16;
+    if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
+        // c does not decompose
+        return NULL;
+    } else if(isHangul(norm16)) {
+        // Hangul syllable: decompose algorithmically
+        Hangul::getRawDecomposition(c, buffer);
+        length=2;
+        return buffer;
+    } else if(isDecompNoAlgorithmic(norm16)) {
+        c=mapAlgorithmic(c, norm16);
+        length=0;
+        U16_APPEND_UNSAFE(buffer, length, c);
+        return buffer;
+    } else {
+        // c decomposes, get everything from the variable-length extra data
+        const uint16_t *mapping=getMapping(norm16);
+        uint16_t firstUnit=*mapping;
+        int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
+        if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
+            // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
+            // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
+            const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
+            uint16_t rm0=*rawMapping;
+            if(rm0<=MAPPING_LENGTH_MASK) {
+                length=rm0;
+                return (const UChar *)rawMapping-rm0;
+            } else {
+                // Copy the normal mapping and replace its first two code units with rm0.
+                buffer[0]=(UChar)rm0;
+                u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
+                length=mLength-1;
+                return buffer;
             }
-            return (const UChar *)mapping;
+        } else {
+            length=mLength;
+            return (const UChar *)mapping+1;
         }
     }
 }
 
 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
                                          UBool doDecompose,
+                                         UnicodeString &safeMiddle,
                                          ReorderingBuffer &buffer,
                                          UErrorCode &errorCode) const {
+    buffer.copyReorderableSuffixTo(safeMiddle);
     if(doDecompose) {
         decompose(src, limit, &buffer, errorCode);
         return;
@@ -584,8 +706,13 @@ void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
         prevCC=cc;
         cc=getCC(iter.next16());
     };
-    buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
+    if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
+        limit=u_strchr(iter.codePointStart, 0);
+    }
+
+    if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
+    }
 }
 
 // Note: hasDecompBoundary() could be implemented as aliases to
@@ -606,7 +733,7 @@ UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
         } else {
             // c decomposes, get everything from the variable-length extra data
             const uint16_t *mapping=getMapping(norm16);
-            uint16_t firstUnit=*mapping++;
+            uint16_t firstUnit=*mapping;
             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
                 return FALSE;
             }
@@ -622,7 +749,7 @@ UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
             }
             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
-            return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
+            return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
         }
     }
 }
@@ -895,6 +1022,55 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart
     buffer.setReorderingLimit(limit);
 }
 
+UChar32
+Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
+    uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
+    const uint16_t *list;
+    if(isInert(norm16)) {
+        return U_SENTINEL;
+    } else if(norm16<minYesNoMappingsOnly) {
+        if(isJamoL(norm16)) {
+            b-=Hangul::JAMO_V_BASE;
+            if(0<=b && b<Hangul::JAMO_V_COUNT) {
+                return
+                    (Hangul::HANGUL_BASE+
+                     ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
+                     Hangul::JAMO_T_COUNT);
+            } else {
+                return U_SENTINEL;
+            }
+        } else if(isHangul(norm16)) {
+            b-=Hangul::JAMO_T_BASE;
+            if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
+                return a+b;
+            } else {
+                return U_SENTINEL;
+            }
+        } else {
+            // 'a' has a compositions list in extraData
+            list=extraData+norm16;
+            if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
+                list+=  // mapping pointer
+                    1+  // +1 to skip the first unit with the mapping lenth
+                    (*list&MAPPING_LENGTH_MASK);  // + mapping length
+            }
+        }
+    } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
+        return U_SENTINEL;
+    } else {
+        list=maybeYesCompositions+norm16-minMaybeYes;
+    }
+    if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
+        return U_SENTINEL;
+    }
+#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
+    return combine(list, b)>>1;
+#else
+    int32_t compositeAndFwd=combine(list, b);
+    return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
+#endif
+}
+
 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 // doCompose: normalize
 // !doCompose: isNormalized (buffer must be empty and initialized)
@@ -1271,6 +1447,7 @@ Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
                                        UBool doCompose,
                                        UBool onlyContiguous,
+                                       UnicodeString &safeMiddle,
                                        ReorderingBuffer &buffer,
                                        UErrorCode &errorCode) const {
     if(!buffer.isEmpty()) {
@@ -1278,9 +1455,10 @@ void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
         if(src!=firstStarterInSrc) {
             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
                                                                     buffer.getLimit());
-            UnicodeString middle(lastStarterInDest,
-                                 (int32_t)(buffer.getLimit()-lastStarterInDest));
-            buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest));
+            int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
+            UnicodeString middle(lastStarterInDest, destSuffixLength);
+            buffer.removeSuffix(destSuffixLength);
+            safeMiddle=middle;
             middle.append(src, (int32_t)(firstStarterInSrc-src));
             const UChar *middleStart=middle.getBuffer();
             compose(middleStart, middleStart+middle.length(), onlyContiguous,
@@ -1294,6 +1472,9 @@ void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
     if(doCompose) {
         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
     } else {
+        if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
+            limit=u_strchr(src, 0);
+        }
         buffer.appendZeroCC(src, limit, errorCode);
     }
 }
@@ -1317,14 +1498,14 @@ UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
         } else {
             // c decomposes, get everything from the variable-length extra data
             const uint16_t *mapping=getMapping(norm16);
-            uint16_t firstUnit=*mapping++;
+            uint16_t firstUnit=*mapping;
             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
                 return FALSE;
             }
-            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
+            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
                 return FALSE;  // non-zero leadCC
             }
-            int32_t i=0;
+            int32_t i=1;  // skip over the firstUnit
             UChar32 c;
             U16_NEXT_UNSAFE(mapping, i, c);
             return isCompYesAndZeroCC(getNorm16(c));
@@ -1338,7 +1519,8 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo
         if(isInert(norm16)) {
             return TRUE;
         } else if(norm16<=minYesNo) {
-            // Hangul LVT (==minYesNo) has a boundary after it.
+            // Hangul: norm16==minYesNo
+            // Hangul LVT has a boundary after it.
             // Hangul LV and non-inert yesYes characters combine forward.
             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
@@ -1352,12 +1534,13 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo
             const uint16_t *mapping=getMapping(norm16);
             uint16_t firstUnit=*mapping;
             // TRUE if
-            //      c is not deleted, and
-            //      it and its decomposition do not combine forward, and it has a starter, and
-            //      if FCC then trailCC<=1
+            //   not MAPPING_NO_COMP_BOUNDARY_AFTER
+            //     (which is set if
+            //       c is not deleted, and
+            //       it and its decomposition do not combine forward, and it has a starter)
+            //   and if FCC then trailCC<=1
             return
-                (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
-                (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
+                (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
                 (!onlyContiguous || firstUnit<=0x1ff);
         }
     }
@@ -1383,95 +1566,30 @@ const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *
     return iter.codePointStart;
 }
 
-class FCDTrieSingleton : public UTrie2Singleton {
-public:
-    FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
-        UTrie2Singleton(s), impl(ni), errorCode(ec) {}
-    UTrie2 *getInstance(UErrorCode &errorCode) {
-        return UTrie2Singleton::getInstance(createInstance, this, errorCode);
-    }
-    static void *createInstance(const void *context, UErrorCode &errorCode);
-    UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
-        if(value!=0) {
-            impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
-        }
-        return U_SUCCESS(errorCode);
-    }
-
-    Normalizer2Impl &impl;
-    UTrie2 *newFCDTrie;
-    UErrorCode &errorCode;
-};
-
-U_CDECL_BEGIN
-
-// Set the FCD value for a range of same-norm16 characters.
-static UBool U_CALLCONV
-enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
-    return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
-}
-
-// Collect (OR together) the FCD values for a range of supplementary characters,
-// for their lead surrogate code unit.
-static UBool U_CALLCONV
-enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
-    *((uint32_t *)context)|=value;
-    return TRUE;
-}
-
-U_CDECL_END
-
-void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
-    FCDTrieSingleton *me=(FCDTrieSingleton *)context;
-    me->newFCDTrie=utrie2_open(0, 0, &errorCode);
-    if(U_SUCCESS(errorCode)) {
-        utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
-        for(UChar lead=0xd800; lead<0xdc00; ++lead) {
-            uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
-            utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
-            if(oredValue!=0) {
-                // Set a "bad" value for makeFCD() to break the quick check loop
-                // and look up the value for the supplementary code point.
-                // If there is any lccc, then set the worst-case lccc of 1.
-                // The ORed-together value's tccc is already the worst case.
-                if(oredValue>0xff) {
-                    oredValue=0x100|(oredValue&0xff);
-                }
-                utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
-            }
-        }
-        utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
-        if(U_SUCCESS(errorCode)) {
-            return me->newFCDTrie;
-        }
-    }
-    utrie2_close(me->newFCDTrie);
-    return NULL;
-}
+// Note: normalizer2impl.cpp r30982 (2011-nov-27)
+// still had getFCDTrie() which built and cached an FCD trie.
+// That provided faster access to FCD data than getFCD16FromNormData()
+// but required synchronization and consumed some 10kB of heap memory
+// in any process that uses FCD (e.g., via collation).
+// tccc180[] and smallFCD[] are intended to help with any loss of performance,
+// at least for Latin & CJK.
 
-void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
-                                         UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
+// Gets the FCD value from the regular normalization data.
+uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
     // Only loops for 1:1 algorithmic mappings.
     for(;;) {
-        if(norm16>=MIN_NORMAL_MAYBE_YES) {
-            norm16&=0xff;
-            norm16|=norm16<<8;
-        } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
+        uint16_t norm16=getNorm16(c);
+        if(norm16<=minYesNo) {
             // no decomposition or Hangul syllable, all zeros
-            break;
-        } else if(limitNoNo<=norm16) {
-            int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
-            if(start==end) {
-                start+=delta;
-                norm16=getNorm16(start);
-            } else {
-                // the same delta leads from different original characters to different mappings
-                do {
-                    UChar32 c=start+delta;
-                    setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
-                } while(++start<=end);
-                break;
-            }
+            return 0;
+        } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
+            // combining mark
+            norm16&=0xff;
+            return norm16|(norm16<<8);
+        } else if(norm16>=minMaybeYes) {
+            return 0;
+        } else if(isDecompNoAlgorithmic(norm16)) {
+            c=mapAlgorithmic(c, norm16);
         } else {
             // c decomposes, get everything from the variable-length extra data
             const uint16_t *mapping=getMapping(norm16);
@@ -1480,27 +1598,18 @@ void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t no
                 // A character that is deleted (maps to an empty string) must
                 // get the worst-case lccc and tccc values because arbitrary
                 // characters on both sides will become adjacent.
-                norm16=0x1ff;
+                return 0x1ff;
             } else {
+                norm16=firstUnit>>8;  // tccc
                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
-                    norm16=mapping[1]&0xff00;  // lccc
-                } else {
-                    norm16=0;
+                    norm16|=*(mapping-1)&0xff00;  // lccc
                 }
-                norm16|=firstUnit>>8;  // tccc
+                return norm16;
             }
         }
-        utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
-        break;
     }
 }
 
-const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
-    // Logically const: Synchronized instantiation.
-    Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
-    return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
-}
-
 // Dual functionality:
 // buffer!=NULL: normalize
 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
@@ -1521,7 +1630,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
             prevBoundary=src;
             // We know that the previous character's lccc==0.
             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
-            prevFCD16=getFCD16FromSingleLead(*(src-1));
+            prevFCD16=getFCD16(*(src-1));
             if(prevFCD16>1) {
                 --prevBoundary;
             }
@@ -1535,8 +1644,6 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
     // The exception is the call to decomposeShort() which uses the buffer
     // in the normal way.
 
-    const UTrie2 *trie=fcdTrie();
-
     const UChar *prevSrc;
     UChar32 c=0;
     uint16_t fcd16=0;
@@ -1547,24 +1654,24 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
             if((c=*src)<MIN_CCC_LCCC_CP) {
                 prevFCD16=~c;
                 ++src;
-            } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
-                prevFCD16=fcd16;
+            } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
+                prevFCD16=0;
                 ++src;
-            } else if(!U16_IS_SURROGATE(c)) {
-                break;
             } else {
-                UChar c2;
-                if(U16_IS_SURROGATE_LEAD(c)) {
-                    if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
-                        c=U16_GET_SUPPLEMENTARY(c, c2);
-                    }
-                } else /* trail surrogate */ {
-                    if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
-                        --src;
-                        c=U16_GET_SUPPLEMENTARY(c2, c);
+                if(U16_IS_SURROGATE(c)) {
+                    UChar c2;
+                    if(U16_IS_SURROGATE_LEAD(c)) {
+                        if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
+                            c=U16_GET_SUPPLEMENTARY(c, c2);
+                        }
+                    } else /* trail surrogate */ {
+                        if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
+                            --src;
+                            c=U16_GET_SUPPLEMENTARY(c2, c);
+                        }
                     }
                 }
-                if((fcd16=getFCD16(c))<=0xff) {
+                if((fcd16=getFCD16FromNormData(c))<=0xff) {
                     prevFCD16=fcd16;
                     src+=U16_LENGTH(c);
                 } else {
@@ -1584,7 +1691,8 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
             // We know that the previous character's lccc==0.
             if(prevFCD16<0) {
                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
-                prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
+                UChar32 prev=~prevFCD16;
+                prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
                 if(prevFCD16>1) {
                     --prevBoundary;
                 }
@@ -1594,7 +1702,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
                     --p;
                     // Need to fetch the previous character's FCD value because
                     // prevFCD16 was just for the trail surrogate code point.
-                    prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
+                    prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
                 }
                 if(prevFCD16>1) {
@@ -1650,6 +1758,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
 
 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
                                        UBool doMakeFCD,
+                                       UnicodeString &safeMiddle,
                                        ReorderingBuffer &buffer,
                                        UErrorCode &errorCode) const {
     if(!buffer.isEmpty()) {
@@ -1657,9 +1766,10 @@ void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
         if(src!=firstBoundaryInSrc) {
             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
                                                                     buffer.getLimit());
-            UnicodeString middle(lastBoundaryInDest,
-                                 (int32_t)(buffer.getLimit()-lastBoundaryInDest));
-            buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest));
+            int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
+            UnicodeString middle(lastBoundaryInDest, destSuffixLength);
+            buffer.removeSuffix(destSuffixLength);
+            safeMiddle=middle;
             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
             const UChar *middleStart=middle.getBuffer();
             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
@@ -1672,33 +1782,33 @@ void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
     if(doMakeFCD) {
         makeFCD(src, limit, &buffer, errorCode);
     } else {
+        if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
+            limit=u_strchr(src, 0);
+        }
         buffer.appendZeroCC(src, limit, errorCode);
     }
 }
 
 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
-    BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
-    uint16_t fcd16;
-    do {
-        fcd16=iter.previous16();
-    } while(fcd16>0xff);
-    return iter.codePointStart;
+    while(start<p && previousFCD16(start, p)>0xff) {}
+    return p;
 }
 
 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
-    ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
-    uint16_t fcd16;
-    do {
-        fcd16=iter.next16();
-    } while(fcd16>0xff);
-    return iter.codePointStart;
+    while(p<limit) {
+        const UChar *codePointStart=p;
+        if(nextFCD16(p, limit)<=0xff) {
+            return codePointStart;
+        }
+    }
+    return p;
 }
 
 // CanonicalIterator data -------------------------------------------------- ***
 
 CanonIterData::CanonIterData(UErrorCode &errorCode) :
         trie(utrie2_open(0, 0, &errorCode)),
-        canonStartSets(uhash_deleteUObject, NULL, errorCode) {}
+        canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
 
 CanonIterData::~CanonIterData() {
     utrie2_close(trie);
@@ -1733,60 +1843,44 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode
     }
 }
 
-class CanonIterDataSingleton {
-public:
-    CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
-        singleton(s), impl(ni), errorCode(ec) {}
-    CanonIterData *getInstance(UErrorCode &errorCode) {
-        void *duplicate;
-        CanonIterData *instance=
-            (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
-        delete (CanonIterData *)duplicate;
-        return instance;
-    }
-    static void *createInstance(const void *context, UErrorCode &errorCode);
-    UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
-        if(value!=0) {
-            impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
-        }
-        return U_SUCCESS(errorCode);
-    }
-
-private:
-    SimpleSingleton &singleton;
-    Normalizer2Impl &impl;
-    CanonIterData *newData;
-    UErrorCode &errorCode;
-};
-
 U_CDECL_BEGIN
 
 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
+//     context: the Normalizer2Impl
 static UBool U_CALLCONV
 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
-    return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
+    UErrorCode errorCode = U_ZERO_ERROR;
+    if (value != 0) {
+        Normalizer2Impl *impl = (Normalizer2Impl *)context;
+        impl->makeCanonIterDataFromNorm16(
+            start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
+    }
+    return U_SUCCESS(errorCode);
 }
 
-U_CDECL_END
 
-void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
-    CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
-    me->newData=new CanonIterData(errorCode);
-    if(me->newData==NULL) {
+
+// UInitOnce instantiation function for CanonIterData
+
+static void U_CALLCONV 
+initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
+    U_ASSERT(impl->fCanonIterData == NULL);
+    impl->fCanonIterData = new CanonIterData(errorCode);
+    if (impl->fCanonIterData == NULL) {
         errorCode=U_MEMORY_ALLOCATION_ERROR;
-        return NULL;
     }
-    if(U_SUCCESS(errorCode)) {
-        utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
-        utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
-        if(U_SUCCESS(errorCode)) {
-            return me->newData;
-        }
+    if (U_SUCCESS(errorCode)) {
+        utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
+        utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
+    }
+    if (U_FAILURE(errorCode)) {
+        delete impl->fCanonIterData;
+        impl->fCanonIterData = NULL;
     }
-    delete me->newData;
-    return NULL;
 }
 
+U_CDECL_END
+
 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
                                                   CanonIterData &newData,
                                                   UErrorCode &errorCode) const {
@@ -1821,16 +1915,16 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, ui
             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
                 // c decomposes, get everything from the variable-length extra data
                 const uint16_t *mapping=getMapping(norm16_2);
-                uint16_t firstUnit=*mapping++;
+                uint16_t firstUnit=*mapping;
                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
-                    if(c==c2 && (*mapping&0xff)!=0) {
+                    if(c==c2 && (*(mapping-1)&0xff)!=0) {
                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
                     }
-                    ++mapping;
                 }
                 // Skip empty mappings (no characters in the decomposition).
                 if(length!=0) {
+                    ++mapping;  // skip over the firstUnit
                     // add c to first code point's start set
                     int32_t i=0;
                     U16_NEXT_UNSAFE(mapping, i, c2);
@@ -1863,17 +1957,16 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, ui
 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
     // Logically const: Synchronized instantiation.
     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
-    CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
+    umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
     return U_SUCCESS(errorCode);
 }
 
 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
-    return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
+    return (int32_t)utrie2_get32(fCanonIterData->trie, c);
 }
 
 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
-    return *(const UnicodeSet *)(
-        ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
+    return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
 }
 
 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
@@ -1939,7 +2032,7 @@ unorm2_swap(const UDataSwapper *ds,
         pInfo->dataFormat[1]==0x72 &&
         pInfo->dataFormat[2]==0x6d &&
         pInfo->dataFormat[3]==0x32 &&
-        pInfo->formatVersion[0]==1
+        (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
     )) {
         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
                          pInfo->dataFormat[0], pInfo->dataFormat[1],
@@ -1998,10 +2091,14 @@ unorm2_swap(const UDataSwapper *ds,
         offset=nextOffset;
 
         /* swap the uint16_t extraData[] */
-        nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
+        nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
         offset=nextOffset;
 
+        /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
+        nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
+        offset=nextOffset;
+
         U_ASSERT(offset==size);
     }