ICU-62141.0.1.tar.gz

[apple/icu.git] / icuSources / i18n / uspoof_impl.cpp
diff --git a/icuSources/i18n/uspoof_impl.cpp b/icuSources/i18n/uspoof_impl.cpp

index 4e98db548a48efae90b4b64ff710ae60c541561f..2c1f088b12db24bb4746892396cbdd63780026b8 100644 (file)
--- a/icuSources/i18n/uspoof_impl.cpp
+++ b/icuSources/i18n/uspoof_impl.cpp
@@ -1,3 +1,5 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
  **********************************************************************
  *   Copyright (C) 2008-2016, International Business Machines
@@ -13,11 +15,11 @@
  #include "utrie2.h"
  #include "cmemory.h"
  #include "cstring.h"
-#include "identifier_info.h"
  #include "scriptset.h"
  #include "umutex.h"
  #include "udataswp.h"
  #include "uassert.h"
+#include "ucln_in.h"
  #include "uspoof_impl.h"
  
  #if !UCONFIG_NO_NORMALIZATION
@@ -27,41 +29,53 @@ U_NAMESPACE_BEGIN
  
  UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
  
-SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
-        fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) , 
-        fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
-    if (U_FAILURE(status)) {
-        return;
-    }
+SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
+    construct(status);
+    fSpoofData = data;
+}
+
+SpoofImpl::SpoofImpl(UErrorCode& status) {
+    construct(status);
+
+    // TODO: Call this method where it is actually needed, instead of in the
+    // constructor, to allow for lazy data loading.  See #12696.
+    fSpoofData = SpoofData::getDefault(status);
+}
+
+SpoofImpl::SpoofImpl() {
+    UErrorCode status = U_ZERO_ERROR;
+    construct(status);
+
+    // TODO: Call this method where it is actually needed, instead of in the
+    // constructor, to allow for lazy data loading.  See #12696.
+    fSpoofData = SpoofData::getDefault(status);
+}
+
+void SpoofImpl::construct(UErrorCode& status) {
+    fMagic = USPOOF_MAGIC;
+    fChecks = USPOOF_ALL_CHECKS;
+    fSpoofData = NULL;
+    fAllowedCharsSet = NULL;
+    fAllowedLocales = NULL;
      fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
  
+    if (U_FAILURE(status)) { return; }
+
      UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
-    allowedCharsSet->freeze();
      fAllowedCharsSet = allowedCharsSet;
      fAllowedLocales  = uprv_strdup("");
      if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
-    fMagic = USPOOF_MAGIC;
-}
-
-
-SpoofImpl::SpoofImpl() :
-        fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 
-        fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
-    UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
      allowedCharsSet->freeze();
-    fAllowedCharsSet = allowedCharsSet;
-    fAllowedLocales  = uprv_strdup("");
-    fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
  }
  
  
  // Copy Constructor, used by the user level clone() function.
  SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
          fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 
-        fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
+        fAllowedLocales(NULL) {
      if (U_FAILURE(status)) {
          return;
      }
@@ -71,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
          fSpoofData = src.fSpoofData->addReference();
      }
      fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
-    if (fAllowedCharsSet == NULL) {
+    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
+    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
      }
-    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
      fRestrictionLevel = src.fRestrictionLevel;
  }
  
@@ -86,7 +100,11 @@ SpoofImpl::~SpoofImpl() {
      }
      delete fAllowedCharsSet;
      uprv_free((void *)fAllowedLocales);
-    delete fCachedIdentifierInfo;
+}
+
+//  Cast this instance as a USpoofChecker for the C API.
+USpoofChecker *SpoofImpl::asUSpoofChecker() {
+    return reinterpret_cast<USpoofChecker*>(this);
  }
  
  //
@@ -102,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st
          return NULL;
      }
      SpoofImpl *This = (SpoofImpl *)sc;
-    if (This->fMagic != USPOOF_MAGIC ||
-        This->fSpoofData == NULL) {
+    if (This->fMagic != USPOOF_MAGIC) {
          status = U_INVALID_FORMAT_ERROR;
          return NULL;
      }
-    if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
+    if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
          return NULL;
      }
      return This;
@@ -119,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
  }
  
  
-
-//--------------------------------------------------------------------------------------
-//
-//  confusableLookup()    This is the heart of the confusable skeleton generation
-//                        implementation.
-//
-//                        Given a source character, produce the corresponding
-//                        replacement character(s), appending them to the dest string.
-//
-//---------------------------------------------------------------------------------------
-int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
-
-    // Binary search the spoof data key table for the inChar
-    int32_t  *low   = fSpoofData->fCFUKeys;
-    int32_t  *mid   = NULL;
-    int32_t  *limit = low + fSpoofData->fRawData->fCFUKeysSize;
-    UChar32   midc;
-    do {
-        int32_t delta = ((int32_t)(limit-low))/2;
-        mid = low + delta;
-        midc = *mid & 0x1fffff;
-        if (inChar == midc) {
-            goto foundChar;
-        } else if (inChar < midc) {
-            limit = mid;
-        } else {
-            low = mid;
-        }
-    } while (low < limit-1);
-    mid = low;
-    midc = *mid & 0x1fffff;
-    if (inChar != midc) {
-        // Char not found.  It maps to itself.
-        int i = 0;
-        dest.append(inChar);
-        return i;
-    } 
-  foundChar:
-    int32_t keyFlags = *mid & 0xff000000;
-    if ((keyFlags & tableMask) == 0) {
-        // We found the right key char, but the entry doesn't pertain to the
-        //  table we need.  See if there is an adjacent key that does
-        if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
-            int32_t *altMid;
-            for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
-                keyFlags = *altMid & 0xff000000;
-                if (keyFlags & tableMask) {
-                    mid = altMid;
-                    goto foundKey;
-                }
-            }
-            for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
-                keyFlags = *altMid & 0xff000000;
-                if (keyFlags & tableMask) {
-                    mid = altMid;
-                    goto foundKey;
-                }
-            }
-        }
-        // No key entry for this char & table.
-        // The input char maps to itself.
-        int i = 0;
-        dest.append(inChar);
-        return i;
-    }
-
-  foundKey:
-    int32_t  stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
-    int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
-
-    // Value is either a UChar  (for strings of length 1) or
-    //                 an index into the string table (for longer strings)
-    uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
-    if (stringLen == 1) {
-        dest.append((UChar)value);
-        return 1;
-    }
-
-    // String length of 4 from the above lookup is used for all strings of length >= 4.
-    // For these, get the real length from the string lengths table,
-    //   which maps string table indexes to lengths.
-    //   All strings of the same length are stored contiguously in the string table.
-    //   'value' from the lookup above is the starting index for the desired string.
-
-    int32_t ix;
-    if (stringLen == 4) {
-        int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
-        for (ix = 0; ix < stringLengthsLimit; ix++) {
-            if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
-                stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
-                break;
-            }
-        }
-        U_ASSERT(ix < stringLengthsLimit);
-    }
-
-    U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
-    UChar *src = &fSpoofData->fCFUStrings[value];
-    dest.append(src, stringLen);
-    return stringLen;
-}
-
-
-//---------------------------------------------------------------------------------------
-//
-//  wholeScriptCheck()
-//
-//      Input text is already normalized to NFD
-//      Return the set of scripts, each of which can represent something that is
-//             confusable with the input text.  The script of the input text
-//             is included; input consisting of characters from a single script will
-//             always produce a result consisting of a set containing that script.
-//
-//---------------------------------------------------------------------------------------
-void SpoofImpl::wholeScriptCheck(
-        const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
-
-    UTrie2 *table =
-        (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
-    result->setAll();
-    int32_t length = text.length();
-    for (int32_t inputIdx=0; inputIdx < length;) {
-        UChar32 c = text.char32At(inputIdx);
-        inputIdx += U16_LENGTH(c);
-        uint32_t index = utrie2_get32(table, c);
-        if (index == 0) {
-            // No confusables in another script for this char.
-            // TODO:  we should change the data to have sets with just the single script
-            //        bit for the script of this char.  Gets rid of this special case.
-            //        Until then, grab the script from the char and intersect it with the set.
-            UScriptCode cpScript = uscript_getScript(c, &status);
-            U_ASSERT(cpScript > USCRIPT_INHERITED);
-            result->intersect(cpScript, status);
-        } else if (index == 1) {
-            // Script == Common or Inherited.  Nothing to do.
-        } else {
-            result->intersect(fSpoofData->fScriptSets[index]);
-        }
-    }
-}
-
-
  void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
      UnicodeSet    allowedChars;
      UnicodeSet    *tmpSet = NULL;
@@ -372,6 +247,174 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr
      }
  }
  
+// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
+void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
+    result.resetAll();
+    result.setScriptExtensions(codePoint, status);
+    if (U_FAILURE(status)) { return; }
+
+    // Section 5.1 step 1
+    if (result.test(USCRIPT_HAN, status)) {
+        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
+        result.set(USCRIPT_JAPANESE, status);
+        result.set(USCRIPT_KOREAN, status);
+    }
+    if (result.test(USCRIPT_HIRAGANA, status)) {
+        result.set(USCRIPT_JAPANESE, status);
+    }
+    if (result.test(USCRIPT_KATAKANA, status)) {
+        result.set(USCRIPT_JAPANESE, status);
+    }
+    if (result.test(USCRIPT_HANGUL, status)) {
+        result.set(USCRIPT_KOREAN, status);
+    }
+    if (result.test(USCRIPT_BOPOMOFO, status)) {
+        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
+    }
+
+    // Section 5.1 step 2
+    if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
+        result.setAll();
+    }
+}
+
+// Computes the resolved script set for a string, according to UTS 39 section 5.1.
+void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
+    getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
+}
+
+// Computes the resolved script set for a string, omitting characters having the specified script.
+// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
+void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
+    result.setAll();
+
+    ScriptSet temp;
+    UChar32 codePoint;
+    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
+        codePoint = input.char32At(i);
+
+        // Compute the augmented script set for the character
+        getAugmentedScriptSet(codePoint, temp, status);
+        if (U_FAILURE(status)) { return; }
+
+        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
+        // have the script specified in the function call
+        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
+            result.intersect(temp);
+        }
+    }
+}
+
+// Computes the set of numerics for a string, according to UTS 39 section 5.3.
+void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
+    result.clear();
+
+    UChar32 codePoint;
+    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
+        codePoint = input.char32At(i);
+
+        // Store a representative character for each kind of decimal digit
+        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
+            // Store the zero character as a representative for comparison.
+            // Unicode guarantees it is codePoint - value
+            result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
+        }
+    }
+}
+
+// Computes the restriction level of a string, according to UTS 39 section 5.2.
+URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
+    // Section 5.2 step 1:
+    if (!fAllowedCharsSet->containsAll(input)) {
+        return USPOOF_UNRESTRICTIVE;
+    }
+
+    // Section 5.2 step 2
+    // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
+    // and just do a simple for loop.
+    UBool allASCII = TRUE;
+    for (int32_t i=0, length=input.length(); i<length; i++) {
+        if (input.charAt(i) > 0x7f) {
+            allASCII = FALSE;
+            break;
+        }
+    }
+    if (allASCII) {
+        return USPOOF_ASCII;
+    }
+
+    // Section 5.2 steps 3:
+    ScriptSet resolvedScriptSet;
+    getResolvedScriptSet(input, resolvedScriptSet, status);
+    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
+
+    // Section 5.2 step 4:
+    if (!resolvedScriptSet.isEmpty()) {
+        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
+    }
+
+    // Section 5.2 step 5:
+    ScriptSet resolvedNoLatn;
+    getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
+    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
+
+    // Section 5.2 step 6:
+    if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
+            || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
+            || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
+        return USPOOF_HIGHLY_RESTRICTIVE;
+    }
+
+    // Section 5.2 step 7:
+    if (!resolvedNoLatn.isEmpty()
+            && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
+            && !resolvedNoLatn.test(USCRIPT_GREEK, status)
+            && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
+        return USPOOF_MODERATELY_RESTRICTIVE;
+    }
+
+    // Section 5.2 step 8:
+    return USPOOF_MINIMALLY_RESTRICTIVE;
+}
+
+int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
+    bool sawLeadCharacter = false;
+    for (int32_t i=0; i<input.length();) {
+        UChar32 cp = input.char32At(i);
+        if (sawLeadCharacter && cp == 0x0307) {
+            return i;
+        }
+        uint8_t combiningClass = u_getCombiningClass(cp);
+        // Skip over characters except for those with combining class 0 (non-combining characters) or with
+        // combining class 230 (same class as U+0307)
+        U_ASSERT(u_getCombiningClass(0x0307) == 230);
+        if (combiningClass == 0 || combiningClass == 230) {
+            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
+        }
+        i += U16_LENGTH(cp);
+    }
+    return -1;
+}
+
+static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
+    return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
+           u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
+}
+
+bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
+    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
+        return true;
+    }
+    UnicodeString skelStr;
+    fSpoofData->confusableLookup(cp, skelStr);
+    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
+    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
+        return true;
+    }
+    return false;
+}
+
+
  
  // Convert a text format hex number.  Utility function used by builder code.  Static.
  // Input: UChar *string text.  Output: a UChar32
@@ -404,55 +447,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC
      return (UChar32)val;
  }
  
-// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
-//                       Maintain a one-element cache, which is sufficient to avoid repeatedly
-//                       creating new ones unless we get multi-thread concurrency in spoof
-//                       check operations, which should be statistically uncommon.
  
-// These functions are used in place of new & delete of an IdentifierInfo.
-// They will recycle the IdentifierInfo when possible.
-// They are logically const, and used within const functions that must be thread safe.
-IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
-    IdentifierInfo *returnIdInfo = NULL;
-    if (U_FAILURE(status)) {
-        return returnIdInfo;
-    }
-    SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
-    {
-        Mutex m;
-        returnIdInfo = nonConstThis->fCachedIdentifierInfo;
-        nonConstThis->fCachedIdentifierInfo = NULL;
-    }
-    if (returnIdInfo == NULL) {
-        returnIdInfo = new IdentifierInfo(status);
-        if (U_SUCCESS(status) && returnIdInfo == NULL) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-        }
-        if (U_FAILURE(status) && returnIdInfo != NULL) {
-            delete returnIdInfo;
-            returnIdInfo = NULL;
-        }
-    }
-    return returnIdInfo;
+//-----------------------------------------
+//
+//   class CheckResult Implementation
+//
+//-----------------------------------------
+
+CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
+    clear();
  }
  
+USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
+    return reinterpret_cast<USpoofCheckResult*>(this);
+}
  
-void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
-    if (idInfo != NULL) {
-        SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
-        {
-            Mutex m;
-            if (nonConstThis->fCachedIdentifierInfo == NULL) {
-                nonConstThis->fCachedIdentifierInfo = idInfo;
-                idInfo = NULL;
-            }
-        }
-        delete idInfo;
+//
+//  Incoming parameter check on Status and the CheckResult object
+//    received from the C API.
+//
+const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
+    if (U_FAILURE(status)) { return NULL; }
+    if (ptr == NULL) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
      }
+    CheckResult *This = (CheckResult*) ptr;
+    if (This->fMagic != USPOOF_CHECK_MAGIC) {
+        status = U_INVALID_FORMAT_ERROR;
+        return NULL;
+    }
+    return This;
  }
  
+CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
+    return const_cast<CheckResult *>
+        (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
+}
  
+void CheckResult::clear() {
+    fChecks = 0;
+    fNumerics.clear();
+    fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
+}
+
+int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
+    if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
+        return fChecks | fRestrictionLevel;
+    } else {
+        return fChecks;
+    }
+}
  
+CheckResult::~CheckResult() {
+}
  
  //----------------------------------------------------------------------------------------------
  //
@@ -461,12 +509,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
  //----------------------------------------------------------------------------------------------
  
  
-UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
+UBool SpoofData::validateDataVersion(UErrorCode &status) const {
      if (U_FAILURE(status) ||
-        rawData == NULL ||
-        rawData->fMagic != USPOOF_MAGIC ||
-        rawData->fFormatVersion[0] > 1 ||
-        rawData->fFormatVersion[1] > 0) {
+        fRawData == NULL ||
+        fRawData->fMagic != USPOOF_MAGIC ||
+        fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
+        fRawData->fFormatVersion[1] != 0 ||
+        fRawData->fFormatVersion[2] != 0 ||
+        fRawData->fFormatVersion[3] != 0) {
              status = U_INVALID_FORMAT_ERROR;
              return FALSE;
      }
@@ -485,7 +535,7 @@ spoofDataIsAcceptable(void *context,
          pInfo->dataFormat[1] == 0x66 &&
          pInfo->dataFormat[2] == 0x75 &&
          pInfo->dataFormat[3] == 0x20 &&
-        pInfo->formatVersion[0] == 1
+        pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
      ) {
          UVersionInfo *version = static_cast<UVersionInfo *>(context);
          if(version != NULL) {
@@ -497,32 +547,62 @@ spoofDataIsAcceptable(void *context,
      }
  }
  
+//  Methods for the loading of the default confusables data file.  The confusable
+//  data is loaded only when it is needed.
  //
-//  SpoofData::getDefault() - return a wrapper around the spoof data that is
-//                            baked into the default ICU data.
+//  SpoofData::getDefault() - Return the default confusables data, and call the
+//                            initOnce() if it is not available.  Adds a reference
+//                            to the SpoofData that the caller is responsible for
+//                            decrementing when they are done with the data.
  //
-//               Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
-//               SpoofData is shared by all spoof checkers using the default data.
+//  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
+//                           is shared by all spoof checkers using the default data.
  //
-SpoofData *SpoofData::getDefault(UErrorCode &status) {
-    UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
+//  uspoof_cleanupDefaultData - Called during cleanup.
+//
+
+static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
+static SpoofData* gDefaultSpoofData;
+
+static UBool U_CALLCONV
+uspoof_cleanupDefaultData(void) {
+    if (gDefaultSpoofData) {
+        // Will delete, assuming all user-level spoof checkers were closed.
+        gDefaultSpoofData->removeReference();
+        gDefaultSpoofData = nullptr;
+        gSpoofInitDefaultOnce.reset();
+    }
+    return TRUE;
+}
+
+static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
+    UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
                                          spoofDataIsAcceptable, 
-                                        NULL,       // context, would receive dataVersion if supplied.
+                                        nullptr,       // context, would receive dataVersion if supplied.
                                          &status);
+    if (U_FAILURE(status)) { return; }
+    gDefaultSpoofData = new SpoofData(udm, status);
      if (U_FAILURE(status)) {
-        return NULL;
-    }
-    SpoofData *This = new SpoofData(udm, status);
-    if (U_FAILURE(status)) {
-        delete This;
-        return NULL;
+        delete gDefaultSpoofData;
+        gDefaultSpoofData = nullptr;
+        return;
      }
-    if (This == NULL) {
+    if (gDefaultSpoofData == nullptr) {
          status = U_MEMORY_ALLOCATION_ERROR;
+        return;
      }
-    return This;
+    ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
  }
  
+SpoofData* SpoofData::getDefault(UErrorCode& status) {
+    umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
+    if (U_FAILURE(status)) { return NULL; }
+    gDefaultSpoofData->addReference();
+    return gDefaultSpoofData;
+}
+
+
+
  SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
  {
      reset();
@@ -533,7 +613,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
      // fRawData is non-const because it may be constructed by the data builder.
      fRawData = reinterpret_cast<SpoofDataHeader *>(
              const_cast<void *>(udata_getMemory(udm)));
-    validateDataVersion(fRawData, status);
+    validateDataVersion(status);
      initPtrs(status);
  }
  
@@ -548,13 +628,17 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
          status = U_INVALID_FORMAT_ERROR;
          return;
      }
+    if (data == NULL) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
      void *ncData = const_cast<void *>(data);
      fRawData = static_cast<SpoofDataHeader *>(ncData);
      if (length < fRawData->fLength) {
          status = U_INVALID_FORMAT_ERROR;
          return;
      }
-    validateDataVersion(fRawData, status);
+    validateDataVersion(status);
      initPtrs(status);
  }
  
@@ -582,7 +666,7 @@ SpoofData::SpoofData(UErrorCode &status) {
      uprv_memset(fRawData, 0, initialSize);
  
      fRawData->fMagic = USPOOF_MAGIC;
-    fRawData->fFormatVersion[0] = 1;
+    fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
      fRawData->fFormatVersion[1] = 0;
      fRawData->fFormatVersion[2] = 0;
      fRawData->fFormatVersion[3] = 0;
@@ -600,11 +684,7 @@ void SpoofData::reset() {
     fRefCount = 1;
     fCFUKeys = NULL;
     fCFUValues = NULL;
-   fCFUStringLengths = NULL;
     fCFUStrings = NULL;
-   fAnyCaseTrie = NULL;
-   fLowerCaseTrie = NULL;
-   fScriptSets = NULL;
  }
  
  
@@ -626,7 +706,6 @@ void SpoofData::reset() {
  void SpoofData::initPtrs(UErrorCode &status) {
      fCFUKeys = NULL;
      fCFUValues = NULL;
-    fCFUStringLengths = NULL;
      fCFUStrings = NULL;
      if (U_FAILURE(status)) {
          return;
@@ -637,33 +716,13 @@ void SpoofData::initPtrs(UErrorCode &status) {
      if (fRawData->fCFUStringIndex != 0) {
          fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
      }
-    if (fRawData->fCFUStringLengths != 0) {
-        fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
-    }
      if (fRawData->fCFUStringTable != 0) {
          fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
      }
-
-    if (fAnyCaseTrie ==  NULL && fRawData->fAnyCaseTrie != 0) {
-        fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
-            (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
-    }
-    if (fLowerCaseTrie ==  NULL && fRawData->fLowerCaseTrie != 0) {
-        fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
-            (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
-    }
-    
-    if (fRawData->fScriptSets != 0) {
-        fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
-    }
  }
  
  
  SpoofData::~SpoofData() {
-    utrie2_close(fAnyCaseTrie);
-    fAnyCaseTrie = NULL;
-    utrie2_close(fLowerCaseTrie);
-    fLowerCaseTrie = NULL;
      if (fDataOwned) {
          uprv_free(fRawData);
      }
@@ -708,6 +767,78 @@ void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
      return (char *)fRawData + returnOffset;
  }
  
+int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
+    int32_t dataSize = fRawData->fLength;
+    if (capacity < dataSize) {
+        status = U_BUFFER_OVERFLOW_ERROR;
+        return dataSize;
+    }
+    uprv_memcpy(buf, fRawData, dataSize);
+    return dataSize;
+}
+
+int32_t SpoofData::size() const {
+    return fRawData->fLength;
+}
+
+//-------------------------------
+//
+// Front-end APIs for SpoofData
+//
+//-------------------------------
+
+int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
+    // Perform a binary search.
+    // [lo, hi), i.e lo is inclusive, hi is exclusive.
+    // The result after the loop will be in lo.
+    int32_t lo = 0;
+    int32_t hi = length();
+    do {
+        int32_t mid = (lo + hi) / 2;
+        if (codePointAt(mid) > inChar) {
+            hi = mid;
+        } else if (codePointAt(mid) < inChar) {
+            lo = mid;
+        } else {
+            // Found result.  Break early.
+            lo = mid;
+            break;
+        }
+    } while (hi - lo > 1);
+
+    // Did we find an entry?  If not, the char maps to itself.
+    if (codePointAt(lo) != inChar) {
+        dest.append(inChar);
+        return 1;
+    }
+
+    // Add the element to the string builder and return.
+    return appendValueTo(lo, dest);
+}
+
+int32_t SpoofData::length() const {
+    return fRawData->fCFUKeysSize;
+}
+
+UChar32 SpoofData::codePointAt(int32_t index) const {
+    return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
+}
+
+int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
+    int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
+
+    // Value is either a char (for strings of length 1) or
+    // an index into the string table (for longer strings)
+    uint16_t value = fCFUValues[index];
+    if (stringLength == 1) {
+        dest.append((UChar)value);
+    } else {
+        dest.append(fCFUStrings + value, stringLength);
+    }
+
+    return stringLength;
+}
+
  
  U_NAMESPACE_END
  
@@ -739,7 +870,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
             pInfo->dataFormat[1]==0x66 &&
             pInfo->dataFormat[2]==0x75 &&
             pInfo->dataFormat[3]==0x20 &&
-           pInfo->formatVersion[0]==1  )) {
+           pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
+           pInfo->formatVersion[1]==0 &&
+           pInfo->formatVersion[2]==0 &&
+           pInfo->formatVersion[3]==0  )) {
          udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
                               "(format version %02x %02x %02x %02x) is not recognized\n",
                           pInfo->dataFormat[0], pInfo->dataFormat[1],
@@ -828,26 +962,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
      sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
      ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
  
-    // String Lengths Section
-    sectionStart  = ds->readUInt32(spoofDH->fCFUStringLengths);
-    sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
-    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
-    // Any Case Trie
-    sectionStart  = ds->readUInt32(spoofDH->fAnyCaseTrie);
-    sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
-    utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
-    // Lower Case Trie
-    sectionStart  = ds->readUInt32(spoofDH->fLowerCaseTrie);
-    sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
-    utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
-    // Script Sets.  The data is an array of int32_t
-    sectionStart  = ds->readUInt32(spoofDH->fScriptSets);
-    sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
-    ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
      // And, last, swap the header itself.
      //   int32_t   fMagic             // swap this
      //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy