X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b331163bffd790ced0e88b73f44f86d49ccc48a5..249c4c5ea9376c24572daf9c2effa7484a282f14:/icuSources/i18n/uspoof_impl.cpp?ds=sidebyside diff --git a/icuSources/i18n/uspoof_impl.cpp b/icuSources/i18n/uspoof_impl.cpp index 9c662f80..2c1f088b 100644 --- a/icuSources/i18n/uspoof_impl.cpp +++ b/icuSources/i18n/uspoof_impl.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** -* Copyright (C) 2008-2014, International Business Machines +* Copyright (C) 2008-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -13,11 +15,11 @@ #include "utrie2.h" #include "cmemory.h" #include "cstring.h" -#include "identifier_info.h" #include "scriptset.h" #include "umutex.h" #include "udataswp.h" #include "uassert.h" +#include "ucln_in.h" #include "uspoof_impl.h" #if !UCONFIG_NO_NORMALIZATION @@ -27,42 +29,53 @@ U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) -SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : - fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , - fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { - if (U_FAILURE(status)) { - return; - } +SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) { + construct(status); fSpoofData = data; +} + +SpoofImpl::SpoofImpl(UErrorCode& status) { + construct(status); + + // TODO: Call this method where it is actually needed, instead of in the + // constructor, to allow for lazy data loading. See #12696. + fSpoofData = SpoofData::getDefault(status); +} + +SpoofImpl::SpoofImpl() { + UErrorCode status = U_ZERO_ERROR; + construct(status); + + // TODO: Call this method where it is actually needed, instead of in the + // constructor, to allow for lazy data loading. See #12696. + fSpoofData = SpoofData::getDefault(status); +} + +void SpoofImpl::construct(UErrorCode& status) { + fMagic = USPOOF_MAGIC; + fChecks = USPOOF_ALL_CHECKS; + fSpoofData = NULL; + fAllowedCharsSet = NULL; + fAllowedLocales = NULL; fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; + if (U_FAILURE(status)) { return; } + UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); - allowedCharsSet->freeze(); fAllowedCharsSet = allowedCharsSet; fAllowedLocales = uprv_strdup(""); if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } - fMagic = USPOOF_MAGIC; -} - - -SpoofImpl::SpoofImpl() : - fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , - fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { - UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); allowedCharsSet->freeze(); - fAllowedCharsSet = allowedCharsSet; - fAllowedLocales = uprv_strdup(""); - fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; } // Copy Constructor, used by the user level clone() function. SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , - fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { + fAllowedLocales(NULL) { if (U_FAILURE(status)) { return; } @@ -72,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : fSpoofData = src.fSpoofData->addReference(); } fAllowedCharsSet = static_cast(src.fAllowedCharsSet->clone()); - if (fAllowedCharsSet == NULL) { + fAllowedLocales = uprv_strdup(src.fAllowedLocales); + if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } - fAllowedLocales = uprv_strdup(src.fAllowedLocales); fRestrictionLevel = src.fRestrictionLevel; } @@ -87,7 +100,11 @@ SpoofImpl::~SpoofImpl() { } delete fAllowedCharsSet; uprv_free((void *)fAllowedLocales); - delete fCachedIdentifierInfo; +} + +// Cast this instance as a USpoofChecker for the C API. +USpoofChecker *SpoofImpl::asUSpoofChecker() { + return reinterpret_cast(this); } // @@ -103,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st return NULL; } SpoofImpl *This = (SpoofImpl *)sc; - if (This->fMagic != USPOOF_MAGIC || - This->fSpoofData == NULL) { + if (This->fMagic != USPOOF_MAGIC) { status = U_INVALID_FORMAT_ERROR; return NULL; } - if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { + if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) { return NULL; } return This; @@ -120,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { } - -//-------------------------------------------------------------------------------------- -// -// confusableLookup() This is the heart of the confusable skeleton generation -// implementation. -// -// Given a source character, produce the corresponding -// replacement character(s), appending them to the dest string. -// -//--------------------------------------------------------------------------------------- -int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { - - // Binary search the spoof data key table for the inChar - int32_t *low = fSpoofData->fCFUKeys; - int32_t *mid = NULL; - int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; - UChar32 midc; - do { - int32_t delta = ((int32_t)(limit-low))/2; - mid = low + delta; - midc = *mid & 0x1fffff; - if (inChar == midc) { - goto foundChar; - } else if (inChar < midc) { - limit = mid; - } else { - low = mid; - } - } while (low < limit-1); - mid = low; - midc = *mid & 0x1fffff; - if (inChar != midc) { - // Char not found. It maps to itself. - int i = 0; - dest.append(inChar); - return i; - } - foundChar: - int32_t keyFlags = *mid & 0xff000000; - if ((keyFlags & tableMask) == 0) { - // We found the right key char, but the entry doesn't pertain to the - // table we need. See if there is an adjacent key that does - if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { - int32_t *altMid; - for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { - keyFlags = *altMid & 0xff000000; - if (keyFlags & tableMask) { - mid = altMid; - goto foundKey; - } - } - for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { - keyFlags = *altMid & 0xff000000; - if (keyFlags & tableMask) { - mid = altMid; - goto foundKey; - } - } - } - // No key entry for this char & table. - // The input char maps to itself. - int i = 0; - dest.append(inChar); - return i; - } - - foundKey: - int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; - int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); - - // Value is either a UChar (for strings of length 1) or - // an index into the string table (for longer strings) - uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; - if (stringLen == 1) { - dest.append((UChar)value); - return 1; - } - - // String length of 4 from the above lookup is used for all strings of length >= 4. - // For these, get the real length from the string lengths table, - // which maps string table indexes to lengths. - // All strings of the same length are stored contiguously in the string table. - // 'value' from the lookup above is the starting index for the desired string. - - int32_t ix; - if (stringLen == 4) { - int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; - for (ix = 0; ix < stringLengthsLimit; ix++) { - if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { - stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; - break; - } - } - U_ASSERT(ix < stringLengthsLimit); - } - - U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); - UChar *src = &fSpoofData->fCFUStrings[value]; - dest.append(src, stringLen); - return stringLen; -} - - -//--------------------------------------------------------------------------------------- -// -// wholeScriptCheck() -// -// Input text is already normalized to NFD -// Return the set of scripts, each of which can represent something that is -// confusable with the input text. The script of the input text -// is included; input consisting of characters from a single script will -// always produce a result consisting of a set containing that script. -// -//--------------------------------------------------------------------------------------- -void SpoofImpl::wholeScriptCheck( - const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { - - UTrie2 *table = - (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; - result->setAll(); - int32_t length = text.length(); - for (int32_t inputIdx=0; inputIdx < length;) { - UChar32 c = text.char32At(inputIdx); - inputIdx += U16_LENGTH(c); - uint32_t index = utrie2_get32(table, c); - if (index == 0) { - // No confusables in another script for this char. - // TODO: we should change the data to have sets with just the single script - // bit for the script of this char. Gets rid of this special case. - // Until then, grab the script from the char and intersect it with the set. - UScriptCode cpScript = uscript_getScript(c, &status); - U_ASSERT(cpScript > USCRIPT_INHERITED); - result->intersect(cpScript, status); - } else if (index == 1) { - // Script == Common or Inherited. Nothing to do. - } else { - result->intersect(fSpoofData->fScriptSets[index]); - } - } -} - - void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { UnicodeSet allowedChars; UnicodeSet *tmpSet = NULL; @@ -357,7 +231,7 @@ const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { UScriptCode scripts[30]; - int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); + int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status); if (U_FAILURE(status)) { return; } @@ -373,6 +247,174 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr } } +// Computes the augmented script set for a code point, according to UTS 39 section 5.1. +void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { + result.resetAll(); + result.setScriptExtensions(codePoint, status); + if (U_FAILURE(status)) { return; } + + // Section 5.1 step 1 + if (result.test(USCRIPT_HAN, status)) { + result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); + result.set(USCRIPT_JAPANESE, status); + result.set(USCRIPT_KOREAN, status); + } + if (result.test(USCRIPT_HIRAGANA, status)) { + result.set(USCRIPT_JAPANESE, status); + } + if (result.test(USCRIPT_KATAKANA, status)) { + result.set(USCRIPT_JAPANESE, status); + } + if (result.test(USCRIPT_HANGUL, status)) { + result.set(USCRIPT_KOREAN, status); + } + if (result.test(USCRIPT_BOPOMOFO, status)) { + result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); + } + + // Section 5.1 step 2 + if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { + result.setAll(); + } +} + +// Computes the resolved script set for a string, according to UTS 39 section 5.1. +void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const { + getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status); +} + +// Computes the resolved script set for a string, omitting characters having the specified script. +// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. +void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { + result.setAll(); + + ScriptSet temp; + UChar32 codePoint; + for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { + codePoint = input.char32At(i); + + // Compute the augmented script set for the character + getAugmentedScriptSet(codePoint, temp, status); + if (U_FAILURE(status)) { return; } + + // Intersect the augmented script set with the resolved script set, but only if the character doesn't + // have the script specified in the function call + if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { + result.intersect(temp); + } + } +} + +// Computes the set of numerics for a string, according to UTS 39 section 5.3. +void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { + result.clear(); + + UChar32 codePoint; + for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { + codePoint = input.char32At(i); + + // Store a representative character for each kind of decimal digit + if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { + // Store the zero character as a representative for comparison. + // Unicode guarantees it is codePoint - value + result.add(codePoint - (UChar32)u_getNumericValue(codePoint)); + } + } +} + +// Computes the restriction level of a string, according to UTS 39 section 5.2. +URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { + // Section 5.2 step 1: + if (!fAllowedCharsSet->containsAll(input)) { + return USPOOF_UNRESTRICTIVE; + } + + // Section 5.2 step 2 + // Java use a static UnicodeSet for this test. In C++, avoid the static variable + // and just do a simple for loop. + UBool allASCII = TRUE; + for (int32_t i=0, length=input.length(); i 0x7f) { + allASCII = FALSE; + break; + } + } + if (allASCII) { + return USPOOF_ASCII; + } + + // Section 5.2 steps 3: + ScriptSet resolvedScriptSet; + getResolvedScriptSet(input, resolvedScriptSet, status); + if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } + + // Section 5.2 step 4: + if (!resolvedScriptSet.isEmpty()) { + return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; + } + + // Section 5.2 step 5: + ScriptSet resolvedNoLatn; + getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); + if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } + + // Section 5.2 step 6: + if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) + || resolvedNoLatn.test(USCRIPT_JAPANESE, status) + || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { + return USPOOF_HIGHLY_RESTRICTIVE; + } + + // Section 5.2 step 7: + if (!resolvedNoLatn.isEmpty() + && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) + && !resolvedNoLatn.test(USCRIPT_GREEK, status) + && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { + return USPOOF_MODERATELY_RESTRICTIVE; + } + + // Section 5.2 step 8: + return USPOOF_MINIMALLY_RESTRICTIVE; +} + +int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const { + bool sawLeadCharacter = false; + for (int32_t i=0; iconfusableLookup(cp, skelStr); + UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1)); + if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { + return true; + } + return false; +} + + // Convert a text format hex number. Utility function used by builder code. Static. // Input: UChar *string text. Output: a UChar32 @@ -405,55 +447,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC return (UChar32)val; } -// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. -// Maintain a one-element cache, which is sufficient to avoid repeatedly -// creating new ones unless we get multi-thread concurrency in spoof -// check operations, which should be statistically uncommon. -// These functions are used in place of new & delete of an IdentifierInfo. -// They will recycle the IdentifierInfo when possible. -// They are logically const, and used within const functions that must be thread safe. -IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { - IdentifierInfo *returnIdInfo = NULL; - if (U_FAILURE(status)) { - return returnIdInfo; - } - SpoofImpl *nonConstThis = const_cast(this); - { - Mutex m; - returnIdInfo = nonConstThis->fCachedIdentifierInfo; - nonConstThis->fCachedIdentifierInfo = NULL; - } - if (returnIdInfo == NULL) { - returnIdInfo = new IdentifierInfo(status); - if (U_SUCCESS(status) && returnIdInfo == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } - if (U_FAILURE(status) && returnIdInfo != NULL) { - delete returnIdInfo; - returnIdInfo = NULL; - } - } - return returnIdInfo; +//----------------------------------------- +// +// class CheckResult Implementation +// +//----------------------------------------- + +CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) { + clear(); } +USpoofCheckResult* CheckResult::asUSpoofCheckResult() { + return reinterpret_cast(this); +} -void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { - if (idInfo != NULL) { - SpoofImpl *nonConstThis = const_cast(this); - { - Mutex m; - if (nonConstThis->fCachedIdentifierInfo == NULL) { - nonConstThis->fCachedIdentifierInfo = idInfo; - idInfo = NULL; - } - } - delete idInfo; +// +// Incoming parameter check on Status and the CheckResult object +// received from the C API. +// +const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) { + if (U_FAILURE(status)) { return NULL; } + if (ptr == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; } + CheckResult *This = (CheckResult*) ptr; + if (This->fMagic != USPOOF_CHECK_MAGIC) { + status = U_INVALID_FORMAT_ERROR; + return NULL; + } + return This; } +CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) { + return const_cast + (CheckResult::validateThis(const_cast(ptr), status)); +} + +void CheckResult::clear() { + fChecks = 0; + fNumerics.clear(); + fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE; +} +int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) { + if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) { + return fChecks | fRestrictionLevel; + } else { + return fChecks; + } +} +CheckResult::~CheckResult() { +} //---------------------------------------------------------------------------------------------- // @@ -462,12 +509,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { //---------------------------------------------------------------------------------------------- -UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { +UBool SpoofData::validateDataVersion(UErrorCode &status) const { if (U_FAILURE(status) || - rawData == NULL || - rawData->fMagic != USPOOF_MAGIC || - rawData->fFormatVersion[0] > 1 || - rawData->fFormatVersion[1] > 0) { + fRawData == NULL || + fRawData->fMagic != USPOOF_MAGIC || + fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION || + fRawData->fFormatVersion[1] != 0 || + fRawData->fFormatVersion[2] != 0 || + fRawData->fFormatVersion[3] != 0) { status = U_INVALID_FORMAT_ERROR; return FALSE; } @@ -486,7 +535,7 @@ spoofDataIsAcceptable(void *context, pInfo->dataFormat[1] == 0x66 && pInfo->dataFormat[2] == 0x75 && pInfo->dataFormat[3] == 0x20 && - pInfo->formatVersion[0] == 1 + pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ) { UVersionInfo *version = static_cast(context); if(version != NULL) { @@ -498,31 +547,62 @@ spoofDataIsAcceptable(void *context, } } +// Methods for the loading of the default confusables data file. The confusable +// data is loaded only when it is needed. +// +// SpoofData::getDefault() - Return the default confusables data, and call the +// initOnce() if it is not available. Adds a reference +// to the SpoofData that the caller is responsible for +// decrementing when they are done with the data. // -// SpoofData::getDefault() - return a wrapper around the spoof data that is -// baked into the default ICU data. +// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData +// is shared by all spoof checkers using the default data. // -SpoofData *SpoofData::getDefault(UErrorCode &status) { - // TODO: Cache it. Lazy create, keep until cleanup. +// uspoof_cleanupDefaultData - Called during cleanup. +// + +static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER; +static SpoofData* gDefaultSpoofData; - UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables", +static UBool U_CALLCONV +uspoof_cleanupDefaultData(void) { + if (gDefaultSpoofData) { + // Will delete, assuming all user-level spoof checkers were closed. + gDefaultSpoofData->removeReference(); + gDefaultSpoofData = nullptr; + gSpoofInitDefaultOnce.reset(); + } + return TRUE; +} + +static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) { + UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables", spoofDataIsAcceptable, - NULL, // context, would receive dataVersion if supplied. + nullptr, // context, would receive dataVersion if supplied. &status); + if (U_FAILURE(status)) { return; } + gDefaultSpoofData = new SpoofData(udm, status); if (U_FAILURE(status)) { - return NULL; - } - SpoofData *This = new SpoofData(udm, status); - if (U_FAILURE(status)) { - delete This; - return NULL; + delete gDefaultSpoofData; + gDefaultSpoofData = nullptr; + return; } - if (This == NULL) { + if (gDefaultSpoofData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; + return; } - return This; + ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData); } +SpoofData* SpoofData::getDefault(UErrorCode& status) { + umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status); + if (U_FAILURE(status)) { return NULL; } + gDefaultSpoofData->addReference(); + return gDefaultSpoofData; +} + + + SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) { reset(); @@ -533,7 +613,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) // fRawData is non-const because it may be constructed by the data builder. fRawData = reinterpret_cast( const_cast(udata_getMemory(udm))); - validateDataVersion(fRawData, status); + validateDataVersion(status); initPtrs(status); } @@ -548,13 +628,17 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) status = U_INVALID_FORMAT_ERROR; return; } + if (data == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } void *ncData = const_cast(data); fRawData = static_cast(ncData); if (length < fRawData->fLength) { status = U_INVALID_FORMAT_ERROR; return; } - validateDataVersion(fRawData, status); + validateDataVersion(status); initPtrs(status); } @@ -567,7 +651,6 @@ SpoofData::SpoofData(UErrorCode &status) { return; } fDataOwned = true; - fRefCount = 1; // The spoof header should already be sized to be a multiple of 16 bytes. // Just in case it's not, round it up. @@ -583,7 +666,7 @@ SpoofData::SpoofData(UErrorCode &status) { uprv_memset(fRawData, 0, initialSize); fRawData->fMagic = USPOOF_MAGIC; - fRawData->fFormatVersion[0] = 1; + fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION; fRawData->fFormatVersion[1] = 0; fRawData->fFormatVersion[2] = 0; fRawData->fFormatVersion[3] = 0; @@ -601,11 +684,7 @@ void SpoofData::reset() { fRefCount = 1; fCFUKeys = NULL; fCFUValues = NULL; - fCFUStringLengths = NULL; fCFUStrings = NULL; - fAnyCaseTrie = NULL; - fLowerCaseTrie = NULL; - fScriptSets = NULL; } @@ -627,7 +706,6 @@ void SpoofData::reset() { void SpoofData::initPtrs(UErrorCode &status) { fCFUKeys = NULL; fCFUValues = NULL; - fCFUStringLengths = NULL; fCFUStrings = NULL; if (U_FAILURE(status)) { return; @@ -638,33 +716,13 @@ void SpoofData::initPtrs(UErrorCode &status) { if (fRawData->fCFUStringIndex != 0) { fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); } - if (fRawData->fCFUStringLengths != 0) { - fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); - } if (fRawData->fCFUStringTable != 0) { fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); } - - if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { - fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, - (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); - } - if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { - fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, - (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); - } - - if (fRawData->fScriptSets != 0) { - fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); - } } SpoofData::~SpoofData() { - utrie2_close(fAnyCaseTrie); - fAnyCaseTrie = NULL; - utrie2_close(fLowerCaseTrie); - fLowerCaseTrie = NULL; if (fDataOwned) { uprv_free(fRawData); } @@ -709,6 +767,78 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { return (char *)fRawData + returnOffset; } +int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const { + int32_t dataSize = fRawData->fLength; + if (capacity < dataSize) { + status = U_BUFFER_OVERFLOW_ERROR; + return dataSize; + } + uprv_memcpy(buf, fRawData, dataSize); + return dataSize; +} + +int32_t SpoofData::size() const { + return fRawData->fLength; +} + +//------------------------------- +// +// Front-end APIs for SpoofData +// +//------------------------------- + +int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const { + // Perform a binary search. + // [lo, hi), i.e lo is inclusive, hi is exclusive. + // The result after the loop will be in lo. + int32_t lo = 0; + int32_t hi = length(); + do { + int32_t mid = (lo + hi) / 2; + if (codePointAt(mid) > inChar) { + hi = mid; + } else if (codePointAt(mid) < inChar) { + lo = mid; + } else { + // Found result. Break early. + lo = mid; + break; + } + } while (hi - lo > 1); + + // Did we find an entry? If not, the char maps to itself. + if (codePointAt(lo) != inChar) { + dest.append(inChar); + return 1; + } + + // Add the element to the string builder and return. + return appendValueTo(lo, dest); +} + +int32_t SpoofData::length() const { + return fRawData->fCFUKeysSize; +} + +UChar32 SpoofData::codePointAt(int32_t index) const { + return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]); +} + +int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const { + int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]); + + // Value is either a char (for strings of length 1) or + // an index into the string table (for longer strings) + uint16_t value = fCFUValues[index]; + if (stringLength == 1) { + dest.append((UChar)value); + } else { + dest.append(fCFUStrings + value, stringLength); + } + + return stringLength; +} + U_NAMESPACE_END @@ -740,7 +870,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou pInfo->dataFormat[1]==0x66 && pInfo->dataFormat[2]==0x75 && pInfo->dataFormat[3]==0x20 && - pInfo->formatVersion[0]==1 )) { + pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION && + pInfo->formatVersion[1]==0 && + pInfo->formatVersion[2]==0 && + pInfo->formatVersion[3]==0 )) { udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " "(format version %02x %02x %02x %02x) is not recognized\n", pInfo->dataFormat[0], pInfo->dataFormat[1], @@ -829,26 +962,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - // String Lengths Section - sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); - sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; - ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - - // Any Case Trie - sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); - sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); - utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - - // Lower Case Trie - sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); - sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); - utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - - // Script Sets. The data is an array of int32_t - sectionStart = ds->readUInt32(spoofDH->fScriptSets); - sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); - ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - // And, last, swap the header itself. // int32_t fMagic // swap this // uint8_t fFormatVersion[4] // Do not swap this, just copy