1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2008-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/utypes.h"
11 #include "unicode/uspoof.h"
12 #include "unicode/uchar.h"
13 #include "unicode/uniset.h"
14 #include "unicode/utf16.h"
18 #include "scriptset.h"
23 #include "uspoof_impl.h"
25 #if !UCONFIG_NO_NORMALIZATION
30 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl
)
32 SpoofImpl::SpoofImpl(SpoofData
*data
, UErrorCode
& status
) {
37 SpoofImpl::SpoofImpl(UErrorCode
& status
) {
40 // TODO: Call this method where it is actually needed, instead of in the
41 // constructor, to allow for lazy data loading. See #12696.
42 fSpoofData
= SpoofData::getDefault(status
);
45 SpoofImpl::SpoofImpl() {
46 UErrorCode status
= U_ZERO_ERROR
;
49 // TODO: Call this method where it is actually needed, instead of in the
50 // constructor, to allow for lazy data loading. See #12696.
51 fSpoofData
= SpoofData::getDefault(status
);
54 void SpoofImpl::construct(UErrorCode
& status
) {
55 fChecks
= USPOOF_ALL_CHECKS
;
57 fAllowedCharsSet
= NULL
;
58 fAllowedLocales
= NULL
;
59 fRestrictionLevel
= USPOOF_HIGHLY_RESTRICTIVE
;
61 if (U_FAILURE(status
)) { return; }
63 UnicodeSet
*allowedCharsSet
= new UnicodeSet(0, 0x10ffff);
64 fAllowedCharsSet
= allowedCharsSet
;
65 fAllowedLocales
= uprv_strdup("");
66 if (fAllowedCharsSet
== NULL
|| fAllowedLocales
== NULL
) {
67 status
= U_MEMORY_ALLOCATION_ERROR
;
70 allowedCharsSet
->freeze();
74 // Copy Constructor, used by the user level clone() function.
75 SpoofImpl::SpoofImpl(const SpoofImpl
&src
, UErrorCode
&status
) :
76 fChecks(USPOOF_ALL_CHECKS
), fSpoofData(NULL
), fAllowedCharsSet(NULL
) ,
77 fAllowedLocales(NULL
) {
78 if (U_FAILURE(status
)) {
81 fChecks
= src
.fChecks
;
82 if (src
.fSpoofData
!= NULL
) {
83 fSpoofData
= src
.fSpoofData
->addReference();
85 fAllowedCharsSet
= static_cast<const UnicodeSet
*>(src
.fAllowedCharsSet
->clone());
86 fAllowedLocales
= uprv_strdup(src
.fAllowedLocales
);
87 if (fAllowedCharsSet
== NULL
|| fAllowedLocales
== NULL
) {
88 status
= U_MEMORY_ALLOCATION_ERROR
;
90 fRestrictionLevel
= src
.fRestrictionLevel
;
93 SpoofImpl::~SpoofImpl() {
94 if (fSpoofData
!= NULL
) {
95 fSpoofData
->removeReference(); // Will delete if refCount goes to zero.
97 delete fAllowedCharsSet
;
98 uprv_free((void *)fAllowedLocales
);
101 // Cast this instance as a USpoofChecker for the C API.
102 USpoofChecker
*SpoofImpl::asUSpoofChecker() {
107 // Incoming parameter check on Status and the SpoofChecker object
108 // received from the C API.
110 const SpoofImpl
*SpoofImpl::validateThis(const USpoofChecker
*sc
, UErrorCode
&status
) {
111 auto* This
= validate(sc
, status
);
112 if (U_FAILURE(status
)) {
115 if (This
->fSpoofData
!= NULL
&& !This
->fSpoofData
->validateDataVersion(status
)) {
121 SpoofImpl
*SpoofImpl::validateThis(USpoofChecker
*sc
, UErrorCode
&status
) {
122 return const_cast<SpoofImpl
*>
123 (SpoofImpl::validateThis(const_cast<const USpoofChecker
*>(sc
), status
));
127 void SpoofImpl::setAllowedLocales(const char *localesList
, UErrorCode
&status
) {
128 UnicodeSet allowedChars
;
129 UnicodeSet
*tmpSet
= NULL
;
130 const char *locStart
= localesList
;
131 const char *locEnd
= NULL
;
132 const char *localesListEnd
= localesList
+ uprv_strlen(localesList
);
133 int32_t localeListCount
= 0; // Number of locales provided by caller.
135 // Loop runs once per locale from the localesList, a comma separated list of locales.
137 locEnd
= uprv_strchr(locStart
, ',');
138 if (locEnd
== NULL
) {
139 locEnd
= localesListEnd
;
141 while (*locStart
== ' ') {
144 const char *trimmedEnd
= locEnd
-1;
145 while (trimmedEnd
> locStart
&& *trimmedEnd
== ' ') {
148 if (trimmedEnd
<= locStart
) {
151 const char *locale
= uprv_strndup(locStart
, (int32_t)(trimmedEnd
+ 1 - locStart
));
154 // We have one locale from the locales list.
155 // Add the script chars for this locale to the accumulating set of allowed chars.
156 // If the locale is no good, we will be notified back via status.
157 addScriptChars(locale
, &allowedChars
, status
);
158 uprv_free((void *)locale
);
159 if (U_FAILURE(status
)) {
162 locStart
= locEnd
+ 1;
163 } while (locStart
< localesListEnd
);
165 // If our caller provided an empty list of locales, we disable the allowed characters checking
166 if (localeListCount
== 0) {
167 uprv_free((void *)fAllowedLocales
);
168 fAllowedLocales
= uprv_strdup("");
169 tmpSet
= new UnicodeSet(0, 0x10ffff);
170 if (fAllowedLocales
== NULL
|| tmpSet
== NULL
) {
171 status
= U_MEMORY_ALLOCATION_ERROR
;
175 delete fAllowedCharsSet
;
176 fAllowedCharsSet
= tmpSet
;
177 fChecks
&= ~USPOOF_CHAR_LIMIT
;
182 // Add all common and inherited characters to the set of allowed chars.
184 tempSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_COMMON
, status
);
185 allowedChars
.addAll(tempSet
);
186 tempSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_INHERITED
, status
);
187 allowedChars
.addAll(tempSet
);
189 // If anything went wrong, we bail out without changing
190 // the state of the spoof checker.
191 if (U_FAILURE(status
)) {
195 // Store the updated spoof checker state.
196 tmpSet
= static_cast<UnicodeSet
*>(allowedChars
.clone());
197 const char *tmpLocalesList
= uprv_strdup(localesList
);
198 if (tmpSet
== NULL
|| tmpLocalesList
== NULL
) {
199 status
= U_MEMORY_ALLOCATION_ERROR
;
202 uprv_free((void *)fAllowedLocales
);
203 fAllowedLocales
= tmpLocalesList
;
205 delete fAllowedCharsSet
;
206 fAllowedCharsSet
= tmpSet
;
207 fChecks
|= USPOOF_CHAR_LIMIT
;
211 const char * SpoofImpl::getAllowedLocales(UErrorCode
&/*status*/) {
212 return fAllowedLocales
;
216 // Given a locale (a language), add all the characters from all of the scripts used with that language
217 // to the allowedChars UnicodeSet
219 void SpoofImpl::addScriptChars(const char *locale
, UnicodeSet
*allowedChars
, UErrorCode
&status
) {
220 UScriptCode scripts
[30];
222 int32_t numScripts
= uscript_getCode(locale
, scripts
, UPRV_LENGTHOF(scripts
), &status
);
223 if (U_FAILURE(status
)) {
226 if (status
== U_USING_DEFAULT_WARNING
) {
227 status
= U_ILLEGAL_ARGUMENT_ERROR
;
232 for (i
=0; i
<numScripts
; i
++) {
233 tmpSet
.applyIntPropertyValue(UCHAR_SCRIPT
, scripts
[i
], status
);
234 allowedChars
->addAll(tmpSet
);
238 // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
239 void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint
, ScriptSet
& result
, UErrorCode
& status
) {
241 result
.setScriptExtensions(codePoint
, status
);
242 if (U_FAILURE(status
)) { return; }
244 // Section 5.1 step 1
245 if (result
.test(USCRIPT_HAN
, status
)) {
246 result
.set(USCRIPT_HAN_WITH_BOPOMOFO
, status
);
247 result
.set(USCRIPT_JAPANESE
, status
);
248 result
.set(USCRIPT_KOREAN
, status
);
250 if (result
.test(USCRIPT_HIRAGANA
, status
)) {
251 result
.set(USCRIPT_JAPANESE
, status
);
253 if (result
.test(USCRIPT_KATAKANA
, status
)) {
254 result
.set(USCRIPT_JAPANESE
, status
);
256 if (result
.test(USCRIPT_HANGUL
, status
)) {
257 result
.set(USCRIPT_KOREAN
, status
);
259 if (result
.test(USCRIPT_BOPOMOFO
, status
)) {
260 result
.set(USCRIPT_HAN_WITH_BOPOMOFO
, status
);
263 // Section 5.1 step 2
264 if (result
.test(USCRIPT_COMMON
, status
) || result
.test(USCRIPT_INHERITED
, status
)) {
269 // Computes the resolved script set for a string, according to UTS 39 section 5.1.
270 void SpoofImpl::getResolvedScriptSet(const UnicodeString
& input
, ScriptSet
& result
, UErrorCode
& status
) const {
271 getResolvedScriptSetWithout(input
, USCRIPT_CODE_LIMIT
, result
, status
);
274 // Computes the resolved script set for a string, omitting characters having the specified script.
275 // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
276 void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString
& input
, UScriptCode script
, ScriptSet
& result
, UErrorCode
& status
) const {
281 for (int32_t i
= 0; i
< input
.length(); i
+= U16_LENGTH(codePoint
)) {
282 codePoint
= input
.char32At(i
);
284 // Compute the augmented script set for the character
285 getAugmentedScriptSet(codePoint
, temp
, status
);
286 if (U_FAILURE(status
)) { return; }
288 // Intersect the augmented script set with the resolved script set, but only if the character doesn't
289 // have the script specified in the function call
290 if (script
== USCRIPT_CODE_LIMIT
|| !temp
.test(script
, status
)) {
291 result
.intersect(temp
);
296 // Computes the set of numerics for a string, according to UTS 39 section 5.3.
297 void SpoofImpl::getNumerics(const UnicodeString
& input
, UnicodeSet
& result
, UErrorCode
& /*status*/) const {
301 for (int32_t i
= 0; i
< input
.length(); i
+= U16_LENGTH(codePoint
)) {
302 codePoint
= input
.char32At(i
);
304 // Store a representative character for each kind of decimal digit
305 if (u_charType(codePoint
) == U_DECIMAL_DIGIT_NUMBER
) {
306 // Store the zero character as a representative for comparison.
307 // Unicode guarantees it is codePoint - value
308 result
.add(codePoint
- (UChar32
)u_getNumericValue(codePoint
));
313 // Computes the restriction level of a string, according to UTS 39 section 5.2.
314 URestrictionLevel
SpoofImpl::getRestrictionLevel(const UnicodeString
& input
, UErrorCode
& status
) const {
315 // Section 5.2 step 1:
316 if (!fAllowedCharsSet
->containsAll(input
)) {
317 return USPOOF_UNRESTRICTIVE
;
320 // Section 5.2 step 2
321 // Java use a static UnicodeSet for this test. In C++, avoid the static variable
322 // and just do a simple for loop.
323 UBool allASCII
= TRUE
;
324 for (int32_t i
=0, length
=input
.length(); i
<length
; i
++) {
325 if (input
.charAt(i
) > 0x7f) {
334 // Section 5.2 steps 3:
335 ScriptSet resolvedScriptSet
;
336 getResolvedScriptSet(input
, resolvedScriptSet
, status
);
337 if (U_FAILURE(status
)) { return USPOOF_UNRESTRICTIVE
; }
339 // Section 5.2 step 4:
340 if (!resolvedScriptSet
.isEmpty()) {
341 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE
;
344 // Section 5.2 step 5:
345 ScriptSet resolvedNoLatn
;
346 getResolvedScriptSetWithout(input
, USCRIPT_LATIN
, resolvedNoLatn
, status
);
347 if (U_FAILURE(status
)) { return USPOOF_UNRESTRICTIVE
; }
349 // Section 5.2 step 6:
350 if (resolvedNoLatn
.test(USCRIPT_HAN_WITH_BOPOMOFO
, status
)
351 || resolvedNoLatn
.test(USCRIPT_JAPANESE
, status
)
352 || resolvedNoLatn
.test(USCRIPT_KOREAN
, status
)) {
353 return USPOOF_HIGHLY_RESTRICTIVE
;
356 // Section 5.2 step 7:
357 if (!resolvedNoLatn
.isEmpty()
358 && !resolvedNoLatn
.test(USCRIPT_CYRILLIC
, status
)
359 && !resolvedNoLatn
.test(USCRIPT_GREEK
, status
)
360 && !resolvedNoLatn
.test(USCRIPT_CHEROKEE
, status
)) {
361 return USPOOF_MODERATELY_RESTRICTIVE
;
364 // Section 5.2 step 8:
365 return USPOOF_MINIMALLY_RESTRICTIVE
;
368 int32_t SpoofImpl::findHiddenOverlay(const UnicodeString
& input
, UErrorCode
&) const {
369 bool sawLeadCharacter
= false;
370 for (int32_t i
=0; i
<input
.length();) {
371 UChar32 cp
= input
.char32At(i
);
372 if (sawLeadCharacter
&& cp
== 0x0307) {
375 uint8_t combiningClass
= u_getCombiningClass(cp
);
376 // Skip over characters except for those with combining class 0 (non-combining characters) or with
377 // combining class 230 (same class as U+0307)
378 U_ASSERT(u_getCombiningClass(0x0307) == 230);
379 if (combiningClass
== 0 || combiningClass
== 230) {
380 sawLeadCharacter
= isIllegalCombiningDotLeadCharacter(cp
);
387 static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp
) {
388 return cp
== u
'i' || cp
== u
'j' || cp
== u
'ı' || cp
== u
'ȷ' || cp
== u
'l' ||
389 u_hasBinaryProperty(cp
, UCHAR_SOFT_DOTTED
);
392 bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp
) const {
393 if (isIllegalCombiningDotLeadCharacterNoLookup(cp
)) {
396 UnicodeString skelStr
;
397 fSpoofData
->confusableLookup(cp
, skelStr
);
398 UChar32 finalCp
= skelStr
.char32At(skelStr
.moveIndex32(skelStr
.length(), -1));
399 if (finalCp
!= cp
&& isIllegalCombiningDotLeadCharacterNoLookup(finalCp
)) {
407 // Convert a text format hex number. Utility function used by builder code. Static.
408 // Input: UChar *string text. Output: a UChar32
409 // Input has been pre-checked, and will have no non-hex chars.
410 // The number must fall in the code point range of 0..0x10ffff
412 UChar32
SpoofImpl::ScanHex(const UChar
*s
, int32_t start
, int32_t limit
, UErrorCode
&status
) {
413 if (U_FAILURE(status
)) {
416 U_ASSERT(limit
-start
> 0);
419 for (i
=start
; i
<limit
; i
++) {
420 int digitVal
= s
[i
] - 0x30;
422 digitVal
= 0xa + (s
[i
] - 0x41); // Upper Case 'A'
425 digitVal
= 0xa + (s
[i
] - 0x61); // Lower Case 'a'
427 U_ASSERT(digitVal
<= 0xf);
431 if (val
> 0x10ffff) {
432 status
= U_PARSE_ERROR
;
439 //-----------------------------------------
441 // class CheckResult Implementation
443 //-----------------------------------------
445 CheckResult::CheckResult() {
449 USpoofCheckResult
* CheckResult::asUSpoofCheckResult() {
454 // Incoming parameter check on Status and the CheckResult object
455 // received from the C API.
457 const CheckResult
* CheckResult::validateThis(const USpoofCheckResult
*ptr
, UErrorCode
&status
) {
458 return validate(ptr
, status
);
461 CheckResult
* CheckResult::validateThis(USpoofCheckResult
*ptr
, UErrorCode
&status
) {
462 return validate(ptr
, status
);
465 void CheckResult::clear() {
468 fRestrictionLevel
= USPOOF_UNDEFINED_RESTRICTIVE
;
471 int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks
) {
472 if ((enabledChecks
& USPOOF_AUX_INFO
) != 0 && fRestrictionLevel
!= USPOOF_UNDEFINED_RESTRICTIVE
) {
473 return fChecks
| fRestrictionLevel
;
479 CheckResult::~CheckResult() {
482 //----------------------------------------------------------------------------------------------
484 // class SpoofData Implementation
486 //----------------------------------------------------------------------------------------------
489 UBool
SpoofData::validateDataVersion(UErrorCode
&status
) const {
490 if (U_FAILURE(status
) ||
492 fRawData
->fMagic
!= USPOOF_MAGIC
||
493 fRawData
->fFormatVersion
[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
||
494 fRawData
->fFormatVersion
[1] != 0 ||
495 fRawData
->fFormatVersion
[2] != 0 ||
496 fRawData
->fFormatVersion
[3] != 0) {
497 status
= U_INVALID_FORMAT_ERROR
;
503 static UBool U_CALLCONV
504 spoofDataIsAcceptable(void *context
,
505 const char * /* type */, const char * /*name*/,
506 const UDataInfo
*pInfo
) {
509 pInfo
->isBigEndian
== U_IS_BIG_ENDIAN
&&
510 pInfo
->charsetFamily
== U_CHARSET_FAMILY
&&
511 pInfo
->dataFormat
[0] == 0x43 && // dataFormat="Cfu "
512 pInfo
->dataFormat
[1] == 0x66 &&
513 pInfo
->dataFormat
[2] == 0x75 &&
514 pInfo
->dataFormat
[3] == 0x20 &&
515 pInfo
->formatVersion
[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
517 UVersionInfo
*version
= static_cast<UVersionInfo
*>(context
);
518 if(version
!= NULL
) {
519 uprv_memcpy(version
, pInfo
->dataVersion
, 4);
527 // Methods for the loading of the default confusables data file. The confusable
528 // data is loaded only when it is needed.
530 // SpoofData::getDefault() - Return the default confusables data, and call the
531 // initOnce() if it is not available. Adds a reference
532 // to the SpoofData that the caller is responsible for
533 // decrementing when they are done with the data.
535 // uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
536 // is shared by all spoof checkers using the default data.
538 // uspoof_cleanupDefaultData - Called during cleanup.
541 static UInitOnce gSpoofInitDefaultOnce
= U_INITONCE_INITIALIZER
;
542 static SpoofData
* gDefaultSpoofData
;
544 static UBool U_CALLCONV
545 uspoof_cleanupDefaultData(void) {
546 if (gDefaultSpoofData
) {
547 // Will delete, assuming all user-level spoof checkers were closed.
548 gDefaultSpoofData
->removeReference();
549 gDefaultSpoofData
= nullptr;
550 gSpoofInitDefaultOnce
.reset();
555 static void U_CALLCONV
uspoof_loadDefaultData(UErrorCode
& status
) {
556 UDataMemory
*udm
= udata_openChoice(nullptr, "cfu", "confusables",
557 spoofDataIsAcceptable
,
558 nullptr, // context, would receive dataVersion if supplied.
560 if (U_FAILURE(status
)) { return; }
561 gDefaultSpoofData
= new SpoofData(udm
, status
);
562 if (U_FAILURE(status
)) {
563 delete gDefaultSpoofData
;
564 gDefaultSpoofData
= nullptr;
567 if (gDefaultSpoofData
== nullptr) {
568 status
= U_MEMORY_ALLOCATION_ERROR
;
571 ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA
, uspoof_cleanupDefaultData
);
574 SpoofData
* SpoofData::getDefault(UErrorCode
& status
) {
575 umtx_initOnce(gSpoofInitDefaultOnce
, &uspoof_loadDefaultData
, status
);
576 if (U_FAILURE(status
)) { return NULL
; }
577 gDefaultSpoofData
->addReference();
578 return gDefaultSpoofData
;
583 SpoofData::SpoofData(UDataMemory
*udm
, UErrorCode
&status
)
586 if (U_FAILURE(status
)) {
590 // fRawData is non-const because it may be constructed by the data builder.
591 fRawData
= reinterpret_cast<SpoofDataHeader
*>(
592 const_cast<void *>(udata_getMemory(udm
)));
593 validateDataVersion(status
);
598 SpoofData::SpoofData(const void *data
, int32_t length
, UErrorCode
&status
)
601 if (U_FAILURE(status
)) {
604 if ((size_t)length
< sizeof(SpoofDataHeader
)) {
605 status
= U_INVALID_FORMAT_ERROR
;
609 status
= U_ILLEGAL_ARGUMENT_ERROR
;
612 void *ncData
= const_cast<void *>(data
);
613 fRawData
= static_cast<SpoofDataHeader
*>(ncData
);
614 if (length
< fRawData
->fLength
) {
615 status
= U_INVALID_FORMAT_ERROR
;
618 validateDataVersion(status
);
623 // Spoof Data constructor for use from data builder.
624 // Initializes a new, empty data area that will be populated later.
625 SpoofData::SpoofData(UErrorCode
&status
) {
627 if (U_FAILURE(status
)) {
632 // The spoof header should already be sized to be a multiple of 16 bytes.
633 // Just in case it's not, round it up.
634 uint32_t initialSize
= (sizeof(SpoofDataHeader
) + 15) & ~15;
635 U_ASSERT(initialSize
== sizeof(SpoofDataHeader
));
637 fRawData
= static_cast<SpoofDataHeader
*>(uprv_malloc(initialSize
));
638 fMemLimit
= initialSize
;
639 if (fRawData
== NULL
) {
640 status
= U_MEMORY_ALLOCATION_ERROR
;
643 uprv_memset(fRawData
, 0, initialSize
);
645 fRawData
->fMagic
= USPOOF_MAGIC
;
646 fRawData
->fFormatVersion
[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
;
647 fRawData
->fFormatVersion
[1] = 0;
648 fRawData
->fFormatVersion
[2] = 0;
649 fRawData
->fFormatVersion
[3] = 0;
653 // reset() - initialize all fields.
654 // Should be updated if any new fields are added.
655 // Called by constructors to put things in a known initial state.
656 void SpoofData::reset() {
668 // SpoofData::initPtrs()
669 // Initialize the pointers to the various sections of the raw data.
671 // This function is used both during the Trie building process (multiple
672 // times, as the individual data sections are added), and
673 // during the opening of a Spoof Checker from prebuilt data.
675 // The pointers for non-existent data sections (identified by an offset of 0)
678 // Note: During building the data, adding each new data section
679 // reallocs the raw data area, which likely relocates it, which
680 // in turn requires reinitializing all of the pointers into it, hence
681 // multiple calls to this function during building.
683 void SpoofData::initPtrs(UErrorCode
&status
) {
687 if (U_FAILURE(status
)) {
690 if (fRawData
->fCFUKeys
!= 0) {
691 fCFUKeys
= (int32_t *)((char *)fRawData
+ fRawData
->fCFUKeys
);
693 if (fRawData
->fCFUStringIndex
!= 0) {
694 fCFUValues
= (uint16_t *)((char *)fRawData
+ fRawData
->fCFUStringIndex
);
696 if (fRawData
->fCFUStringTable
!= 0) {
697 fCFUStrings
= (UChar
*)((char *)fRawData
+ fRawData
->fCFUStringTable
);
702 SpoofData::~SpoofData() {
714 void SpoofData::removeReference() {
715 if (umtx_atomic_dec(&fRefCount
) == 0) {
721 SpoofData
*SpoofData::addReference() {
722 umtx_atomic_inc(&fRefCount
);
727 void *SpoofData::reserveSpace(int32_t numBytes
, UErrorCode
&status
) {
728 if (U_FAILURE(status
)) {
735 numBytes
= (numBytes
+ 15) & ~15; // Round up to a multiple of 16
736 uint32_t returnOffset
= fMemLimit
;
737 fMemLimit
+= numBytes
;
738 fRawData
= static_cast<SpoofDataHeader
*>(uprv_realloc(fRawData
, fMemLimit
));
739 fRawData
->fLength
= fMemLimit
;
740 uprv_memset((char *)fRawData
+ returnOffset
, 0, numBytes
);
742 return (char *)fRawData
+ returnOffset
;
745 int32_t SpoofData::serialize(void *buf
, int32_t capacity
, UErrorCode
&status
) const {
746 int32_t dataSize
= fRawData
->fLength
;
747 if (capacity
< dataSize
) {
748 status
= U_BUFFER_OVERFLOW_ERROR
;
751 uprv_memcpy(buf
, fRawData
, dataSize
);
755 int32_t SpoofData::size() const {
756 return fRawData
->fLength
;
759 //-------------------------------
761 // Front-end APIs for SpoofData
763 //-------------------------------
765 int32_t SpoofData::confusableLookup(UChar32 inChar
, UnicodeString
&dest
) const {
766 // Perform a binary search.
767 // [lo, hi), i.e lo is inclusive, hi is exclusive.
768 // The result after the loop will be in lo.
770 int32_t hi
= length();
772 int32_t mid
= (lo
+ hi
) / 2;
773 if (codePointAt(mid
) > inChar
) {
775 } else if (codePointAt(mid
) < inChar
) {
778 // Found result. Break early.
782 } while (hi
- lo
> 1);
784 // Did we find an entry? If not, the char maps to itself.
785 if (codePointAt(lo
) != inChar
) {
790 // Add the element to the string builder and return.
791 return appendValueTo(lo
, dest
);
794 int32_t SpoofData::length() const {
795 return fRawData
->fCFUKeysSize
;
798 UChar32
SpoofData::codePointAt(int32_t index
) const {
799 return ConfusableDataUtils::keyToCodePoint(fCFUKeys
[index
]);
802 int32_t SpoofData::appendValueTo(int32_t index
, UnicodeString
& dest
) const {
803 int32_t stringLength
= ConfusableDataUtils::keyToLength(fCFUKeys
[index
]);
805 // Value is either a char (for strings of length 1) or
806 // an index into the string table (for longer strings)
807 uint16_t value
= fCFUValues
[index
];
808 if (stringLength
== 1) {
809 dest
.append((UChar
)value
);
811 dest
.append(fCFUStrings
+ value
, stringLength
);
822 //-----------------------------------------------------------------------------
824 // uspoof_swap - byte swap and char encoding swap of spoof data
826 //-----------------------------------------------------------------------------
827 U_CAPI
int32_t U_EXPORT2
828 uspoof_swap(const UDataSwapper
*ds
, const void *inData
, int32_t length
, void *outData
,
829 UErrorCode
*status
) {
831 if (status
== NULL
|| U_FAILURE(*status
)) {
834 if(ds
==NULL
|| inData
==NULL
|| length
<-1 || (length
>0 && outData
==NULL
)) {
835 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
840 // Check that the data header is for spoof data.
841 // (Header contents are defined in gencfu.cpp)
843 const UDataInfo
*pInfo
= (const UDataInfo
*)((const char *)inData
+4);
844 if(!( pInfo
->dataFormat
[0]==0x43 && /* dataFormat="Cfu " */
845 pInfo
->dataFormat
[1]==0x66 &&
846 pInfo
->dataFormat
[2]==0x75 &&
847 pInfo
->dataFormat
[3]==0x20 &&
848 pInfo
->formatVersion
[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
&&
849 pInfo
->formatVersion
[1]==0 &&
850 pInfo
->formatVersion
[2]==0 &&
851 pInfo
->formatVersion
[3]==0 )) {
852 udata_printError(ds
, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
853 "(format version %02x %02x %02x %02x) is not recognized\n",
854 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
855 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
856 pInfo
->formatVersion
[0], pInfo
->formatVersion
[1],
857 pInfo
->formatVersion
[2], pInfo
->formatVersion
[3]);
858 *status
=U_UNSUPPORTED_ERROR
;
863 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
864 // header). This swap also conveniently gets us
865 // the size of the ICU d.h., which lets us locate the start
866 // of the uspoof specific data.
868 int32_t headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, status
);
872 // Get the Spoof Data Header, and check that it appears to be OK.
875 const uint8_t *inBytes
=(const uint8_t *)inData
+headerSize
;
876 SpoofDataHeader
*spoofDH
= (SpoofDataHeader
*)inBytes
;
877 if (ds
->readUInt32(spoofDH
->fMagic
) != USPOOF_MAGIC
||
878 ds
->readUInt32(spoofDH
->fLength
) < sizeof(SpoofDataHeader
))
880 udata_printError(ds
, "uspoof_swap(): Spoof Data header is invalid.\n");
881 *status
=U_UNSUPPORTED_ERROR
;
886 // Prefight operation? Just return the size
888 int32_t spoofDataLength
= ds
->readUInt32(spoofDH
->fLength
);
889 int32_t totalSize
= headerSize
+ spoofDataLength
;
895 // Check that length passed in is consistent with length from Spoof data header.
897 if (length
< totalSize
) {
898 udata_printError(ds
, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
900 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
906 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
907 // we need to reference the header to locate the data, and an
908 // inplace swap of the header leaves it unusable.
910 uint8_t *outBytes
= (uint8_t *)outData
+ headerSize
;
911 SpoofDataHeader
*outputDH
= (SpoofDataHeader
*)outBytes
;
913 int32_t sectionStart
;
914 int32_t sectionLength
;
917 // If not swapping in place, zero out the output buffer before starting.
918 // Gaps may exist between the individual sections, and these must be zeroed in
919 // the output buffer. The simplest way to do that is to just zero the whole thing.
921 if (inBytes
!= outBytes
) {
922 uprv_memset(outBytes
, 0, spoofDataLength
);
925 // Confusables Keys Section (fCFUKeys)
926 sectionStart
= ds
->readUInt32(spoofDH
->fCFUKeys
);
927 sectionLength
= ds
->readUInt32(spoofDH
->fCFUKeysSize
) * 4;
928 ds
->swapArray32(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
930 // String Index Section
931 sectionStart
= ds
->readUInt32(spoofDH
->fCFUStringIndex
);
932 sectionLength
= ds
->readUInt32(spoofDH
->fCFUStringIndexSize
) * 2;
933 ds
->swapArray16(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
935 // String Table Section
936 sectionStart
= ds
->readUInt32(spoofDH
->fCFUStringTable
);
937 sectionLength
= ds
->readUInt32(spoofDH
->fCFUStringTableLen
) * 2;
938 ds
->swapArray16(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
940 // And, last, swap the header itself.
941 // int32_t fMagic // swap this
942 // uint8_t fFormatVersion[4] // Do not swap this, just copy
943 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
945 uint32_t magic
= ds
->readUInt32(spoofDH
->fMagic
);
946 ds
->writeUInt32((uint32_t *)&outputDH
->fMagic
, magic
);
948 if (outputDH
->fFormatVersion
!= spoofDH
->fFormatVersion
) {
949 uprv_memcpy(outputDH
->fFormatVersion
, spoofDH
->fFormatVersion
, sizeof(spoofDH
->fFormatVersion
));
951 // swap starting at fLength
952 ds
->swapArray32(ds
, &spoofDH
->fLength
, sizeof(SpoofDataHeader
)-8 /* minus magic and fFormatVersion[4] */, &outputDH
->fLength
, status
);