1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2008-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/utypes.h"
11 #include "unicode/uspoof.h"
12 #include "unicode/uchar.h"
13 #include "unicode/uniset.h"
14 #include "unicode/utf16.h"
18 #include "scriptset.h"
23 #include "uspoof_impl.h"
25 #if !UCONFIG_NO_NORMALIZATION
30 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl
)
32 SpoofImpl::SpoofImpl(SpoofData
*data
, UErrorCode
& status
) {
37 SpoofImpl::SpoofImpl(UErrorCode
& status
) {
40 // TODO: Call this method where it is actually needed, instead of in the
41 // constructor, to allow for lazy data loading. See #12696.
42 fSpoofData
= SpoofData::getDefault(status
);
45 SpoofImpl::SpoofImpl() {
46 UErrorCode status
= U_ZERO_ERROR
;
49 // TODO: Call this method where it is actually needed, instead of in the
50 // constructor, to allow for lazy data loading. See #12696.
51 fSpoofData
= SpoofData::getDefault(status
);
54 void SpoofImpl::construct(UErrorCode
& status
) {
55 fMagic
= USPOOF_MAGIC
;
56 fChecks
= USPOOF_ALL_CHECKS
;
58 fAllowedCharsSet
= NULL
;
59 fAllowedLocales
= NULL
;
60 fRestrictionLevel
= USPOOF_HIGHLY_RESTRICTIVE
;
62 if (U_FAILURE(status
)) { return; }
64 UnicodeSet
*allowedCharsSet
= new UnicodeSet(0, 0x10ffff);
65 fAllowedCharsSet
= allowedCharsSet
;
66 fAllowedLocales
= uprv_strdup("");
67 if (fAllowedCharsSet
== NULL
|| fAllowedLocales
== NULL
) {
68 status
= U_MEMORY_ALLOCATION_ERROR
;
71 allowedCharsSet
->freeze();
75 // Copy Constructor, used by the user level clone() function.
76 SpoofImpl::SpoofImpl(const SpoofImpl
&src
, UErrorCode
&status
) :
77 fMagic(0), fChecks(USPOOF_ALL_CHECKS
), fSpoofData(NULL
), fAllowedCharsSet(NULL
) ,
78 fAllowedLocales(NULL
) {
79 if (U_FAILURE(status
)) {
83 fChecks
= src
.fChecks
;
84 if (src
.fSpoofData
!= NULL
) {
85 fSpoofData
= src
.fSpoofData
->addReference();
87 fAllowedCharsSet
= static_cast<const UnicodeSet
*>(src
.fAllowedCharsSet
->clone());
88 fAllowedLocales
= uprv_strdup(src
.fAllowedLocales
);
89 if (fAllowedCharsSet
== NULL
|| fAllowedLocales
== NULL
) {
90 status
= U_MEMORY_ALLOCATION_ERROR
;
92 fRestrictionLevel
= src
.fRestrictionLevel
;
95 SpoofImpl::~SpoofImpl() {
96 fMagic
= 0; // head off application errors by preventing use of
97 // of deleted objects.
98 if (fSpoofData
!= NULL
) {
99 fSpoofData
->removeReference(); // Will delete if refCount goes to zero.
101 delete fAllowedCharsSet
;
102 uprv_free((void *)fAllowedLocales
);
105 // Cast this instance as a USpoofChecker for the C API.
106 USpoofChecker
*SpoofImpl::asUSpoofChecker() {
107 return reinterpret_cast<USpoofChecker
*>(this);
111 // Incoming parameter check on Status and the SpoofChecker object
112 // received from the C API.
114 const SpoofImpl
*SpoofImpl::validateThis(const USpoofChecker
*sc
, UErrorCode
&status
) {
115 if (U_FAILURE(status
)) {
119 status
= U_ILLEGAL_ARGUMENT_ERROR
;
122 SpoofImpl
*This
= (SpoofImpl
*)sc
;
123 if (This
->fMagic
!= USPOOF_MAGIC
) {
124 status
= U_INVALID_FORMAT_ERROR
;
127 if (This
->fSpoofData
!= NULL
&& !This
->fSpoofData
->validateDataVersion(status
)) {
133 SpoofImpl
*SpoofImpl::validateThis(USpoofChecker
*sc
, UErrorCode
&status
) {
134 return const_cast<SpoofImpl
*>
135 (SpoofImpl::validateThis(const_cast<const USpoofChecker
*>(sc
), status
));
139 void SpoofImpl::setAllowedLocales(const char *localesList
, UErrorCode
&status
) {
140 UnicodeSet allowedChars
;
141 UnicodeSet
*tmpSet
= NULL
;
142 const char *locStart
= localesList
;
143 const char *locEnd
= NULL
;
144 const char *localesListEnd
= localesList
+ uprv_strlen(localesList
);
145 int32_t localeListCount
= 0; // Number of locales provided by caller.
147 // Loop runs once per locale from the localesList, a comma separated list of locales.
149 locEnd
= uprv_strchr(locStart
, ',');
150 if (locEnd
== NULL
) {
151 locEnd
= localesListEnd
;
153 while (*locStart
== ' ') {
156 const char *trimmedEnd
= locEnd
-1;
157 while (trimmedEnd
> locStart
&& *trimmedEnd
== ' ') {
160 if (trimmedEnd
<= locStart
) {
163 const char *locale
= uprv_strndup(locStart
, (int32_t)(trimmedEnd
+ 1 - locStart
));
166 // We have one locale from the locales list.
167 // Add the script chars for this locale to the accumulating set of allowed chars.
168 // If the locale is no good, we will be notified back via status.
169 addScriptChars(locale
, &allowedChars
, status
);
170 uprv_free((void *)locale
);
171 if (U_FAILURE(status
)) {
174 locStart
= locEnd
+ 1;
175 } while (locStart
< localesListEnd
);
177 // If our caller provided an empty list of locales, we disable the allowed characters checking
178 if (localeListCount
== 0) {
179 uprv_free((void *)fAllowedLocales
);
180 fAllowedLocales
= uprv_strdup("");
181 tmpSet
= new UnicodeSet(0, 0x10ffff);
182 if (fAllowedLocales
== NULL
|| tmpSet
== NULL
) {
183 status
= U_MEMORY_ALLOCATION_ERROR
;
187 delete fAllowedCharsSet
;
188 fAllowedCharsSet
= tmpSet
;
189 fChecks
&= ~USPOOF_CHAR_LIMIT
;
194 // Add all common and inherited characters to the set of allowed chars.
196 tempSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_COMMON
, status
);
197 allowedChars
.addAll(tempSet
);
198 tempSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_INHERITED
, status
);
199 allowedChars
.addAll(tempSet
);
201 // If anything went wrong, we bail out without changing
202 // the state of the spoof checker.
203 if (U_FAILURE(status
)) {
207 // Store the updated spoof checker state.
208 tmpSet
= static_cast<UnicodeSet
*>(allowedChars
.clone());
209 const char *tmpLocalesList
= uprv_strdup(localesList
);
210 if (tmpSet
== NULL
|| tmpLocalesList
== NULL
) {
211 status
= U_MEMORY_ALLOCATION_ERROR
;
214 uprv_free((void *)fAllowedLocales
);
215 fAllowedLocales
= tmpLocalesList
;
217 delete fAllowedCharsSet
;
218 fAllowedCharsSet
= tmpSet
;
219 fChecks
|= USPOOF_CHAR_LIMIT
;
223 const char * SpoofImpl::getAllowedLocales(UErrorCode
&/*status*/) {
224 return fAllowedLocales
;
228 // Given a locale (a language), add all the characters from all of the scripts used with that language
229 // to the allowedChars UnicodeSet
231 void SpoofImpl::addScriptChars(const char *locale
, UnicodeSet
*allowedChars
, UErrorCode
&status
) {
232 UScriptCode scripts
[30];
234 int32_t numScripts
= uscript_getCode(locale
, scripts
, UPRV_LENGTHOF(scripts
), &status
);
235 if (U_FAILURE(status
)) {
238 if (status
== U_USING_DEFAULT_WARNING
) {
239 status
= U_ILLEGAL_ARGUMENT_ERROR
;
244 for (i
=0; i
<numScripts
; i
++) {
245 tmpSet
.applyIntPropertyValue(UCHAR_SCRIPT
, scripts
[i
], status
);
246 allowedChars
->addAll(tmpSet
);
250 // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
251 void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint
, ScriptSet
& result
, UErrorCode
& status
) {
253 result
.setScriptExtensions(codePoint
, status
);
254 if (U_FAILURE(status
)) { return; }
256 // Section 5.1 step 1
257 if (result
.test(USCRIPT_HAN
, status
)) {
258 result
.set(USCRIPT_HAN_WITH_BOPOMOFO
, status
);
259 result
.set(USCRIPT_JAPANESE
, status
);
260 result
.set(USCRIPT_KOREAN
, status
);
262 if (result
.test(USCRIPT_HIRAGANA
, status
)) {
263 result
.set(USCRIPT_JAPANESE
, status
);
265 if (result
.test(USCRIPT_KATAKANA
, status
)) {
266 result
.set(USCRIPT_JAPANESE
, status
);
268 if (result
.test(USCRIPT_HANGUL
, status
)) {
269 result
.set(USCRIPT_KOREAN
, status
);
271 if (result
.test(USCRIPT_BOPOMOFO
, status
)) {
272 result
.set(USCRIPT_HAN_WITH_BOPOMOFO
, status
);
275 // Section 5.1 step 2
276 if (result
.test(USCRIPT_COMMON
, status
) || result
.test(USCRIPT_INHERITED
, status
)) {
281 // Computes the resolved script set for a string, according to UTS 39 section 5.1.
282 void SpoofImpl::getResolvedScriptSet(const UnicodeString
& input
, ScriptSet
& result
, UErrorCode
& status
) const {
283 getResolvedScriptSetWithout(input
, USCRIPT_CODE_LIMIT
, result
, status
);
286 // Computes the resolved script set for a string, omitting characters having the specified script.
287 // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
288 void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString
& input
, UScriptCode script
, ScriptSet
& result
, UErrorCode
& status
) const {
293 for (int32_t i
= 0; i
< input
.length(); i
+= U16_LENGTH(codePoint
)) {
294 codePoint
= input
.char32At(i
);
296 // Compute the augmented script set for the character
297 getAugmentedScriptSet(codePoint
, temp
, status
);
298 if (U_FAILURE(status
)) { return; }
300 // Intersect the augmented script set with the resolved script set, but only if the character doesn't
301 // have the script specified in the function call
302 if (script
== USCRIPT_CODE_LIMIT
|| !temp
.test(script
, status
)) {
303 result
.intersect(temp
);
308 // Computes the set of numerics for a string, according to UTS 39 section 5.3.
309 void SpoofImpl::getNumerics(const UnicodeString
& input
, UnicodeSet
& result
, UErrorCode
& /*status*/) const {
313 for (int32_t i
= 0; i
< input
.length(); i
+= U16_LENGTH(codePoint
)) {
314 codePoint
= input
.char32At(i
);
316 // Store a representative character for each kind of decimal digit
317 if (u_charType(codePoint
) == U_DECIMAL_DIGIT_NUMBER
) {
318 // Store the zero character as a representative for comparison.
319 // Unicode guarantees it is codePoint - value
320 result
.add(codePoint
- (UChar32
)u_getNumericValue(codePoint
));
325 // Computes the restriction level of a string, according to UTS 39 section 5.2.
326 URestrictionLevel
SpoofImpl::getRestrictionLevel(const UnicodeString
& input
, UErrorCode
& status
) const {
327 // Section 5.2 step 1:
328 if (!fAllowedCharsSet
->containsAll(input
)) {
329 return USPOOF_UNRESTRICTIVE
;
332 // Section 5.2 step 2
333 // Java use a static UnicodeSet for this test. In C++, avoid the static variable
334 // and just do a simple for loop.
335 UBool allASCII
= TRUE
;
336 for (int32_t i
=0, length
=input
.length(); i
<length
; i
++) {
337 if (input
.charAt(i
) > 0x7f) {
346 // Section 5.2 steps 3:
347 ScriptSet resolvedScriptSet
;
348 getResolvedScriptSet(input
, resolvedScriptSet
, status
);
349 if (U_FAILURE(status
)) { return USPOOF_UNRESTRICTIVE
; }
351 // Section 5.2 step 4:
352 if (!resolvedScriptSet
.isEmpty()) {
353 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE
;
356 // Section 5.2 step 5:
357 ScriptSet resolvedNoLatn
;
358 getResolvedScriptSetWithout(input
, USCRIPT_LATIN
, resolvedNoLatn
, status
);
359 if (U_FAILURE(status
)) { return USPOOF_UNRESTRICTIVE
; }
361 // Section 5.2 step 6:
362 if (resolvedNoLatn
.test(USCRIPT_HAN_WITH_BOPOMOFO
, status
)
363 || resolvedNoLatn
.test(USCRIPT_JAPANESE
, status
)
364 || resolvedNoLatn
.test(USCRIPT_KOREAN
, status
)) {
365 return USPOOF_HIGHLY_RESTRICTIVE
;
368 // Section 5.2 step 7:
369 if (!resolvedNoLatn
.isEmpty()
370 && !resolvedNoLatn
.test(USCRIPT_CYRILLIC
, status
)
371 && !resolvedNoLatn
.test(USCRIPT_GREEK
, status
)
372 && !resolvedNoLatn
.test(USCRIPT_CHEROKEE
, status
)) {
373 return USPOOF_MODERATELY_RESTRICTIVE
;
376 // Section 5.2 step 8:
377 return USPOOF_MINIMALLY_RESTRICTIVE
;
380 int32_t SpoofImpl::findHiddenOverlay(const UnicodeString
& input
, UErrorCode
&) const {
381 bool sawLeadCharacter
= false;
382 for (int32_t i
=0; i
<input
.length();) {
383 UChar32 cp
= input
.char32At(i
);
384 if (sawLeadCharacter
&& cp
== 0x0307) {
387 uint8_t combiningClass
= u_getCombiningClass(cp
);
388 // Skip over characters except for those with combining class 0 (non-combining characters) or with
389 // combining class 230 (same class as U+0307)
390 U_ASSERT(u_getCombiningClass(0x0307) == 230);
391 if (combiningClass
== 0 || combiningClass
== 230) {
392 sawLeadCharacter
= isIllegalCombiningDotLeadCharacter(cp
);
399 static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp
) {
400 return cp
== u
'i' || cp
== u
'j' || cp
== u
'ı' || cp
== u
'ȷ' || cp
== u
'l' ||
401 u_hasBinaryProperty(cp
, UCHAR_SOFT_DOTTED
);
404 bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp
) const {
405 if (isIllegalCombiningDotLeadCharacterNoLookup(cp
)) {
408 UnicodeString skelStr
;
409 fSpoofData
->confusableLookup(cp
, skelStr
);
410 UChar32 finalCp
= skelStr
.char32At(skelStr
.moveIndex32(skelStr
.length(), -1));
411 if (finalCp
!= cp
&& isIllegalCombiningDotLeadCharacterNoLookup(finalCp
)) {
419 // Convert a text format hex number. Utility function used by builder code. Static.
420 // Input: UChar *string text. Output: a UChar32
421 // Input has been pre-checked, and will have no non-hex chars.
422 // The number must fall in the code point range of 0..0x10ffff
424 UChar32
SpoofImpl::ScanHex(const UChar
*s
, int32_t start
, int32_t limit
, UErrorCode
&status
) {
425 if (U_FAILURE(status
)) {
428 U_ASSERT(limit
-start
> 0);
431 for (i
=start
; i
<limit
; i
++) {
432 int digitVal
= s
[i
] - 0x30;
434 digitVal
= 0xa + (s
[i
] - 0x41); // Upper Case 'A'
437 digitVal
= 0xa + (s
[i
] - 0x61); // Lower Case 'a'
439 U_ASSERT(digitVal
<= 0xf);
443 if (val
> 0x10ffff) {
444 status
= U_PARSE_ERROR
;
451 //-----------------------------------------
453 // class CheckResult Implementation
455 //-----------------------------------------
457 CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC
) {
461 USpoofCheckResult
* CheckResult::asUSpoofCheckResult() {
462 return reinterpret_cast<USpoofCheckResult
*>(this);
466 // Incoming parameter check on Status and the CheckResult object
467 // received from the C API.
469 const CheckResult
* CheckResult::validateThis(const USpoofCheckResult
*ptr
, UErrorCode
&status
) {
470 if (U_FAILURE(status
)) { return NULL
; }
472 status
= U_ILLEGAL_ARGUMENT_ERROR
;
475 CheckResult
*This
= (CheckResult
*) ptr
;
476 if (This
->fMagic
!= USPOOF_CHECK_MAGIC
) {
477 status
= U_INVALID_FORMAT_ERROR
;
483 CheckResult
* CheckResult::validateThis(USpoofCheckResult
*ptr
, UErrorCode
&status
) {
484 return const_cast<CheckResult
*>
485 (CheckResult::validateThis(const_cast<const USpoofCheckResult
*>(ptr
), status
));
488 void CheckResult::clear() {
491 fRestrictionLevel
= USPOOF_UNDEFINED_RESTRICTIVE
;
494 int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks
) {
495 if ((enabledChecks
& USPOOF_AUX_INFO
) != 0 && fRestrictionLevel
!= USPOOF_UNDEFINED_RESTRICTIVE
) {
496 return fChecks
| fRestrictionLevel
;
502 CheckResult::~CheckResult() {
505 //----------------------------------------------------------------------------------------------
507 // class SpoofData Implementation
509 //----------------------------------------------------------------------------------------------
512 UBool
SpoofData::validateDataVersion(UErrorCode
&status
) const {
513 if (U_FAILURE(status
) ||
515 fRawData
->fMagic
!= USPOOF_MAGIC
||
516 fRawData
->fFormatVersion
[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
||
517 fRawData
->fFormatVersion
[1] != 0 ||
518 fRawData
->fFormatVersion
[2] != 0 ||
519 fRawData
->fFormatVersion
[3] != 0) {
520 status
= U_INVALID_FORMAT_ERROR
;
526 static UBool U_CALLCONV
527 spoofDataIsAcceptable(void *context
,
528 const char * /* type */, const char * /*name*/,
529 const UDataInfo
*pInfo
) {
532 pInfo
->isBigEndian
== U_IS_BIG_ENDIAN
&&
533 pInfo
->charsetFamily
== U_CHARSET_FAMILY
&&
534 pInfo
->dataFormat
[0] == 0x43 && // dataFormat="Cfu "
535 pInfo
->dataFormat
[1] == 0x66 &&
536 pInfo
->dataFormat
[2] == 0x75 &&
537 pInfo
->dataFormat
[3] == 0x20 &&
538 pInfo
->formatVersion
[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
540 UVersionInfo
*version
= static_cast<UVersionInfo
*>(context
);
541 if(version
!= NULL
) {
542 uprv_memcpy(version
, pInfo
->dataVersion
, 4);
550 // Methods for the loading of the default confusables data file. The confusable
551 // data is loaded only when it is needed.
553 // SpoofData::getDefault() - Return the default confusables data, and call the
554 // initOnce() if it is not available. Adds a reference
555 // to the SpoofData that the caller is responsible for
556 // decrementing when they are done with the data.
558 // uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
559 // is shared by all spoof checkers using the default data.
561 // uspoof_cleanupDefaultData - Called during cleanup.
564 static UInitOnce gSpoofInitDefaultOnce
= U_INITONCE_INITIALIZER
;
565 static SpoofData
* gDefaultSpoofData
;
567 static UBool U_CALLCONV
568 uspoof_cleanupDefaultData(void) {
569 if (gDefaultSpoofData
) {
570 // Will delete, assuming all user-level spoof checkers were closed.
571 gDefaultSpoofData
->removeReference();
572 gDefaultSpoofData
= nullptr;
573 gSpoofInitDefaultOnce
.reset();
578 static void U_CALLCONV
uspoof_loadDefaultData(UErrorCode
& status
) {
579 UDataMemory
*udm
= udata_openChoice(nullptr, "cfu", "confusables",
580 spoofDataIsAcceptable
,
581 nullptr, // context, would receive dataVersion if supplied.
583 if (U_FAILURE(status
)) { return; }
584 gDefaultSpoofData
= new SpoofData(udm
, status
);
585 if (U_FAILURE(status
)) {
586 delete gDefaultSpoofData
;
587 gDefaultSpoofData
= nullptr;
590 if (gDefaultSpoofData
== nullptr) {
591 status
= U_MEMORY_ALLOCATION_ERROR
;
594 ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA
, uspoof_cleanupDefaultData
);
597 SpoofData
* SpoofData::getDefault(UErrorCode
& status
) {
598 umtx_initOnce(gSpoofInitDefaultOnce
, &uspoof_loadDefaultData
, status
);
599 if (U_FAILURE(status
)) { return NULL
; }
600 gDefaultSpoofData
->addReference();
601 return gDefaultSpoofData
;
606 SpoofData::SpoofData(UDataMemory
*udm
, UErrorCode
&status
)
609 if (U_FAILURE(status
)) {
613 // fRawData is non-const because it may be constructed by the data builder.
614 fRawData
= reinterpret_cast<SpoofDataHeader
*>(
615 const_cast<void *>(udata_getMemory(udm
)));
616 validateDataVersion(status
);
621 SpoofData::SpoofData(const void *data
, int32_t length
, UErrorCode
&status
)
624 if (U_FAILURE(status
)) {
627 if ((size_t)length
< sizeof(SpoofDataHeader
)) {
628 status
= U_INVALID_FORMAT_ERROR
;
632 status
= U_ILLEGAL_ARGUMENT_ERROR
;
635 void *ncData
= const_cast<void *>(data
);
636 fRawData
= static_cast<SpoofDataHeader
*>(ncData
);
637 if (length
< fRawData
->fLength
) {
638 status
= U_INVALID_FORMAT_ERROR
;
641 validateDataVersion(status
);
646 // Spoof Data constructor for use from data builder.
647 // Initializes a new, empty data area that will be populated later.
648 SpoofData::SpoofData(UErrorCode
&status
) {
650 if (U_FAILURE(status
)) {
655 // The spoof header should already be sized to be a multiple of 16 bytes.
656 // Just in case it's not, round it up.
657 uint32_t initialSize
= (sizeof(SpoofDataHeader
) + 15) & ~15;
658 U_ASSERT(initialSize
== sizeof(SpoofDataHeader
));
660 fRawData
= static_cast<SpoofDataHeader
*>(uprv_malloc(initialSize
));
661 fMemLimit
= initialSize
;
662 if (fRawData
== NULL
) {
663 status
= U_MEMORY_ALLOCATION_ERROR
;
666 uprv_memset(fRawData
, 0, initialSize
);
668 fRawData
->fMagic
= USPOOF_MAGIC
;
669 fRawData
->fFormatVersion
[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
;
670 fRawData
->fFormatVersion
[1] = 0;
671 fRawData
->fFormatVersion
[2] = 0;
672 fRawData
->fFormatVersion
[3] = 0;
676 // reset() - initialize all fields.
677 // Should be updated if any new fields are added.
678 // Called by constructors to put things in a known initial state.
679 void SpoofData::reset() {
691 // SpoofData::initPtrs()
692 // Initialize the pointers to the various sections of the raw data.
694 // This function is used both during the Trie building process (multiple
695 // times, as the individual data sections are added), and
696 // during the opening of a Spoof Checker from prebuilt data.
698 // The pointers for non-existent data sections (identified by an offset of 0)
701 // Note: During building the data, adding each new data section
702 // reallocs the raw data area, which likely relocates it, which
703 // in turn requires reinitializing all of the pointers into it, hence
704 // multiple calls to this function during building.
706 void SpoofData::initPtrs(UErrorCode
&status
) {
710 if (U_FAILURE(status
)) {
713 if (fRawData
->fCFUKeys
!= 0) {
714 fCFUKeys
= (int32_t *)((char *)fRawData
+ fRawData
->fCFUKeys
);
716 if (fRawData
->fCFUStringIndex
!= 0) {
717 fCFUValues
= (uint16_t *)((char *)fRawData
+ fRawData
->fCFUStringIndex
);
719 if (fRawData
->fCFUStringTable
!= 0) {
720 fCFUStrings
= (UChar
*)((char *)fRawData
+ fRawData
->fCFUStringTable
);
725 SpoofData::~SpoofData() {
737 void SpoofData::removeReference() {
738 if (umtx_atomic_dec(&fRefCount
) == 0) {
744 SpoofData
*SpoofData::addReference() {
745 umtx_atomic_inc(&fRefCount
);
750 void *SpoofData::reserveSpace(int32_t numBytes
, UErrorCode
&status
) {
751 if (U_FAILURE(status
)) {
756 status
= U_INTERNAL_PROGRAM_ERROR
;
760 numBytes
= (numBytes
+ 15) & ~15; // Round up to a multiple of 16
761 uint32_t returnOffset
= fMemLimit
;
762 fMemLimit
+= numBytes
;
763 fRawData
= static_cast<SpoofDataHeader
*>(uprv_realloc(fRawData
, fMemLimit
));
764 fRawData
->fLength
= fMemLimit
;
765 uprv_memset((char *)fRawData
+ returnOffset
, 0, numBytes
);
767 return (char *)fRawData
+ returnOffset
;
770 int32_t SpoofData::serialize(void *buf
, int32_t capacity
, UErrorCode
&status
) const {
771 int32_t dataSize
= fRawData
->fLength
;
772 if (capacity
< dataSize
) {
773 status
= U_BUFFER_OVERFLOW_ERROR
;
776 uprv_memcpy(buf
, fRawData
, dataSize
);
780 int32_t SpoofData::size() const {
781 return fRawData
->fLength
;
784 //-------------------------------
786 // Front-end APIs for SpoofData
788 //-------------------------------
790 int32_t SpoofData::confusableLookup(UChar32 inChar
, UnicodeString
&dest
) const {
791 // Perform a binary search.
792 // [lo, hi), i.e lo is inclusive, hi is exclusive.
793 // The result after the loop will be in lo.
795 int32_t hi
= length();
797 int32_t mid
= (lo
+ hi
) / 2;
798 if (codePointAt(mid
) > inChar
) {
800 } else if (codePointAt(mid
) < inChar
) {
803 // Found result. Break early.
807 } while (hi
- lo
> 1);
809 // Did we find an entry? If not, the char maps to itself.
810 if (codePointAt(lo
) != inChar
) {
815 // Add the element to the string builder and return.
816 return appendValueTo(lo
, dest
);
819 int32_t SpoofData::length() const {
820 return fRawData
->fCFUKeysSize
;
823 UChar32
SpoofData::codePointAt(int32_t index
) const {
824 return ConfusableDataUtils::keyToCodePoint(fCFUKeys
[index
]);
827 int32_t SpoofData::appendValueTo(int32_t index
, UnicodeString
& dest
) const {
828 int32_t stringLength
= ConfusableDataUtils::keyToLength(fCFUKeys
[index
]);
830 // Value is either a char (for strings of length 1) or
831 // an index into the string table (for longer strings)
832 uint16_t value
= fCFUValues
[index
];
833 if (stringLength
== 1) {
834 dest
.append((UChar
)value
);
836 dest
.append(fCFUStrings
+ value
, stringLength
);
847 //-----------------------------------------------------------------------------
849 // uspoof_swap - byte swap and char encoding swap of spoof data
851 //-----------------------------------------------------------------------------
852 U_CAPI
int32_t U_EXPORT2
853 uspoof_swap(const UDataSwapper
*ds
, const void *inData
, int32_t length
, void *outData
,
854 UErrorCode
*status
) {
856 if (status
== NULL
|| U_FAILURE(*status
)) {
859 if(ds
==NULL
|| inData
==NULL
|| length
<-1 || (length
>0 && outData
==NULL
)) {
860 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
865 // Check that the data header is for spoof data.
866 // (Header contents are defined in gencfu.cpp)
868 const UDataInfo
*pInfo
= (const UDataInfo
*)((const char *)inData
+4);
869 if(!( pInfo
->dataFormat
[0]==0x43 && /* dataFormat="Cfu " */
870 pInfo
->dataFormat
[1]==0x66 &&
871 pInfo
->dataFormat
[2]==0x75 &&
872 pInfo
->dataFormat
[3]==0x20 &&
873 pInfo
->formatVersion
[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
&&
874 pInfo
->formatVersion
[1]==0 &&
875 pInfo
->formatVersion
[2]==0 &&
876 pInfo
->formatVersion
[3]==0 )) {
877 udata_printError(ds
, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
878 "(format version %02x %02x %02x %02x) is not recognized\n",
879 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
880 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
881 pInfo
->formatVersion
[0], pInfo
->formatVersion
[1],
882 pInfo
->formatVersion
[2], pInfo
->formatVersion
[3]);
883 *status
=U_UNSUPPORTED_ERROR
;
888 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
889 // header). This swap also conveniently gets us
890 // the size of the ICU d.h., which lets us locate the start
891 // of the uspoof specific data.
893 int32_t headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, status
);
897 // Get the Spoof Data Header, and check that it appears to be OK.
900 const uint8_t *inBytes
=(const uint8_t *)inData
+headerSize
;
901 SpoofDataHeader
*spoofDH
= (SpoofDataHeader
*)inBytes
;
902 if (ds
->readUInt32(spoofDH
->fMagic
) != USPOOF_MAGIC
||
903 ds
->readUInt32(spoofDH
->fLength
) < sizeof(SpoofDataHeader
))
905 udata_printError(ds
, "uspoof_swap(): Spoof Data header is invalid.\n");
906 *status
=U_UNSUPPORTED_ERROR
;
911 // Prefight operation? Just return the size
913 int32_t spoofDataLength
= ds
->readUInt32(spoofDH
->fLength
);
914 int32_t totalSize
= headerSize
+ spoofDataLength
;
920 // Check that length passed in is consistent with length from Spoof data header.
922 if (length
< totalSize
) {
923 udata_printError(ds
, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
925 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
931 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
932 // we need to reference the header to locate the data, and an
933 // inplace swap of the header leaves it unusable.
935 uint8_t *outBytes
= (uint8_t *)outData
+ headerSize
;
936 SpoofDataHeader
*outputDH
= (SpoofDataHeader
*)outBytes
;
938 int32_t sectionStart
;
939 int32_t sectionLength
;
942 // If not swapping in place, zero out the output buffer before starting.
943 // Gaps may exist between the individual sections, and these must be zeroed in
944 // the output buffer. The simplest way to do that is to just zero the whole thing.
946 if (inBytes
!= outBytes
) {
947 uprv_memset(outBytes
, 0, spoofDataLength
);
950 // Confusables Keys Section (fCFUKeys)
951 sectionStart
= ds
->readUInt32(spoofDH
->fCFUKeys
);
952 sectionLength
= ds
->readUInt32(spoofDH
->fCFUKeysSize
) * 4;
953 ds
->swapArray32(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
955 // String Index Section
956 sectionStart
= ds
->readUInt32(spoofDH
->fCFUStringIndex
);
957 sectionLength
= ds
->readUInt32(spoofDH
->fCFUStringIndexSize
) * 2;
958 ds
->swapArray16(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
960 // String Table Section
961 sectionStart
= ds
->readUInt32(spoofDH
->fCFUStringTable
);
962 sectionLength
= ds
->readUInt32(spoofDH
->fCFUStringTableLen
) * 2;
963 ds
->swapArray16(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
965 // And, last, swap the header itself.
966 // int32_t fMagic // swap this
967 // uint8_t fFormatVersion[4] // Do not swap this, just copy
968 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
970 uint32_t magic
= ds
->readUInt32(spoofDH
->fMagic
);
971 ds
->writeUInt32((uint32_t *)&outputDH
->fMagic
, magic
);
973 if (outputDH
->fFormatVersion
!= spoofDH
->fFormatVersion
) {
974 uprv_memcpy(outputDH
->fFormatVersion
, spoofDH
->fFormatVersion
, sizeof(spoofDH
->fFormatVersion
));
976 // swap starting at fLength
977 ds
->swapArray32(ds
, &spoofDH
->fLength
, sizeof(SpoofDataHeader
)-8 /* minus magic and fFormatVersion[4] */, &outputDH
->fLength
, status
);