2 **********************************************************************
3 * Copyright (C) 2008-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
9 #include "unicode/uspoof.h"
10 #include "unicode/unorm.h"
11 #include "unicode/uchar.h"
12 #include "unicode/uniset.h"
20 #include "uspoof_impl.h"
22 #if !UCONFIG_NO_NORMALIZATION
27 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl
)
29 SpoofImpl::SpoofImpl(SpoofData
*data
, UErrorCode
&status
) :
30 fMagic(0), fSpoofData(NULL
), fAllowedCharsSet(NULL
) , fAllowedLocales(NULL
) {
31 if (U_FAILURE(status
)) {
34 fMagic
= USPOOF_MAGIC
;
36 fChecks
= USPOOF_ALL_CHECKS
;
37 UnicodeSet
*allowedCharsSet
= new UnicodeSet(0, 0x10ffff);
38 if (allowedCharsSet
== NULL
) {
39 status
= U_MEMORY_ALLOCATION_ERROR
;
41 allowedCharsSet
->freeze();
42 fAllowedCharsSet
= allowedCharsSet
;
43 fAllowedLocales
= uprv_strdup("");
47 SpoofImpl::SpoofImpl() {
48 fMagic
= USPOOF_MAGIC
;
50 fChecks
= USPOOF_ALL_CHECKS
;
51 UnicodeSet
*allowedCharsSet
= new UnicodeSet(0, 0x10ffff);
52 allowedCharsSet
->freeze();
53 fAllowedCharsSet
= allowedCharsSet
;
54 fAllowedLocales
= uprv_strdup("");
58 // Copy Constructor, used by the user level clone() function.
59 SpoofImpl::SpoofImpl(const SpoofImpl
&src
, UErrorCode
&status
) :
60 fMagic(0), fSpoofData(NULL
), fAllowedCharsSet(NULL
) {
61 if (U_FAILURE(status
)) {
65 fChecks
= src
.fChecks
;
66 if (src
.fSpoofData
!= NULL
) {
67 fSpoofData
= src
.fSpoofData
->addReference();
69 fCheckMask
= src
.fCheckMask
;
70 fAllowedCharsSet
= static_cast<const UnicodeSet
*>(src
.fAllowedCharsSet
->clone());
71 if (fAllowedCharsSet
== NULL
) {
72 status
= U_MEMORY_ALLOCATION_ERROR
;
74 fAllowedLocales
= uprv_strdup(src
.fAllowedLocales
);
77 SpoofImpl::~SpoofImpl() {
78 fMagic
= 0; // head off application errors by preventing use of
79 // of deleted objects.
80 if (fSpoofData
!= NULL
) {
81 fSpoofData
->removeReference(); // Will delete if refCount goes to zero.
83 delete fAllowedCharsSet
;
84 uprv_free((void *)fAllowedLocales
);
88 // Incoming parameter check on Status and the SpoofChecker object
89 // received from the C API.
91 const SpoofImpl
*SpoofImpl::validateThis(const USpoofChecker
*sc
, UErrorCode
&status
) {
92 if (U_FAILURE(status
)) {
96 status
= U_ILLEGAL_ARGUMENT_ERROR
;
99 SpoofImpl
*This
= (SpoofImpl
*)sc
;
100 if (This
->fMagic
!= USPOOF_MAGIC
||
101 This
->fSpoofData
== NULL
) {
102 status
= U_INVALID_FORMAT_ERROR
;
105 if (!SpoofData::validateDataVersion(This
->fSpoofData
->fRawData
, status
)) {
111 SpoofImpl
*SpoofImpl::validateThis(USpoofChecker
*sc
, UErrorCode
&status
) {
112 return const_cast<SpoofImpl
*>
113 (SpoofImpl::validateThis(const_cast<const USpoofChecker
*>(sc
), status
));
118 //--------------------------------------------------------------------------------------
120 // confusableLookup() This is the heart of the confusable skeleton generation
123 // Given a source character, produce the corresponding
124 // replacement character(s)
126 //---------------------------------------------------------------------------------------
127 int32_t SpoofImpl::confusableLookup(UChar32 inChar
, int32_t tableMask
, UChar
*destBuf
) const {
129 // Binary search the spoof data key table for the inChar
130 int32_t *low
= fSpoofData
->fCFUKeys
;
132 int32_t *limit
= low
+ fSpoofData
->fRawData
->fCFUKeysSize
;
135 int32_t delta
= ((int32_t)(limit
-low
))/2;
137 midc
= *mid
& 0x1fffff;
138 if (inChar
== midc
) {
140 } else if (inChar
< midc
) {
145 } while (low
< limit
-1);
147 midc
= *mid
& 0x1fffff;
148 if (inChar
!= midc
) {
149 // Char not found. It maps to itself.
151 U16_APPEND_UNSAFE(destBuf
, i
, inChar
)
155 int32_t keyFlags
= *mid
& 0xff000000;
156 if ((keyFlags
& tableMask
) == 0) {
157 // We found the right key char, but the entry doesn't pertain to the
158 // table we need. See if there is an adjacent key that does
159 if (keyFlags
& USPOOF_KEY_MULTIPLE_VALUES
) {
161 for (altMid
= mid
-1; (*altMid
&0x00ffffff) == inChar
; altMid
--) {
162 keyFlags
= *altMid
& 0xff000000;
163 if (keyFlags
& tableMask
) {
168 for (altMid
= mid
+1; (*altMid
&0x00ffffff) == inChar
; altMid
++) {
169 keyFlags
= *altMid
& 0xff000000;
170 if (keyFlags
& tableMask
) {
176 // No key entry for this char & table.
177 // The input char maps to itself.
179 U16_APPEND_UNSAFE(destBuf
, i
, inChar
)
184 int32_t stringLen
= USPOOF_KEY_LENGTH_FIELD(keyFlags
) + 1;
185 int32_t keyTableIndex
= (int32_t)(mid
- fSpoofData
->fCFUKeys
);
187 // Value is either a UChar (for strings of length 1) or
188 // an index into the string table (for longer strings)
189 uint16_t value
= fSpoofData
->fCFUValues
[keyTableIndex
];
190 if (stringLen
== 1) {
195 // String length of 4 from the above lookup is used for all strings of length >= 4.
196 // For these, get the real length from the string lengths table,
197 // which maps string table indexes to lengths.
198 // All strings of the same length are stored contiguously in the string table.
199 // 'value' from the lookup above is the starting index for the desired string.
202 if (stringLen
== 4) {
203 int32_t stringLengthsLimit
= fSpoofData
->fRawData
->fCFUStringLengthsSize
;
204 for (ix
= 0; ix
< stringLengthsLimit
; ix
++) {
205 if (fSpoofData
->fCFUStringLengths
[ix
].fLastString
>= value
) {
206 stringLen
= fSpoofData
->fCFUStringLengths
[ix
].fStrLength
;
210 U_ASSERT(ix
< stringLengthsLimit
);
213 U_ASSERT(value
+ stringLen
<= fSpoofData
->fRawData
->fCFUStringTableLen
);
214 UChar
*src
= &fSpoofData
->fCFUStrings
[value
];
215 for (ix
=0; ix
<stringLen
; ix
++) {
216 destBuf
[ix
] = src
[ix
];
222 //---------------------------------------------------------------------------------------
224 // wholeScriptCheck()
226 // Input text is already normalized to NFD
227 // Return the set of scripts, each of which can represent something that is
228 // confusable with the input text. The script of the input text
229 // is included; input consisting of characters from a single script will
230 // always produce a result consisting of a set containing that script.
232 //---------------------------------------------------------------------------------------
233 void SpoofImpl::wholeScriptCheck(
234 const UChar
*text
, int32_t length
, ScriptSet
*result
, UErrorCode
&status
) const {
236 int32_t inputIdx
= 0;
240 (fChecks
& USPOOF_ANY_CASE
) ? fSpoofData
->fAnyCaseTrie
: fSpoofData
->fLowerCaseTrie
;
242 while (inputIdx
< length
) {
243 U16_NEXT(text
, inputIdx
, length
, c
);
244 uint32_t index
= utrie2_get32(table
, c
);
246 // No confusables in another script for this char.
247 // TODO: we should change the data to have sets with just the single script
248 // bit for the script of this char. Gets rid of this special case.
249 // Until then, grab the script from the char and intersect it with the set.
250 UScriptCode cpScript
= uscript_getScript(c
, &status
);
251 U_ASSERT(cpScript
> USCRIPT_INHERITED
);
252 result
->intersect(cpScript
);
253 } else if (index
== 1) {
254 // Script == Common or Inherited. Nothing to do.
256 result
->intersect(fSpoofData
->fScriptSets
[index
]);
262 void SpoofImpl::setAllowedLocales(const char *localesList
, UErrorCode
&status
) {
263 UnicodeSet allowedChars
;
264 UnicodeSet
*tmpSet
= NULL
;
265 const char *locStart
= localesList
;
266 const char *locEnd
= NULL
;
267 const char *localesListEnd
= localesList
+ uprv_strlen(localesList
);
268 int32_t localeListCount
= 0; // Number of locales provided by caller.
270 // Loop runs once per locale from the localesList, a comma separated list of locales.
272 locEnd
= uprv_strchr(locStart
, ',');
273 if (locEnd
== NULL
) {
274 locEnd
= localesListEnd
;
276 while (*locStart
== ' ') {
279 const char *trimmedEnd
= locEnd
-1;
280 while (trimmedEnd
> locStart
&& *trimmedEnd
== ' ') {
283 if (trimmedEnd
<= locStart
) {
286 const char *locale
= uprv_strndup(locStart
, (int32_t)(trimmedEnd
+ 1 - locStart
));
289 // We have one locale from the locales list.
290 // Add the script chars for this locale to the accumulating set of allowed chars.
291 // If the locale is no good, we will be notified back via status.
292 addScriptChars(locale
, &allowedChars
, status
);
293 uprv_free((void *)locale
);
294 if (U_FAILURE(status
)) {
297 locStart
= locEnd
+ 1;
298 } while (locStart
< localesListEnd
);
300 // If our caller provided an empty list of locales, we disable the allowed characters checking
301 if (localeListCount
== 0) {
302 uprv_free((void *)fAllowedLocales
);
303 fAllowedLocales
= uprv_strdup("");
304 tmpSet
= new UnicodeSet(0, 0x10ffff);
305 if (fAllowedLocales
== NULL
|| tmpSet
== NULL
) {
306 status
= U_MEMORY_ALLOCATION_ERROR
;
310 delete fAllowedCharsSet
;
311 fAllowedCharsSet
= tmpSet
;
312 fCheckMask
&= ~USPOOF_CHAR_LIMIT
;
317 // Add all common and inherited characters to the set of allowed chars.
319 tempSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_COMMON
, status
);
320 allowedChars
.addAll(tempSet
);
321 tempSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_INHERITED
, status
);
322 allowedChars
.addAll(tempSet
);
324 // If anything went wrong, we bail out without changing
325 // the state of the spoof checker.
326 if (U_FAILURE(status
)) {
330 // Store the updated spoof checker state.
331 tmpSet
= static_cast<UnicodeSet
*>(allowedChars
.clone());
332 const char *tmpLocalesList
= uprv_strdup(localesList
);
333 if (tmpSet
== NULL
|| tmpLocalesList
== NULL
) {
334 status
= U_MEMORY_ALLOCATION_ERROR
;
337 uprv_free((void *)fAllowedLocales
);
338 fAllowedLocales
= tmpLocalesList
;
340 delete fAllowedCharsSet
;
341 fAllowedCharsSet
= tmpSet
;
342 fCheckMask
|= USPOOF_CHAR_LIMIT
;
346 const char * SpoofImpl::getAllowedLocales(UErrorCode
&/*status*/) {
347 return fAllowedLocales
;
351 // Given a locale (a language), add all the characters from all of the scripts used with that language
352 // to the allowedChars UnicodeSet
354 void SpoofImpl::addScriptChars(const char *locale
, UnicodeSet
*allowedChars
, UErrorCode
&status
) {
355 UScriptCode scripts
[30];
357 int32_t numScripts
= uscript_getCode(locale
, scripts
, sizeof(scripts
)/sizeof(UScriptCode
), &status
);
358 if (U_FAILURE(status
)) {
361 if (status
== U_USING_DEFAULT_WARNING
) {
362 status
= U_ILLEGAL_ARGUMENT_ERROR
;
367 for (i
=0; i
<numScripts
; i
++) {
368 tmpSet
.applyIntPropertyValue(UCHAR_SCRIPT
, scripts
[i
], status
);
369 allowedChars
->addAll(tmpSet
);
374 int32_t SpoofImpl::scriptScan
375 (const UChar
*text
, int32_t length
, int32_t &pos
, UErrorCode
&status
) const {
376 if (U_FAILURE(status
)) {
379 int32_t inputIdx
= 0;
381 int32_t scriptCount
= 0;
382 UScriptCode lastScript
= USCRIPT_INVALID_CODE
;
383 UScriptCode sc
= USCRIPT_INVALID_CODE
;
384 while ((inputIdx
< length
|| length
== -1) && scriptCount
< 2) {
385 U16_NEXT(text
, inputIdx
, length
, c
);
386 if (c
== 0 && length
== -1) {
389 sc
= uscript_getScript(c
, &status
);
390 if (sc
== USCRIPT_COMMON
|| sc
== USCRIPT_INHERITED
|| sc
== USCRIPT_UNKNOWN
) {
393 if (sc
!= lastScript
) {
398 if (scriptCount
== 2) {
405 // Convert a text format hex number. Utility function used by builder code. Static.
406 // Input: UChar *string text. Output: a UChar32
407 // Input has been pre-checked, and will have no non-hex chars.
408 // The number must fall in the code point range of 0..0x10ffff
410 UChar32
SpoofImpl::ScanHex(const UChar
*s
, int32_t start
, int32_t limit
, UErrorCode
&status
) {
411 if (U_FAILURE(status
)) {
414 U_ASSERT(limit
-start
> 0);
417 for (i
=start
; i
<limit
; i
++) {
418 int digitVal
= s
[i
] - 0x30;
420 digitVal
= 0xa + (s
[i
] - 0x41); // Upper Case 'A'
423 digitVal
= 0xa + (s
[i
] - 0x61); // Lower Case 'a'
425 U_ASSERT(digitVal
<= 0xf);
429 if (val
> 0x10ffff) {
430 status
= U_PARSE_ERROR
;
438 //----------------------------------------------------------------------------------------------
440 // class SpoofData Implementation
442 //----------------------------------------------------------------------------------------------
445 UBool
SpoofData::validateDataVersion(const SpoofDataHeader
*rawData
, UErrorCode
&status
) {
446 if (U_FAILURE(status
) ||
448 rawData
->fMagic
!= USPOOF_MAGIC
||
449 rawData
->fFormatVersion
[0] > 1 ||
450 rawData
->fFormatVersion
[1] > 0) {
451 status
= U_INVALID_FORMAT_ERROR
;
458 // SpoofData::getDefault() - return a wrapper around the spoof data that is
459 // baked into the default ICU data.
461 SpoofData
*SpoofData::getDefault(UErrorCode
&status
) {
462 // TODO: Cache it. Lazy create, keep until cleanup.
464 UDataMemory
*udm
= udata_open(NULL
, "cfu", "confusables", &status
);
465 if (U_FAILURE(status
)) {
468 SpoofData
*This
= new SpoofData(udm
, status
);
469 if (U_FAILURE(status
)) {
474 status
= U_MEMORY_ALLOCATION_ERROR
;
480 SpoofData::SpoofData(UDataMemory
*udm
, UErrorCode
&status
)
483 if (U_FAILURE(status
)) {
486 fRawData
= reinterpret_cast<SpoofDataHeader
*>
487 ((char *)(udm
->pHeader
) + udm
->pHeader
->dataHeader
.headerSize
);
489 validateDataVersion(fRawData
, status
);
494 SpoofData::SpoofData(const void *data
, int32_t length
, UErrorCode
&status
)
497 if (U_FAILURE(status
)) {
500 if ((size_t)length
< sizeof(SpoofDataHeader
)) {
501 status
= U_INVALID_FORMAT_ERROR
;
504 void *ncData
= const_cast<void *>(data
);
505 fRawData
= static_cast<SpoofDataHeader
*>(ncData
);
506 if (length
< fRawData
->fLength
) {
507 status
= U_INVALID_FORMAT_ERROR
;
510 validateDataVersion(fRawData
, status
);
515 // Spoof Data constructor for use from data builder.
516 // Initializes a new, empty data area that will be populated later.
517 SpoofData::SpoofData(UErrorCode
&status
) {
519 if (U_FAILURE(status
)) {
525 // The spoof header should already be sized to be a multiple of 16 bytes.
526 // Just in case it's not, round it up.
527 uint32_t initialSize
= (sizeof(SpoofDataHeader
) + 15) & ~15;
528 U_ASSERT(initialSize
== sizeof(SpoofDataHeader
));
530 fRawData
= static_cast<SpoofDataHeader
*>(uprv_malloc(initialSize
));
531 fMemLimit
= initialSize
;
532 if (fRawData
== NULL
) {
533 status
= U_MEMORY_ALLOCATION_ERROR
;
536 uprv_memset(fRawData
, 0, initialSize
);
538 fRawData
->fMagic
= USPOOF_MAGIC
;
539 fRawData
->fFormatVersion
[0] = 1;
540 fRawData
->fFormatVersion
[1] = 0;
541 fRawData
->fFormatVersion
[2] = 0;
542 fRawData
->fFormatVersion
[3] = 0;
546 // reset() - initialize all fields.
547 // Should be updated if any new fields are added.
548 // Called by constructors to put things in a known initial state.
549 void SpoofData::reset() {
557 fCFUStringLengths
= NULL
;
560 fLowerCaseTrie
= NULL
;
565 // SpoofData::initPtrs()
566 // Initialize the pointers to the various sections of the raw data.
568 // This function is used both during the Trie building process (multiple
569 // times, as the individual data sections are added), and
570 // during the opening of a Spoof Checker from prebuilt data.
572 // The pointers for non-existent data sections (identified by an offset of 0)
575 // Note: During building the data, adding each new data section
576 // reallocs the raw data area, which likely relocates it, which
577 // in turn requires reinitializing all of the pointers into it, hence
578 // multiple calls to this function during building.
580 void SpoofData::initPtrs(UErrorCode
&status
) {
583 fCFUStringLengths
= NULL
;
585 if (U_FAILURE(status
)) {
588 if (fRawData
->fCFUKeys
!= 0) {
589 fCFUKeys
= (int32_t *)((char *)fRawData
+ fRawData
->fCFUKeys
);
591 if (fRawData
->fCFUStringIndex
!= 0) {
592 fCFUValues
= (uint16_t *)((char *)fRawData
+ fRawData
->fCFUStringIndex
);
594 if (fRawData
->fCFUStringLengths
!= 0) {
595 fCFUStringLengths
= (SpoofStringLengthsElement
*)((char *)fRawData
+ fRawData
->fCFUStringLengths
);
597 if (fRawData
->fCFUStringTable
!= 0) {
598 fCFUStrings
= (UChar
*)((char *)fRawData
+ fRawData
->fCFUStringTable
);
601 if (fAnyCaseTrie
== NULL
&& fRawData
->fAnyCaseTrie
!= 0) {
602 fAnyCaseTrie
= utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS
,
603 (char *)fRawData
+ fRawData
->fAnyCaseTrie
, fRawData
->fAnyCaseTrieLength
, NULL
, &status
);
605 if (fLowerCaseTrie
== NULL
&& fRawData
->fLowerCaseTrie
!= 0) {
606 fLowerCaseTrie
= utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS
,
607 (char *)fRawData
+ fRawData
->fLowerCaseTrie
, fRawData
->fLowerCaseTrieLength
, NULL
, &status
);
610 if (fRawData
->fScriptSets
!= 0) {
611 fScriptSets
= (ScriptSet
*)((char *)fRawData
+ fRawData
->fScriptSets
);
616 SpoofData::~SpoofData() {
617 utrie2_close(fAnyCaseTrie
);
619 utrie2_close(fLowerCaseTrie
);
620 fLowerCaseTrie
= NULL
;
632 void SpoofData::removeReference() {
633 if (umtx_atomic_dec(&fRefCount
) == 0) {
639 SpoofData
*SpoofData::addReference() {
640 umtx_atomic_inc(&fRefCount
);
645 void *SpoofData::reserveSpace(int32_t numBytes
, UErrorCode
&status
) {
646 if (U_FAILURE(status
)) {
651 status
= U_INTERNAL_PROGRAM_ERROR
;
655 numBytes
= (numBytes
+ 15) & ~15; // Round up to a multiple of 16
656 uint32_t returnOffset
= fMemLimit
;
657 fMemLimit
+= numBytes
;
658 fRawData
= static_cast<SpoofDataHeader
*>(uprv_realloc(fRawData
, fMemLimit
));
659 fRawData
->fLength
= fMemLimit
;
660 uprv_memset((char *)fRawData
+ returnOffset
, 0, numBytes
);
662 return (char *)fRawData
+ returnOffset
;
666 //----------------------------------------------------------------------------
668 // ScriptSet implementation
670 //----------------------------------------------------------------------------
671 ScriptSet::ScriptSet() {
672 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
677 ScriptSet::~ScriptSet() {
680 UBool
ScriptSet::operator == (const ScriptSet
&other
) {
681 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
682 if (bits
[i
] != other
.bits
[i
]) {
689 void ScriptSet::Union(UScriptCode script
) {
690 uint32_t index
= script
/ 32;
691 uint32_t bit
= 1 << (script
& 31);
692 U_ASSERT(index
< sizeof(bits
)*4);
697 void ScriptSet::Union(const ScriptSet
&other
) {
698 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
699 bits
[i
] |= other
.bits
[i
];
703 void ScriptSet::intersect(const ScriptSet
&other
) {
704 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
705 bits
[i
] &= other
.bits
[i
];
709 void ScriptSet::intersect(UScriptCode script
) {
710 uint32_t index
= script
/ 32;
711 uint32_t bit
= 1 << (script
& 31);
712 U_ASSERT(index
< sizeof(bits
)*4);
714 for (i
=0; i
<index
; i
++) {
718 for (i
=index
+1; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
724 ScriptSet
& ScriptSet::operator =(const ScriptSet
&other
) {
725 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
726 bits
[i
] = other
.bits
[i
];
732 void ScriptSet::setAll() {
733 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
734 bits
[i
] = 0xffffffffu
;
739 void ScriptSet::resetAll() {
740 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
745 int32_t ScriptSet::countMembers() {
746 // This bit counter is good for sparse numbers of '1's, which is
747 // very much the case that we will usually have.
749 for (uint32_t i
=0; i
<sizeof(bits
)/sizeof(uint32_t); i
++) {
750 uint32_t x
= bits
[i
];
753 x
&= (x
- 1); // and off the least significant one bit.
761 //-----------------------------------------------------------------------------
763 // NFDBuffer Implementation.
765 //-----------------------------------------------------------------------------
767 NFDBuffer::NFDBuffer(const UChar
*text
, int32_t length
, UErrorCode
&status
) {
768 fNormalizedText
= NULL
;
769 fNormalizedTextLength
= 0;
770 fOriginalText
= text
;
771 if (U_FAILURE(status
)) {
774 fNormalizedText
= fSmallBuf
;
775 fNormalizedTextLength
= unorm_normalize(
776 text
, length
, UNORM_NFD
, 0, fNormalizedText
, USPOOF_STACK_BUFFER_SIZE
, &status
);
777 if (status
== U_BUFFER_OVERFLOW_ERROR
) {
778 status
= U_ZERO_ERROR
;
779 fNormalizedText
= (UChar
*)uprv_malloc((fNormalizedTextLength
+1)*sizeof(UChar
));
780 if (fNormalizedText
== NULL
) {
781 status
= U_MEMORY_ALLOCATION_ERROR
;
783 fNormalizedTextLength
= unorm_normalize(text
, length
, UNORM_NFD
, 0,
784 fNormalizedText
, fNormalizedTextLength
+1, &status
);
790 NFDBuffer::~NFDBuffer() {
791 if (fNormalizedText
!= fSmallBuf
) {
792 uprv_free(fNormalizedText
);
797 const UChar
*NFDBuffer::getBuffer() {
798 return fNormalizedText
;
801 int32_t NFDBuffer::getLength() {
802 return fNormalizedTextLength
;
813 //-----------------------------------------------------------------------------
815 // uspoof_swap - byte swap and char encoding swap of spoof data
817 //-----------------------------------------------------------------------------
818 U_CAPI
int32_t U_EXPORT2
819 uspoof_swap(const UDataSwapper
*ds
, const void *inData
, int32_t length
, void *outData
,
820 UErrorCode
*status
) {
822 if (status
== NULL
|| U_FAILURE(*status
)) {
825 if(ds
==NULL
|| inData
==NULL
|| length
<-1 || (length
>0 && outData
==NULL
)) {
826 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
831 // Check that the data header is for spoof data.
832 // (Header contents are defined in gencfu.cpp)
834 const UDataInfo
*pInfo
= (const UDataInfo
*)((const char *)inData
+4);
835 if(!( pInfo
->dataFormat
[0]==0x43 && /* dataFormat="Cfu " */
836 pInfo
->dataFormat
[1]==0x66 &&
837 pInfo
->dataFormat
[2]==0x75 &&
838 pInfo
->dataFormat
[3]==0x20 &&
839 pInfo
->formatVersion
[0]==1 )) {
840 udata_printError(ds
, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
841 "(format version %02x %02x %02x %02x) is not recognized\n",
842 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
843 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
844 pInfo
->formatVersion
[0], pInfo
->formatVersion
[1],
845 pInfo
->formatVersion
[2], pInfo
->formatVersion
[3]);
846 *status
=U_UNSUPPORTED_ERROR
;
851 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
852 // header). This swap also conveniently gets us
853 // the size of the ICU d.h., which lets us locate the start
854 // of the uspoof specific data.
856 int32_t headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, status
);
860 // Get the Spoof Data Header, and check that it appears to be OK.
863 const uint8_t *inBytes
=(const uint8_t *)inData
+headerSize
;
864 SpoofDataHeader
*spoofDH
= (SpoofDataHeader
*)inBytes
;
865 if (ds
->readUInt32(spoofDH
->fMagic
) != USPOOF_MAGIC
||
866 ds
->readUInt32(spoofDH
->fLength
) < sizeof(SpoofDataHeader
))
868 udata_printError(ds
, "uspoof_swap(): Spoof Data header is invalid.\n");
869 *status
=U_UNSUPPORTED_ERROR
;
874 // Prefight operation? Just return the size
876 int32_t spoofDataLength
= ds
->readUInt32(spoofDH
->fLength
);
877 int32_t totalSize
= headerSize
+ spoofDataLength
;
883 // Check that length passed in is consistent with length from Spoof data header.
885 if (length
< totalSize
) {
886 udata_printError(ds
, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
888 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
894 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
895 // we need to reference the header to locate the data, and an
896 // inplace swap of the header leaves it unusable.
898 uint8_t *outBytes
= (uint8_t *)outData
+ headerSize
;
899 SpoofDataHeader
*outputDH
= (SpoofDataHeader
*)outBytes
;
901 int32_t sectionStart
;
902 int32_t sectionLength
;
905 // If not swapping in place, zero out the output buffer before starting.
906 // Gaps may exist between the individual sections, and these must be zeroed in
907 // the output buffer. The simplest way to do that is to just zero the whole thing.
909 if (inBytes
!= outBytes
) {
910 uprv_memset(outBytes
, 0, spoofDataLength
);
913 // Confusables Keys Section (fCFUKeys)
914 sectionStart
= ds
->readUInt32(spoofDH
->fCFUKeys
);
915 sectionLength
= ds
->readUInt32(spoofDH
->fCFUKeysSize
) * 4;
916 ds
->swapArray32(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
918 // String Index Section
919 sectionStart
= ds
->readUInt32(spoofDH
->fCFUStringIndex
);
920 sectionLength
= ds
->readUInt32(spoofDH
->fCFUStringIndexSize
) * 2;
921 ds
->swapArray16(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
923 // String Table Section
924 sectionStart
= ds
->readUInt32(spoofDH
->fCFUStringTable
);
925 sectionLength
= ds
->readUInt32(spoofDH
->fCFUStringTableLen
) * 2;
926 ds
->swapArray16(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
928 // String Lengths Section
929 sectionStart
= ds
->readUInt32(spoofDH
->fCFUStringLengths
);
930 sectionLength
= ds
->readUInt32(spoofDH
->fCFUStringLengthsSize
) * 4;
931 ds
->swapArray16(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
934 sectionStart
= ds
->readUInt32(spoofDH
->fAnyCaseTrie
);
935 sectionLength
= ds
->readUInt32(spoofDH
->fAnyCaseTrieLength
);
936 utrie2_swap(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
939 sectionStart
= ds
->readUInt32(spoofDH
->fLowerCaseTrie
);
940 sectionLength
= ds
->readUInt32(spoofDH
->fLowerCaseTrieLength
);
941 utrie2_swap(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
943 // Script Sets. The data is an array of int32_t
944 sectionStart
= ds
->readUInt32(spoofDH
->fScriptSets
);
945 sectionLength
= ds
->readUInt32(spoofDH
->fScriptSetsLength
) * sizeof(ScriptSet
);
946 ds
->swapArray32(ds
, inBytes
+sectionStart
, sectionLength
, outBytes
+sectionStart
, status
);
948 // And, last, swap the header itself.
949 // int32_t fMagic // swap this
950 // uint8_t fFormatVersion[4] // Do not swap this, just copy
951 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
953 uint32_t magic
= ds
->readUInt32(spoofDH
->fMagic
);
954 ds
->writeUInt32((uint32_t *)&outputDH
->fMagic
, magic
);
955 uprv_memcpy(outputDH
->fFormatVersion
, spoofDH
->fFormatVersion
, sizeof(spoofDH
->fFormatVersion
));
956 // swap starting at fLength
957 ds
->swapArray32(ds
, &spoofDH
->fLength
, sizeof(SpoofDataHeader
)-8 /* minus magic and fFormatVersion[4] */, &outputDH
->fLength
, status
);