2 ***************************************************************************
3 * Copyright (C) 2008-2013, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 * file name: uspoof.cpp
8 * tab size: 8 (not used)
11 * created on: 2008Feb13
12 * created by: Andy Heninger
14 * Unicode Spoof Detection
16 #include "unicode/utypes.h"
17 #include "unicode/normalizer2.h"
18 #include "unicode/uspoof.h"
19 #include "unicode/ustring.h"
20 #include "unicode/utf16.h"
23 #include "identifier_info.h"
25 #include "scriptset.h"
28 #include "uspoof_impl.h"
32 #if !UCONFIG_NO_NORMALIZATION
38 // Static Objects used by the spoof impl, their thread safe initialization and their cleanup.
40 static UnicodeSet
*gInclusionSet
= NULL
;
41 static UnicodeSet
*gRecommendedSet
= NULL
;
42 static const Normalizer2
*gNfdNormalizer
= NULL
;
43 static UMutex gInitMutex
= U_MUTEX_INITIALIZER
;
45 static UBool U_CALLCONV
46 uspoof_cleanup(void) {
49 delete gRecommendedSet
;
50 gRecommendedSet
= NULL
;
51 gNfdNormalizer
= NULL
;
55 static void initializeStatics() {
57 UErrorCode status
= U_ZERO_ERROR
;
58 if (gInclusionSet
== NULL
) {
59 gInclusionSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\
60 \\-.\\u00B7\\u05F3\\u05F4\\u0F0B\\u200C\\u200D\\u2019]"), status
);
61 gRecommendedSet
= new UnicodeSet(UNICODE_STRING_SIMPLE("[\
62 [0-z\\u00C0-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\
63 \\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B\\u021E\
64 \\u021F\\u0226-\\u0233\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\
65 \\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\
66 \\u0328\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\
67 \\u0342-\\u0345\\u037B-\\u03CE\\u03FC-\\u045F\\u048A-\\u0525\
68 \\u0531-\\u0586\\u05D0-\\u05F2\\u0621-\\u063F\\u0641-\\u0655\
69 \\u0660-\\u0669\\u0670-\\u068D\\u068F-\\u06D5\\u06E5\\u06E6\
70 \\u06EE-\\u06FF\\u0750-\\u07B1\\u0901-\\u0939\\u093C-\\u094D\
71 \\u0950\\u0960-\\u0972\\u0979-\\u0A4D\\u0A5C-\\u0A74\\u0A81-\
72 \\u0B43\\u0B47-\\u0B61\\u0B66-\\u0C56\\u0C60\\u0C61\\u0C66-\
73 \\u0CD6\\u0CE0-\\u0CEF\\u0D02-\\u0D28\\u0D2A-\\u0D39\\u0D3D-\
74 \\u0D43\\u0D46-\\u0D4D\\u0D57-\\u0D61\\u0D66-\\u0D8E\\u0D91-\
75 \\u0DA5\\u0DA7-\\u0DDE\\u0DF2\\u0E01-\\u0ED9\\u0F00\\u0F20-\
76 \\u0F8B\\u0F90-\\u109D\\u10D0-\\u10F0\\u10F7-\\u10FA\\u1200-\
77 \\u135A\\u135F\\u1380-\\u138F\\u1401-\\u167F\\u1780-\\u17A2\
78 \\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7-\
79 \\u17DC\\u17E0-\\u17E9\\u1810-\\u18A8\\u18AA-\\u18F5\\u1E00-\
80 \\u1E99\\u1F00-\\u1FFC\\u2D30-\\u2D65\\u2D80-\\u2DDE\\u3005-\
81 \\u3007\\u3041-\\u31B7\\u3400-\\u9FCB\\uA000-\\uA48C\\uA67F\
82 \\uA717-\\uA71F\\uA788\\uAA60-\\uAA7B\\uAC00-\\uD7A3\\uFA0E-\
84 \\U0002B734]-[[:Cn:][:nfkcqc=n:][:XIDC=n:]]]"), status
);
85 gNfdNormalizer
= Normalizer2::getNFDInstance(status
);
87 ucln_i18n_registerCleanup(UCLN_I18N_SPOOF
, uspoof_cleanup
);
93 U_CAPI USpoofChecker
* U_EXPORT2
94 uspoof_open(UErrorCode
*status
) {
95 if (U_FAILURE(*status
)) {
99 SpoofImpl
*si
= new SpoofImpl(SpoofData::getDefault(*status
), *status
);
100 if (U_FAILURE(*status
)) {
104 return reinterpret_cast<USpoofChecker
*>(si
);
108 U_CAPI USpoofChecker
* U_EXPORT2
109 uspoof_openFromSerialized(const void *data
, int32_t length
, int32_t *pActualLength
,
110 UErrorCode
*status
) {
111 if (U_FAILURE(*status
)) {
115 SpoofData
*sd
= new SpoofData(data
, length
, *status
);
116 SpoofImpl
*si
= new SpoofImpl(sd
, *status
);
117 if (U_FAILURE(*status
)) {
122 if (sd
== NULL
|| si
== NULL
) {
123 *status
= U_MEMORY_ALLOCATION_ERROR
;
129 if (pActualLength
!= NULL
) {
130 *pActualLength
= sd
->fRawData
->fLength
;
132 return reinterpret_cast<USpoofChecker
*>(si
);
136 U_CAPI USpoofChecker
* U_EXPORT2
137 uspoof_clone(const USpoofChecker
*sc
, UErrorCode
*status
) {
138 const SpoofImpl
*src
= SpoofImpl::validateThis(sc
, *status
);
142 SpoofImpl
*result
= new SpoofImpl(*src
, *status
); // copy constructor
143 if (U_FAILURE(*status
)) {
147 return reinterpret_cast<USpoofChecker
*>(result
);
151 U_CAPI
void U_EXPORT2
152 uspoof_close(USpoofChecker
*sc
) {
153 UErrorCode status
= U_ZERO_ERROR
;
154 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, status
);
159 U_CAPI
void U_EXPORT2
160 uspoof_setChecks(USpoofChecker
*sc
, int32_t checks
, UErrorCode
*status
) {
161 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
166 // Verify that the requested checks are all ones (bits) that
167 // are acceptable, known values.
168 if (checks
& ~USPOOF_ALL_CHECKS
) {
169 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
173 This
->fChecks
= checks
;
177 U_CAPI
int32_t U_EXPORT2
178 uspoof_getChecks(const USpoofChecker
*sc
, UErrorCode
*status
) {
179 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
183 return This
->fChecks
;
186 U_CAPI
void U_EXPORT2
187 uspoof_setRestrictionLevel(USpoofChecker
*sc
, URestrictionLevel restrictionLevel
) {
188 UErrorCode status
= U_ZERO_ERROR
;
189 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, status
);
191 This
->fRestrictionLevel
= restrictionLevel
;
195 U_CAPI URestrictionLevel U_EXPORT2
196 uspoof_getRestrictionLevel(const USpoofChecker
*sc
) {
197 UErrorCode status
= U_ZERO_ERROR
;
198 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, status
);
200 return USPOOF_UNRESTRICTIVE
;
202 return This
->fRestrictionLevel
;
205 U_CAPI
void U_EXPORT2
206 uspoof_setAllowedLocales(USpoofChecker
*sc
, const char *localesList
, UErrorCode
*status
) {
207 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
211 This
->setAllowedLocales(localesList
, *status
);
214 U_CAPI
const char * U_EXPORT2
215 uspoof_getAllowedLocales(USpoofChecker
*sc
, UErrorCode
*status
) {
216 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
220 return This
->getAllowedLocales(*status
);
224 U_CAPI
const USet
* U_EXPORT2
225 uspoof_getAllowedChars(const USpoofChecker
*sc
, UErrorCode
*status
) {
226 const UnicodeSet
*result
= uspoof_getAllowedUnicodeSet(sc
, status
);
227 return result
->toUSet();
230 U_CAPI
const UnicodeSet
* U_EXPORT2
231 uspoof_getAllowedUnicodeSet(const USpoofChecker
*sc
, UErrorCode
*status
) {
232 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
236 return This
->fAllowedCharsSet
;
240 U_CAPI
void U_EXPORT2
241 uspoof_setAllowedChars(USpoofChecker
*sc
, const USet
*chars
, UErrorCode
*status
) {
242 const UnicodeSet
*set
= UnicodeSet::fromUSet(chars
);
243 uspoof_setAllowedUnicodeSet(sc
, set
, status
);
247 U_CAPI
void U_EXPORT2
248 uspoof_setAllowedUnicodeSet(USpoofChecker
*sc
, const UnicodeSet
*chars
, UErrorCode
*status
) {
249 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
253 if (chars
->isBogus()) {
254 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
257 UnicodeSet
*clonedSet
= static_cast<UnicodeSet
*>(chars
->clone());
258 if (clonedSet
== NULL
|| clonedSet
->isBogus()) {
259 *status
= U_MEMORY_ALLOCATION_ERROR
;
263 delete This
->fAllowedCharsSet
;
264 This
->fAllowedCharsSet
= clonedSet
;
265 This
->fChecks
|= USPOOF_CHAR_LIMIT
;
269 U_CAPI
int32_t U_EXPORT2
270 uspoof_check(const USpoofChecker
*sc
,
271 const UChar
*id
, int32_t length
,
273 UErrorCode
*status
) {
275 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
280 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
283 UnicodeString
idStr((length
== -1), id
, length
); // Aliasing constructor.
284 int32_t result
= uspoof_checkUnicodeString(sc
, idStr
, position
, status
);
289 U_CAPI
int32_t U_EXPORT2
290 uspoof_checkUTF8(const USpoofChecker
*sc
,
291 const char *id
, int32_t length
,
293 UErrorCode
*status
) {
295 if (U_FAILURE(*status
)) {
298 UnicodeString idStr
= UnicodeString::fromUTF8(StringPiece(id
, length
>=0 ? length
: uprv_strlen(id
)));
299 int32_t result
= uspoof_checkUnicodeString(sc
, idStr
, position
, status
);
304 U_CAPI
int32_t U_EXPORT2
305 uspoof_areConfusable(const USpoofChecker
*sc
,
306 const UChar
*id1
, int32_t length1
,
307 const UChar
*id2
, int32_t length2
,
308 UErrorCode
*status
) {
309 SpoofImpl::validateThis(sc
, *status
);
310 if (U_FAILURE(*status
)) {
313 if (length1
< -1 || length2
< -1) {
314 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
318 UnicodeString
id1Str((length1
==-1), id1
, length1
); // Aliasing constructor
319 UnicodeString
id2Str((length2
==-1), id2
, length2
); // Aliasing constructor
320 return uspoof_areConfusableUnicodeString(sc
, id1Str
, id2Str
, status
);
324 U_CAPI
int32_t U_EXPORT2
325 uspoof_areConfusableUTF8(const USpoofChecker
*sc
,
326 const char *id1
, int32_t length1
,
327 const char *id2
, int32_t length2
,
328 UErrorCode
*status
) {
329 SpoofImpl::validateThis(sc
, *status
);
330 if (U_FAILURE(*status
)) {
333 if (length1
< -1 || length2
< -1) {
334 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
337 UnicodeString id1Str
= UnicodeString::fromUTF8(StringPiece(id1
, length1
>=0? length1
: uprv_strlen(id1
)));
338 UnicodeString id2Str
= UnicodeString::fromUTF8(StringPiece(id2
, length2
>=0? length2
: uprv_strlen(id2
)));
339 int32_t results
= uspoof_areConfusableUnicodeString(sc
, id1Str
, id2Str
, status
);
344 U_CAPI
int32_t U_EXPORT2
345 uspoof_areConfusableUnicodeString(const USpoofChecker
*sc
,
346 const icu::UnicodeString
&id1
,
347 const icu::UnicodeString
&id2
,
348 UErrorCode
*status
) {
349 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
350 if (U_FAILURE(*status
)) {
354 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
355 // and for definitions of the types (single, whole, mixed-script) of confusables.
357 // We only care about a few of the check flags. Ignore the others.
358 // If no tests relavant to this function have been specified, return an error.
359 // TODO: is this really the right thing to do? It's probably an error on the caller's part,
360 // but logically we would just return 0 (no error).
361 if ((This
->fChecks
& (USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_MIXED_SCRIPT_CONFUSABLE
|
362 USPOOF_WHOLE_SCRIPT_CONFUSABLE
)) == 0) {
363 *status
= U_INVALID_STATE_ERROR
;
366 int32_t flagsForSkeleton
= This
->fChecks
& USPOOF_ANY_CASE
;
369 IdentifierInfo
*identifierInfo
= This
->getIdentifierInfo(*status
);
370 if (U_FAILURE(*status
)) {
373 identifierInfo
->setIdentifier(id1
, *status
);
374 int32_t id1ScriptCount
= identifierInfo
->getScriptCount();
375 identifierInfo
->setIdentifier(id2
, *status
);
376 int32_t id2ScriptCount
= identifierInfo
->getScriptCount();
377 This
->releaseIdentifierInfo(identifierInfo
);
378 identifierInfo
= NULL
;
380 if (This
->fChecks
& USPOOF_SINGLE_SCRIPT_CONFUSABLE
) {
381 UnicodeString id1Skeleton
;
382 UnicodeString id2Skeleton
;
383 if (id1ScriptCount
<= 1 && id2ScriptCount
<= 1) {
384 flagsForSkeleton
|= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
385 uspoof_getSkeletonUnicodeString(sc
, flagsForSkeleton
, id1
, id1Skeleton
, status
);
386 uspoof_getSkeletonUnicodeString(sc
, flagsForSkeleton
, id2
, id2Skeleton
, status
);
387 if (id1Skeleton
== id2Skeleton
) {
388 result
|= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
393 if (result
& USPOOF_SINGLE_SCRIPT_CONFUSABLE
) {
394 // If the two inputs are single script confusable they cannot also be
395 // mixed or whole script confusable, according to the UAX39 definitions.
396 // So we can skip those tests.
400 // Two identifiers are whole script confusable if each is of a single script
401 // and they are mixed script confusable.
402 UBool possiblyWholeScriptConfusables
=
403 id1ScriptCount
<= 1 && id2ScriptCount
<= 1 && (This
->fChecks
& USPOOF_WHOLE_SCRIPT_CONFUSABLE
);
406 // Mixed Script Check
408 if ((This
->fChecks
& USPOOF_MIXED_SCRIPT_CONFUSABLE
) || possiblyWholeScriptConfusables
) {
409 // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
410 // the mixed script table skeleton, which is what we want.
411 // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
412 UnicodeString id1Skeleton
;
413 UnicodeString id2Skeleton
;
414 flagsForSkeleton
&= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
415 uspoof_getSkeletonUnicodeString(sc
, flagsForSkeleton
, id1
, id1Skeleton
, status
);
416 uspoof_getSkeletonUnicodeString(sc
, flagsForSkeleton
, id2
, id2Skeleton
, status
);
417 if (id1Skeleton
== id2Skeleton
) {
418 result
|= USPOOF_MIXED_SCRIPT_CONFUSABLE
;
419 if (possiblyWholeScriptConfusables
) {
420 result
|= USPOOF_WHOLE_SCRIPT_CONFUSABLE
;
431 U_CAPI
int32_t U_EXPORT2
432 uspoof_checkUnicodeString(const USpoofChecker
*sc
,
433 const icu::UnicodeString
&id
,
435 UErrorCode
*status
) {
436 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
442 IdentifierInfo
*identifierInfo
= NULL
;
443 if ((This
->fChecks
) & (USPOOF_RESTRICTION_LEVEL
| USPOOF_MIXED_NUMBERS
)) {
444 identifierInfo
= This
->getIdentifierInfo(*status
);
445 if (U_FAILURE(*status
)) {
446 goto cleanupAndReturn
;
448 identifierInfo
->setIdentifier(id
, *status
);
449 identifierInfo
->setIdentifierProfile(*This
->fAllowedCharsSet
);
453 if ((This
->fChecks
) & USPOOF_RESTRICTION_LEVEL
) {
454 URestrictionLevel idRestrictionLevel
= identifierInfo
->getRestrictionLevel(*status
);
455 if (idRestrictionLevel
> This
->fRestrictionLevel
) {
456 result
|= USPOOF_RESTRICTION_LEVEL
;
458 if (This
->fChecks
& USPOOF_AUX_INFO
) {
459 result
|= idRestrictionLevel
;
463 if ((This
->fChecks
) & USPOOF_MIXED_NUMBERS
) {
464 const UnicodeSet
*numerics
= identifierInfo
->getNumerics();
465 if (numerics
->size() > 1) {
466 result
|= USPOOF_MIXED_NUMBERS
;
469 // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
470 // We have no easy way to do the same in C.
471 // if (checkResult != null) {
472 // checkResult.numerics = numerics;
477 if (This
->fChecks
& (USPOOF_CHAR_LIMIT
)) {
480 int32_t length
= id
.length();
481 for (i
=0; i
<length
;) {
484 if (!This
->fAllowedCharsSet
->contains(c
)) {
485 result
|= USPOOF_CHAR_LIMIT
;
492 (USPOOF_WHOLE_SCRIPT_CONFUSABLE
| USPOOF_MIXED_SCRIPT_CONFUSABLE
| USPOOF_INVISIBLE
)) {
493 // These are the checks that need to be done on NFD input
494 UnicodeString nfdText
;
495 gNfdNormalizer
->normalize(id
, nfdText
, *status
);
496 int32_t nfdLength
= nfdText
.length();
498 if (This
->fChecks
& USPOOF_INVISIBLE
) {
500 // scan for more than one occurence of the same non-spacing mark
501 // in a sequence of non-spacing marks.
504 UChar32 firstNonspacingMark
= 0;
505 UBool haveMultipleMarks
= FALSE
;
506 UnicodeSet marksSeenSoFar
; // Set of combining marks in a single combining sequence.
508 for (i
=0; i
<nfdLength
;) {
509 c
= nfdText
.char32At(i
);
511 if (u_charType(c
) != U_NON_SPACING_MARK
) {
512 firstNonspacingMark
= 0;
513 if (haveMultipleMarks
) {
514 marksSeenSoFar
.clear();
515 haveMultipleMarks
= FALSE
;
519 if (firstNonspacingMark
== 0) {
520 firstNonspacingMark
= c
;
523 if (!haveMultipleMarks
) {
524 marksSeenSoFar
.add(firstNonspacingMark
);
525 haveMultipleMarks
= TRUE
;
527 if (marksSeenSoFar
.contains(c
)) {
528 // report the error, and stop scanning.
529 // No need to find more than the first failure.
530 result
|= USPOOF_INVISIBLE
;
533 marksSeenSoFar
.add(c
);
538 if (This
->fChecks
& (USPOOF_WHOLE_SCRIPT_CONFUSABLE
| USPOOF_MIXED_SCRIPT_CONFUSABLE
)) {
539 // The basic test is the same for both whole and mixed script confusables.
540 // Compute the set of scripts that every input character has a confusable in.
541 // For this computation an input character is always considered to be
542 // confusable with itself in its own script.
544 // If the number of such scripts is two or more, and the input consisted of
545 // characters all from a single script, we have a whole script confusable.
546 // (The two scripts will be the original script and the one that is confusable)
548 // If the number of such scripts >= one, and the original input contained characters from
549 // more than one script, we have a mixed script confusable. (We can transform
550 // some of the characters, and end up with a visually similar string all in
553 if (identifierInfo
== NULL
) {
554 identifierInfo
= This
->getIdentifierInfo(*status
);
555 if (U_FAILURE(*status
)) {
556 goto cleanupAndReturn
;
558 identifierInfo
->setIdentifier(id
, *status
);
561 int32_t scriptCount
= identifierInfo
->getScriptCount();
564 This
->wholeScriptCheck(nfdText
, &scripts
, *status
);
565 int32_t confusableScriptCount
= scripts
.countMembers();
566 //printf("confusableScriptCount = %d\n", confusableScriptCount);
568 if ((This
->fChecks
& USPOOF_WHOLE_SCRIPT_CONFUSABLE
) &&
569 confusableScriptCount
>= 2 &&
571 result
|= USPOOF_WHOLE_SCRIPT_CONFUSABLE
;
574 if ((This
->fChecks
& USPOOF_MIXED_SCRIPT_CONFUSABLE
) &&
575 confusableScriptCount
>= 1 &&
577 result
|= USPOOF_MIXED_SCRIPT_CONFUSABLE
;
583 This
->releaseIdentifierInfo(identifierInfo
);
584 if (position
!= NULL
) {
591 U_CAPI
int32_t U_EXPORT2
592 uspoof_getSkeleton(const USpoofChecker
*sc
,
594 const UChar
*id
, int32_t length
,
595 UChar
*dest
, int32_t destCapacity
,
596 UErrorCode
*status
) {
598 SpoofImpl::validateThis(sc
, *status
);
599 if (U_FAILURE(*status
)) {
602 if (length
<-1 || destCapacity
<0 || (destCapacity
==0 && dest
!=NULL
)) {
603 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
607 UnicodeString
idStr((length
==-1), id
, length
); // Aliasing constructor
608 UnicodeString destStr
;
609 uspoof_getSkeletonUnicodeString(sc
, type
, idStr
, destStr
, status
);
610 destStr
.extract(dest
, destCapacity
, *status
);
611 return destStr
.length();
616 U_I18N_API UnicodeString
& U_EXPORT2
617 uspoof_getSkeletonUnicodeString(const USpoofChecker
*sc
,
619 const UnicodeString
&id
,
621 UErrorCode
*status
) {
622 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
623 if (U_FAILURE(*status
)) {
627 int32_t tableMask
= 0;
630 tableMask
= USPOOF_ML_TABLE_FLAG
;
632 case USPOOF_SINGLE_SCRIPT_CONFUSABLE
:
633 tableMask
= USPOOF_SL_TABLE_FLAG
;
635 case USPOOF_ANY_CASE
:
636 tableMask
= USPOOF_MA_TABLE_FLAG
;
638 case USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
:
639 tableMask
= USPOOF_SA_TABLE_FLAG
;
642 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
647 gNfdNormalizer
->normalize(id
, nfdId
, *status
);
649 // Apply the skeleton mapping to the NFD normalized input string
650 // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
651 int32_t inputIndex
= 0;
652 UnicodeString skelStr
;
653 int32_t normalizedLen
= nfdId
.length();
654 for (inputIndex
=0; inputIndex
< normalizedLen
; ) {
655 UChar32 c
= nfdId
.char32At(inputIndex
);
656 inputIndex
+= U16_LENGTH(c
);
657 This
->confusableLookup(c
, tableMask
, skelStr
);
660 gNfdNormalizer
->normalize(skelStr
, dest
, *status
);
665 U_CAPI
int32_t U_EXPORT2
666 uspoof_getSkeletonUTF8(const USpoofChecker
*sc
,
668 const char *id
, int32_t length
,
669 char *dest
, int32_t destCapacity
,
670 UErrorCode
*status
) {
671 SpoofImpl::validateThis(sc
, *status
);
672 if (U_FAILURE(*status
)) {
675 if (length
<-1 || destCapacity
<0 || (destCapacity
==0 && dest
!=NULL
)) {
676 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
680 UnicodeString srcStr
= UnicodeString::fromUTF8(StringPiece(id
, length
>=0 ? length
: uprv_strlen(id
)));
681 UnicodeString destStr
;
682 uspoof_getSkeletonUnicodeString(sc
, type
, srcStr
, destStr
, status
);
683 if (U_FAILURE(*status
)) {
687 int32_t lengthInUTF8
= 0;
688 u_strToUTF8(dest
, destCapacity
, &lengthInUTF8
,
689 destStr
.getBuffer(), destStr
.length(), status
);
694 U_CAPI
int32_t U_EXPORT2
695 uspoof_serialize(USpoofChecker
*sc
,void *buf
, int32_t capacity
, UErrorCode
*status
) {
696 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
698 U_ASSERT(U_FAILURE(*status
));
701 int32_t dataSize
= This
->fSpoofData
->fRawData
->fLength
;
702 if (capacity
< dataSize
) {
703 *status
= U_BUFFER_OVERFLOW_ERROR
;
706 uprv_memcpy(buf
, This
->fSpoofData
->fRawData
, dataSize
);
710 U_CAPI
const USet
* U_EXPORT2
711 uspoof_getInclusionSet(UErrorCode
*) {
713 return gInclusionSet
->toUSet();
716 U_CAPI
const USet
* U_EXPORT2
717 uspoof_getRecommendedSet(UErrorCode
*) {
719 return gRecommendedSet
->toUSet();
722 U_I18N_API
const UnicodeSet
* U_EXPORT2
723 uspoof_getInclusionUnicodeSet(UErrorCode
*) {
725 return gInclusionSet
;
728 U_I18N_API
const UnicodeSet
* U_EXPORT2
729 uspoof_getRecommendedUnicodeSet(UErrorCode
*) {
731 return gRecommendedSet
;
736 #endif // !UCONFIG_NO_NORMALIZATION