2 ***************************************************************************
3 * Copyright (C) 2008-2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 * file name: uspoof.cpp
8 * tab size: 8 (not used)
11 * created on: 2008Feb13
12 * created by: Andy Heninger
14 * Unicode Spoof Detection
16 #include "unicode/utypes.h"
17 #include "unicode/uspoof.h"
18 #include "unicode/unorm.h"
19 #include "unicode/ustring.h"
21 #include "uspoof_impl.h"
25 #if !UCONFIG_NO_NORMALIZATION
28 #include <stdio.h> // debug
33 U_CAPI USpoofChecker
* U_EXPORT2
34 uspoof_open(UErrorCode
*status
) {
35 if (U_FAILURE(*status
)) {
38 SpoofImpl
*si
= new SpoofImpl(SpoofData::getDefault(*status
), *status
);
39 if (U_FAILURE(*status
)) {
43 return (USpoofChecker
*)si
;
47 U_CAPI USpoofChecker
* U_EXPORT2
48 uspoof_openFromSerialized(const void *data
, int32_t length
, int32_t *pActualLength
,
50 if (U_FAILURE(*status
)) {
53 SpoofData
*sd
= new SpoofData(data
, length
, *status
);
54 SpoofImpl
*si
= new SpoofImpl(sd
, *status
);
55 if (U_FAILURE(*status
)) {
60 if (sd
== NULL
|| si
== NULL
) {
61 *status
= U_MEMORY_ALLOCATION_ERROR
;
67 if (pActualLength
!= NULL
) {
68 *pActualLength
= sd
->fRawData
->fLength
;
70 return reinterpret_cast<USpoofChecker
*>(si
);
74 U_CAPI USpoofChecker
* U_EXPORT2
75 uspoof_clone(const USpoofChecker
*sc
, UErrorCode
*status
) {
76 const SpoofImpl
*src
= SpoofImpl::validateThis(sc
, *status
);
80 SpoofImpl
*result
= new SpoofImpl(*src
, *status
); // copy constructor
81 if (U_FAILURE(*status
)) {
85 return (USpoofChecker
*)result
;
90 uspoof_close(USpoofChecker
*sc
) {
91 UErrorCode status
= U_ZERO_ERROR
;
92 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, status
);
98 uspoof_setChecks(USpoofChecker
*sc
, int32_t checks
, UErrorCode
*status
) {
99 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
104 // Verify that the requested checks are all ones (bits) that
105 // are acceptable, known values.
106 if (checks
& ~USPOOF_ALL_CHECKS
) {
107 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
111 This
->fChecks
= checks
;
115 U_CAPI
int32_t U_EXPORT2
116 uspoof_getChecks(const USpoofChecker
*sc
, UErrorCode
*status
) {
117 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
121 return This
->fChecks
;
124 U_CAPI
void U_EXPORT2
125 uspoof_setAllowedLocales(USpoofChecker
*sc
, const char *localesList
, UErrorCode
*status
) {
126 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
130 This
->setAllowedLocales(localesList
, *status
);
133 U_CAPI
const char * U_EXPORT2
134 uspoof_getAllowedLocales(USpoofChecker
*sc
, UErrorCode
*status
) {
135 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
139 return This
->getAllowedLocales(*status
);
143 U_CAPI
const USet
* U_EXPORT2
144 uspoof_getAllowedChars(const USpoofChecker
*sc
, UErrorCode
*status
) {
145 const UnicodeSet
*result
= uspoof_getAllowedUnicodeSet(sc
, status
);
146 return reinterpret_cast<const USet
*>(result
);
149 U_CAPI
const UnicodeSet
* U_EXPORT2
150 uspoof_getAllowedUnicodeSet(const USpoofChecker
*sc
, UErrorCode
*status
) {
151 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
155 return This
->fAllowedCharsSet
;
159 U_CAPI
void U_EXPORT2
160 uspoof_setAllowedChars(USpoofChecker
*sc
, const USet
*chars
, UErrorCode
*status
) {
161 const UnicodeSet
*set
= reinterpret_cast<const UnicodeSet
*>(chars
);
162 uspoof_setAllowedUnicodeSet(sc
, set
, status
);
166 U_CAPI
void U_EXPORT2
167 uspoof_setAllowedUnicodeSet(USpoofChecker
*sc
, const UnicodeSet
*chars
, UErrorCode
*status
) {
168 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
172 if (chars
->isBogus()) {
173 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
176 UnicodeSet
*clonedSet
= static_cast<UnicodeSet
*>(chars
->clone());
177 if (clonedSet
== NULL
|| clonedSet
->isBogus()) {
178 *status
= U_MEMORY_ALLOCATION_ERROR
;
182 delete This
->fAllowedCharsSet
;
183 This
->fAllowedCharsSet
= clonedSet
;
184 This
->fChecks
|= USPOOF_CHAR_LIMIT
;
188 U_CAPI
int32_t U_EXPORT2
189 uspoof_check(const USpoofChecker
*sc
,
190 const UChar
*text
, int32_t length
,
192 UErrorCode
*status
) {
194 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
199 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
203 // It's not worth the bother to handle nul terminated strings everywhere.
204 // Just get the length and be done with it.
205 length
= u_strlen(text
);
209 int32_t failPos
= 0x7fffffff; // TODO: do we have a #define for max int32?
211 // A count of the number of non-Common or inherited scripts.
212 // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
213 // Share the computation when possible. scriptCount == -1 means that we haven't
215 int32_t scriptCount
= -1;
217 if ((This
->fChecks
) & USPOOF_SINGLE_SCRIPT
) {
218 scriptCount
= This
->scriptScan(text
, length
, failPos
, *status
);
219 // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
220 if ( scriptCount
>= 2) {
221 // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
222 result
|= USPOOF_SINGLE_SCRIPT
;
226 if (This
->fChecks
& USPOOF_CHAR_LIMIT
) {
229 for (i
=0; i
<length
;) {
230 U16_NEXT(text
, i
, length
, c
);
231 if (!This
->fAllowedCharsSet
->contains(c
)) {
232 result
|= USPOOF_CHAR_LIMIT
;
242 (USPOOF_WHOLE_SCRIPT_CONFUSABLE
| USPOOF_MIXED_SCRIPT_CONFUSABLE
| USPOOF_INVISIBLE
)) {
243 // These are the checks that need to be done on NFD input
244 NFDBuffer
normalizedInput(text
, length
, *status
);
245 const UChar
*nfdText
= normalizedInput
.getBuffer();
246 int32_t nfdLength
= normalizedInput
.getLength();
248 if (This
->fChecks
& USPOOF_INVISIBLE
) {
250 // scan for more than one occurence of the same non-spacing mark
251 // in a sequence of non-spacing marks.
254 UChar32 firstNonspacingMark
= 0;
255 UBool haveMultipleMarks
= FALSE
;
256 UnicodeSet marksSeenSoFar
; // Set of combining marks in a single combining sequence.
258 for (i
=0; i
<length
;) {
259 U16_NEXT(nfdText
, i
, nfdLength
, c
);
260 if (u_charType(c
) != U_NON_SPACING_MARK
) {
261 firstNonspacingMark
= 0;
262 if (haveMultipleMarks
) {
263 marksSeenSoFar
.clear();
264 haveMultipleMarks
= FALSE
;
268 if (firstNonspacingMark
== 0) {
269 firstNonspacingMark
= c
;
272 if (!haveMultipleMarks
) {
273 marksSeenSoFar
.add(firstNonspacingMark
);
274 haveMultipleMarks
= TRUE
;
276 if (marksSeenSoFar
.contains(c
)) {
277 // report the error, and stop scanning.
278 // No need to find more than the first failure.
279 result
|= USPOOF_INVISIBLE
;
283 marksSeenSoFar
.add(c
);
288 if (This
->fChecks
& (USPOOF_WHOLE_SCRIPT_CONFUSABLE
| USPOOF_MIXED_SCRIPT_CONFUSABLE
)) {
289 // The basic test is the same for both whole and mixed script confusables.
290 // Compute the set of scripts that every input character has a confusable in.
291 // For this computation an input character is always considered to be
292 // confusable with itself in its own script.
293 // If the number of such scripts is two or more, and the input consisted of
294 // characters all from a single script, we have a whole script confusable.
295 // (The two scripts will be the original script and the one that is confusable)
296 // If the number of such scripts >= one, and the original input contained characters from
297 // more than one script, we have a mixed script confusable. (We can transform
298 // some of the characters, and end up with a visually similar string all in
301 if (scriptCount
== -1) {
303 scriptCount
= This
->scriptScan(text
, length
, t
, *status
);
307 This
->wholeScriptCheck(nfdText
, nfdLength
, &scripts
, *status
);
308 int32_t confusableScriptCount
= scripts
.countMembers();
309 //printf("confusableScriptCount = %d\n", confusableScriptCount);
311 if ((This
->fChecks
& USPOOF_WHOLE_SCRIPT_CONFUSABLE
) &&
312 confusableScriptCount
>= 2 &&
314 result
|= USPOOF_WHOLE_SCRIPT_CONFUSABLE
;
317 if ((This
->fChecks
& USPOOF_MIXED_SCRIPT_CONFUSABLE
) &&
318 confusableScriptCount
>= 1 &&
320 result
|= USPOOF_MIXED_SCRIPT_CONFUSABLE
;
324 if (position
!= NULL
&& failPos
!= 0x7fffffff) {
331 U_CAPI
int32_t U_EXPORT2
332 uspoof_checkUTF8(const USpoofChecker
*sc
,
333 const char *text
, int32_t length
,
335 UErrorCode
*status
) {
337 if (U_FAILURE(*status
)) {
340 UChar stackBuf
[USPOOF_STACK_BUFFER_SIZE
];
341 UChar
* text16
= stackBuf
;
344 u_strFromUTF8(text16
, USPOOF_STACK_BUFFER_SIZE
, &len16
, text
, length
, status
);
345 if (U_FAILURE(*status
) && *status
!= U_BUFFER_OVERFLOW_ERROR
) {
348 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
349 text16
= static_cast<UChar
*>(uprv_malloc(len16
* sizeof(UChar
) + 2));
350 if (text16
== NULL
) {
351 *status
= U_MEMORY_ALLOCATION_ERROR
;
354 *status
= U_ZERO_ERROR
;
355 u_strFromUTF8(text16
, len16
+1, NULL
, text
, length
, status
);
358 int32_t position16
= -1;
359 int32_t result
= uspoof_check(sc
, text16
, len16
, &position16
, status
);
360 if (U_FAILURE(*status
)) {
364 if (position16
> 0) {
365 // Translate a UTF-16 based error position back to a UTF-8 offset.
366 // u_strToUTF8() in preflight mode is an easy way to do it.
367 U_ASSERT(position16
<= len16
);
368 u_strToUTF8(NULL
, 0, position
, text16
, position16
, status
);
370 // position is the required buffer length from u_strToUTF8, which includes
371 // space for a terminating NULL, which we don't want, hence the -1.
374 *status
= U_ZERO_ERROR
; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
377 if (text16
!= stackBuf
) {
384 /* A convenience wrapper around the public uspoof_getSkeleton that handles
385 * allocating a larger buffer than provided if the original is too small.
387 static UChar
*getSkeleton(const USpoofChecker
*sc
, uint32_t type
, const UChar
*s
, int32_t inputLength
,
388 UChar
*dest
, int32_t destCapacity
, int32_t *outputLength
, UErrorCode
*status
) {
389 int32_t requiredCapacity
= 0;
392 if (U_FAILURE(*status
)) {
395 requiredCapacity
= uspoof_getSkeleton(sc
, type
, s
, inputLength
, dest
, destCapacity
, status
);
396 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
397 buf
= static_cast<UChar
*>(uprv_malloc(requiredCapacity
* sizeof(UChar
)));
399 *status
= U_MEMORY_ALLOCATION_ERROR
;
402 *status
= U_ZERO_ERROR
;
403 uspoof_getSkeleton(sc
, type
, s
, inputLength
, buf
, requiredCapacity
, status
);
405 *outputLength
= requiredCapacity
;
410 U_CAPI
int32_t U_EXPORT2
411 uspoof_areConfusable(const USpoofChecker
*sc
,
412 const UChar
*s1
, int32_t length1
,
413 const UChar
*s2
, int32_t length2
,
414 UErrorCode
*status
) {
415 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
416 if (U_FAILURE(*status
)) {
420 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
421 // and for definitions of the types (single, whole, mixed-script) of confusables.
423 // We only care about a few of the check flags. Ignore the others.
424 // If no tests relavant to this function have been specified, return an error.
425 // TODO: is this really the right thing to do? It's probably an error on the caller's part,
426 // but logically we would just return 0 (no error).
427 if ((This
->fChecks
& (USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_MIXED_SCRIPT_CONFUSABLE
|
428 USPOOF_WHOLE_SCRIPT_CONFUSABLE
)) == 0) {
429 *status
= U_INVALID_STATE_ERROR
;
432 int32_t flagsForSkeleton
= This
->fChecks
& USPOOF_ANY_CASE
;
433 UChar s1SkeletonBuf
[USPOOF_STACK_BUFFER_SIZE
];
435 int32_t s1SkeletonLength
= 0;
437 UChar s2SkeletonBuf
[USPOOF_STACK_BUFFER_SIZE
];
439 int32_t s2SkeletonLength
= 0;
443 int32_t s1ScriptCount
= This
->scriptScan(s1
, length1
, t
, *status
);
444 int32_t s2ScriptCount
= This
->scriptScan(s2
, length2
, t
, *status
);
446 if (This
->fChecks
& USPOOF_SINGLE_SCRIPT_CONFUSABLE
) {
447 // Do the Single Script compare.
448 if (s1ScriptCount
<= 1 && s2ScriptCount
<= 1) {
449 flagsForSkeleton
|= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
450 s1Skeleton
= getSkeleton(sc
, flagsForSkeleton
, s1
, length1
, s1SkeletonBuf
,
451 sizeof(s1SkeletonBuf
)/sizeof(UChar
), &s1SkeletonLength
, status
);
452 s2Skeleton
= getSkeleton(sc
, flagsForSkeleton
, s2
, length2
, s2SkeletonBuf
,
453 sizeof(s2SkeletonBuf
)/sizeof(UChar
), &s2SkeletonLength
, status
);
454 if (s1SkeletonLength
== s2SkeletonLength
&& u_strncmp(s1Skeleton
, s2Skeleton
, s1SkeletonLength
) == 0) {
455 result
|= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
457 if (s1Skeleton
!= s1SkeletonBuf
) {
458 uprv_free(s1Skeleton
);
460 if (s2Skeleton
!= s2SkeletonBuf
) {
461 uprv_free(s2Skeleton
);
466 if (result
& USPOOF_SINGLE_SCRIPT_CONFUSABLE
) {
467 // If the two inputs are single script confusable they cannot also be
468 // mixed or whole script confusable, according to the UAX39 definitions.
469 // So we can skip those tests.
473 // Optimization for whole script confusables test: two identifiers are whole script confusable if
474 // each is of a single script and they are mixed script confusable.
475 UBool possiblyWholeScriptConfusables
=
476 s1ScriptCount
<= 1 && s2ScriptCount
<= 1 && (This
->fChecks
& USPOOF_WHOLE_SCRIPT_CONFUSABLE
);
479 // Mixed Script Check
481 if ((This
->fChecks
& USPOOF_MIXED_SCRIPT_CONFUSABLE
) || possiblyWholeScriptConfusables
) {
482 // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
483 // the mixed script table skeleton, which is what we want.
484 // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
485 flagsForSkeleton
&= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
486 s1Skeleton
= getSkeleton(sc
, flagsForSkeleton
, s1
, length1
, s1SkeletonBuf
,
487 sizeof(s1SkeletonBuf
)/sizeof(UChar
), &s1SkeletonLength
, status
);
488 s2Skeleton
= getSkeleton(sc
, flagsForSkeleton
, s2
, length2
, s2SkeletonBuf
,
489 sizeof(s2SkeletonBuf
)/sizeof(UChar
), &s2SkeletonLength
, status
);
490 if (s1SkeletonLength
== s2SkeletonLength
&& u_strncmp(s1Skeleton
, s2Skeleton
, s1SkeletonLength
) == 0) {
491 result
|= USPOOF_MIXED_SCRIPT_CONFUSABLE
;
492 if (possiblyWholeScriptConfusables
) {
493 result
|= USPOOF_WHOLE_SCRIPT_CONFUSABLE
;
496 if (s1Skeleton
!= s1SkeletonBuf
) {
497 uprv_free(s1Skeleton
);
499 if (s2Skeleton
!= s2SkeletonBuf
) {
500 uprv_free(s2Skeleton
);
508 // Convenience function for converting a UTF-8 input to a UChar * string, including
509 // reallocating a buffer when required. Parameters and their interpretation mostly
510 // match u_strFromUTF8.
512 static UChar
* convertFromUTF8(UChar
*outBuf
, int32_t outBufCapacity
, int32_t *outputLength
,
513 const char *in
, int32_t inLength
, UErrorCode
*status
) {
514 if (U_FAILURE(*status
)) {
517 UChar
*dest
= outBuf
;
518 u_strFromUTF8(dest
, outBufCapacity
, outputLength
, in
, inLength
, status
);
519 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
520 dest
= static_cast<UChar
*>(uprv_malloc(*outputLength
* sizeof(UChar
)));
522 *status
= U_MEMORY_ALLOCATION_ERROR
;
525 *status
= U_ZERO_ERROR
;
526 u_strFromUTF8(dest
, *outputLength
, NULL
, in
, inLength
, status
);
533 U_CAPI
int32_t U_EXPORT2
534 uspoof_areConfusableUTF8(const USpoofChecker
*sc
,
535 const char *s1
, int32_t length1
,
536 const char *s2
, int32_t length2
,
537 UErrorCode
*status
) {
539 SpoofImpl::validateThis(sc
, *status
);
540 if (U_FAILURE(*status
)) {
544 UChar s1Buf
[USPOOF_STACK_BUFFER_SIZE
];
546 UChar
*s1U
= convertFromUTF8(s1Buf
, USPOOF_STACK_BUFFER_SIZE
, &lengthS1U
, s1
, length1
, status
);
548 UChar s2Buf
[USPOOF_STACK_BUFFER_SIZE
];
550 UChar
*s2U
= convertFromUTF8(s2Buf
, USPOOF_STACK_BUFFER_SIZE
, &lengthS2U
, s2
, length2
, status
);
552 int32_t results
= uspoof_areConfusable(sc
, s1U
, lengthS1U
, s2U
, lengthS2U
, status
);
564 U_CAPI
int32_t U_EXPORT2
565 uspoof_areConfusableUnicodeString(const USpoofChecker
*sc
,
566 const U_NAMESPACE_QUALIFIER UnicodeString
&s1
,
567 const U_NAMESPACE_QUALIFIER UnicodeString
&s2
,
568 UErrorCode
*status
) {
570 const UChar
*u1
= s1
.getBuffer();
571 int32_t length1
= s1
.length();
572 const UChar
*u2
= s2
.getBuffer();
573 int32_t length2
= s2
.length();
575 int32_t results
= uspoof_areConfusable(sc
, u1
, length1
, u2
, length2
, status
);
582 U_CAPI
int32_t U_EXPORT2
583 uspoof_checkUnicodeString(const USpoofChecker
*sc
,
584 const U_NAMESPACE_QUALIFIER UnicodeString
&text
,
586 UErrorCode
*status
) {
587 int32_t result
= uspoof_check(sc
, text
.getBuffer(), text
.length(), position
, status
);
592 U_CAPI
int32_t U_EXPORT2
593 uspoof_getSkeleton(const USpoofChecker
*sc
,
595 const UChar
*s
, int32_t length
,
596 UChar
*dest
, int32_t destCapacity
,
597 UErrorCode
*status
) {
599 // TODO: this function could be sped up a bit
600 // Skip the input normalization when not needed, work from callers data.
601 // Put the initial skeleton straight into the caller's destination buffer.
602 // It probably won't need normalization.
603 // But these would make the structure more complicated.
605 const SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
606 if (U_FAILURE(*status
)) {
609 if (length
<-1 || destCapacity
<0 || (destCapacity
==0 && dest
!=NULL
) ||
610 (type
& ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
)) != 0) {
611 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
615 int32_t tableMask
= 0;
618 tableMask
= USPOOF_ML_TABLE_FLAG
;
620 case USPOOF_SINGLE_SCRIPT_CONFUSABLE
:
621 tableMask
= USPOOF_SL_TABLE_FLAG
;
623 case USPOOF_ANY_CASE
:
624 tableMask
= USPOOF_MA_TABLE_FLAG
;
626 case USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
:
627 tableMask
= USPOOF_SA_TABLE_FLAG
;
630 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
634 // NFD transform of the user supplied input
636 UChar nfdStackBuf
[USPOOF_STACK_BUFFER_SIZE
];
637 UChar
*nfdInput
= nfdStackBuf
;
638 int32_t normalizedLen
= unorm_normalize(
639 s
, length
, UNORM_NFD
, 0, nfdInput
, USPOOF_STACK_BUFFER_SIZE
, status
);
640 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
641 nfdInput
= (UChar
*)uprv_malloc((normalizedLen
+1)*sizeof(UChar
));
642 if (nfdInput
== NULL
) {
643 *status
= U_MEMORY_ALLOCATION_ERROR
;
646 *status
= U_ZERO_ERROR
;
647 normalizedLen
= unorm_normalize(s
, length
, UNORM_NFD
, 0,
648 nfdInput
, normalizedLen
+1, status
);
650 if (U_FAILURE(*status
)) {
651 if (nfdInput
!= nfdStackBuf
) {
657 // buffer to hold the Unicode defined skeleton mappings for a single code point
658 UChar buf
[USPOOF_MAX_SKELETON_EXPANSION
];
660 // Apply the skeleton mapping to the NFD normalized input string
661 // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
662 int32_t inputIndex
= 0;
663 UnicodeString skelStr
;
664 while (inputIndex
< normalizedLen
) {
666 U16_NEXT(nfdInput
, inputIndex
, normalizedLen
, c
);
667 int32_t replaceLen
= This
->confusableLookup(c
, tableMask
, buf
);
668 skelStr
.append(buf
, replaceLen
);
671 if (nfdInput
!= nfdStackBuf
) {
675 const UChar
*result
= skelStr
.getBuffer();
676 int32_t resultLen
= skelStr
.length();
677 UChar
*normedResult
= NULL
;
679 // Check the skeleton for NFD, normalize it if needed.
680 // Unnormalized results should be very rare.
681 if (!unorm_isNormalized(result
, resultLen
, UNORM_NFD
, status
)) {
682 normalizedLen
= unorm_normalize(result
, resultLen
, UNORM_NFD
, 0, NULL
, 0, status
);
683 normedResult
= static_cast<UChar
*>(uprv_malloc((normalizedLen
+1)*sizeof(UChar
)));
684 if (normedResult
== NULL
) {
685 *status
= U_MEMORY_ALLOCATION_ERROR
;
688 *status
= U_ZERO_ERROR
;
689 unorm_normalize(result
, resultLen
, UNORM_NFD
, 0, normedResult
, normalizedLen
+1, status
);
690 result
= normedResult
;
691 resultLen
= normalizedLen
;
694 // Copy the skeleton to the caller's buffer
695 if (U_SUCCESS(*status
)) {
696 if (destCapacity
== 0 || resultLen
> destCapacity
) {
697 *status
= resultLen
>destCapacity
? U_BUFFER_OVERFLOW_ERROR
: U_STRING_NOT_TERMINATED_WARNING
;
699 u_memcpy(dest
, result
, resultLen
);
700 if (destCapacity
> resultLen
) {
703 *status
= U_STRING_NOT_TERMINATED_WARNING
;
707 uprv_free(normedResult
);
713 U_CAPI UnicodeString
& U_EXPORT2
714 uspoof_getSkeletonUnicodeString(const USpoofChecker
*sc
,
716 const UnicodeString
&s
,
718 UErrorCode
*status
) {
719 if (U_FAILURE(*status
)) {
724 const UChar
*str
= s
.getBuffer();
725 int32_t strLen
= s
.length();
726 UChar smallBuf
[USPOOF_STACK_BUFFER_SIZE
];
727 UChar
*buf
= smallBuf
;
728 int32_t outputSize
= uspoof_getSkeleton(sc
, type
, str
, strLen
, smallBuf
, USPOOF_STACK_BUFFER_SIZE
, status
);
729 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
730 buf
= static_cast<UChar
*>(uprv_malloc((outputSize
+1)*sizeof(UChar
)));
732 *status
= U_MEMORY_ALLOCATION_ERROR
;
735 *status
= U_ZERO_ERROR
;
736 uspoof_getSkeleton(sc
, type
, str
, strLen
, buf
, outputSize
+1, status
);
738 if (U_SUCCESS(*status
)) {
739 dest
.setTo(buf
, outputSize
);
742 if (buf
!= smallBuf
) {
749 U_CAPI
int32_t U_EXPORT2
750 uspoof_getSkeletonUTF8(const USpoofChecker
*sc
,
752 const char *s
, int32_t length
,
753 char *dest
, int32_t destCapacity
,
754 UErrorCode
*status
) {
755 // Lacking a UTF-8 normalization API, just converting the input to
756 // UTF-16 seems as good an approach as any. In typical use, input will
757 // be an identifier, which is to say not too long for stack buffers.
758 if (U_FAILURE(*status
)) {
761 // Buffers for the UChar form of the input and skeleton strings.
762 UChar smallInBuf
[USPOOF_STACK_BUFFER_SIZE
];
763 UChar
*inBuf
= smallInBuf
;
764 UChar smallOutBuf
[USPOOF_STACK_BUFFER_SIZE
];
765 UChar
*outBuf
= smallOutBuf
;
767 int32_t lengthInUChars
= 0;
768 int32_t skelLengthInUChars
= 0;
769 int32_t skelLengthInUTF8
= 0;
771 u_strFromUTF8(inBuf
, USPOOF_STACK_BUFFER_SIZE
, &lengthInUChars
,
773 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
774 inBuf
= static_cast<UChar
*>(uprv_malloc((lengthInUChars
+1)*sizeof(UChar
)));
776 *status
= U_MEMORY_ALLOCATION_ERROR
;
779 *status
= U_ZERO_ERROR
;
780 u_strFromUTF8(inBuf
, lengthInUChars
+1, &lengthInUChars
,
784 skelLengthInUChars
= uspoof_getSkeleton(sc
, type
, inBuf
, lengthInUChars
,
785 outBuf
, USPOOF_STACK_BUFFER_SIZE
, status
);
786 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
787 outBuf
= static_cast<UChar
*>(uprv_malloc((skelLengthInUChars
+1)*sizeof(UChar
)));
788 if (outBuf
== NULL
) {
789 *status
= U_MEMORY_ALLOCATION_ERROR
;
792 *status
= U_ZERO_ERROR
;
793 skelLengthInUChars
= uspoof_getSkeleton(sc
, type
, inBuf
, lengthInUChars
,
794 outBuf
, skelLengthInUChars
+1, status
);
797 u_strToUTF8(dest
, destCapacity
, &skelLengthInUTF8
,
798 outBuf
, skelLengthInUChars
, status
);
801 if (inBuf
!= smallInBuf
) {
804 if (outBuf
!= smallOutBuf
) {
807 return skelLengthInUTF8
;
811 U_CAPI
int32_t U_EXPORT2
812 uspoof_serialize(USpoofChecker
*sc
,void *buf
, int32_t capacity
, UErrorCode
*status
) {
813 SpoofImpl
*This
= SpoofImpl::validateThis(sc
, *status
);
815 U_ASSERT(U_FAILURE(*status
));
818 int32_t dataSize
= This
->fSpoofData
->fRawData
->fLength
;
819 if (capacity
< dataSize
) {
820 *status
= U_BUFFER_OVERFLOW_ERROR
;
823 uprv_memcpy(buf
, This
->fSpoofData
->fRawData
, dataSize
);