]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/uspoof.cpp
ICU-491.11.3.tar.gz
[apple/icu.git] / icuSources / i18n / uspoof.cpp
CommitLineData
729e4ab9
A
1/*
2***************************************************************************
3* Copyright (C) 2008-2011, International Business Machines Corporation
4* and others. All Rights Reserved.
5***************************************************************************
6* file name: uspoof.cpp
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2008Feb13
12* created by: Andy Heninger
13*
14* Unicode Spoof Detection
15*/
16#include "unicode/utypes.h"
17#include "unicode/uspoof.h"
18#include "unicode/unorm.h"
19#include "unicode/ustring.h"
4388f060 20#include "unicode/utf16.h"
729e4ab9
A
21#include "cmemory.h"
22#include "uspoof_impl.h"
23#include "uassert.h"
24
25
26#if !UCONFIG_NO_NORMALIZATION
27
729e4ab9
A
28U_NAMESPACE_USE
29
30
31U_CAPI USpoofChecker * U_EXPORT2
32uspoof_open(UErrorCode *status) {
33 if (U_FAILURE(*status)) {
34 return NULL;
35 }
36 SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
37 if (U_FAILURE(*status)) {
38 delete si;
39 si = NULL;
40 }
41 return (USpoofChecker *)si;
42}
43
44
45U_CAPI USpoofChecker * U_EXPORT2
46uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
47 UErrorCode *status) {
48 if (U_FAILURE(*status)) {
49 return NULL;
50 }
51 SpoofData *sd = new SpoofData(data, length, *status);
52 SpoofImpl *si = new SpoofImpl(sd, *status);
53 if (U_FAILURE(*status)) {
54 delete sd;
55 delete si;
56 return NULL;
57 }
58 if (sd == NULL || si == NULL) {
59 *status = U_MEMORY_ALLOCATION_ERROR;
60 delete sd;
61 delete si;
62 return NULL;
63 }
64
65 if (pActualLength != NULL) {
66 *pActualLength = sd->fRawData->fLength;
67 }
68 return reinterpret_cast<USpoofChecker *>(si);
69}
70
71
72U_CAPI USpoofChecker * U_EXPORT2
73uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
74 const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
75 if (src == NULL) {
76 return NULL;
77 }
78 SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor
79 if (U_FAILURE(*status)) {
80 delete result;
81 result = NULL;
82 }
83 return (USpoofChecker *)result;
84}
85
86
87U_CAPI void U_EXPORT2
88uspoof_close(USpoofChecker *sc) {
89 UErrorCode status = U_ZERO_ERROR;
90 SpoofImpl *This = SpoofImpl::validateThis(sc, status);
91 delete This;
92}
93
94
95U_CAPI void U_EXPORT2
96uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
97 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
98 if (This == NULL) {
99 return;
100 }
101
102 // Verify that the requested checks are all ones (bits) that
103 // are acceptable, known values.
104 if (checks & ~USPOOF_ALL_CHECKS) {
105 *status = U_ILLEGAL_ARGUMENT_ERROR;
106 return;
107 }
108
109 This->fChecks = checks;
110}
111
112
113U_CAPI int32_t U_EXPORT2
114uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
115 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
116 if (This == NULL) {
117 return 0;
118 }
119 return This->fChecks;
120}
121
122U_CAPI void U_EXPORT2
123uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
124 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
125 if (This == NULL) {
126 return;
127 }
128 This->setAllowedLocales(localesList, *status);
129}
130
131U_CAPI const char * U_EXPORT2
132uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
133 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
134 if (This == NULL) {
135 return NULL;
136 }
137 return This->getAllowedLocales(*status);
138}
139
140
141U_CAPI const USet * U_EXPORT2
142uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
143 const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
144 return reinterpret_cast<const USet *>(result);
145}
146
147U_CAPI const UnicodeSet * U_EXPORT2
148uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
149 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
150 if (This == NULL) {
151 return NULL;
152 }
153 return This->fAllowedCharsSet;
154}
155
156
157U_CAPI void U_EXPORT2
158uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
159 const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
160 uspoof_setAllowedUnicodeSet(sc, set, status);
161}
162
163
164U_CAPI void U_EXPORT2
165uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
166 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
167 if (This == NULL) {
168 return;
169 }
170 if (chars->isBogus()) {
171 *status = U_ILLEGAL_ARGUMENT_ERROR;
172 return;
173 }
174 UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
175 if (clonedSet == NULL || clonedSet->isBogus()) {
176 *status = U_MEMORY_ALLOCATION_ERROR;
177 return;
178 }
179 clonedSet->freeze();
180 delete This->fAllowedCharsSet;
181 This->fAllowedCharsSet = clonedSet;
182 This->fChecks |= USPOOF_CHAR_LIMIT;
183}
184
185
186U_CAPI int32_t U_EXPORT2
187uspoof_check(const USpoofChecker *sc,
188 const UChar *text, int32_t length,
189 int32_t *position,
190 UErrorCode *status) {
191
192 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
193 if (This == NULL) {
194 return 0;
195 }
196 if (length < -1) {
197 *status = U_ILLEGAL_ARGUMENT_ERROR;
198 return 0;
199 }
200 if (length == -1) {
201 // It's not worth the bother to handle nul terminated strings everywhere.
202 // Just get the length and be done with it.
203 length = u_strlen(text);
204 }
205
206 int32_t result = 0;
207 int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32?
208
209 // A count of the number of non-Common or inherited scripts.
210 // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
211 // Share the computation when possible. scriptCount == -1 means that we haven't
212 // done it yet.
213 int32_t scriptCount = -1;
214
215 if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
216 scriptCount = This->scriptScan(text, length, failPos, *status);
217 // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
218 if ( scriptCount >= 2) {
219 // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
220 result |= USPOOF_SINGLE_SCRIPT;
221 }
222 }
223
224 if (This->fChecks & USPOOF_CHAR_LIMIT) {
225 int32_t i;
226 UChar32 c;
227 for (i=0; i<length ;) {
228 U16_NEXT(text, i, length, c);
229 if (!This->fAllowedCharsSet->contains(c)) {
230 result |= USPOOF_CHAR_LIMIT;
231 if (i < failPos) {
232 failPos = i;
233 }
234 break;
235 }
236 }
237 }
238
239 if (This->fChecks &
240 (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
241 // These are the checks that need to be done on NFD input
242 NFDBuffer normalizedInput(text, length, *status);
243 const UChar *nfdText = normalizedInput.getBuffer();
244 int32_t nfdLength = normalizedInput.getLength();
245
246 if (This->fChecks & USPOOF_INVISIBLE) {
247
248 // scan for more than one occurence of the same non-spacing mark
249 // in a sequence of non-spacing marks.
250 int32_t i;
251 UChar32 c;
252 UChar32 firstNonspacingMark = 0;
253 UBool haveMultipleMarks = FALSE;
254 UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
255
4388f060 256 for (i=0; i<nfdLength ;) {
729e4ab9
A
257 U16_NEXT(nfdText, i, nfdLength, c);
258 if (u_charType(c) != U_NON_SPACING_MARK) {
259 firstNonspacingMark = 0;
260 if (haveMultipleMarks) {
261 marksSeenSoFar.clear();
262 haveMultipleMarks = FALSE;
263 }
264 continue;
265 }
266 if (firstNonspacingMark == 0) {
267 firstNonspacingMark = c;
268 continue;
269 }
270 if (!haveMultipleMarks) {
271 marksSeenSoFar.add(firstNonspacingMark);
272 haveMultipleMarks = TRUE;
273 }
274 if (marksSeenSoFar.contains(c)) {
275 // report the error, and stop scanning.
276 // No need to find more than the first failure.
277 result |= USPOOF_INVISIBLE;
278 failPos = i;
4388f060
A
279 // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want
280 // to give back to our caller is a position in the original input string.
281 if (failPos > length) {
282 failPos = length;
283 }
729e4ab9
A
284 break;
285 }
286 marksSeenSoFar.add(c);
287 }
288 }
289
290
291 if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
292 // The basic test is the same for both whole and mixed script confusables.
293 // Compute the set of scripts that every input character has a confusable in.
294 // For this computation an input character is always considered to be
295 // confusable with itself in its own script.
296 // If the number of such scripts is two or more, and the input consisted of
297 // characters all from a single script, we have a whole script confusable.
298 // (The two scripts will be the original script and the one that is confusable)
299 // If the number of such scripts >= one, and the original input contained characters from
300 // more than one script, we have a mixed script confusable. (We can transform
301 // some of the characters, and end up with a visually similar string all in
302 // one script.)
303
304 if (scriptCount == -1) {
305 int32_t t;
306 scriptCount = This->scriptScan(text, length, t, *status);
307 }
308
309 ScriptSet scripts;
310 This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
311 int32_t confusableScriptCount = scripts.countMembers();
312 //printf("confusableScriptCount = %d\n", confusableScriptCount);
313
314 if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
315 confusableScriptCount >= 2 &&
316 scriptCount == 1) {
317 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
318 }
319
320 if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
321 confusableScriptCount >= 1 &&
322 scriptCount > 1) {
323 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
324 }
325 }
326 }
327 if (position != NULL && failPos != 0x7fffffff) {
328 *position = failPos;
329 }
330 return result;
331}
332
333
334U_CAPI int32_t U_EXPORT2
335uspoof_checkUTF8(const USpoofChecker *sc,
336 const char *text, int32_t length,
337 int32_t *position,
338 UErrorCode *status) {
339
340 if (U_FAILURE(*status)) {
341 return 0;
342 }
343 UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
344 UChar* text16 = stackBuf;
345 int32_t len16;
346
347 u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
348 if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
349 return 0;
350 }
351 if (*status == U_BUFFER_OVERFLOW_ERROR) {
352 text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
353 if (text16 == NULL) {
354 *status = U_MEMORY_ALLOCATION_ERROR;
355 return 0;
356 }
357 *status = U_ZERO_ERROR;
358 u_strFromUTF8(text16, len16+1, NULL, text, length, status);
359 }
360
361 int32_t position16 = -1;
362 int32_t result = uspoof_check(sc, text16, len16, &position16, status);
363 if (U_FAILURE(*status)) {
364 return 0;
365 }
366
367 if (position16 > 0) {
368 // Translate a UTF-16 based error position back to a UTF-8 offset.
369 // u_strToUTF8() in preflight mode is an easy way to do it.
370 U_ASSERT(position16 <= len16);
371 u_strToUTF8(NULL, 0, position, text16, position16, status);
372 if (position > 0) {
373 // position is the required buffer length from u_strToUTF8, which includes
374 // space for a terminating NULL, which we don't want, hence the -1.
375 *position -= 1;
376 }
377 *status = U_ZERO_ERROR; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
378 }
379
380 if (text16 != stackBuf) {
381 uprv_free(text16);
382 }
383 return result;
384
385}
386
387/* A convenience wrapper around the public uspoof_getSkeleton that handles
388 * allocating a larger buffer than provided if the original is too small.
389 */
390static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
391 UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
392 int32_t requiredCapacity = 0;
393 UChar *buf = dest;
394
395 if (U_FAILURE(*status)) {
396 return NULL;
397 }
398 requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
399 if (*status == U_BUFFER_OVERFLOW_ERROR) {
400 buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
401 if (buf == NULL) {
402 *status = U_MEMORY_ALLOCATION_ERROR;
403 return NULL;
404 }
405 *status = U_ZERO_ERROR;
406 uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
407 }
408 *outputLength = requiredCapacity;
409 return buf;
410}
411
412
413U_CAPI int32_t U_EXPORT2
414uspoof_areConfusable(const USpoofChecker *sc,
415 const UChar *s1, int32_t length1,
416 const UChar *s2, int32_t length2,
417 UErrorCode *status) {
418 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
419 if (U_FAILURE(*status)) {
420 return 0;
421 }
422 //
423 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
424 // and for definitions of the types (single, whole, mixed-script) of confusables.
425
426 // We only care about a few of the check flags. Ignore the others.
427 // If no tests relavant to this function have been specified, return an error.
428 // TODO: is this really the right thing to do? It's probably an error on the caller's part,
429 // but logically we would just return 0 (no error).
430 if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
431 USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
432 *status = U_INVALID_STATE_ERROR;
433 return 0;
434 }
435 int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
436 UChar s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
437 UChar *s1Skeleton;
438 int32_t s1SkeletonLength = 0;
439
440 UChar s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
441 UChar *s2Skeleton;
442 int32_t s2SkeletonLength = 0;
443
444 int32_t result = 0;
445 int32_t t;
446 int32_t s1ScriptCount = This->scriptScan(s1, length1, t, *status);
447 int32_t s2ScriptCount = This->scriptScan(s2, length2, t, *status);
448
449 if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
450 // Do the Single Script compare.
451 if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
452 flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
453 s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
454 sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
455 s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
456 sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
457 if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
458 result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
459 }
460 if (s1Skeleton != s1SkeletonBuf) {
461 uprv_free(s1Skeleton);
462 }
463 if (s2Skeleton != s2SkeletonBuf) {
464 uprv_free(s2Skeleton);
465 }
466 }
467 }
468
469 if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
470 // If the two inputs are single script confusable they cannot also be
471 // mixed or whole script confusable, according to the UAX39 definitions.
472 // So we can skip those tests.
473 return result;
474 }
475
476 // Optimization for whole script confusables test: two identifiers are whole script confusable if
477 // each is of a single script and they are mixed script confusable.
478 UBool possiblyWholeScriptConfusables =
479 s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
480
481 //
482 // Mixed Script Check
483 //
484 if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
485 // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
486 // the mixed script table skeleton, which is what we want.
487 // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
488 flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
489 s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
490 sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
491 s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
492 sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
493 if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
494 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
495 if (possiblyWholeScriptConfusables) {
496 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
497 }
498 }
499 if (s1Skeleton != s1SkeletonBuf) {
500 uprv_free(s1Skeleton);
501 }
502 if (s2Skeleton != s2SkeletonBuf) {
503 uprv_free(s2Skeleton);
504 }
505 }
506
507 return result;
508}
509
510
511// Convenience function for converting a UTF-8 input to a UChar * string, including
512// reallocating a buffer when required. Parameters and their interpretation mostly
513// match u_strFromUTF8.
514
515static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
516 const char *in, int32_t inLength, UErrorCode *status) {
517 if (U_FAILURE(*status)) {
518 return NULL;
519 }
520 UChar *dest = outBuf;
521 u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
522 if (*status == U_BUFFER_OVERFLOW_ERROR) {
523 dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
524 if (dest == NULL) {
525 *status = U_MEMORY_ALLOCATION_ERROR;
526 return NULL;
527 }
528 *status = U_ZERO_ERROR;
529 u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
530 }
531 return dest;
532}
533
534
535
536U_CAPI int32_t U_EXPORT2
537uspoof_areConfusableUTF8(const USpoofChecker *sc,
538 const char *s1, int32_t length1,
539 const char *s2, int32_t length2,
540 UErrorCode *status) {
541
542 SpoofImpl::validateThis(sc, *status);
543 if (U_FAILURE(*status)) {
544 return 0;
545 }
546
547 UChar s1Buf[USPOOF_STACK_BUFFER_SIZE];
548 int32_t lengthS1U;
549 UChar *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
550
551 UChar s2Buf[USPOOF_STACK_BUFFER_SIZE];
552 int32_t lengthS2U;
553 UChar *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
554
555 int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
556
557 if (s1U != s1Buf) {
558 uprv_free(s1U);
559 }
560 if (s2U != s2Buf) {
561 uprv_free(s2U);
562 }
563 return results;
564}
565
566
567U_CAPI int32_t U_EXPORT2
568uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
4388f060
A
569 const icu::UnicodeString &s1,
570 const icu::UnicodeString &s2,
729e4ab9
A
571 UErrorCode *status) {
572
573 const UChar *u1 = s1.getBuffer();
574 int32_t length1 = s1.length();
575 const UChar *u2 = s2.getBuffer();
576 int32_t length2 = s2.length();
577
578 int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
579 return results;
580}
581
582
583
584
585U_CAPI int32_t U_EXPORT2
586uspoof_checkUnicodeString(const USpoofChecker *sc,
4388f060 587 const icu::UnicodeString &text,
729e4ab9
A
588 int32_t *position,
589 UErrorCode *status) {
590 int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
591 return result;
592}
593
594
595U_CAPI int32_t U_EXPORT2
596uspoof_getSkeleton(const USpoofChecker *sc,
597 uint32_t type,
598 const UChar *s, int32_t length,
599 UChar *dest, int32_t destCapacity,
600 UErrorCode *status) {
601
602 // TODO: this function could be sped up a bit
603 // Skip the input normalization when not needed, work from callers data.
604 // Put the initial skeleton straight into the caller's destination buffer.
605 // It probably won't need normalization.
606 // But these would make the structure more complicated.
607
608 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
609 if (U_FAILURE(*status)) {
610 return 0;
611 }
612 if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
613 (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
614 *status = U_ILLEGAL_ARGUMENT_ERROR;
615 return 0;
616 }
617
618 int32_t tableMask = 0;
619 switch (type) {
620 case 0:
621 tableMask = USPOOF_ML_TABLE_FLAG;
622 break;
623 case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
624 tableMask = USPOOF_SL_TABLE_FLAG;
625 break;
626 case USPOOF_ANY_CASE:
627 tableMask = USPOOF_MA_TABLE_FLAG;
628 break;
629 case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
630 tableMask = USPOOF_SA_TABLE_FLAG;
631 break;
632 default:
633 *status = U_ILLEGAL_ARGUMENT_ERROR;
634 return 0;
635 }
636
637 // NFD transform of the user supplied input
638
639 UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
640 UChar *nfdInput = nfdStackBuf;
641 int32_t normalizedLen = unorm_normalize(
642 s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
643 if (*status == U_BUFFER_OVERFLOW_ERROR) {
644 nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
645 if (nfdInput == NULL) {
646 *status = U_MEMORY_ALLOCATION_ERROR;
647 return 0;
648 }
649 *status = U_ZERO_ERROR;
650 normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
651 nfdInput, normalizedLen+1, status);
652 }
653 if (U_FAILURE(*status)) {
654 if (nfdInput != nfdStackBuf) {
655 uprv_free(nfdInput);
656 }
657 return 0;
658 }
659
660 // buffer to hold the Unicode defined skeleton mappings for a single code point
661 UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
662
663 // Apply the skeleton mapping to the NFD normalized input string
664 // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
665 int32_t inputIndex = 0;
666 UnicodeString skelStr;
667 while (inputIndex < normalizedLen) {
668 UChar32 c;
669 U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
670 int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
671 skelStr.append(buf, replaceLen);
672 }
673
674 if (nfdInput != nfdStackBuf) {
675 uprv_free(nfdInput);
676 }
677
678 const UChar *result = skelStr.getBuffer();
679 int32_t resultLen = skelStr.length();
680 UChar *normedResult = NULL;
681
682 // Check the skeleton for NFD, normalize it if needed.
683 // Unnormalized results should be very rare.
684 if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
685 normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
686 normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
687 if (normedResult == NULL) {
688 *status = U_MEMORY_ALLOCATION_ERROR;
689 return 0;
690 }
691 *status = U_ZERO_ERROR;
692 unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
693 result = normedResult;
694 resultLen = normalizedLen;
695 }
696
697 // Copy the skeleton to the caller's buffer
698 if (U_SUCCESS(*status)) {
699 if (destCapacity == 0 || resultLen > destCapacity) {
700 *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
701 } else {
702 u_memcpy(dest, result, resultLen);
703 if (destCapacity > resultLen) {
704 dest[resultLen] = 0;
705 } else {
706 *status = U_STRING_NOT_TERMINATED_WARNING;
707 }
708 }
709 }
710 uprv_free(normedResult);
711 return resultLen;
712}
713
714
715
716U_CAPI UnicodeString & U_EXPORT2
717uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
718 uint32_t type,
719 const UnicodeString &s,
720 UnicodeString &dest,
721 UErrorCode *status) {
722 if (U_FAILURE(*status)) {
723 return dest;
724 }
725 dest.remove();
726
727 const UChar *str = s.getBuffer();
728 int32_t strLen = s.length();
729 UChar smallBuf[USPOOF_STACK_BUFFER_SIZE];
730 UChar *buf = smallBuf;
731 int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
732 if (*status == U_BUFFER_OVERFLOW_ERROR) {
733 buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
734 if (buf == NULL) {
735 *status = U_MEMORY_ALLOCATION_ERROR;
736 return dest;
737 }
738 *status = U_ZERO_ERROR;
739 uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
740 }
741 if (U_SUCCESS(*status)) {
742 dest.setTo(buf, outputSize);
743 }
744
745 if (buf != smallBuf) {
746 uprv_free(buf);
747 }
748 return dest;
749}
750
751
752U_CAPI int32_t U_EXPORT2
753uspoof_getSkeletonUTF8(const USpoofChecker *sc,
754 uint32_t type,
755 const char *s, int32_t length,
756 char *dest, int32_t destCapacity,
757 UErrorCode *status) {
758 // Lacking a UTF-8 normalization API, just converting the input to
759 // UTF-16 seems as good an approach as any. In typical use, input will
760 // be an identifier, which is to say not too long for stack buffers.
761 if (U_FAILURE(*status)) {
762 return 0;
763 }
764 // Buffers for the UChar form of the input and skeleton strings.
765 UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE];
766 UChar *inBuf = smallInBuf;
767 UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
768 UChar *outBuf = smallOutBuf;
769
770 int32_t lengthInUChars = 0;
771 int32_t skelLengthInUChars = 0;
772 int32_t skelLengthInUTF8 = 0;
773
774 u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
775 s, length, status);
776 if (*status == U_BUFFER_OVERFLOW_ERROR) {
777 inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
778 if (inBuf == NULL) {
779 *status = U_MEMORY_ALLOCATION_ERROR;
780 goto cleanup;
781 }
782 *status = U_ZERO_ERROR;
783 u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
784 s, length, status);
785 }
786
787 skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
788 outBuf, USPOOF_STACK_BUFFER_SIZE, status);
789 if (*status == U_BUFFER_OVERFLOW_ERROR) {
790 outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
791 if (outBuf == NULL) {
792 *status = U_MEMORY_ALLOCATION_ERROR;
793 goto cleanup;
794 }
795 *status = U_ZERO_ERROR;
796 skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
797 outBuf, skelLengthInUChars+1, status);
798 }
799
800 u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
801 outBuf, skelLengthInUChars, status);
802
803 cleanup:
804 if (inBuf != smallInBuf) {
805 uprv_free(inBuf);
806 }
807 if (outBuf != smallOutBuf) {
808 uprv_free(outBuf);
809 }
810 return skelLengthInUTF8;
811}
812
813
814U_CAPI int32_t U_EXPORT2
815uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
816 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
817 if (This == NULL) {
818 U_ASSERT(U_FAILURE(*status));
819 return 0;
820 }
821 int32_t dataSize = This->fSpoofData->fRawData->fLength;
822 if (capacity < dataSize) {
823 *status = U_BUFFER_OVERFLOW_ERROR;
824 return dataSize;
825 }
826 uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
827 return dataSize;
828}
829
830#endif