]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/uspoof.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / i18n / uspoof.cpp
1 /*
2 ***************************************************************************
3 * Copyright (C) 2008-2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 * file name: uspoof.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2008Feb13
12 * created by: Andy Heninger
13 *
14 * Unicode Spoof Detection
15 */
16 #include "unicode/utypes.h"
17 #include "unicode/uspoof.h"
18 #include "unicode/unorm.h"
19 #include "unicode/ustring.h"
20 #include "cmemory.h"
21 #include "uspoof_impl.h"
22 #include "uassert.h"
23
24
25 #if !UCONFIG_NO_NORMALIZATION
26
27
28 #include <stdio.h> // debug
29
30 U_NAMESPACE_USE
31
32
33 U_CAPI USpoofChecker * U_EXPORT2
34 uspoof_open(UErrorCode *status) {
35 if (U_FAILURE(*status)) {
36 return NULL;
37 }
38 SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
39 if (U_FAILURE(*status)) {
40 delete si;
41 si = NULL;
42 }
43 return (USpoofChecker *)si;
44 }
45
46
47 U_CAPI USpoofChecker * U_EXPORT2
48 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
49 UErrorCode *status) {
50 if (U_FAILURE(*status)) {
51 return NULL;
52 }
53 SpoofData *sd = new SpoofData(data, length, *status);
54 SpoofImpl *si = new SpoofImpl(sd, *status);
55 if (U_FAILURE(*status)) {
56 delete sd;
57 delete si;
58 return NULL;
59 }
60 if (sd == NULL || si == NULL) {
61 *status = U_MEMORY_ALLOCATION_ERROR;
62 delete sd;
63 delete si;
64 return NULL;
65 }
66
67 if (pActualLength != NULL) {
68 *pActualLength = sd->fRawData->fLength;
69 }
70 return reinterpret_cast<USpoofChecker *>(si);
71 }
72
73
74 U_CAPI USpoofChecker * U_EXPORT2
75 uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
76 const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
77 if (src == NULL) {
78 return NULL;
79 }
80 SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor
81 if (U_FAILURE(*status)) {
82 delete result;
83 result = NULL;
84 }
85 return (USpoofChecker *)result;
86 }
87
88
89 U_CAPI void U_EXPORT2
90 uspoof_close(USpoofChecker *sc) {
91 UErrorCode status = U_ZERO_ERROR;
92 SpoofImpl *This = SpoofImpl::validateThis(sc, status);
93 delete This;
94 }
95
96
97 U_CAPI void U_EXPORT2
98 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
99 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
100 if (This == NULL) {
101 return;
102 }
103
104 // Verify that the requested checks are all ones (bits) that
105 // are acceptable, known values.
106 if (checks & ~USPOOF_ALL_CHECKS) {
107 *status = U_ILLEGAL_ARGUMENT_ERROR;
108 return;
109 }
110
111 This->fChecks = checks;
112 }
113
114
115 U_CAPI int32_t U_EXPORT2
116 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
117 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
118 if (This == NULL) {
119 return 0;
120 }
121 return This->fChecks;
122 }
123
124 U_CAPI void U_EXPORT2
125 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
126 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
127 if (This == NULL) {
128 return;
129 }
130 This->setAllowedLocales(localesList, *status);
131 }
132
133 U_CAPI const char * U_EXPORT2
134 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
135 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
136 if (This == NULL) {
137 return NULL;
138 }
139 return This->getAllowedLocales(*status);
140 }
141
142
143 U_CAPI const USet * U_EXPORT2
144 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
145 const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
146 return reinterpret_cast<const USet *>(result);
147 }
148
149 U_CAPI const UnicodeSet * U_EXPORT2
150 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
151 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
152 if (This == NULL) {
153 return NULL;
154 }
155 return This->fAllowedCharsSet;
156 }
157
158
159 U_CAPI void U_EXPORT2
160 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
161 const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
162 uspoof_setAllowedUnicodeSet(sc, set, status);
163 }
164
165
166 U_CAPI void U_EXPORT2
167 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
168 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
169 if (This == NULL) {
170 return;
171 }
172 if (chars->isBogus()) {
173 *status = U_ILLEGAL_ARGUMENT_ERROR;
174 return;
175 }
176 UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
177 if (clonedSet == NULL || clonedSet->isBogus()) {
178 *status = U_MEMORY_ALLOCATION_ERROR;
179 return;
180 }
181 clonedSet->freeze();
182 delete This->fAllowedCharsSet;
183 This->fAllowedCharsSet = clonedSet;
184 This->fChecks |= USPOOF_CHAR_LIMIT;
185 }
186
187
188 U_CAPI int32_t U_EXPORT2
189 uspoof_check(const USpoofChecker *sc,
190 const UChar *text, int32_t length,
191 int32_t *position,
192 UErrorCode *status) {
193
194 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
195 if (This == NULL) {
196 return 0;
197 }
198 if (length < -1) {
199 *status = U_ILLEGAL_ARGUMENT_ERROR;
200 return 0;
201 }
202 if (length == -1) {
203 // It's not worth the bother to handle nul terminated strings everywhere.
204 // Just get the length and be done with it.
205 length = u_strlen(text);
206 }
207
208 int32_t result = 0;
209 int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32?
210
211 // A count of the number of non-Common or inherited scripts.
212 // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
213 // Share the computation when possible. scriptCount == -1 means that we haven't
214 // done it yet.
215 int32_t scriptCount = -1;
216
217 if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
218 scriptCount = This->scriptScan(text, length, failPos, *status);
219 // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
220 if ( scriptCount >= 2) {
221 // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
222 result |= USPOOF_SINGLE_SCRIPT;
223 }
224 }
225
226 if (This->fChecks & USPOOF_CHAR_LIMIT) {
227 int32_t i;
228 UChar32 c;
229 for (i=0; i<length ;) {
230 U16_NEXT(text, i, length, c);
231 if (!This->fAllowedCharsSet->contains(c)) {
232 result |= USPOOF_CHAR_LIMIT;
233 if (i < failPos) {
234 failPos = i;
235 }
236 break;
237 }
238 }
239 }
240
241 if (This->fChecks &
242 (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
243 // These are the checks that need to be done on NFD input
244 NFDBuffer normalizedInput(text, length, *status);
245 const UChar *nfdText = normalizedInput.getBuffer();
246 int32_t nfdLength = normalizedInput.getLength();
247
248 if (This->fChecks & USPOOF_INVISIBLE) {
249
250 // scan for more than one occurence of the same non-spacing mark
251 // in a sequence of non-spacing marks.
252 int32_t i;
253 UChar32 c;
254 UChar32 firstNonspacingMark = 0;
255 UBool haveMultipleMarks = FALSE;
256 UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
257
258 for (i=0; i<length ;) {
259 U16_NEXT(nfdText, i, nfdLength, c);
260 if (u_charType(c) != U_NON_SPACING_MARK) {
261 firstNonspacingMark = 0;
262 if (haveMultipleMarks) {
263 marksSeenSoFar.clear();
264 haveMultipleMarks = FALSE;
265 }
266 continue;
267 }
268 if (firstNonspacingMark == 0) {
269 firstNonspacingMark = c;
270 continue;
271 }
272 if (!haveMultipleMarks) {
273 marksSeenSoFar.add(firstNonspacingMark);
274 haveMultipleMarks = TRUE;
275 }
276 if (marksSeenSoFar.contains(c)) {
277 // report the error, and stop scanning.
278 // No need to find more than the first failure.
279 result |= USPOOF_INVISIBLE;
280 failPos = i;
281 break;
282 }
283 marksSeenSoFar.add(c);
284 }
285 }
286
287
288 if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
289 // The basic test is the same for both whole and mixed script confusables.
290 // Compute the set of scripts that every input character has a confusable in.
291 // For this computation an input character is always considered to be
292 // confusable with itself in its own script.
293 // If the number of such scripts is two or more, and the input consisted of
294 // characters all from a single script, we have a whole script confusable.
295 // (The two scripts will be the original script and the one that is confusable)
296 // If the number of such scripts >= one, and the original input contained characters from
297 // more than one script, we have a mixed script confusable. (We can transform
298 // some of the characters, and end up with a visually similar string all in
299 // one script.)
300
301 if (scriptCount == -1) {
302 int32_t t;
303 scriptCount = This->scriptScan(text, length, t, *status);
304 }
305
306 ScriptSet scripts;
307 This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
308 int32_t confusableScriptCount = scripts.countMembers();
309 //printf("confusableScriptCount = %d\n", confusableScriptCount);
310
311 if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
312 confusableScriptCount >= 2 &&
313 scriptCount == 1) {
314 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
315 }
316
317 if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
318 confusableScriptCount >= 1 &&
319 scriptCount > 1) {
320 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
321 }
322 }
323 }
324 if (position != NULL && failPos != 0x7fffffff) {
325 *position = failPos;
326 }
327 return result;
328 }
329
330
331 U_CAPI int32_t U_EXPORT2
332 uspoof_checkUTF8(const USpoofChecker *sc,
333 const char *text, int32_t length,
334 int32_t *position,
335 UErrorCode *status) {
336
337 if (U_FAILURE(*status)) {
338 return 0;
339 }
340 UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
341 UChar* text16 = stackBuf;
342 int32_t len16;
343
344 u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
345 if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
346 return 0;
347 }
348 if (*status == U_BUFFER_OVERFLOW_ERROR) {
349 text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
350 if (text16 == NULL) {
351 *status = U_MEMORY_ALLOCATION_ERROR;
352 return 0;
353 }
354 *status = U_ZERO_ERROR;
355 u_strFromUTF8(text16, len16+1, NULL, text, length, status);
356 }
357
358 int32_t position16 = -1;
359 int32_t result = uspoof_check(sc, text16, len16, &position16, status);
360 if (U_FAILURE(*status)) {
361 return 0;
362 }
363
364 if (position16 > 0) {
365 // Translate a UTF-16 based error position back to a UTF-8 offset.
366 // u_strToUTF8() in preflight mode is an easy way to do it.
367 U_ASSERT(position16 <= len16);
368 u_strToUTF8(NULL, 0, position, text16, position16, status);
369 if (position > 0) {
370 // position is the required buffer length from u_strToUTF8, which includes
371 // space for a terminating NULL, which we don't want, hence the -1.
372 *position -= 1;
373 }
374 *status = U_ZERO_ERROR; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
375 }
376
377 if (text16 != stackBuf) {
378 uprv_free(text16);
379 }
380 return result;
381
382 }
383
384 /* A convenience wrapper around the public uspoof_getSkeleton that handles
385 * allocating a larger buffer than provided if the original is too small.
386 */
387 static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
388 UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
389 int32_t requiredCapacity = 0;
390 UChar *buf = dest;
391
392 if (U_FAILURE(*status)) {
393 return NULL;
394 }
395 requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
396 if (*status == U_BUFFER_OVERFLOW_ERROR) {
397 buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
398 if (buf == NULL) {
399 *status = U_MEMORY_ALLOCATION_ERROR;
400 return NULL;
401 }
402 *status = U_ZERO_ERROR;
403 uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
404 }
405 *outputLength = requiredCapacity;
406 return buf;
407 }
408
409
410 U_CAPI int32_t U_EXPORT2
411 uspoof_areConfusable(const USpoofChecker *sc,
412 const UChar *s1, int32_t length1,
413 const UChar *s2, int32_t length2,
414 UErrorCode *status) {
415 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
416 if (U_FAILURE(*status)) {
417 return 0;
418 }
419 //
420 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
421 // and for definitions of the types (single, whole, mixed-script) of confusables.
422
423 // We only care about a few of the check flags. Ignore the others.
424 // If no tests relavant to this function have been specified, return an error.
425 // TODO: is this really the right thing to do? It's probably an error on the caller's part,
426 // but logically we would just return 0 (no error).
427 if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
428 USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
429 *status = U_INVALID_STATE_ERROR;
430 return 0;
431 }
432 int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
433 UChar s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
434 UChar *s1Skeleton;
435 int32_t s1SkeletonLength = 0;
436
437 UChar s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
438 UChar *s2Skeleton;
439 int32_t s2SkeletonLength = 0;
440
441 int32_t result = 0;
442 int32_t t;
443 int32_t s1ScriptCount = This->scriptScan(s1, length1, t, *status);
444 int32_t s2ScriptCount = This->scriptScan(s2, length2, t, *status);
445
446 if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
447 // Do the Single Script compare.
448 if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
449 flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
450 s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
451 sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
452 s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
453 sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
454 if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
455 result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
456 }
457 if (s1Skeleton != s1SkeletonBuf) {
458 uprv_free(s1Skeleton);
459 }
460 if (s2Skeleton != s2SkeletonBuf) {
461 uprv_free(s2Skeleton);
462 }
463 }
464 }
465
466 if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
467 // If the two inputs are single script confusable they cannot also be
468 // mixed or whole script confusable, according to the UAX39 definitions.
469 // So we can skip those tests.
470 return result;
471 }
472
473 // Optimization for whole script confusables test: two identifiers are whole script confusable if
474 // each is of a single script and they are mixed script confusable.
475 UBool possiblyWholeScriptConfusables =
476 s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
477
478 //
479 // Mixed Script Check
480 //
481 if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
482 // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
483 // the mixed script table skeleton, which is what we want.
484 // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
485 flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
486 s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
487 sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
488 s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
489 sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
490 if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
491 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
492 if (possiblyWholeScriptConfusables) {
493 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
494 }
495 }
496 if (s1Skeleton != s1SkeletonBuf) {
497 uprv_free(s1Skeleton);
498 }
499 if (s2Skeleton != s2SkeletonBuf) {
500 uprv_free(s2Skeleton);
501 }
502 }
503
504 return result;
505 }
506
507
508 // Convenience function for converting a UTF-8 input to a UChar * string, including
509 // reallocating a buffer when required. Parameters and their interpretation mostly
510 // match u_strFromUTF8.
511
512 static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
513 const char *in, int32_t inLength, UErrorCode *status) {
514 if (U_FAILURE(*status)) {
515 return NULL;
516 }
517 UChar *dest = outBuf;
518 u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
519 if (*status == U_BUFFER_OVERFLOW_ERROR) {
520 dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
521 if (dest == NULL) {
522 *status = U_MEMORY_ALLOCATION_ERROR;
523 return NULL;
524 }
525 *status = U_ZERO_ERROR;
526 u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
527 }
528 return dest;
529 }
530
531
532
533 U_CAPI int32_t U_EXPORT2
534 uspoof_areConfusableUTF8(const USpoofChecker *sc,
535 const char *s1, int32_t length1,
536 const char *s2, int32_t length2,
537 UErrorCode *status) {
538
539 SpoofImpl::validateThis(sc, *status);
540 if (U_FAILURE(*status)) {
541 return 0;
542 }
543
544 UChar s1Buf[USPOOF_STACK_BUFFER_SIZE];
545 int32_t lengthS1U;
546 UChar *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
547
548 UChar s2Buf[USPOOF_STACK_BUFFER_SIZE];
549 int32_t lengthS2U;
550 UChar *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
551
552 int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
553
554 if (s1U != s1Buf) {
555 uprv_free(s1U);
556 }
557 if (s2U != s2Buf) {
558 uprv_free(s2U);
559 }
560 return results;
561 }
562
563
564 U_CAPI int32_t U_EXPORT2
565 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
566 const U_NAMESPACE_QUALIFIER UnicodeString &s1,
567 const U_NAMESPACE_QUALIFIER UnicodeString &s2,
568 UErrorCode *status) {
569
570 const UChar *u1 = s1.getBuffer();
571 int32_t length1 = s1.length();
572 const UChar *u2 = s2.getBuffer();
573 int32_t length2 = s2.length();
574
575 int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
576 return results;
577 }
578
579
580
581
582 U_CAPI int32_t U_EXPORT2
583 uspoof_checkUnicodeString(const USpoofChecker *sc,
584 const U_NAMESPACE_QUALIFIER UnicodeString &text,
585 int32_t *position,
586 UErrorCode *status) {
587 int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
588 return result;
589 }
590
591
592 U_CAPI int32_t U_EXPORT2
593 uspoof_getSkeleton(const USpoofChecker *sc,
594 uint32_t type,
595 const UChar *s, int32_t length,
596 UChar *dest, int32_t destCapacity,
597 UErrorCode *status) {
598
599 // TODO: this function could be sped up a bit
600 // Skip the input normalization when not needed, work from callers data.
601 // Put the initial skeleton straight into the caller's destination buffer.
602 // It probably won't need normalization.
603 // But these would make the structure more complicated.
604
605 const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
606 if (U_FAILURE(*status)) {
607 return 0;
608 }
609 if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
610 (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
611 *status = U_ILLEGAL_ARGUMENT_ERROR;
612 return 0;
613 }
614
615 int32_t tableMask = 0;
616 switch (type) {
617 case 0:
618 tableMask = USPOOF_ML_TABLE_FLAG;
619 break;
620 case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
621 tableMask = USPOOF_SL_TABLE_FLAG;
622 break;
623 case USPOOF_ANY_CASE:
624 tableMask = USPOOF_MA_TABLE_FLAG;
625 break;
626 case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
627 tableMask = USPOOF_SA_TABLE_FLAG;
628 break;
629 default:
630 *status = U_ILLEGAL_ARGUMENT_ERROR;
631 return 0;
632 }
633
634 // NFD transform of the user supplied input
635
636 UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
637 UChar *nfdInput = nfdStackBuf;
638 int32_t normalizedLen = unorm_normalize(
639 s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
640 if (*status == U_BUFFER_OVERFLOW_ERROR) {
641 nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
642 if (nfdInput == NULL) {
643 *status = U_MEMORY_ALLOCATION_ERROR;
644 return 0;
645 }
646 *status = U_ZERO_ERROR;
647 normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
648 nfdInput, normalizedLen+1, status);
649 }
650 if (U_FAILURE(*status)) {
651 if (nfdInput != nfdStackBuf) {
652 uprv_free(nfdInput);
653 }
654 return 0;
655 }
656
657 // buffer to hold the Unicode defined skeleton mappings for a single code point
658 UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
659
660 // Apply the skeleton mapping to the NFD normalized input string
661 // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
662 int32_t inputIndex = 0;
663 UnicodeString skelStr;
664 while (inputIndex < normalizedLen) {
665 UChar32 c;
666 U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
667 int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
668 skelStr.append(buf, replaceLen);
669 }
670
671 if (nfdInput != nfdStackBuf) {
672 uprv_free(nfdInput);
673 }
674
675 const UChar *result = skelStr.getBuffer();
676 int32_t resultLen = skelStr.length();
677 UChar *normedResult = NULL;
678
679 // Check the skeleton for NFD, normalize it if needed.
680 // Unnormalized results should be very rare.
681 if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
682 normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
683 normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
684 if (normedResult == NULL) {
685 *status = U_MEMORY_ALLOCATION_ERROR;
686 return 0;
687 }
688 *status = U_ZERO_ERROR;
689 unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
690 result = normedResult;
691 resultLen = normalizedLen;
692 }
693
694 // Copy the skeleton to the caller's buffer
695 if (U_SUCCESS(*status)) {
696 if (destCapacity == 0 || resultLen > destCapacity) {
697 *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
698 } else {
699 u_memcpy(dest, result, resultLen);
700 if (destCapacity > resultLen) {
701 dest[resultLen] = 0;
702 } else {
703 *status = U_STRING_NOT_TERMINATED_WARNING;
704 }
705 }
706 }
707 uprv_free(normedResult);
708 return resultLen;
709 }
710
711
712
713 U_CAPI UnicodeString & U_EXPORT2
714 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
715 uint32_t type,
716 const UnicodeString &s,
717 UnicodeString &dest,
718 UErrorCode *status) {
719 if (U_FAILURE(*status)) {
720 return dest;
721 }
722 dest.remove();
723
724 const UChar *str = s.getBuffer();
725 int32_t strLen = s.length();
726 UChar smallBuf[USPOOF_STACK_BUFFER_SIZE];
727 UChar *buf = smallBuf;
728 int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
729 if (*status == U_BUFFER_OVERFLOW_ERROR) {
730 buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
731 if (buf == NULL) {
732 *status = U_MEMORY_ALLOCATION_ERROR;
733 return dest;
734 }
735 *status = U_ZERO_ERROR;
736 uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
737 }
738 if (U_SUCCESS(*status)) {
739 dest.setTo(buf, outputSize);
740 }
741
742 if (buf != smallBuf) {
743 uprv_free(buf);
744 }
745 return dest;
746 }
747
748
749 U_CAPI int32_t U_EXPORT2
750 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
751 uint32_t type,
752 const char *s, int32_t length,
753 char *dest, int32_t destCapacity,
754 UErrorCode *status) {
755 // Lacking a UTF-8 normalization API, just converting the input to
756 // UTF-16 seems as good an approach as any. In typical use, input will
757 // be an identifier, which is to say not too long for stack buffers.
758 if (U_FAILURE(*status)) {
759 return 0;
760 }
761 // Buffers for the UChar form of the input and skeleton strings.
762 UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE];
763 UChar *inBuf = smallInBuf;
764 UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
765 UChar *outBuf = smallOutBuf;
766
767 int32_t lengthInUChars = 0;
768 int32_t skelLengthInUChars = 0;
769 int32_t skelLengthInUTF8 = 0;
770
771 u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
772 s, length, status);
773 if (*status == U_BUFFER_OVERFLOW_ERROR) {
774 inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
775 if (inBuf == NULL) {
776 *status = U_MEMORY_ALLOCATION_ERROR;
777 goto cleanup;
778 }
779 *status = U_ZERO_ERROR;
780 u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
781 s, length, status);
782 }
783
784 skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
785 outBuf, USPOOF_STACK_BUFFER_SIZE, status);
786 if (*status == U_BUFFER_OVERFLOW_ERROR) {
787 outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
788 if (outBuf == NULL) {
789 *status = U_MEMORY_ALLOCATION_ERROR;
790 goto cleanup;
791 }
792 *status = U_ZERO_ERROR;
793 skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
794 outBuf, skelLengthInUChars+1, status);
795 }
796
797 u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
798 outBuf, skelLengthInUChars, status);
799
800 cleanup:
801 if (inBuf != smallInBuf) {
802 uprv_free(inBuf);
803 }
804 if (outBuf != smallOutBuf) {
805 uprv_free(outBuf);
806 }
807 return skelLengthInUTF8;
808 }
809
810
811 U_CAPI int32_t U_EXPORT2
812 uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
813 SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
814 if (This == NULL) {
815 U_ASSERT(U_FAILURE(*status));
816 return 0;
817 }
818 int32_t dataSize = This->fSpoofData->fRawData->fLength;
819 if (capacity < dataSize) {
820 *status = U_BUFFER_OVERFLOW_ERROR;
821 return dataSize;
822 }
823 uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
824 return dataSize;
825 }
826
827 #endif