]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | *************************************************************************** | |
3 | * Copyright (C) 2008-2011, International Business Machines Corporation | |
4 | * and others. All Rights Reserved. | |
5 | *************************************************************************** | |
6 | * file name: uspoof.cpp | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2008Feb13 | |
12 | * created by: Andy Heninger | |
13 | * | |
14 | * Unicode Spoof Detection | |
15 | */ | |
16 | #include "unicode/utypes.h" | |
17 | #include "unicode/uspoof.h" | |
18 | #include "unicode/unorm.h" | |
19 | #include "unicode/ustring.h" | |
4388f060 | 20 | #include "unicode/utf16.h" |
729e4ab9 A |
21 | #include "cmemory.h" |
22 | #include "uspoof_impl.h" | |
23 | #include "uassert.h" | |
24 | ||
25 | ||
26 | #if !UCONFIG_NO_NORMALIZATION | |
27 | ||
729e4ab9 A |
28 | U_NAMESPACE_USE |
29 | ||
30 | ||
31 | U_CAPI USpoofChecker * U_EXPORT2 | |
32 | uspoof_open(UErrorCode *status) { | |
33 | if (U_FAILURE(*status)) { | |
34 | return NULL; | |
35 | } | |
36 | SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status); | |
37 | if (U_FAILURE(*status)) { | |
38 | delete si; | |
39 | si = NULL; | |
40 | } | |
41 | return (USpoofChecker *)si; | |
42 | } | |
43 | ||
44 | ||
45 | U_CAPI USpoofChecker * U_EXPORT2 | |
46 | uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, | |
47 | UErrorCode *status) { | |
48 | if (U_FAILURE(*status)) { | |
49 | return NULL; | |
50 | } | |
51 | SpoofData *sd = new SpoofData(data, length, *status); | |
52 | SpoofImpl *si = new SpoofImpl(sd, *status); | |
53 | if (U_FAILURE(*status)) { | |
54 | delete sd; | |
55 | delete si; | |
56 | return NULL; | |
57 | } | |
58 | if (sd == NULL || si == NULL) { | |
59 | *status = U_MEMORY_ALLOCATION_ERROR; | |
60 | delete sd; | |
61 | delete si; | |
62 | return NULL; | |
63 | } | |
64 | ||
65 | if (pActualLength != NULL) { | |
66 | *pActualLength = sd->fRawData->fLength; | |
67 | } | |
68 | return reinterpret_cast<USpoofChecker *>(si); | |
69 | } | |
70 | ||
71 | ||
72 | U_CAPI USpoofChecker * U_EXPORT2 | |
73 | uspoof_clone(const USpoofChecker *sc, UErrorCode *status) { | |
74 | const SpoofImpl *src = SpoofImpl::validateThis(sc, *status); | |
75 | if (src == NULL) { | |
76 | return NULL; | |
77 | } | |
78 | SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor | |
79 | if (U_FAILURE(*status)) { | |
80 | delete result; | |
81 | result = NULL; | |
82 | } | |
83 | return (USpoofChecker *)result; | |
84 | } | |
85 | ||
86 | ||
87 | U_CAPI void U_EXPORT2 | |
88 | uspoof_close(USpoofChecker *sc) { | |
89 | UErrorCode status = U_ZERO_ERROR; | |
90 | SpoofImpl *This = SpoofImpl::validateThis(sc, status); | |
91 | delete This; | |
92 | } | |
93 | ||
94 | ||
95 | U_CAPI void U_EXPORT2 | |
96 | uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) { | |
97 | SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
98 | if (This == NULL) { | |
99 | return; | |
100 | } | |
101 | ||
102 | // Verify that the requested checks are all ones (bits) that | |
103 | // are acceptable, known values. | |
104 | if (checks & ~USPOOF_ALL_CHECKS) { | |
105 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
106 | return; | |
107 | } | |
108 | ||
109 | This->fChecks = checks; | |
110 | } | |
111 | ||
112 | ||
113 | U_CAPI int32_t U_EXPORT2 | |
114 | uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) { | |
115 | const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
116 | if (This == NULL) { | |
117 | return 0; | |
118 | } | |
119 | return This->fChecks; | |
120 | } | |
121 | ||
122 | U_CAPI void U_EXPORT2 | |
123 | uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) { | |
124 | SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
125 | if (This == NULL) { | |
126 | return; | |
127 | } | |
128 | This->setAllowedLocales(localesList, *status); | |
129 | } | |
130 | ||
131 | U_CAPI const char * U_EXPORT2 | |
132 | uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) { | |
133 | SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
134 | if (This == NULL) { | |
135 | return NULL; | |
136 | } | |
137 | return This->getAllowedLocales(*status); | |
138 | } | |
139 | ||
140 | ||
141 | U_CAPI const USet * U_EXPORT2 | |
142 | uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) { | |
143 | const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status); | |
144 | return reinterpret_cast<const USet *>(result); | |
145 | } | |
146 | ||
147 | U_CAPI const UnicodeSet * U_EXPORT2 | |
148 | uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) { | |
149 | const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
150 | if (This == NULL) { | |
151 | return NULL; | |
152 | } | |
153 | return This->fAllowedCharsSet; | |
154 | } | |
155 | ||
156 | ||
157 | U_CAPI void U_EXPORT2 | |
158 | uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) { | |
159 | const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars); | |
160 | uspoof_setAllowedUnicodeSet(sc, set, status); | |
161 | } | |
162 | ||
163 | ||
164 | U_CAPI void U_EXPORT2 | |
165 | uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) { | |
166 | SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
167 | if (This == NULL) { | |
168 | return; | |
169 | } | |
170 | if (chars->isBogus()) { | |
171 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
172 | return; | |
173 | } | |
174 | UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone()); | |
175 | if (clonedSet == NULL || clonedSet->isBogus()) { | |
176 | *status = U_MEMORY_ALLOCATION_ERROR; | |
177 | return; | |
178 | } | |
179 | clonedSet->freeze(); | |
180 | delete This->fAllowedCharsSet; | |
181 | This->fAllowedCharsSet = clonedSet; | |
182 | This->fChecks |= USPOOF_CHAR_LIMIT; | |
183 | } | |
184 | ||
185 | ||
186 | U_CAPI int32_t U_EXPORT2 | |
187 | uspoof_check(const USpoofChecker *sc, | |
188 | const UChar *text, int32_t length, | |
189 | int32_t *position, | |
190 | UErrorCode *status) { | |
191 | ||
192 | const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
193 | if (This == NULL) { | |
194 | return 0; | |
195 | } | |
196 | if (length < -1) { | |
197 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
198 | return 0; | |
199 | } | |
200 | if (length == -1) { | |
201 | // It's not worth the bother to handle nul terminated strings everywhere. | |
202 | // Just get the length and be done with it. | |
203 | length = u_strlen(text); | |
204 | } | |
205 | ||
206 | int32_t result = 0; | |
207 | int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32? | |
208 | ||
209 | // A count of the number of non-Common or inherited scripts. | |
210 | // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests. | |
211 | // Share the computation when possible. scriptCount == -1 means that we haven't | |
212 | // done it yet. | |
213 | int32_t scriptCount = -1; | |
214 | ||
215 | if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) { | |
216 | scriptCount = This->scriptScan(text, length, failPos, *status); | |
217 | // printf("scriptCount (clipped to 2) = %d\n", scriptCount); | |
218 | if ( scriptCount >= 2) { | |
219 | // Note: scriptCount == 2 covers all cases of the number of scripts >= 2 | |
220 | result |= USPOOF_SINGLE_SCRIPT; | |
221 | } | |
222 | } | |
223 | ||
224 | if (This->fChecks & USPOOF_CHAR_LIMIT) { | |
225 | int32_t i; | |
226 | UChar32 c; | |
227 | for (i=0; i<length ;) { | |
228 | U16_NEXT(text, i, length, c); | |
229 | if (!This->fAllowedCharsSet->contains(c)) { | |
230 | result |= USPOOF_CHAR_LIMIT; | |
231 | if (i < failPos) { | |
232 | failPos = i; | |
233 | } | |
234 | break; | |
235 | } | |
236 | } | |
237 | } | |
238 | ||
239 | if (This->fChecks & | |
240 | (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { | |
241 | // These are the checks that need to be done on NFD input | |
242 | NFDBuffer normalizedInput(text, length, *status); | |
243 | const UChar *nfdText = normalizedInput.getBuffer(); | |
244 | int32_t nfdLength = normalizedInput.getLength(); | |
245 | ||
246 | if (This->fChecks & USPOOF_INVISIBLE) { | |
247 | ||
248 | // scan for more than one occurence of the same non-spacing mark | |
249 | // in a sequence of non-spacing marks. | |
250 | int32_t i; | |
251 | UChar32 c; | |
252 | UChar32 firstNonspacingMark = 0; | |
253 | UBool haveMultipleMarks = FALSE; | |
254 | UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. | |
255 | ||
4388f060 | 256 | for (i=0; i<nfdLength ;) { |
729e4ab9 A |
257 | U16_NEXT(nfdText, i, nfdLength, c); |
258 | if (u_charType(c) != U_NON_SPACING_MARK) { | |
259 | firstNonspacingMark = 0; | |
260 | if (haveMultipleMarks) { | |
261 | marksSeenSoFar.clear(); | |
262 | haveMultipleMarks = FALSE; | |
263 | } | |
264 | continue; | |
265 | } | |
266 | if (firstNonspacingMark == 0) { | |
267 | firstNonspacingMark = c; | |
268 | continue; | |
269 | } | |
270 | if (!haveMultipleMarks) { | |
271 | marksSeenSoFar.add(firstNonspacingMark); | |
272 | haveMultipleMarks = TRUE; | |
273 | } | |
274 | if (marksSeenSoFar.contains(c)) { | |
275 | // report the error, and stop scanning. | |
276 | // No need to find more than the first failure. | |
277 | result |= USPOOF_INVISIBLE; | |
278 | failPos = i; | |
4388f060 A |
279 | // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want |
280 | // to give back to our caller is a position in the original input string. | |
281 | if (failPos > length) { | |
282 | failPos = length; | |
283 | } | |
729e4ab9 A |
284 | break; |
285 | } | |
286 | marksSeenSoFar.add(c); | |
287 | } | |
288 | } | |
289 | ||
290 | ||
291 | if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { | |
292 | // The basic test is the same for both whole and mixed script confusables. | |
293 | // Compute the set of scripts that every input character has a confusable in. | |
294 | // For this computation an input character is always considered to be | |
295 | // confusable with itself in its own script. | |
296 | // If the number of such scripts is two or more, and the input consisted of | |
297 | // characters all from a single script, we have a whole script confusable. | |
298 | // (The two scripts will be the original script and the one that is confusable) | |
299 | // If the number of such scripts >= one, and the original input contained characters from | |
300 | // more than one script, we have a mixed script confusable. (We can transform | |
301 | // some of the characters, and end up with a visually similar string all in | |
302 | // one script.) | |
303 | ||
304 | if (scriptCount == -1) { | |
305 | int32_t t; | |
306 | scriptCount = This->scriptScan(text, length, t, *status); | |
307 | } | |
308 | ||
309 | ScriptSet scripts; | |
310 | This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status); | |
311 | int32_t confusableScriptCount = scripts.countMembers(); | |
312 | //printf("confusableScriptCount = %d\n", confusableScriptCount); | |
313 | ||
314 | if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && | |
315 | confusableScriptCount >= 2 && | |
316 | scriptCount == 1) { | |
317 | result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; | |
318 | } | |
319 | ||
320 | if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && | |
321 | confusableScriptCount >= 1 && | |
322 | scriptCount > 1) { | |
323 | result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; | |
324 | } | |
325 | } | |
326 | } | |
327 | if (position != NULL && failPos != 0x7fffffff) { | |
328 | *position = failPos; | |
329 | } | |
330 | return result; | |
331 | } | |
332 | ||
333 | ||
334 | U_CAPI int32_t U_EXPORT2 | |
335 | uspoof_checkUTF8(const USpoofChecker *sc, | |
336 | const char *text, int32_t length, | |
337 | int32_t *position, | |
338 | UErrorCode *status) { | |
339 | ||
340 | if (U_FAILURE(*status)) { | |
341 | return 0; | |
342 | } | |
343 | UChar stackBuf[USPOOF_STACK_BUFFER_SIZE]; | |
344 | UChar* text16 = stackBuf; | |
345 | int32_t len16; | |
346 | ||
347 | u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status); | |
348 | if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { | |
349 | return 0; | |
350 | } | |
351 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
352 | text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2)); | |
353 | if (text16 == NULL) { | |
354 | *status = U_MEMORY_ALLOCATION_ERROR; | |
355 | return 0; | |
356 | } | |
357 | *status = U_ZERO_ERROR; | |
358 | u_strFromUTF8(text16, len16+1, NULL, text, length, status); | |
359 | } | |
360 | ||
361 | int32_t position16 = -1; | |
362 | int32_t result = uspoof_check(sc, text16, len16, &position16, status); | |
363 | if (U_FAILURE(*status)) { | |
364 | return 0; | |
365 | } | |
366 | ||
367 | if (position16 > 0) { | |
368 | // Translate a UTF-16 based error position back to a UTF-8 offset. | |
369 | // u_strToUTF8() in preflight mode is an easy way to do it. | |
370 | U_ASSERT(position16 <= len16); | |
371 | u_strToUTF8(NULL, 0, position, text16, position16, status); | |
372 | if (position > 0) { | |
373 | // position is the required buffer length from u_strToUTF8, which includes | |
374 | // space for a terminating NULL, which we don't want, hence the -1. | |
375 | *position -= 1; | |
376 | } | |
377 | *status = U_ZERO_ERROR; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR. | |
378 | } | |
379 | ||
380 | if (text16 != stackBuf) { | |
381 | uprv_free(text16); | |
382 | } | |
383 | return result; | |
384 | ||
385 | } | |
386 | ||
387 | /* A convenience wrapper around the public uspoof_getSkeleton that handles | |
388 | * allocating a larger buffer than provided if the original is too small. | |
389 | */ | |
390 | static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength, | |
391 | UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) { | |
392 | int32_t requiredCapacity = 0; | |
393 | UChar *buf = dest; | |
394 | ||
395 | if (U_FAILURE(*status)) { | |
396 | return NULL; | |
397 | } | |
398 | requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status); | |
399 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
400 | buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar))); | |
401 | if (buf == NULL) { | |
402 | *status = U_MEMORY_ALLOCATION_ERROR; | |
403 | return NULL; | |
404 | } | |
405 | *status = U_ZERO_ERROR; | |
406 | uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status); | |
407 | } | |
408 | *outputLength = requiredCapacity; | |
409 | return buf; | |
410 | } | |
411 | ||
412 | ||
413 | U_CAPI int32_t U_EXPORT2 | |
414 | uspoof_areConfusable(const USpoofChecker *sc, | |
415 | const UChar *s1, int32_t length1, | |
416 | const UChar *s2, int32_t length2, | |
417 | UErrorCode *status) { | |
418 | const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
419 | if (U_FAILURE(*status)) { | |
420 | return 0; | |
421 | } | |
422 | // | |
423 | // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, | |
424 | // and for definitions of the types (single, whole, mixed-script) of confusables. | |
425 | ||
426 | // We only care about a few of the check flags. Ignore the others. | |
427 | // If no tests relavant to this function have been specified, return an error. | |
428 | // TODO: is this really the right thing to do? It's probably an error on the caller's part, | |
429 | // but logically we would just return 0 (no error). | |
430 | if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | | |
431 | USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) { | |
432 | *status = U_INVALID_STATE_ERROR; | |
433 | return 0; | |
434 | } | |
435 | int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE; | |
436 | UChar s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE]; | |
437 | UChar *s1Skeleton; | |
438 | int32_t s1SkeletonLength = 0; | |
439 | ||
440 | UChar s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE]; | |
441 | UChar *s2Skeleton; | |
442 | int32_t s2SkeletonLength = 0; | |
443 | ||
444 | int32_t result = 0; | |
445 | int32_t t; | |
446 | int32_t s1ScriptCount = This->scriptScan(s1, length1, t, *status); | |
447 | int32_t s2ScriptCount = This->scriptScan(s2, length2, t, *status); | |
448 | ||
449 | if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { | |
450 | // Do the Single Script compare. | |
451 | if (s1ScriptCount <= 1 && s2ScriptCount <= 1) { | |
452 | flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; | |
453 | s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf, | |
454 | sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status); | |
455 | s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf, | |
456 | sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status); | |
457 | if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) { | |
458 | result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; | |
459 | } | |
460 | if (s1Skeleton != s1SkeletonBuf) { | |
461 | uprv_free(s1Skeleton); | |
462 | } | |
463 | if (s2Skeleton != s2SkeletonBuf) { | |
464 | uprv_free(s2Skeleton); | |
465 | } | |
466 | } | |
467 | } | |
468 | ||
469 | if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { | |
470 | // If the two inputs are single script confusable they cannot also be | |
471 | // mixed or whole script confusable, according to the UAX39 definitions. | |
472 | // So we can skip those tests. | |
473 | return result; | |
474 | } | |
475 | ||
476 | // Optimization for whole script confusables test: two identifiers are whole script confusable if | |
477 | // each is of a single script and they are mixed script confusable. | |
478 | UBool possiblyWholeScriptConfusables = | |
479 | s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE); | |
480 | ||
481 | // | |
482 | // Mixed Script Check | |
483 | // | |
484 | if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) { | |
485 | // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us | |
486 | // the mixed script table skeleton, which is what we want. | |
487 | // The Any Case / Lower Case bit in the skelton flags was set at the top of the function. | |
488 | flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; | |
489 | s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf, | |
490 | sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status); | |
491 | s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf, | |
492 | sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status); | |
493 | if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) { | |
494 | result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; | |
495 | if (possiblyWholeScriptConfusables) { | |
496 | result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; | |
497 | } | |
498 | } | |
499 | if (s1Skeleton != s1SkeletonBuf) { | |
500 | uprv_free(s1Skeleton); | |
501 | } | |
502 | if (s2Skeleton != s2SkeletonBuf) { | |
503 | uprv_free(s2Skeleton); | |
504 | } | |
505 | } | |
506 | ||
507 | return result; | |
508 | } | |
509 | ||
510 | ||
511 | // Convenience function for converting a UTF-8 input to a UChar * string, including | |
512 | // reallocating a buffer when required. Parameters and their interpretation mostly | |
513 | // match u_strFromUTF8. | |
514 | ||
515 | static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength, | |
516 | const char *in, int32_t inLength, UErrorCode *status) { | |
517 | if (U_FAILURE(*status)) { | |
518 | return NULL; | |
519 | } | |
520 | UChar *dest = outBuf; | |
521 | u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status); | |
522 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
523 | dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar))); | |
524 | if (dest == NULL) { | |
525 | *status = U_MEMORY_ALLOCATION_ERROR; | |
526 | return NULL; | |
527 | } | |
528 | *status = U_ZERO_ERROR; | |
529 | u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status); | |
530 | } | |
531 | return dest; | |
532 | } | |
533 | ||
534 | ||
535 | ||
536 | U_CAPI int32_t U_EXPORT2 | |
537 | uspoof_areConfusableUTF8(const USpoofChecker *sc, | |
538 | const char *s1, int32_t length1, | |
539 | const char *s2, int32_t length2, | |
540 | UErrorCode *status) { | |
541 | ||
542 | SpoofImpl::validateThis(sc, *status); | |
543 | if (U_FAILURE(*status)) { | |
544 | return 0; | |
545 | } | |
546 | ||
547 | UChar s1Buf[USPOOF_STACK_BUFFER_SIZE]; | |
548 | int32_t lengthS1U; | |
549 | UChar *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status); | |
550 | ||
551 | UChar s2Buf[USPOOF_STACK_BUFFER_SIZE]; | |
552 | int32_t lengthS2U; | |
553 | UChar *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status); | |
554 | ||
555 | int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status); | |
556 | ||
557 | if (s1U != s1Buf) { | |
558 | uprv_free(s1U); | |
559 | } | |
560 | if (s2U != s2Buf) { | |
561 | uprv_free(s2U); | |
562 | } | |
563 | return results; | |
564 | } | |
565 | ||
566 | ||
567 | U_CAPI int32_t U_EXPORT2 | |
568 | uspoof_areConfusableUnicodeString(const USpoofChecker *sc, | |
4388f060 A |
569 | const icu::UnicodeString &s1, |
570 | const icu::UnicodeString &s2, | |
729e4ab9 A |
571 | UErrorCode *status) { |
572 | ||
573 | const UChar *u1 = s1.getBuffer(); | |
574 | int32_t length1 = s1.length(); | |
575 | const UChar *u2 = s2.getBuffer(); | |
576 | int32_t length2 = s2.length(); | |
577 | ||
578 | int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status); | |
579 | return results; | |
580 | } | |
581 | ||
582 | ||
583 | ||
584 | ||
585 | U_CAPI int32_t U_EXPORT2 | |
586 | uspoof_checkUnicodeString(const USpoofChecker *sc, | |
4388f060 | 587 | const icu::UnicodeString &text, |
729e4ab9 A |
588 | int32_t *position, |
589 | UErrorCode *status) { | |
590 | int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status); | |
591 | return result; | |
592 | } | |
593 | ||
594 | ||
595 | U_CAPI int32_t U_EXPORT2 | |
596 | uspoof_getSkeleton(const USpoofChecker *sc, | |
597 | uint32_t type, | |
598 | const UChar *s, int32_t length, | |
599 | UChar *dest, int32_t destCapacity, | |
600 | UErrorCode *status) { | |
601 | ||
602 | // TODO: this function could be sped up a bit | |
603 | // Skip the input normalization when not needed, work from callers data. | |
604 | // Put the initial skeleton straight into the caller's destination buffer. | |
605 | // It probably won't need normalization. | |
606 | // But these would make the structure more complicated. | |
607 | ||
608 | const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
609 | if (U_FAILURE(*status)) { | |
610 | return 0; | |
611 | } | |
612 | if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) || | |
613 | (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) { | |
614 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
615 | return 0; | |
616 | } | |
617 | ||
618 | int32_t tableMask = 0; | |
619 | switch (type) { | |
620 | case 0: | |
621 | tableMask = USPOOF_ML_TABLE_FLAG; | |
622 | break; | |
623 | case USPOOF_SINGLE_SCRIPT_CONFUSABLE: | |
624 | tableMask = USPOOF_SL_TABLE_FLAG; | |
625 | break; | |
626 | case USPOOF_ANY_CASE: | |
627 | tableMask = USPOOF_MA_TABLE_FLAG; | |
628 | break; | |
629 | case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE: | |
630 | tableMask = USPOOF_SA_TABLE_FLAG; | |
631 | break; | |
632 | default: | |
633 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
634 | return 0; | |
635 | } | |
636 | ||
637 | // NFD transform of the user supplied input | |
638 | ||
639 | UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE]; | |
640 | UChar *nfdInput = nfdStackBuf; | |
641 | int32_t normalizedLen = unorm_normalize( | |
642 | s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status); | |
643 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
644 | nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); | |
645 | if (nfdInput == NULL) { | |
646 | *status = U_MEMORY_ALLOCATION_ERROR; | |
647 | return 0; | |
648 | } | |
649 | *status = U_ZERO_ERROR; | |
650 | normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0, | |
651 | nfdInput, normalizedLen+1, status); | |
652 | } | |
653 | if (U_FAILURE(*status)) { | |
654 | if (nfdInput != nfdStackBuf) { | |
655 | uprv_free(nfdInput); | |
656 | } | |
657 | return 0; | |
658 | } | |
659 | ||
660 | // buffer to hold the Unicode defined skeleton mappings for a single code point | |
661 | UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; | |
662 | ||
663 | // Apply the skeleton mapping to the NFD normalized input string | |
664 | // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. | |
665 | int32_t inputIndex = 0; | |
666 | UnicodeString skelStr; | |
667 | while (inputIndex < normalizedLen) { | |
668 | UChar32 c; | |
669 | U16_NEXT(nfdInput, inputIndex, normalizedLen, c); | |
670 | int32_t replaceLen = This->confusableLookup(c, tableMask, buf); | |
671 | skelStr.append(buf, replaceLen); | |
672 | } | |
673 | ||
674 | if (nfdInput != nfdStackBuf) { | |
675 | uprv_free(nfdInput); | |
676 | } | |
677 | ||
678 | const UChar *result = skelStr.getBuffer(); | |
679 | int32_t resultLen = skelStr.length(); | |
680 | UChar *normedResult = NULL; | |
681 | ||
682 | // Check the skeleton for NFD, normalize it if needed. | |
683 | // Unnormalized results should be very rare. | |
684 | if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) { | |
685 | normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status); | |
686 | normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar))); | |
687 | if (normedResult == NULL) { | |
688 | *status = U_MEMORY_ALLOCATION_ERROR; | |
689 | return 0; | |
690 | } | |
691 | *status = U_ZERO_ERROR; | |
692 | unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status); | |
693 | result = normedResult; | |
694 | resultLen = normalizedLen; | |
695 | } | |
696 | ||
697 | // Copy the skeleton to the caller's buffer | |
698 | if (U_SUCCESS(*status)) { | |
699 | if (destCapacity == 0 || resultLen > destCapacity) { | |
700 | *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING; | |
701 | } else { | |
702 | u_memcpy(dest, result, resultLen); | |
703 | if (destCapacity > resultLen) { | |
704 | dest[resultLen] = 0; | |
705 | } else { | |
706 | *status = U_STRING_NOT_TERMINATED_WARNING; | |
707 | } | |
708 | } | |
709 | } | |
710 | uprv_free(normedResult); | |
711 | return resultLen; | |
712 | } | |
713 | ||
714 | ||
715 | ||
716 | U_CAPI UnicodeString & U_EXPORT2 | |
717 | uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, | |
718 | uint32_t type, | |
719 | const UnicodeString &s, | |
720 | UnicodeString &dest, | |
721 | UErrorCode *status) { | |
722 | if (U_FAILURE(*status)) { | |
723 | return dest; | |
724 | } | |
725 | dest.remove(); | |
726 | ||
727 | const UChar *str = s.getBuffer(); | |
728 | int32_t strLen = s.length(); | |
729 | UChar smallBuf[USPOOF_STACK_BUFFER_SIZE]; | |
730 | UChar *buf = smallBuf; | |
731 | int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status); | |
732 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
733 | buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar))); | |
734 | if (buf == NULL) { | |
735 | *status = U_MEMORY_ALLOCATION_ERROR; | |
736 | return dest; | |
737 | } | |
738 | *status = U_ZERO_ERROR; | |
739 | uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status); | |
740 | } | |
741 | if (U_SUCCESS(*status)) { | |
742 | dest.setTo(buf, outputSize); | |
743 | } | |
744 | ||
745 | if (buf != smallBuf) { | |
746 | uprv_free(buf); | |
747 | } | |
748 | return dest; | |
749 | } | |
750 | ||
751 | ||
752 | U_CAPI int32_t U_EXPORT2 | |
753 | uspoof_getSkeletonUTF8(const USpoofChecker *sc, | |
754 | uint32_t type, | |
755 | const char *s, int32_t length, | |
756 | char *dest, int32_t destCapacity, | |
757 | UErrorCode *status) { | |
758 | // Lacking a UTF-8 normalization API, just converting the input to | |
759 | // UTF-16 seems as good an approach as any. In typical use, input will | |
760 | // be an identifier, which is to say not too long for stack buffers. | |
761 | if (U_FAILURE(*status)) { | |
762 | return 0; | |
763 | } | |
764 | // Buffers for the UChar form of the input and skeleton strings. | |
765 | UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE]; | |
766 | UChar *inBuf = smallInBuf; | |
767 | UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE]; | |
768 | UChar *outBuf = smallOutBuf; | |
769 | ||
770 | int32_t lengthInUChars = 0; | |
771 | int32_t skelLengthInUChars = 0; | |
772 | int32_t skelLengthInUTF8 = 0; | |
773 | ||
774 | u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars, | |
775 | s, length, status); | |
776 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
777 | inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar))); | |
778 | if (inBuf == NULL) { | |
779 | *status = U_MEMORY_ALLOCATION_ERROR; | |
780 | goto cleanup; | |
781 | } | |
782 | *status = U_ZERO_ERROR; | |
783 | u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars, | |
784 | s, length, status); | |
785 | } | |
786 | ||
787 | skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars, | |
788 | outBuf, USPOOF_STACK_BUFFER_SIZE, status); | |
789 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
790 | outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar))); | |
791 | if (outBuf == NULL) { | |
792 | *status = U_MEMORY_ALLOCATION_ERROR; | |
793 | goto cleanup; | |
794 | } | |
795 | *status = U_ZERO_ERROR; | |
796 | skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars, | |
797 | outBuf, skelLengthInUChars+1, status); | |
798 | } | |
799 | ||
800 | u_strToUTF8(dest, destCapacity, &skelLengthInUTF8, | |
801 | outBuf, skelLengthInUChars, status); | |
802 | ||
803 | cleanup: | |
804 | if (inBuf != smallInBuf) { | |
805 | uprv_free(inBuf); | |
806 | } | |
807 | if (outBuf != smallOutBuf) { | |
808 | uprv_free(outBuf); | |
809 | } | |
810 | return skelLengthInUTF8; | |
811 | } | |
812 | ||
813 | ||
814 | U_CAPI int32_t U_EXPORT2 | |
815 | uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) { | |
816 | SpoofImpl *This = SpoofImpl::validateThis(sc, *status); | |
817 | if (This == NULL) { | |
818 | U_ASSERT(U_FAILURE(*status)); | |
819 | return 0; | |
820 | } | |
821 | int32_t dataSize = This->fSpoofData->fRawData->fLength; | |
822 | if (capacity < dataSize) { | |
823 | *status = U_BUFFER_OVERFLOW_ERROR; | |
824 | return dataSize; | |
825 | } | |
826 | uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize); | |
827 | return dataSize; | |
828 | } | |
829 | ||
830 | #endif |