icuSources/i18n/uspoof.cpp

   1 /*
   2 ***************************************************************************
   3 * Copyright (C) 2008-2011, International Business Machines Corporation
   4 * and others. All Rights Reserved.
   5 ***************************************************************************
   6 *   file name:  uspoof.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2008Feb13
  12 *   created by: Andy Heninger
  13 *
  14 *   Unicode Spoof Detection
  15 */
  16 #include "unicode/utypes.h"
  17 #include "unicode/uspoof.h"
  18 #include "unicode/unorm.h"
  19 #include "unicode/ustring.h"
  20 #include "cmemory.h"
  21 #include "uspoof_impl.h"
  22 #include "uassert.h"
  23
  24
  25 #if !UCONFIG_NO_NORMALIZATION
  26
  27
  28 #include <stdio.h>      // debug
  29
  30 U_NAMESPACE_USE
  31
  32
  33 U_CAPI USpoofChecker * U_EXPORT2
  34 uspoof_open(UErrorCode *status) {
  35     if (U_FAILURE(*status)) {
  36         return NULL;
  37     }
  38     SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
  39     if (U_FAILURE(*status)) {
  40         delete si;
  41         si = NULL;
  42     }
  43     return (USpoofChecker *)si;
  44 }
  45
  46
  47 U_CAPI USpoofChecker * U_EXPORT2
  48 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
  49                           UErrorCode *status) {
  50     if (U_FAILURE(*status)) {
  51         return NULL;
  52     }
  53     SpoofData *sd = new SpoofData(data, length, *status);
  54     SpoofImpl *si = new SpoofImpl(sd, *status);
  55     if (U_FAILURE(*status)) {
  56         delete sd;
  57         delete si;
  58         return NULL;
  59     }
  60     if (sd == NULL || si == NULL) {
  61         *status = U_MEMORY_ALLOCATION_ERROR;
  62         delete sd;
  63         delete si;
  64         return NULL;
  65     }
  66
  67     if (pActualLength != NULL) {
  68         *pActualLength = sd->fRawData->fLength;
  69     }
  70     return reinterpret_cast<USpoofChecker *>(si);
  71 }
  72
  73
  74 U_CAPI USpoofChecker * U_EXPORT2
  75 uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
  76     const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
  77     if (src == NULL) {
  78         return NULL;
  79     }
  80     SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
  81     if (U_FAILURE(*status)) {
  82         delete result;
  83         result = NULL;
  84     }
  85     return (USpoofChecker *)result;
  86 }
  87
  88
  89 U_CAPI void U_EXPORT2
  90 uspoof_close(USpoofChecker *sc) {
  91     UErrorCode status = U_ZERO_ERROR;
  92     SpoofImpl *This = SpoofImpl::validateThis(sc, status);
  93     delete This;
  94 }
  95
  96
  97 U_CAPI void U_EXPORT2
  98 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
  99     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 100     if (This == NULL) {
 101         return;
 102     }
 103
 104     // Verify that the requested checks are all ones (bits) that
 105     //   are acceptable, known values.
 106     if (checks & ~USPOOF_ALL_CHECKS) {
 107         *status = U_ILLEGAL_ARGUMENT_ERROR;
 108         return;
 109     }
 110
 111     This->fChecks = checks;
 112 }
 113
 114
 115 U_CAPI int32_t U_EXPORT2
 116 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
 117     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 118     if (This == NULL) {
 119         return 0;
 120     }
 121     return This->fChecks;
 122 }
 123
 124 U_CAPI void U_EXPORT2
 125 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
 126     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 127     if (This == NULL) {
 128         return;
 129     }
 130     This->setAllowedLocales(localesList, *status);
 131 }
 132
 133 U_CAPI const char * U_EXPORT2
 134 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
 135     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 136     if (This == NULL) {
 137         return NULL;
 138     }
 139     return This->getAllowedLocales(*status);
 140 }
 141
 142
 143 U_CAPI const USet * U_EXPORT2
 144 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
 145     const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
 146     return reinterpret_cast<const USet *>(result);
 147 }
 148
 149 U_CAPI const UnicodeSet * U_EXPORT2
 150 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
 151     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 152     if (This == NULL) {
 153         return NULL;
 154     }
 155     return This->fAllowedCharsSet;
 156 }
 157
 158
 159 U_CAPI void U_EXPORT2
 160 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
 161     const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
 162     uspoof_setAllowedUnicodeSet(sc, set, status);
 163 }
 164
 165
 166 U_CAPI void U_EXPORT2
 167 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
 168     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 169     if (This == NULL) {
 170         return;
 171     }
 172     if (chars->isBogus()) {
 173         *status = U_ILLEGAL_ARGUMENT_ERROR;
 174         return;
 175     }
 176     UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
 177     if (clonedSet == NULL || clonedSet->isBogus()) {
 178         *status = U_MEMORY_ALLOCATION_ERROR;
 179         return;
 180     }
 181     clonedSet->freeze();
 182     delete This->fAllowedCharsSet;
 183     This->fAllowedCharsSet = clonedSet;
 184     This->fChecks |= USPOOF_CHAR_LIMIT;
 185 }
 186
 187
 188 U_CAPI int32_t U_EXPORT2
 189 uspoof_check(const USpoofChecker *sc,
 190              const UChar *text, int32_t length,
 191              int32_t *position,
 192              UErrorCode *status) {
 193
 194     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 195     if (This == NULL) {
 196         return 0;
 197     }
 198     if (length < -1) {
 199         *status = U_ILLEGAL_ARGUMENT_ERROR;
 200         return 0;
 201     }
 202     if (length == -1) {
 203         // It's not worth the bother to handle nul terminated strings everywhere.
 204         //   Just get the length and be done with it.
 205         length = u_strlen(text);
 206     }
 207
 208     int32_t result = 0;
 209     int32_t failPos = 0x7fffffff;   // TODO: do we have a #define for max int32?
 210
 211     // A count of the number of non-Common or inherited scripts.
 212     // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
 213     // Share the computation when possible.  scriptCount == -1 means that we haven't
 214     // done it yet.
 215     int32_t scriptCount = -1;
 216
 217     if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
 218         scriptCount = This->scriptScan(text, length, failPos, *status);
 219         // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
 220         if ( scriptCount >= 2) {
 221             // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
 222             result |= USPOOF_SINGLE_SCRIPT;
 223         }
 224     }
 225
 226     if (This->fChecks & USPOOF_CHAR_LIMIT) {
 227         int32_t i;
 228         UChar32 c;
 229         for (i=0; i<length ;) {
 230             U16_NEXT(text, i, length, c);
 231             if (!This->fAllowedCharsSet->contains(c)) {
 232                 result |= USPOOF_CHAR_LIMIT;
 233                 if (i < failPos) {
 234                     failPos = i;
 235                 }
 236                 break;
 237             }
 238         }
 239     }
 240
 241     if (This->fChecks &
 242         (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
 243         // These are the checks that need to be done on NFD input
 244         NFDBuffer   normalizedInput(text, length, *status);
 245         const UChar  *nfdText = normalizedInput.getBuffer();
 246         int32_t      nfdLength = normalizedInput.getLength();
 247
 248         if (This->fChecks & USPOOF_INVISIBLE) {
 249
 250             // scan for more than one occurence of the same non-spacing mark
 251             // in a sequence of non-spacing marks.
 252             int32_t     i;
 253             UChar32     c;
 254             UChar32     firstNonspacingMark = 0;
 255             UBool       haveMultipleMarks = FALSE;
 256             UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
 257
 258             for (i=0; i<length ;) {
 259                 U16_NEXT(nfdText, i, nfdLength, c);
 260                 if (u_charType(c) != U_NON_SPACING_MARK) {
 261                     firstNonspacingMark = 0;
 262                     if (haveMultipleMarks) {
 263                         marksSeenSoFar.clear();
 264                         haveMultipleMarks = FALSE;
 265                     }
 266                     continue;
 267                 }
 268                 if (firstNonspacingMark == 0) {
 269                     firstNonspacingMark = c;
 270                     continue;
 271                 }
 272                 if (!haveMultipleMarks) {
 273                     marksSeenSoFar.add(firstNonspacingMark);
 274                     haveMultipleMarks = TRUE;
 275                 }
 276                 if (marksSeenSoFar.contains(c)) {
 277                     // report the error, and stop scanning.
 278                     // No need to find more than the first failure.
 279                     result |= USPOOF_INVISIBLE;
 280                     failPos = i;
 281                     break;
 282                 }
 283                 marksSeenSoFar.add(c);
 284             }
 285         }
 286
 287
 288         if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
 289             // The basic test is the same for both whole and mixed script confusables.
 290             // Compute the set of scripts that every input character has a confusable in.
 291             // For this computation an input character is always considered to be
 292             //    confusable with itself in its own script.
 293             // If the number of such scripts is two or more, and the input consisted of
 294             //   characters all from a single script, we have a whole script confusable.
 295             //   (The two scripts will be the original script and the one that is confusable)
 296             // If the number of such scripts >= one, and the original input contained characters from
 297             //   more than one script, we have a mixed script confusable.  (We can transform
 298             //   some of the characters, and end up with a visually similar string all in
 299             //   one script.)
 300
 301             if (scriptCount == -1) {
 302                 int32_t t;
 303                 scriptCount = This->scriptScan(text, length, t, *status);
 304             }
 305
 306             ScriptSet scripts;
 307             This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
 308             int32_t confusableScriptCount = scripts.countMembers();
 309             //printf("confusableScriptCount = %d\n", confusableScriptCount);
 310
 311             if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
 312                 confusableScriptCount >= 2 &&
 313                 scriptCount == 1) {
 314                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
 315             }
 316
 317             if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
 318                 confusableScriptCount >= 1 &&
 319                 scriptCount > 1) {
 320                 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
 321             }
 322         }
 323     }
 324     if (position != NULL && failPos != 0x7fffffff) {
 325         *position = failPos;
 326     }
 327     return result;
 328 }
 329
 330
 331 U_CAPI int32_t U_EXPORT2
 332 uspoof_checkUTF8(const USpoofChecker *sc,
 333                  const char *text, int32_t length,
 334                  int32_t *position,
 335                  UErrorCode *status) {
 336
 337     if (U_FAILURE(*status)) {
 338         return 0;
 339     }
 340     UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
 341     UChar* text16 = stackBuf;
 342     int32_t len16;
 343
 344     u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
 345     if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
 346         return 0;
 347     }
 348     if (*status == U_BUFFER_OVERFLOW_ERROR) {
 349         text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
 350         if (text16 == NULL) {
 351             *status = U_MEMORY_ALLOCATION_ERROR;
 352             return 0;
 353         }
 354         *status = U_ZERO_ERROR;
 355         u_strFromUTF8(text16, len16+1, NULL, text, length, status);
 356     }
 357
 358     int32_t position16 = -1;
 359     int32_t result = uspoof_check(sc, text16, len16, &position16, status);
 360     if (U_FAILURE(*status)) {
 361         return 0;
 362     }
 363
 364     if (position16 > 0) {
 365         // Translate a UTF-16 based error position back to a UTF-8 offset.
 366         // u_strToUTF8() in preflight mode is an easy way to do it.
 367         U_ASSERT(position16 <= len16);
 368         u_strToUTF8(NULL, 0, position, text16, position16, status);
 369         if (position > 0) {
 370             // position is the required buffer length from u_strToUTF8, which includes
 371             // space for a terminating NULL, which we don't want, hence the -1.
 372             *position -= 1;
 373         }
 374         *status = U_ZERO_ERROR;   // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
 375     }
 376
 377     if (text16 != stackBuf) {
 378         uprv_free(text16);
 379     }
 380     return result;
 381
 382 }
 383
 384 /*  A convenience wrapper around the public uspoof_getSkeleton that handles
 385  *  allocating a larger buffer than provided if the original is too small.
 386  */
 387 static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
 388                          UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
 389     int32_t requiredCapacity = 0;
 390     UChar *buf = dest;
 391
 392     if (U_FAILURE(*status)) {
 393         return NULL;
 394     }
 395     requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
 396     if (*status == U_BUFFER_OVERFLOW_ERROR) {
 397         buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
 398         if (buf == NULL) {
 399             *status = U_MEMORY_ALLOCATION_ERROR;
 400             return NULL;
 401         }
 402         *status = U_ZERO_ERROR;
 403         uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
 404     }
 405     *outputLength = requiredCapacity;
 406     return buf;
 407 }
 408
 409
 410 U_CAPI int32_t U_EXPORT2
 411 uspoof_areConfusable(const USpoofChecker *sc,
 412                      const UChar *s1, int32_t length1,
 413                      const UChar *s2, int32_t length2,
 414                      UErrorCode *status) {
 415     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 416     if (U_FAILURE(*status)) {
 417         return 0;
 418     }
 419     //
 420     // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
 421     //   and for definitions of the types (single, whole, mixed-script) of confusables.
 422
 423     // We only care about a few of the check flags.  Ignore the others.
 424     // If no tests relavant to this function have been specified, return an error.
 425     // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
 426     //        but logically we would just return 0 (no error).
 427     if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
 428                           USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
 429         *status = U_INVALID_STATE_ERROR;
 430         return 0;
 431     }
 432     int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
 433     UChar    s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
 434     UChar   *s1Skeleton;
 435     int32_t  s1SkeletonLength = 0;
 436
 437     UChar    s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
 438     UChar   *s2Skeleton;
 439     int32_t  s2SkeletonLength = 0;
 440
 441     int32_t  result = 0;
 442     int32_t  t;
 443     int32_t  s1ScriptCount = This->scriptScan(s1, length1, t, *status);
 444     int32_t  s2ScriptCount = This->scriptScan(s2, length2, t, *status);
 445
 446     if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
 447         // Do the Single Script compare.
 448         if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
 449             flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 450             s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
 451                                      sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
 452             s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
 453                                      sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
 454             if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
 455                 result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 456             }
 457             if (s1Skeleton != s1SkeletonBuf) {
 458                 uprv_free(s1Skeleton);
 459             }
 460             if (s2Skeleton != s2SkeletonBuf) {
 461                 uprv_free(s2Skeleton);
 462             }
 463         }
 464     }
 465
 466     if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
 467          // If the two inputs are single script confusable they cannot also be
 468          // mixed or whole script confusable, according to the UAX39 definitions.
 469          // So we can skip those tests.
 470          return result;
 471     }
 472
 473     // Optimization for whole script confusables test:  two identifiers are whole script confusable if
 474     // each is of a single script and they are mixed script confusable.
 475     UBool possiblyWholeScriptConfusables =
 476         s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
 477
 478     //
 479     // Mixed Script Check
 480     //
 481     if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
 482         // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
 483         // the mixed script table skeleton, which is what we want.
 484         // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
 485         flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 486         s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
 487                                  sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
 488         s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
 489                                  sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
 490         if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
 491             result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
 492             if (possiblyWholeScriptConfusables) {
 493                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
 494             }
 495         }
 496         if (s1Skeleton != s1SkeletonBuf) {
 497             uprv_free(s1Skeleton);
 498         }
 499         if (s2Skeleton != s2SkeletonBuf) {
 500             uprv_free(s2Skeleton);
 501         }
 502     }
 503
 504     return result;
 505 }
 506
 507
 508 // Convenience function for converting a UTF-8 input to a UChar * string, including
 509 //          reallocating a buffer when required.  Parameters and their interpretation mostly
 510 //          match u_strFromUTF8.
 511
 512 static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
 513                                const char *in, int32_t inLength, UErrorCode *status) {
 514     if (U_FAILURE(*status)) {
 515         return NULL;
 516     }
 517     UChar *dest = outBuf;
 518     u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
 519     if (*status == U_BUFFER_OVERFLOW_ERROR) {
 520         dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
 521         if (dest == NULL) {
 522             *status = U_MEMORY_ALLOCATION_ERROR;
 523             return NULL;
 524         }
 525         *status = U_ZERO_ERROR;
 526         u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
 527     }
 528     return dest;
 529 }
 530
 531
 532
 533 U_CAPI int32_t U_EXPORT2
 534 uspoof_areConfusableUTF8(const USpoofChecker *sc,
 535                          const char *s1, int32_t length1,
 536                          const char *s2, int32_t length2,
 537                          UErrorCode *status) {
 538
 539     SpoofImpl::validateThis(sc, *status);
 540     if (U_FAILURE(*status)) {
 541         return 0;
 542     }
 543
 544     UChar    s1Buf[USPOOF_STACK_BUFFER_SIZE];
 545     int32_t  lengthS1U;
 546     UChar   *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
 547
 548     UChar    s2Buf[USPOOF_STACK_BUFFER_SIZE];
 549     int32_t  lengthS2U;
 550     UChar   *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
 551
 552     int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
 553
 554     if (s1U != s1Buf) {
 555         uprv_free(s1U);
 556     }
 557     if (s2U != s2Buf) {
 558         uprv_free(s2U);
 559     }
 560     return results;
 561 }
 562
 563
 564 U_CAPI int32_t U_EXPORT2
 565 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
 566                                   const U_NAMESPACE_QUALIFIER UnicodeString &s1,
 567                                   const U_NAMESPACE_QUALIFIER UnicodeString &s2,
 568                                   UErrorCode *status) {
 569
 570     const UChar *u1  = s1.getBuffer();
 571     int32_t  length1 = s1.length();
 572     const UChar *u2  = s2.getBuffer();
 573     int32_t  length2 = s2.length();
 574
 575     int32_t results  = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
 576     return results;
 577 }
 578
 579
 580
 581
 582 U_CAPI int32_t U_EXPORT2
 583 uspoof_checkUnicodeString(const USpoofChecker *sc,
 584                           const U_NAMESPACE_QUALIFIER UnicodeString &text,
 585                           int32_t *position,
 586                           UErrorCode *status) {
 587     int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
 588     return result;
 589 }
 590
 591
 592 U_CAPI int32_t U_EXPORT2
 593 uspoof_getSkeleton(const USpoofChecker *sc,
 594                    uint32_t type,
 595                    const UChar *s,  int32_t length,
 596                    UChar *dest, int32_t destCapacity,
 597                    UErrorCode *status) {
 598
 599     // TODO:  this function could be sped up a bit
 600     //        Skip the input normalization when not needed, work from callers data.
 601     //        Put the initial skeleton straight into the caller's destination buffer.
 602     //        It probably won't need normalization.
 603     //        But these would make the structure more complicated.
 604
 605     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 606     if (U_FAILURE(*status)) {
 607         return 0;
 608     }
 609     if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
 610         (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
 611         *status = U_ILLEGAL_ARGUMENT_ERROR;
 612         return 0;
 613     }
 614
 615    int32_t tableMask = 0;
 616    switch (type) {
 617       case 0:
 618         tableMask = USPOOF_ML_TABLE_FLAG;
 619         break;
 620       case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
 621         tableMask = USPOOF_SL_TABLE_FLAG;
 622         break;
 623       case USPOOF_ANY_CASE:
 624         tableMask = USPOOF_MA_TABLE_FLAG;
 625         break;
 626       case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
 627         tableMask = USPOOF_SA_TABLE_FLAG;
 628         break;
 629       default:
 630         *status = U_ILLEGAL_ARGUMENT_ERROR;
 631         return 0;
 632     }
 633
 634     // NFD transform of the user supplied input
 635
 636     UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
 637     UChar *nfdInput = nfdStackBuf;
 638     int32_t normalizedLen = unorm_normalize(
 639         s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
 640     if (*status == U_BUFFER_OVERFLOW_ERROR) {
 641         nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
 642         if (nfdInput == NULL) {
 643             *status = U_MEMORY_ALLOCATION_ERROR;
 644             return 0;
 645         }
 646         *status = U_ZERO_ERROR;
 647         normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
 648                                         nfdInput, normalizedLen+1, status);
 649     }
 650     if (U_FAILURE(*status)) {
 651         if (nfdInput != nfdStackBuf) {
 652             uprv_free(nfdInput);
 653         }
 654         return 0;
 655     }
 656
 657     // buffer to hold the Unicode defined skeleton mappings for a single code point
 658     UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
 659
 660     // Apply the skeleton mapping to the NFD normalized input string
 661     // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
 662     int32_t inputIndex = 0;
 663     UnicodeString skelStr;
 664     while (inputIndex < normalizedLen) {
 665         UChar32 c;
 666         U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
 667         int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
 668         skelStr.append(buf, replaceLen);
 669     }
 670
 671     if (nfdInput != nfdStackBuf) {
 672         uprv_free(nfdInput);
 673     }
 674
 675     const UChar *result = skelStr.getBuffer();
 676     int32_t  resultLen  = skelStr.length();
 677     UChar   *normedResult = NULL;
 678
 679     // Check the skeleton for NFD, normalize it if needed.
 680     // Unnormalized results should be very rare.
 681     if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
 682         normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
 683         normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
 684         if (normedResult == NULL) {
 685             *status = U_MEMORY_ALLOCATION_ERROR;
 686             return 0;
 687         }
 688         *status = U_ZERO_ERROR;
 689         unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
 690         result = normedResult;
 691         resultLen = normalizedLen;
 692     }
 693
 694     // Copy the skeleton to the caller's buffer
 695     if (U_SUCCESS(*status)) {
 696         if (destCapacity == 0 || resultLen > destCapacity) {
 697             *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
 698         } else {
 699             u_memcpy(dest, result, resultLen);
 700             if (destCapacity > resultLen) {
 701                 dest[resultLen] = 0;
 702             } else {
 703                 *status = U_STRING_NOT_TERMINATED_WARNING;
 704             }
 705         }
 706      }
 707      uprv_free(normedResult);
 708      return resultLen;
 709 }
 710
 711
 712
 713 U_CAPI UnicodeString &  U_EXPORT2
 714 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
 715                                 uint32_t type,
 716                                 const UnicodeString &s,
 717                                 UnicodeString &dest,
 718                                 UErrorCode *status) {
 719     if (U_FAILURE(*status)) {
 720         return dest;
 721     }
 722     dest.remove();
 723
 724     const UChar *str = s.getBuffer();
 725     int32_t      strLen = s.length();
 726     UChar        smallBuf[USPOOF_STACK_BUFFER_SIZE];
 727     UChar       *buf = smallBuf;
 728     int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
 729     if (*status == U_BUFFER_OVERFLOW_ERROR) {
 730         buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
 731         if (buf == NULL) {
 732             *status = U_MEMORY_ALLOCATION_ERROR;
 733             return dest;
 734         }
 735         *status = U_ZERO_ERROR;
 736         uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
 737     }
 738     if (U_SUCCESS(*status)) {
 739         dest.setTo(buf, outputSize);
 740     }
 741
 742     if (buf != smallBuf) {
 743         uprv_free(buf);
 744     }
 745     return dest;
 746 }
 747
 748
 749 U_CAPI int32_t U_EXPORT2
 750 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
 751                        uint32_t type,
 752                        const char *s,  int32_t length,
 753                        char *dest, int32_t destCapacity,
 754                        UErrorCode *status) {
 755     // Lacking a UTF-8 normalization API, just converting the input to
 756     // UTF-16 seems as good an approach as any.  In typical use, input will
 757     // be an identifier, which is to say not too long for stack buffers.
 758     if (U_FAILURE(*status)) {
 759         return 0;
 760     }
 761     // Buffers for the UChar form of the input and skeleton strings.
 762     UChar    smallInBuf[USPOOF_STACK_BUFFER_SIZE];
 763     UChar   *inBuf = smallInBuf;
 764     UChar    smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
 765     UChar   *outBuf = smallOutBuf;
 766
 767     int32_t  lengthInUChars = 0;
 768     int32_t  skelLengthInUChars = 0;
 769     int32_t  skelLengthInUTF8 = 0;
 770
 771     u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
 772                   s, length, status);
 773     if (*status == U_BUFFER_OVERFLOW_ERROR) {
 774         inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
 775         if (inBuf == NULL) {
 776             *status = U_MEMORY_ALLOCATION_ERROR;
 777             goto cleanup;
 778         }
 779         *status = U_ZERO_ERROR;
 780         u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
 781                       s, length, status);
 782     }
 783
 784     skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
 785                                          outBuf, USPOOF_STACK_BUFFER_SIZE, status);
 786     if (*status == U_BUFFER_OVERFLOW_ERROR) {
 787         outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
 788         if (outBuf == NULL) {
 789             *status = U_MEMORY_ALLOCATION_ERROR;
 790             goto cleanup;
 791         }
 792         *status = U_ZERO_ERROR;
 793         skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
 794                                          outBuf, skelLengthInUChars+1, status);
 795     }
 796
 797     u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
 798                 outBuf, skelLengthInUChars, status);
 799
 800   cleanup:
 801     if (inBuf != smallInBuf) {
 802         uprv_free(inBuf);
 803     }
 804     if (outBuf != smallOutBuf) {
 805         uprv_free(outBuf);
 806     }
 807     return skelLengthInUTF8;
 808 }
 809
 810
 811 U_CAPI int32_t U_EXPORT2
 812 uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
 813     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
 814     if (This == NULL) {
 815         U_ASSERT(U_FAILURE(*status));
 816         return 0;
 817     }
 818     int32_t dataSize = This->fSpoofData->fRawData->fLength;
 819     if (capacity < dataSize) {
 820         *status = U_BUFFER_OVERFLOW_ERROR;
 821         return dataSize;
 822     }
 823     uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
 824     return dataSize;
 825 }
 826
 827 #endif