icuSources/test/intltest/itspoof.cpp

   1 /*
   2 **********************************************************************
   3 * Copyright (C) 2011, International Business Machines Corporation
   4 * and others.  All Rights Reserved.
   5 **********************************************************************
   6 */
   7 /**
   8  * IntlTestSpoof tests for USpoofDetector
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
  14
  15 #include "itspoof.h"
  16 #include "unicode/uspoof.h"
  17 #include "unicode/unistr.h"
  18 #include "unicode/regex.h"
  19 #include "unicode/normlzr.h"
  20 #include "cstring.h"
  21 #include <stdlib.h>
  22 #include <stdio.h>
  23
  24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  25     errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
  26
  27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
  28     errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
  29
  30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
  31     errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
  32              __FILE__, __LINE__, #a, (a), #b, (b)); }}
  33
  34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
  35     errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
  36              __FILE__, __LINE__, #a, (a), #b, (b)); }}
  37
  38 /*
  39  *   TEST_SETUP and TEST_TEARDOWN
  40  *         macros to handle the boilerplate around setting up test case.
  41  *         Put arbitrary test code between SETUP and TEARDOWN.
  42  *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
  43  */
  44 #define TEST_SETUP {  \
  45     UErrorCode status = U_ZERO_ERROR; \
  46     USpoofChecker *sc;     \
  47     sc = uspoof_open(&status);  \
  48     TEST_ASSERT_SUCCESS(status);   \
  49     if (U_SUCCESS(status)){
  50
  51 #define TEST_TEARDOWN  \
  52     }  \
  53     TEST_ASSERT_SUCCESS(status);  \
  54     uspoof_close(sc);  \
  55 }
  56
  57
  58
  59
  60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  61 {
  62     if (exec) logln("TestSuite spoof: ");
  63     switch (index) {
  64         case 0:
  65             name = "TestSpoofAPI";
  66             if (exec) {
  67                 testSpoofAPI();
  68             }
  69             break;
  70          case 1:
  71             name = "TestSkeleton";
  72             if (exec) {
  73                 testSkeleton();
  74             }
  75             break;
  76          case 2:
  77             name = "TestAreConfusable";
  78             if (exec) {
  79                 testAreConfusable();
  80             }
  81             break;
  82           case 3:
  83             name = "TestInvisible";
  84             if (exec) {
  85                 testInvisible();
  86             }
  87             break;
  88           case 4:
  89             name = "testConfData";
  90             if (exec) {
  91                 testConfData();
  92             }
  93             break;
  94           case 5:
  95             name = "testBug8654";
  96             if (exec) {
  97                 testBug8654();
  98             }
  99             break;
 100          default: name=""; break;
 101     }
 102 }
 103
 104 void IntlTestSpoof::testSpoofAPI() {
 105
 106     TEST_SETUP
 107         UnicodeString s("xyz");  // Many latin ranges are whole-script confusable with other scripts.
 108                                  // If this test starts failing, consult confusablesWholeScript.txt
 109         int32_t position = 666;
 110         int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
 111         TEST_ASSERT_SUCCESS(status);
 112         TEST_ASSERT_EQ(0, checkResults);
 113         TEST_ASSERT_EQ(666, position);
 114     TEST_TEARDOWN;
 115
 116     TEST_SETUP
 117         UnicodeString s1("cxs");
 118         UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
 119         int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
 120         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
 121
 122     TEST_TEARDOWN;
 123
 124     TEST_SETUP
 125         UnicodeString s("I1l0O");
 126         UnicodeString dest;
 127         UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
 128         TEST_ASSERT_SUCCESS(status);
 129         TEST_ASSERT(UnicodeString("lllOO") == dest);
 130         TEST_ASSERT(&dest == &retStr);
 131     TEST_TEARDOWN;
 132 }
 133
 134
 135 #define CHECK_SKELETON(type, input, expected) { \
 136     checkSkeleton(sc, type, input, expected, __LINE__); \
 137     }
 138
 139
 140 // testSkeleton.   Spot check a number of confusable skeleton substitutions from the
 141 //                 Unicode data file confusables.txt
 142 //                 Test cases chosen for substitutions of various lengths, and
 143 //                 membership in different mapping tables.
 144 void IntlTestSpoof::testSkeleton() {
 145     const uint32_t ML = 0;
 146     const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 147     const uint32_t MA = USPOOF_ANY_CASE;
 148     const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
 149
 150     TEST_SETUP
 151         // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
 152         CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
 153                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
 154                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
 155                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
 156
 157                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
 158                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
 159                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
 160                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
 161
 162         CHECK_SKELETON(SL, "nochange", "nochange");
 163         CHECK_SKELETON(MA, "love", "love");
 164         CHECK_SKELETON(MA, "1ove", "love");   // Digit 1 to letter l
 165         CHECK_SKELETON(ML, "OOPS", "OOPS");
 166         CHECK_SKELETON(ML, "00PS", "00PS");   // Digit 0 unchanged in lower case mode.
 167         CHECK_SKELETON(MA, "OOPS", "OOPS");
 168         CHECK_SKELETON(MA, "00PS", "OOPS");   // Digit 0 to letter O in any case mode only
 169         CHECK_SKELETON(SL, "\\u059c", "\\u0301");
 170         CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
 171         CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029");  // "(ll)"
 172         CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
 173
 174         // This mapping exists in the ML and MA tables, does not exist in SL, SA
 175         //0C83 ;        0C03 ;
 176         CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
 177         CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
 178         CHECK_SKELETON(ML, "\\u0C83", "\\u0983");
 179         CHECK_SKELETON(MA, "\\u0C83", "\\u0983");
 180
 181         // 0391 ; 0041 ;
 182         // This mapping exists only in the MA table.
 183         CHECK_SKELETON(MA, "\\u0391", "A");
 184         CHECK_SKELETON(SA, "\\u0391", "\\u0391");
 185         CHECK_SKELETON(ML, "\\u0391", "\\u0391");
 186         CHECK_SKELETON(SL, "\\u0391", "\\u0391");
 187
 188         // 13CF ;  0062 ;
 189         // This mapping exists in the ML and MA tables
 190         CHECK_SKELETON(ML, "\\u13CF", "b");
 191         CHECK_SKELETON(MA, "\\u13CF", "b");
 192         CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
 193         CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");
 194
 195         // 0022 ;  0027 0027 ;
 196         // all tables.
 197         CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027");
 198         CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027");
 199         CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
 200         CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
 201
 202         // 017F ;  0066 ;
 203         // This mapping exists in the SA and MA tables
 204         CHECK_SKELETON(MA, "\\u017F", "f");
 205         CHECK_SKELETON(SA, "\\u017F", "f");
 206
 207     TEST_TEARDOWN;
 208 }
 209
 210
 211 //
 212 //  Run a single confusable skeleton transformation test case.
 213 //
 214 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
 215                                   const char *input, const char *expected, int32_t lineNum) {
 216     UnicodeString uInput = UnicodeString(input).unescape();
 217     UnicodeString uExpected = UnicodeString(expected).unescape();
 218
 219     UErrorCode status = U_ZERO_ERROR;
 220     UnicodeString actual;
 221     uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
 222     if (U_FAILURE(status)) {
 223         errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
 224               u_errorName(status));
 225         return;
 226     }
 227     if (uExpected != actual) {
 228         errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
 229                __FILE__, __LINE__, lineNum);
 230         errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
 231               UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
 232     }
 233 }
 234
 235 void IntlTestSpoof::testAreConfusable() {
 236     TEST_SETUP
 237         UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
 238                          "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
 239         UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
 240                          "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
 241         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
 242         TEST_ASSERT_SUCCESS(status);
 243
 244     TEST_TEARDOWN;
 245 }
 246
 247 void IntlTestSpoof::testInvisible() {
 248     TEST_SETUP
 249         UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
 250         int32_t position = -42;
 251         TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
 252         TEST_ASSERT_SUCCESS(status);
 253         TEST_ASSERT(position == -42);
 254
 255         UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
 256         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
 257         TEST_ASSERT_SUCCESS(status);
 258         TEST_ASSERT_EQ(7, position);
 259
 260         // Two acute accents, one from the composed a with acute accent, \u00e1,
 261         // and one separate.
 262         position = -42;
 263         UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
 264         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
 265         TEST_ASSERT_SUCCESS(status);
 266         TEST_ASSERT_EQ(7, position);
 267     TEST_TEARDOWN;
 268 }
 269
 270 void IntlTestSpoof::testBug8654() {
 271     TEST_SETUP
 272         UnicodeString s = UnicodeString("B\\u00c1\\u0301").unescape();
 273         int32_t position = -42;
 274         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE );
 275         TEST_ASSERT_SUCCESS(status);
 276         TEST_ASSERT_EQ(3, position);
 277     TEST_TEARDOWN;
 278 }
 279
 280 static UnicodeString parseHex(const UnicodeString &in) {
 281     // Convert a series of hex numbers in a Unicode String to a string with the
 282     // corresponding characters.
 283     // The conversion is _really_ annoying.  There must be some function to just do it.
 284     UnicodeString result;
 285     UChar32 cc = 0;
 286     for (int32_t i=0; i<in.length(); i++) {
 287         UChar c = in.charAt(i);
 288         if (c == 0x20) {   // Space
 289             if (cc > 0) {
 290                result.append(cc);
 291                cc = 0;
 292             }
 293         } else if (c>=0x30 && c<=0x39) {
 294             cc = (cc<<4) + (c - 0x30);
 295         } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
 296             cc = (cc<<4) + (c & 0x0f)+9;
 297         }
 298         // else do something with bad input.
 299     }
 300     if (cc > 0) {
 301         result.append(cc);
 302     }
 303     return result;
 304 }
 305
 306
 307 //
 308 // Append the hex form of a UChar32 to a UnicodeString.
 309 // Used in formatting error messages.
 310 // Match the formatting of numbers in confusables.txt
 311 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
 312 //
 313 static void appendHexUChar(UnicodeString &dest, UChar32 c) {
 314     UBool   doZeroes = FALSE;
 315     for (int bitNum=28; bitNum>=0; bitNum-=4) {
 316         if (bitNum <= 12) {
 317             doZeroes = TRUE;
 318         }
 319         int hexDigit = (c>>bitNum) & 0x0f;
 320         if (hexDigit != 0 || doZeroes) {
 321             doZeroes = TRUE;
 322             dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
 323         }
 324     }
 325     dest.append((UChar)0x20);
 326 }
 327
 328 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
 329
 330 //  testConfData - Check each data item from the Unicode confusables.txt file,
 331 //                 verify that it transforms correctly in a skeleton.
 332 //
 333 void IntlTestSpoof::testConfData() {
 334     UErrorCode status = U_ZERO_ERROR;
 335
 336     const char *testDataDir = IntlTest::getSourceTestData(status);
 337     TEST_ASSERT_SUCCESS(status);
 338     char buffer[2000];
 339     uprv_strcpy(buffer, testDataDir);
 340     uprv_strcat(buffer, "confusables.txt");
 341
 342     LocalStdioFilePointer f(fopen(buffer, "rb"));
 343     if (f.isNull()) {
 344         errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
 345         return;
 346     }
 347     fseek(f.getAlias(), 0, SEEK_END);
 348     int32_t  fileSize = ftell(f.getAlias());
 349     LocalArray<char> fileBuf(new char[fileSize]);
 350     fseek(f.getAlias(), 0, SEEK_SET);
 351     int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
 352     TEST_ASSERT_EQ(amt_read, fileSize);
 353     TEST_ASSERT(fileSize>0);
 354     if (amt_read != fileSize || fileSize <=0) {
 355         return;
 356     }
 357     UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize));
 358
 359     LocalUSpoofCheckerPointer sc(uspoof_open(&status));
 360     TEST_ASSERT_SUCCESS(status);
 361
 362     // Parse lines from the confusables.txt file.  Example Line:
 363     // FF44 ;   0064 ;  SL      # ( d -> d ) FULLWIDTH ....
 364     // Three fields.  The hex fields can contain more than one character,
 365     //                and each character may be more than 4 digits (for supplemntals)
 366     // This regular expression matches lines and splits the fields into capture groups.
 367     RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
 368     TEST_ASSERT_SUCCESS(status);
 369     while (parseLine.find()) {
 370         UnicodeString from = parseHex(parseLine.group(1, status));
 371         if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
 372             // The source character was not NFD.
 373             // Skip this case; the first step in obtaining a skeleton is to NFD the input,
 374             //  so the mapping in this line of confusables.txt will never be applied.
 375             continue;
 376         }
 377
 378         UnicodeString rawExpected = parseHex(parseLine.group(2, status));
 379         UnicodeString expected;
 380         Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
 381         TEST_ASSERT_SUCCESS(status);
 382
 383         int32_t skeletonType = 0;
 384         UnicodeString tableType = parseLine.group(3, status);
 385         TEST_ASSERT_SUCCESS(status);
 386         if (tableType.indexOf("SL") >= 0) {
 387             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 388         } else if (tableType.indexOf("SA") >= 0) {
 389             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
 390         } else if (tableType.indexOf("ML") >= 0) {
 391             skeletonType = 0;
 392         } else if (tableType.indexOf("MA") >= 0) {
 393             skeletonType = USPOOF_ANY_CASE;
 394         }
 395
 396         UnicodeString actual;
 397         uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status);
 398         TEST_ASSERT_SUCCESS(status);
 399         TEST_ASSERT(actual == expected);
 400         if (actual != expected) {
 401             errln(parseLine.group(0, status));
 402             UnicodeString line = "Actual: ";
 403             int i = 0;
 404             while (i < actual.length()) {
 405                 appendHexUChar(line, actual.char32At(i));
 406                 i = actual.moveIndex32(i, 1);
 407             }
 408             errln(line);
 409         }
 410         if (U_FAILURE(status)) {
 411             break;
 412         }
 413     }
 414 }
 415 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
 416