icuSources/test/intltest/itspoof.cpp

   1 /*
   2 **********************************************************************
   3 * Copyright (C) 2011, International Business Machines Corporation
   4 * and others.  All Rights Reserved.
   5 **********************************************************************
   6 */
   7 /**
   8  * IntlTestSpoof tests for USpoofDetector
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
  14
  15 #include "itspoof.h"
  16 #include "unicode/uspoof.h"
  17 #include "unicode/unistr.h"
  18 #include "unicode/regex.h"
  19 #include "unicode/normlzr.h"
  20 #include "cstring.h"
  21 #include <stdlib.h>
  22 #include <stdio.h>
  23
  24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  25     errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
  26
  27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
  28     errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
  29
  30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
  31     errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
  32              __FILE__, __LINE__, #a, (a), #b, (b)); }}
  33
  34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
  35     errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
  36              __FILE__, __LINE__, #a, (a), #b, (b)); }}
  37
  38 /*
  39  *   TEST_SETUP and TEST_TEARDOWN
  40  *         macros to handle the boilerplate around setting up test case.
  41  *         Put arbitrary test code between SETUP and TEARDOWN.
  42  *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
  43  */
  44 #define TEST_SETUP {  \
  45     UErrorCode status = U_ZERO_ERROR; \
  46     USpoofChecker *sc;     \
  47     sc = uspoof_open(&status);  \
  48     TEST_ASSERT_SUCCESS(status);   \
  49     if (U_SUCCESS(status)){
  50
  51 #define TEST_TEARDOWN  \
  52     }  \
  53     TEST_ASSERT_SUCCESS(status);  \
  54     uspoof_close(sc);  \
  55 }
  56
  57
  58
  59
  60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  61 {
  62     if (exec) logln("TestSuite spoof: ");
  63     switch (index) {
  64         case 0:
  65             name = "TestSpoofAPI";
  66             if (exec) {
  67                 testSpoofAPI();
  68             }
  69             break;
  70          case 1:
  71             name = "TestSkeleton";
  72             if (exec) {
  73                 testSkeleton();
  74             }
  75             break;
  76          case 2:
  77             name = "TestAreConfusable";
  78             if (exec) {
  79                 testAreConfusable();
  80             }
  81             break;
  82           case 3:
  83             name = "TestInvisible";
  84             if (exec) {
  85                 testInvisible();
  86             }
  87             break;
  88           case 4:
  89             name = "testConfData";
  90             if (exec) {
  91                 testConfData();
  92             }
  93             break;
  94         default: name=""; break;
  95     }
  96 }
  97
  98 void IntlTestSpoof::testSpoofAPI() {
  99
 100     TEST_SETUP
 101         UnicodeString s("xyz");  // Many latin ranges are whole-script confusable with other scripts.
 102                                  // If this test starts failing, consult confusablesWholeScript.txt
 103         int32_t position = 666;
 104         int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
 105         TEST_ASSERT_SUCCESS(status);
 106         TEST_ASSERT_EQ(0, checkResults);
 107         TEST_ASSERT_EQ(666, position);
 108     TEST_TEARDOWN;
 109
 110     TEST_SETUP
 111         UnicodeString s1("cxs");
 112         UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
 113         int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
 114         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
 115
 116     TEST_TEARDOWN;
 117
 118     TEST_SETUP
 119         UnicodeString s("I1l0O");
 120         UnicodeString dest;
 121         UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
 122         TEST_ASSERT_SUCCESS(status);
 123         TEST_ASSERT(UnicodeString("lllOO") == dest);
 124         TEST_ASSERT(&dest == &retStr);
 125     TEST_TEARDOWN;
 126 }
 127
 128
 129 #define CHECK_SKELETON(type, input, expected) { \
 130     checkSkeleton(sc, type, input, expected, __LINE__); \
 131     }
 132
 133
 134 // testSkeleton.   Spot check a number of confusable skeleton substitutions from the
 135 //                 Unicode data file confusables.txt
 136 //                 Test cases chosen for substitutions of various lengths, and
 137 //                 membership in different mapping tables.
 138 void IntlTestSpoof::testSkeleton() {
 139     const uint32_t ML = 0;
 140     const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 141     const uint32_t MA = USPOOF_ANY_CASE;
 142     const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
 143
 144     TEST_SETUP
 145         // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
 146         CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
 147                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
 148                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
 149                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
 150
 151                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
 152                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
 153                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
 154                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
 155
 156         CHECK_SKELETON(SL, "nochange", "nochange");
 157         CHECK_SKELETON(MA, "love", "love");
 158         CHECK_SKELETON(MA, "1ove", "love");   // Digit 1 to letter l
 159         CHECK_SKELETON(ML, "OOPS", "OOPS");
 160         CHECK_SKELETON(ML, "00PS", "00PS");   // Digit 0 unchanged in lower case mode.
 161         CHECK_SKELETON(MA, "OOPS", "OOPS");
 162         CHECK_SKELETON(MA, "00PS", "OOPS");   // Digit 0 to letter O in any case mode only
 163         CHECK_SKELETON(SL, "\\u059c", "\\u0301");
 164         CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
 165         CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029");  // "(ll)"
 166         CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
 167
 168         // This mapping exists in the ML and MA tables, does not exist in SL, SA
 169         //0C83 ;        0C03 ;
 170         CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
 171         CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
 172         CHECK_SKELETON(ML, "\\u0C83", "\\u0983");
 173         CHECK_SKELETON(MA, "\\u0C83", "\\u0983");
 174
 175         // 0391 ; 0041 ;
 176         // This mapping exists only in the MA table.
 177         CHECK_SKELETON(MA, "\\u0391", "A");
 178         CHECK_SKELETON(SA, "\\u0391", "\\u0391");
 179         CHECK_SKELETON(ML, "\\u0391", "\\u0391");
 180         CHECK_SKELETON(SL, "\\u0391", "\\u0391");
 181
 182         // 13CF ;  0062 ;
 183         // This mapping exists in the ML and MA tables
 184         CHECK_SKELETON(ML, "\\u13CF", "b");
 185         CHECK_SKELETON(MA, "\\u13CF", "b");
 186         CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
 187         CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");
 188
 189         // 0022 ;  0027 0027 ;
 190         // all tables.
 191         CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027");
 192         CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027");
 193         CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
 194         CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
 195
 196         // 017F ;  0066 ;
 197         // This mapping exists in the SA and MA tables
 198         CHECK_SKELETON(MA, "\\u017F", "f");
 199         CHECK_SKELETON(SA, "\\u017F", "f");
 200
 201     TEST_TEARDOWN;
 202 }
 203
 204
 205 //
 206 //  Run a single confusable skeleton transformation test case.
 207 //
 208 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
 209                                   const char *input, const char *expected, int32_t lineNum) {
 210     UnicodeString uInput = UnicodeString(input).unescape();
 211     UnicodeString uExpected = UnicodeString(expected).unescape();
 212
 213     UErrorCode status = U_ZERO_ERROR;
 214     UnicodeString actual;
 215     uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
 216     if (U_FAILURE(status)) {
 217         errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
 218               u_errorName(status));
 219         return;
 220     }
 221     if (uExpected != actual) {
 222         errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
 223                __FILE__, __LINE__, lineNum);
 224         errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
 225               UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
 226     }
 227 }
 228
 229 void IntlTestSpoof::testAreConfusable() {
 230     TEST_SETUP
 231         UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
 232                          "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
 233         UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
 234                          "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
 235         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
 236         TEST_ASSERT_SUCCESS(status);
 237
 238     TEST_TEARDOWN;
 239 }
 240
 241 void IntlTestSpoof::testInvisible() {
 242     TEST_SETUP
 243         UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
 244         int32_t position = -42;
 245         TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
 246         TEST_ASSERT_SUCCESS(status);
 247         TEST_ASSERT(position == -42);
 248
 249         UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
 250         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
 251         TEST_ASSERT_SUCCESS(status);
 252         TEST_ASSERT_EQ(7, position);
 253
 254         // Tow acute accents, one from the composed a with acute accent, \u00e1,
 255         // and one separate.
 256         position = -42;
 257         UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
 258         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
 259         TEST_ASSERT_SUCCESS(status);
 260         TEST_ASSERT_EQ(7, position);
 261     TEST_TEARDOWN;
 262 }
 263
 264
 265 static UnicodeString parseHex(const UnicodeString &in) {
 266     // Convert a series of hex numbers in a Unicode String to a string with the
 267     // corresponding characters.
 268     // The conversion is _really_ annoying.  There must be some function to just do it.
 269     UnicodeString result;
 270     UChar32 cc = 0;
 271     for (int32_t i=0; i<in.length(); i++) {
 272         UChar c = in.charAt(i);
 273         if (c == 0x20) {   // Space
 274             if (cc > 0) {
 275                result.append(cc);
 276                cc = 0;
 277             }
 278         } else if (c>=0x30 && c<=0x39) {
 279             cc = (cc<<4) + (c - 0x30);
 280         } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
 281             cc = (cc<<4) + (c & 0x0f)+9;
 282         }
 283         // else do something with bad input.
 284     }
 285     if (cc > 0) {
 286         result.append(cc);
 287     }
 288     return result;
 289 }
 290
 291
 292 //
 293 // Append the hex form of a UChar32 to a UnicodeString.
 294 // Used in formatting error messages.
 295 // Match the formatting of numbers in confusables.txt
 296 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
 297 //
 298 static void appendHexUChar(UnicodeString &dest, UChar32 c) {
 299     UBool   doZeroes = FALSE;
 300     for (int bitNum=28; bitNum>=0; bitNum-=4) {
 301         if (bitNum <= 12) {
 302             doZeroes = TRUE;
 303         }
 304         int hexDigit = (c>>bitNum) & 0x0f;
 305         if (hexDigit != 0 || doZeroes) {
 306             doZeroes = TRUE;
 307             dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
 308         }
 309     }
 310     dest.append((UChar)0x20);
 311 }
 312
 313 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
 314
 315 //  testConfData - Check each data item from the Unicode confusables.txt file,
 316 //                 verify that it transforms correctly in a skeleton.
 317 //
 318 void IntlTestSpoof::testConfData() {
 319     UErrorCode status = U_ZERO_ERROR;
 320
 321     const char *testDataDir = IntlTest::getSourceTestData(status);
 322     TEST_ASSERT_SUCCESS(status);
 323     char buffer[2000];
 324     uprv_strcpy(buffer, testDataDir);
 325     uprv_strcat(buffer, "confusables.txt");
 326
 327     LocalStdioFilePointer f(fopen(buffer, "rb"));
 328     if (f.isNull()) {
 329         errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
 330         return;
 331     }
 332     fseek(f.getAlias(), 0, SEEK_END);
 333     int32_t  fileSize = ftell(f.getAlias());
 334     LocalArray<char> fileBuf(new char[fileSize]);
 335     fseek(f.getAlias(), 0, SEEK_SET);
 336     int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
 337     TEST_ASSERT_EQ(amt_read, fileSize);
 338     TEST_ASSERT(fileSize>0);
 339     if (amt_read != fileSize || fileSize <=0) {
 340         return;
 341     }
 342     UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize));
 343
 344     LocalUSpoofCheckerPointer sc(uspoof_open(&status));
 345     TEST_ASSERT_SUCCESS(status);
 346
 347     // Parse lines from the confusables.txt file.  Example Line:
 348     // FF44 ;   0064 ;  SL      # ( d -> d ) FULLWIDTH ....
 349     // Three fields.  The hex fields can contain more than one character,
 350     //                and each character may be more than 4 digits (for supplemntals)
 351     // This regular expression matches lines and splits the fields into capture groups.
 352     RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
 353     TEST_ASSERT_SUCCESS(status);
 354     while (parseLine.find()) {
 355         UnicodeString from = parseHex(parseLine.group(1, status));
 356         if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
 357             // The source character was not NFD.
 358             // Skip this case; the first step in obtaining a skeleton is to NFD the input,
 359             //  so the mapping in this line of confusables.txt will never be applied.
 360             continue;
 361         }
 362
 363         UnicodeString rawExpected = parseHex(parseLine.group(2, status));
 364         UnicodeString expected;
 365         Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
 366         TEST_ASSERT_SUCCESS(status);
 367
 368         int32_t skeletonType = 0;
 369         UnicodeString tableType = parseLine.group(3, status);
 370         TEST_ASSERT_SUCCESS(status);
 371         if (tableType.indexOf("SL") >= 0) {
 372             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 373         } else if (tableType.indexOf("SA") >= 0) {
 374             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
 375         } else if (tableType.indexOf("ML") >= 0) {
 376             skeletonType = 0;
 377         } else if (tableType.indexOf("MA") >= 0) {
 378             skeletonType = USPOOF_ANY_CASE;
 379         }
 380
 381         UnicodeString actual;
 382         uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status);
 383         TEST_ASSERT_SUCCESS(status);
 384         TEST_ASSERT(actual == expected);
 385         if (actual != expected) {
 386             errln(parseLine.group(0, status));
 387             UnicodeString line = "Actual: ";
 388             int i = 0;
 389             while (i < actual.length()) {
 390                 appendHexUChar(line, actual.char32At(i));
 391                 i = actual.moveIndex32(i, 1);
 392             }
 393             errln(line);
 394         }
 395         if (U_FAILURE(status)) {
 396             break;
 397         }
 398     }
 399 }
 400 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
 401