]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/itspoof.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / test / intltest / itspoof.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 */
7 /**
8 * IntlTestSpoof tests for USpoofDetector
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
14
15 #include "itspoof.h"
16 #include "unicode/uspoof.h"
17 #include "unicode/unistr.h"
18 #include "unicode/regex.h"
19 #include "unicode/normlzr.h"
20 #include "cstring.h"
21 #include <stdlib.h>
22 #include <stdio.h>
23
24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
26
27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
29
30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
32 __FILE__, __LINE__, #a, (a), #b, (b)); }}
33
34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
36 __FILE__, __LINE__, #a, (a), #b, (b)); }}
37
38 /*
39 * TEST_SETUP and TEST_TEARDOWN
40 * macros to handle the boilerplate around setting up test case.
41 * Put arbitrary test code between SETUP and TEARDOWN.
42 * "sc" is the ready-to-go SpoofChecker for use in the tests.
43 */
44 #define TEST_SETUP { \
45 UErrorCode status = U_ZERO_ERROR; \
46 USpoofChecker *sc; \
47 sc = uspoof_open(&status); \
48 TEST_ASSERT_SUCCESS(status); \
49 if (U_SUCCESS(status)){
50
51 #define TEST_TEARDOWN \
52 } \
53 TEST_ASSERT_SUCCESS(status); \
54 uspoof_close(sc); \
55 }
56
57
58
59
60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61 {
62 if (exec) logln("TestSuite spoof: ");
63 switch (index) {
64 case 0:
65 name = "TestSpoofAPI";
66 if (exec) {
67 testSpoofAPI();
68 }
69 break;
70 case 1:
71 name = "TestSkeleton";
72 if (exec) {
73 testSkeleton();
74 }
75 break;
76 case 2:
77 name = "TestAreConfusable";
78 if (exec) {
79 testAreConfusable();
80 }
81 break;
82 case 3:
83 name = "TestInvisible";
84 if (exec) {
85 testInvisible();
86 }
87 break;
88 case 4:
89 name = "testConfData";
90 if (exec) {
91 testConfData();
92 }
93 break;
94 default: name=""; break;
95 }
96 }
97
98 void IntlTestSpoof::testSpoofAPI() {
99
100 TEST_SETUP
101 UnicodeString s("xyz"); // Many latin ranges are whole-script confusable with other scripts.
102 // If this test starts failing, consult confusablesWholeScript.txt
103 int32_t position = 666;
104 int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
105 TEST_ASSERT_SUCCESS(status);
106 TEST_ASSERT_EQ(0, checkResults);
107 TEST_ASSERT_EQ(666, position);
108 TEST_TEARDOWN;
109
110 TEST_SETUP
111 UnicodeString s1("cxs");
112 UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
113 int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
114 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
115
116 TEST_TEARDOWN;
117
118 TEST_SETUP
119 UnicodeString s("I1l0O");
120 UnicodeString dest;
121 UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
122 TEST_ASSERT_SUCCESS(status);
123 TEST_ASSERT(UnicodeString("lllOO") == dest);
124 TEST_ASSERT(&dest == &retStr);
125 TEST_TEARDOWN;
126 }
127
128
129 #define CHECK_SKELETON(type, input, expected) { \
130 checkSkeleton(sc, type, input, expected, __LINE__); \
131 }
132
133
134 // testSkeleton. Spot check a number of confusable skeleton substitutions from the
135 // Unicode data file confusables.txt
136 // Test cases chosen for substitutions of various lengths, and
137 // membership in different mapping tables.
138 void IntlTestSpoof::testSkeleton() {
139 const uint32_t ML = 0;
140 const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
141 const uint32_t MA = USPOOF_ANY_CASE;
142 const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
143
144 TEST_SETUP
145 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
146 CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
147 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
148 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
149 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
150
151 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
152 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
153 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
154 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
155
156 CHECK_SKELETON(SL, "nochange", "nochange");
157 CHECK_SKELETON(MA, "love", "love");
158 CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l
159 CHECK_SKELETON(ML, "OOPS", "OOPS");
160 CHECK_SKELETON(ML, "00PS", "00PS"); // Digit 0 unchanged in lower case mode.
161 CHECK_SKELETON(MA, "OOPS", "OOPS");
162 CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only
163 CHECK_SKELETON(SL, "\\u059c", "\\u0301");
164 CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
165 CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)"
166 CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
167
168 // This mapping exists in the ML and MA tables, does not exist in SL, SA
169 //0C83 ; 0C03 ;
170 CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
171 CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
172 CHECK_SKELETON(ML, "\\u0C83", "\\u0983");
173 CHECK_SKELETON(MA, "\\u0C83", "\\u0983");
174
175 // 0391 ; 0041 ;
176 // This mapping exists only in the MA table.
177 CHECK_SKELETON(MA, "\\u0391", "A");
178 CHECK_SKELETON(SA, "\\u0391", "\\u0391");
179 CHECK_SKELETON(ML, "\\u0391", "\\u0391");
180 CHECK_SKELETON(SL, "\\u0391", "\\u0391");
181
182 // 13CF ; 0062 ;
183 // This mapping exists in the ML and MA tables
184 CHECK_SKELETON(ML, "\\u13CF", "b");
185 CHECK_SKELETON(MA, "\\u13CF", "b");
186 CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
187 CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");
188
189 // 0022 ; 0027 0027 ;
190 // all tables.
191 CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027");
192 CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027");
193 CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
194 CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
195
196 // 017F ; 0066 ;
197 // This mapping exists in the SA and MA tables
198 CHECK_SKELETON(MA, "\\u017F", "f");
199 CHECK_SKELETON(SA, "\\u017F", "f");
200
201 TEST_TEARDOWN;
202 }
203
204
205 //
206 // Run a single confusable skeleton transformation test case.
207 //
208 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
209 const char *input, const char *expected, int32_t lineNum) {
210 UnicodeString uInput = UnicodeString(input).unescape();
211 UnicodeString uExpected = UnicodeString(expected).unescape();
212
213 UErrorCode status = U_ZERO_ERROR;
214 UnicodeString actual;
215 uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
216 if (U_FAILURE(status)) {
217 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
218 u_errorName(status));
219 return;
220 }
221 if (uExpected != actual) {
222 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
223 __FILE__, __LINE__, lineNum);
224 errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") +
225 UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
226 }
227 }
228
229 void IntlTestSpoof::testAreConfusable() {
230 TEST_SETUP
231 UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
232 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
233 UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
234 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
235 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
236 TEST_ASSERT_SUCCESS(status);
237
238 TEST_TEARDOWN;
239 }
240
241 void IntlTestSpoof::testInvisible() {
242 TEST_SETUP
243 UnicodeString s = UnicodeString("abcd\\u0301ef").unescape();
244 int32_t position = -42;
245 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
246 TEST_ASSERT_SUCCESS(status);
247 TEST_ASSERT(position == -42);
248
249 UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
250 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
251 TEST_ASSERT_SUCCESS(status);
252 TEST_ASSERT_EQ(7, position);
253
254 // Tow acute accents, one from the composed a with acute accent, \u00e1,
255 // and one separate.
256 position = -42;
257 UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
258 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
259 TEST_ASSERT_SUCCESS(status);
260 TEST_ASSERT_EQ(7, position);
261 TEST_TEARDOWN;
262 }
263
264
265 static UnicodeString parseHex(const UnicodeString &in) {
266 // Convert a series of hex numbers in a Unicode String to a string with the
267 // corresponding characters.
268 // The conversion is _really_ annoying. There must be some function to just do it.
269 UnicodeString result;
270 UChar32 cc = 0;
271 for (int32_t i=0; i<in.length(); i++) {
272 UChar c = in.charAt(i);
273 if (c == 0x20) { // Space
274 if (cc > 0) {
275 result.append(cc);
276 cc = 0;
277 }
278 } else if (c>=0x30 && c<=0x39) {
279 cc = (cc<<4) + (c - 0x30);
280 } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
281 cc = (cc<<4) + (c & 0x0f)+9;
282 }
283 // else do something with bad input.
284 }
285 if (cc > 0) {
286 result.append(cc);
287 }
288 return result;
289 }
290
291
292 //
293 // Append the hex form of a UChar32 to a UnicodeString.
294 // Used in formatting error messages.
295 // Match the formatting of numbers in confusables.txt
296 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
297 //
298 static void appendHexUChar(UnicodeString &dest, UChar32 c) {
299 UBool doZeroes = FALSE;
300 for (int bitNum=28; bitNum>=0; bitNum-=4) {
301 if (bitNum <= 12) {
302 doZeroes = TRUE;
303 }
304 int hexDigit = (c>>bitNum) & 0x0f;
305 if (hexDigit != 0 || doZeroes) {
306 doZeroes = TRUE;
307 dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
308 }
309 }
310 dest.append((UChar)0x20);
311 }
312
313 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
314
315 // testConfData - Check each data item from the Unicode confusables.txt file,
316 // verify that it transforms correctly in a skeleton.
317 //
318 void IntlTestSpoof::testConfData() {
319 UErrorCode status = U_ZERO_ERROR;
320
321 const char *testDataDir = IntlTest::getSourceTestData(status);
322 TEST_ASSERT_SUCCESS(status);
323 char buffer[2000];
324 uprv_strcpy(buffer, testDataDir);
325 uprv_strcat(buffer, "confusables.txt");
326
327 LocalStdioFilePointer f(fopen(buffer, "rb"));
328 if (f.isNull()) {
329 errln("Skipping test spoof/testConfData. File confusables.txt not accessible.");
330 return;
331 }
332 fseek(f.getAlias(), 0, SEEK_END);
333 int32_t fileSize = ftell(f.getAlias());
334 LocalArray<char> fileBuf(new char[fileSize]);
335 fseek(f.getAlias(), 0, SEEK_SET);
336 int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
337 TEST_ASSERT_EQ(amt_read, fileSize);
338 TEST_ASSERT(fileSize>0);
339 if (amt_read != fileSize || fileSize <=0) {
340 return;
341 }
342 UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize));
343
344 LocalUSpoofCheckerPointer sc(uspoof_open(&status));
345 TEST_ASSERT_SUCCESS(status);
346
347 // Parse lines from the confusables.txt file. Example Line:
348 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
349 // Three fields. The hex fields can contain more than one character,
350 // and each character may be more than 4 digits (for supplemntals)
351 // This regular expression matches lines and splits the fields into capture groups.
352 RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
353 TEST_ASSERT_SUCCESS(status);
354 while (parseLine.find()) {
355 UnicodeString from = parseHex(parseLine.group(1, status));
356 if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
357 // The source character was not NFD.
358 // Skip this case; the first step in obtaining a skeleton is to NFD the input,
359 // so the mapping in this line of confusables.txt will never be applied.
360 continue;
361 }
362
363 UnicodeString rawExpected = parseHex(parseLine.group(2, status));
364 UnicodeString expected;
365 Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
366 TEST_ASSERT_SUCCESS(status);
367
368 int32_t skeletonType = 0;
369 UnicodeString tableType = parseLine.group(3, status);
370 TEST_ASSERT_SUCCESS(status);
371 if (tableType.indexOf("SL") >= 0) {
372 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
373 } else if (tableType.indexOf("SA") >= 0) {
374 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
375 } else if (tableType.indexOf("ML") >= 0) {
376 skeletonType = 0;
377 } else if (tableType.indexOf("MA") >= 0) {
378 skeletonType = USPOOF_ANY_CASE;
379 }
380
381 UnicodeString actual;
382 uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status);
383 TEST_ASSERT_SUCCESS(status);
384 TEST_ASSERT(actual == expected);
385 if (actual != expected) {
386 errln(parseLine.group(0, status));
387 UnicodeString line = "Actual: ";
388 int i = 0;
389 while (i < actual.length()) {
390 appendHexUChar(line, actual.char32At(i));
391 i = actual.moveIndex32(i, 1);
392 }
393 errln(line);
394 }
395 if (U_FAILURE(status)) {
396 break;
397 }
398 }
399 }
400 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
401