2 **********************************************************************
3 * Copyright (C) 2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
8 * IntlTestSpoof tests for USpoofDetector
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
16 #include "unicode/uspoof.h"
17 #include "unicode/unistr.h"
18 #include "unicode/regex.h"
19 #include "unicode/normlzr.h"
24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
32 __FILE__, __LINE__, #a, (a), #b, (b)); }}
34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
36 __FILE__, __LINE__, #a, (a), #b, (b)); }}
39 * TEST_SETUP and TEST_TEARDOWN
40 * macros to handle the boilerplate around setting up test case.
41 * Put arbitrary test code between SETUP and TEARDOWN.
42 * "sc" is the ready-to-go SpoofChecker for use in the tests.
44 #define TEST_SETUP { \
45 UErrorCode status = U_ZERO_ERROR; \
47 sc = uspoof_open(&status); \
48 TEST_ASSERT_SUCCESS(status); \
49 if (U_SUCCESS(status)){
51 #define TEST_TEARDOWN \
53 TEST_ASSERT_SUCCESS(status); \
60 void IntlTestSpoof::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
62 if (exec
) logln("TestSuite spoof: ");
65 name
= "TestSpoofAPI";
71 name
= "TestSkeleton";
77 name
= "TestAreConfusable";
83 name
= "TestInvisible";
89 name
= "testConfData";
94 default: name
=""; break;
98 void IntlTestSpoof::testSpoofAPI() {
101 UnicodeString
s("xyz"); // Many latin ranges are whole-script confusable with other scripts.
102 // If this test starts failing, consult confusablesWholeScript.txt
103 int32_t position
= 666;
104 int32_t checkResults
= uspoof_checkUnicodeString(sc
, s
, &position
, &status
);
105 TEST_ASSERT_SUCCESS(status
);
106 TEST_ASSERT_EQ(0, checkResults
);
107 TEST_ASSERT_EQ(666, position
);
111 UnicodeString
s1("cxs");
112 UnicodeString s2
= UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
113 int32_t checkResults
= uspoof_areConfusableUnicodeString(sc
, s1
, s2
, &status
);
114 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE
| USPOOF_WHOLE_SCRIPT_CONFUSABLE
, checkResults
);
119 UnicodeString
s("I1l0O");
121 UnicodeString
&retStr
= uspoof_getSkeletonUnicodeString(sc
, USPOOF_ANY_CASE
, s
, dest
, &status
);
122 TEST_ASSERT_SUCCESS(status
);
123 TEST_ASSERT(UnicodeString("lllOO") == dest
);
124 TEST_ASSERT(&dest
== &retStr
);
129 #define CHECK_SKELETON(type, input, expected) { \
130 checkSkeleton(sc, type, input, expected, __LINE__); \
134 // testSkeleton. Spot check a number of confusable skeleton substitutions from the
135 // Unicode data file confusables.txt
136 // Test cases chosen for substitutions of various lengths, and
137 // membership in different mapping tables.
138 void IntlTestSpoof::testSkeleton() {
139 const uint32_t ML
= 0;
140 const uint32_t SL
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
141 const uint32_t MA
= USPOOF_ANY_CASE
;
142 const uint32_t SA
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
;
145 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
146 CHECK_SKELETON(SL
, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
147 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
148 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
149 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
151 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
152 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
153 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
154 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
156 CHECK_SKELETON(SL
, "nochange", "nochange");
157 CHECK_SKELETON(MA
, "love", "love");
158 CHECK_SKELETON(MA
, "1ove", "love"); // Digit 1 to letter l
159 CHECK_SKELETON(ML
, "OOPS", "OOPS");
160 CHECK_SKELETON(ML
, "00PS", "00PS"); // Digit 0 unchanged in lower case mode.
161 CHECK_SKELETON(MA
, "OOPS", "OOPS");
162 CHECK_SKELETON(MA
, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only
163 CHECK_SKELETON(SL
, "\\u059c", "\\u0301");
164 CHECK_SKELETON(SL
, "\\u2A74", "\\u003A\\u003A\\u003D");
165 CHECK_SKELETON(SL
, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)"
166 CHECK_SKELETON(SL
, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
168 // This mapping exists in the ML and MA tables, does not exist in SL, SA
170 CHECK_SKELETON(SL
, "\\u0C83", "\\u0C83");
171 CHECK_SKELETON(SA
, "\\u0C83", "\\u0C83");
172 CHECK_SKELETON(ML
, "\\u0C83", "\\u0983");
173 CHECK_SKELETON(MA
, "\\u0C83", "\\u0983");
176 // This mapping exists only in the MA table.
177 CHECK_SKELETON(MA
, "\\u0391", "A");
178 CHECK_SKELETON(SA
, "\\u0391", "\\u0391");
179 CHECK_SKELETON(ML
, "\\u0391", "\\u0391");
180 CHECK_SKELETON(SL
, "\\u0391", "\\u0391");
183 // This mapping exists in the ML and MA tables
184 CHECK_SKELETON(ML
, "\\u13CF", "b");
185 CHECK_SKELETON(MA
, "\\u13CF", "b");
186 CHECK_SKELETON(SL
, "\\u13CF", "\\u13CF");
187 CHECK_SKELETON(SA
, "\\u13CF", "\\u13CF");
189 // 0022 ; 0027 0027 ;
191 CHECK_SKELETON(SL
, "\\u0022", "\\u0027\\u0027");
192 CHECK_SKELETON(SA
, "\\u0022", "\\u0027\\u0027");
193 CHECK_SKELETON(ML
, "\\u0022", "\\u0027\\u0027");
194 CHECK_SKELETON(MA
, "\\u0022", "\\u0027\\u0027");
197 // This mapping exists in the SA and MA tables
198 CHECK_SKELETON(MA
, "\\u017F", "f");
199 CHECK_SKELETON(SA
, "\\u017F", "f");
206 // Run a single confusable skeleton transformation test case.
208 void IntlTestSpoof::checkSkeleton(const USpoofChecker
*sc
, uint32_t type
,
209 const char *input
, const char *expected
, int32_t lineNum
) {
210 UnicodeString uInput
= UnicodeString(input
).unescape();
211 UnicodeString uExpected
= UnicodeString(expected
).unescape();
213 UErrorCode status
= U_ZERO_ERROR
;
214 UnicodeString actual
;
215 uspoof_getSkeletonUnicodeString(sc
, type
, uInput
, actual
, &status
);
216 if (U_FAILURE(status
)) {
217 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__
, __LINE__
, lineNum
,
218 u_errorName(status
));
221 if (uExpected
!= actual
) {
222 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
223 __FILE__
, __LINE__
, lineNum
);
224 errln(UnicodeString(" Actual Skeleton: \"") + actual
+ UnicodeString("\"\n") +
225 UnicodeString(" Expected Skeleton: \"") + uExpected
+ UnicodeString("\""));
229 void IntlTestSpoof::testAreConfusable() {
231 UnicodeString
s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
232 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
233 UnicodeString
s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
234 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
235 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE
, uspoof_areConfusableUnicodeString(sc
, s1
, s2
, &status
));
236 TEST_ASSERT_SUCCESS(status
);
241 void IntlTestSpoof::testInvisible() {
243 UnicodeString s
= UnicodeString("abcd\\u0301ef").unescape();
244 int32_t position
= -42;
245 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc
, s
, &position
, &status
));
246 TEST_ASSERT_SUCCESS(status
);
247 TEST_ASSERT(position
== -42);
249 UnicodeString s2
= UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
250 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s2
, &position
, &status
));
251 TEST_ASSERT_SUCCESS(status
);
252 TEST_ASSERT_EQ(7, position
);
254 // Tow acute accents, one from the composed a with acute accent, \u00e1,
257 UnicodeString s3
= UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
258 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s3
, &position
, &status
));
259 TEST_ASSERT_SUCCESS(status
);
260 TEST_ASSERT_EQ(7, position
);
265 static UnicodeString
parseHex(const UnicodeString
&in
) {
266 // Convert a series of hex numbers in a Unicode String to a string with the
267 // corresponding characters.
268 // The conversion is _really_ annoying. There must be some function to just do it.
269 UnicodeString result
;
271 for (int32_t i
=0; i
<in
.length(); i
++) {
272 UChar c
= in
.charAt(i
);
273 if (c
== 0x20) { // Space
278 } else if (c
>=0x30 && c
<=0x39) {
279 cc
= (cc
<<4) + (c
- 0x30);
280 } else if ((c
>=0x41 && c
<=0x46) || (c
>=0x61 && c
<=0x66)) {
281 cc
= (cc
<<4) + (c
& 0x0f)+9;
283 // else do something with bad input.
293 // Append the hex form of a UChar32 to a UnicodeString.
294 // Used in formatting error messages.
295 // Match the formatting of numbers in confusables.txt
296 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
298 static void appendHexUChar(UnicodeString
&dest
, UChar32 c
) {
299 UBool doZeroes
= FALSE
;
300 for (int bitNum
=28; bitNum
>=0; bitNum
-=4) {
304 int hexDigit
= (c
>>bitNum
) & 0x0f;
305 if (hexDigit
!= 0 || doZeroes
) {
307 dest
.append((UChar
)(hexDigit
<=9? hexDigit
+ 0x30: hexDigit
-10 + 0x41));
310 dest
.append((UChar
)0x20);
313 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer
, FILE, fclose
);
315 // testConfData - Check each data item from the Unicode confusables.txt file,
316 // verify that it transforms correctly in a skeleton.
318 void IntlTestSpoof::testConfData() {
319 UErrorCode status
= U_ZERO_ERROR
;
321 const char *testDataDir
= IntlTest::getSourceTestData(status
);
322 TEST_ASSERT_SUCCESS(status
);
324 uprv_strcpy(buffer
, testDataDir
);
325 uprv_strcat(buffer
, "confusables.txt");
327 LocalStdioFilePointer
f(fopen(buffer
, "rb"));
329 errln("Skipping test spoof/testConfData. File confusables.txt not accessible.");
332 fseek(f
.getAlias(), 0, SEEK_END
);
333 int32_t fileSize
= ftell(f
.getAlias());
334 LocalArray
<char> fileBuf(new char[fileSize
]);
335 fseek(f
.getAlias(), 0, SEEK_SET
);
336 int32_t amt_read
= fread(fileBuf
.getAlias(), 1, fileSize
, f
.getAlias());
337 TEST_ASSERT_EQ(amt_read
, fileSize
);
338 TEST_ASSERT(fileSize
>0);
339 if (amt_read
!= fileSize
|| fileSize
<=0) {
342 UnicodeString confusablesTxt
= UnicodeString::fromUTF8(StringPiece(fileBuf
.getAlias(), fileSize
));
344 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
345 TEST_ASSERT_SUCCESS(status
);
347 // Parse lines from the confusables.txt file. Example Line:
348 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
349 // Three fields. The hex fields can contain more than one character,
350 // and each character may be more than 4 digits (for supplemntals)
351 // This regular expression matches lines and splits the fields into capture groups.
352 RegexMatcher
parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt
, 0, status
);
353 TEST_ASSERT_SUCCESS(status
);
354 while (parseLine
.find()) {
355 UnicodeString from
= parseHex(parseLine
.group(1, status
));
356 if (!Normalizer::isNormalized(from
, UNORM_NFD
, status
)) {
357 // The source character was not NFD.
358 // Skip this case; the first step in obtaining a skeleton is to NFD the input,
359 // so the mapping in this line of confusables.txt will never be applied.
363 UnicodeString rawExpected
= parseHex(parseLine
.group(2, status
));
364 UnicodeString expected
;
365 Normalizer::decompose(rawExpected
, FALSE
/*NFD*/, 0, expected
, status
);
366 TEST_ASSERT_SUCCESS(status
);
368 int32_t skeletonType
= 0;
369 UnicodeString tableType
= parseLine
.group(3, status
);
370 TEST_ASSERT_SUCCESS(status
);
371 if (tableType
.indexOf("SL") >= 0) {
372 skeletonType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
373 } else if (tableType
.indexOf("SA") >= 0) {
374 skeletonType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
;
375 } else if (tableType
.indexOf("ML") >= 0) {
377 } else if (tableType
.indexOf("MA") >= 0) {
378 skeletonType
= USPOOF_ANY_CASE
;
381 UnicodeString actual
;
382 uspoof_getSkeletonUnicodeString(sc
.getAlias(), skeletonType
, from
, actual
, &status
);
383 TEST_ASSERT_SUCCESS(status
);
384 TEST_ASSERT(actual
== expected
);
385 if (actual
!= expected
) {
386 errln(parseLine
.group(0, status
));
387 UnicodeString line
= "Actual: ";
389 while (i
< actual
.length()) {
390 appendHexUChar(line
, actual
.char32At(i
));
391 i
= actual
.moveIndex32(i
, 1);
395 if (U_FAILURE(status
)) {
400 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS