2 **********************************************************************
3 * Copyright (C) 2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
8 * IntlTestSpoof tests for USpoofDetector
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
16 #include "unicode/uspoof.h"
17 #include "unicode/unistr.h"
18 #include "unicode/regex.h"
19 #include "unicode/normlzr.h"
24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
32 __FILE__, __LINE__, #a, (a), #b, (b)); }}
34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
36 __FILE__, __LINE__, #a, (a), #b, (b)); }}
39 * TEST_SETUP and TEST_TEARDOWN
40 * macros to handle the boilerplate around setting up test case.
41 * Put arbitrary test code between SETUP and TEARDOWN.
42 * "sc" is the ready-to-go SpoofChecker for use in the tests.
44 #define TEST_SETUP { \
45 UErrorCode status = U_ZERO_ERROR; \
47 sc = uspoof_open(&status); \
48 TEST_ASSERT_SUCCESS(status); \
49 if (U_SUCCESS(status)){
51 #define TEST_TEARDOWN \
53 TEST_ASSERT_SUCCESS(status); \
60 void IntlTestSpoof::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
62 if (exec
) logln("TestSuite spoof: ");
65 name
= "TestSpoofAPI";
71 name
= "TestSkeleton";
77 name
= "TestAreConfusable";
83 name
= "TestInvisible";
89 name
= "testConfData";
100 default: name
=""; break;
104 void IntlTestSpoof::testSpoofAPI() {
107 UnicodeString
s("xyz"); // Many latin ranges are whole-script confusable with other scripts.
108 // If this test starts failing, consult confusablesWholeScript.txt
109 int32_t position
= 666;
110 int32_t checkResults
= uspoof_checkUnicodeString(sc
, s
, &position
, &status
);
111 TEST_ASSERT_SUCCESS(status
);
112 TEST_ASSERT_EQ(0, checkResults
);
113 TEST_ASSERT_EQ(666, position
);
117 UnicodeString
s1("cxs");
118 UnicodeString s2
= UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
119 int32_t checkResults
= uspoof_areConfusableUnicodeString(sc
, s1
, s2
, &status
);
120 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE
| USPOOF_WHOLE_SCRIPT_CONFUSABLE
, checkResults
);
125 UnicodeString
s("I1l0O");
127 UnicodeString
&retStr
= uspoof_getSkeletonUnicodeString(sc
, USPOOF_ANY_CASE
, s
, dest
, &status
);
128 TEST_ASSERT_SUCCESS(status
);
129 TEST_ASSERT(UnicodeString("lllOO") == dest
);
130 TEST_ASSERT(&dest
== &retStr
);
135 #define CHECK_SKELETON(type, input, expected) { \
136 checkSkeleton(sc, type, input, expected, __LINE__); \
140 // testSkeleton. Spot check a number of confusable skeleton substitutions from the
141 // Unicode data file confusables.txt
142 // Test cases chosen for substitutions of various lengths, and
143 // membership in different mapping tables.
144 void IntlTestSpoof::testSkeleton() {
145 const uint32_t ML
= 0;
146 const uint32_t SL
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
147 const uint32_t MA
= USPOOF_ANY_CASE
;
148 const uint32_t SA
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
;
151 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
152 CHECK_SKELETON(SL
, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
153 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
154 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
155 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
157 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
158 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
159 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
160 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
162 CHECK_SKELETON(SL
, "nochange", "nochange");
163 CHECK_SKELETON(MA
, "love", "love");
164 CHECK_SKELETON(MA
, "1ove", "love"); // Digit 1 to letter l
165 CHECK_SKELETON(ML
, "OOPS", "OOPS");
166 CHECK_SKELETON(ML
, "00PS", "00PS"); // Digit 0 unchanged in lower case mode.
167 CHECK_SKELETON(MA
, "OOPS", "OOPS");
168 CHECK_SKELETON(MA
, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only
169 CHECK_SKELETON(SL
, "\\u059c", "\\u0301");
170 CHECK_SKELETON(SL
, "\\u2A74", "\\u003A\\u003A\\u003D");
171 CHECK_SKELETON(SL
, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)"
172 CHECK_SKELETON(SL
, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
174 // This mapping exists in the ML and MA tables, does not exist in SL, SA
176 CHECK_SKELETON(SL
, "\\u0C83", "\\u0C83");
177 CHECK_SKELETON(SA
, "\\u0C83", "\\u0C83");
178 CHECK_SKELETON(ML
, "\\u0C83", "\\u0983");
179 CHECK_SKELETON(MA
, "\\u0C83", "\\u0983");
182 // This mapping exists only in the MA table.
183 CHECK_SKELETON(MA
, "\\u0391", "A");
184 CHECK_SKELETON(SA
, "\\u0391", "\\u0391");
185 CHECK_SKELETON(ML
, "\\u0391", "\\u0391");
186 CHECK_SKELETON(SL
, "\\u0391", "\\u0391");
189 // This mapping exists in the ML and MA tables
190 CHECK_SKELETON(ML
, "\\u13CF", "b");
191 CHECK_SKELETON(MA
, "\\u13CF", "b");
192 CHECK_SKELETON(SL
, "\\u13CF", "\\u13CF");
193 CHECK_SKELETON(SA
, "\\u13CF", "\\u13CF");
195 // 0022 ; 0027 0027 ;
197 CHECK_SKELETON(SL
, "\\u0022", "\\u0027\\u0027");
198 CHECK_SKELETON(SA
, "\\u0022", "\\u0027\\u0027");
199 CHECK_SKELETON(ML
, "\\u0022", "\\u0027\\u0027");
200 CHECK_SKELETON(MA
, "\\u0022", "\\u0027\\u0027");
203 // This mapping exists in the SA and MA tables
204 CHECK_SKELETON(MA
, "\\u017F", "f");
205 CHECK_SKELETON(SA
, "\\u017F", "f");
212 // Run a single confusable skeleton transformation test case.
214 void IntlTestSpoof::checkSkeleton(const USpoofChecker
*sc
, uint32_t type
,
215 const char *input
, const char *expected
, int32_t lineNum
) {
216 UnicodeString uInput
= UnicodeString(input
).unescape();
217 UnicodeString uExpected
= UnicodeString(expected
).unescape();
219 UErrorCode status
= U_ZERO_ERROR
;
220 UnicodeString actual
;
221 uspoof_getSkeletonUnicodeString(sc
, type
, uInput
, actual
, &status
);
222 if (U_FAILURE(status
)) {
223 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__
, __LINE__
, lineNum
,
224 u_errorName(status
));
227 if (uExpected
!= actual
) {
228 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
229 __FILE__
, __LINE__
, lineNum
);
230 errln(UnicodeString(" Actual Skeleton: \"") + actual
+ UnicodeString("\"\n") +
231 UnicodeString(" Expected Skeleton: \"") + uExpected
+ UnicodeString("\""));
235 void IntlTestSpoof::testAreConfusable() {
237 UnicodeString
s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
238 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
239 UnicodeString
s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
240 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
241 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE
, uspoof_areConfusableUnicodeString(sc
, s1
, s2
, &status
));
242 TEST_ASSERT_SUCCESS(status
);
247 void IntlTestSpoof::testInvisible() {
249 UnicodeString s
= UnicodeString("abcd\\u0301ef").unescape();
250 int32_t position
= -42;
251 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc
, s
, &position
, &status
));
252 TEST_ASSERT_SUCCESS(status
);
253 TEST_ASSERT(position
== -42);
255 UnicodeString s2
= UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
256 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s2
, &position
, &status
));
257 TEST_ASSERT_SUCCESS(status
);
258 TEST_ASSERT_EQ(7, position
);
260 // Two acute accents, one from the composed a with acute accent, \u00e1,
263 UnicodeString s3
= UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
264 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s3
, &position
, &status
));
265 TEST_ASSERT_SUCCESS(status
);
266 TEST_ASSERT_EQ(7, position
);
270 void IntlTestSpoof::testBug8654() {
272 UnicodeString s
= UnicodeString("B\\u00c1\\u0301").unescape();
273 int32_t position
= -42;
274 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s
, &position
, &status
) & USPOOF_INVISIBLE
);
275 TEST_ASSERT_SUCCESS(status
);
276 TEST_ASSERT_EQ(3, position
);
280 static UnicodeString
parseHex(const UnicodeString
&in
) {
281 // Convert a series of hex numbers in a Unicode String to a string with the
282 // corresponding characters.
283 // The conversion is _really_ annoying. There must be some function to just do it.
284 UnicodeString result
;
286 for (int32_t i
=0; i
<in
.length(); i
++) {
287 UChar c
= in
.charAt(i
);
288 if (c
== 0x20) { // Space
293 } else if (c
>=0x30 && c
<=0x39) {
294 cc
= (cc
<<4) + (c
- 0x30);
295 } else if ((c
>=0x41 && c
<=0x46) || (c
>=0x61 && c
<=0x66)) {
296 cc
= (cc
<<4) + (c
& 0x0f)+9;
298 // else do something with bad input.
308 // Append the hex form of a UChar32 to a UnicodeString.
309 // Used in formatting error messages.
310 // Match the formatting of numbers in confusables.txt
311 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
313 static void appendHexUChar(UnicodeString
&dest
, UChar32 c
) {
314 UBool doZeroes
= FALSE
;
315 for (int bitNum
=28; bitNum
>=0; bitNum
-=4) {
319 int hexDigit
= (c
>>bitNum
) & 0x0f;
320 if (hexDigit
!= 0 || doZeroes
) {
322 dest
.append((UChar
)(hexDigit
<=9? hexDigit
+ 0x30: hexDigit
-10 + 0x41));
325 dest
.append((UChar
)0x20);
328 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer
, FILE, fclose
);
330 // testConfData - Check each data item from the Unicode confusables.txt file,
331 // verify that it transforms correctly in a skeleton.
333 void IntlTestSpoof::testConfData() {
334 UErrorCode status
= U_ZERO_ERROR
;
336 const char *testDataDir
= IntlTest::getSourceTestData(status
);
337 TEST_ASSERT_SUCCESS(status
);
339 uprv_strcpy(buffer
, testDataDir
);
340 uprv_strcat(buffer
, "confusables.txt");
342 LocalStdioFilePointer
f(fopen(buffer
, "rb"));
344 errln("Skipping test spoof/testConfData. File confusables.txt not accessible.");
347 fseek(f
.getAlias(), 0, SEEK_END
);
348 int32_t fileSize
= ftell(f
.getAlias());
349 LocalArray
<char> fileBuf(new char[fileSize
]);
350 fseek(f
.getAlias(), 0, SEEK_SET
);
351 int32_t amt_read
= fread(fileBuf
.getAlias(), 1, fileSize
, f
.getAlias());
352 TEST_ASSERT_EQ(amt_read
, fileSize
);
353 TEST_ASSERT(fileSize
>0);
354 if (amt_read
!= fileSize
|| fileSize
<=0) {
357 UnicodeString confusablesTxt
= UnicodeString::fromUTF8(StringPiece(fileBuf
.getAlias(), fileSize
));
359 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
360 TEST_ASSERT_SUCCESS(status
);
362 // Parse lines from the confusables.txt file. Example Line:
363 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
364 // Three fields. The hex fields can contain more than one character,
365 // and each character may be more than 4 digits (for supplemntals)
366 // This regular expression matches lines and splits the fields into capture groups.
367 RegexMatcher
parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt
, 0, status
);
368 TEST_ASSERT_SUCCESS(status
);
369 while (parseLine
.find()) {
370 UnicodeString from
= parseHex(parseLine
.group(1, status
));
371 if (!Normalizer::isNormalized(from
, UNORM_NFD
, status
)) {
372 // The source character was not NFD.
373 // Skip this case; the first step in obtaining a skeleton is to NFD the input,
374 // so the mapping in this line of confusables.txt will never be applied.
378 UnicodeString rawExpected
= parseHex(parseLine
.group(2, status
));
379 UnicodeString expected
;
380 Normalizer::decompose(rawExpected
, FALSE
/*NFD*/, 0, expected
, status
);
381 TEST_ASSERT_SUCCESS(status
);
383 int32_t skeletonType
= 0;
384 UnicodeString tableType
= parseLine
.group(3, status
);
385 TEST_ASSERT_SUCCESS(status
);
386 if (tableType
.indexOf("SL") >= 0) {
387 skeletonType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
388 } else if (tableType
.indexOf("SA") >= 0) {
389 skeletonType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
;
390 } else if (tableType
.indexOf("ML") >= 0) {
392 } else if (tableType
.indexOf("MA") >= 0) {
393 skeletonType
= USPOOF_ANY_CASE
;
396 UnicodeString actual
;
397 uspoof_getSkeletonUnicodeString(sc
.getAlias(), skeletonType
, from
, actual
, &status
);
398 TEST_ASSERT_SUCCESS(status
);
399 TEST_ASSERT(actual
== expected
);
400 if (actual
!= expected
) {
401 errln(parseLine
.group(0, status
));
402 UnicodeString line
= "Actual: ";
404 while (i
< actual
.length()) {
405 appendHexUChar(line
, actual
.char32At(i
));
406 i
= actual
.moveIndex32(i
, 1);
410 if (U_FAILURE(status
)) {
415 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS