]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
729e4ab9 A |
3 | /* |
4 | ********************************************************************** | |
b331163b | 5 | * Copyright (C) 2011-2015, International Business Machines Corporation |
729e4ab9 A |
6 | * and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | */ | |
9 | /** | |
10 | * IntlTestSpoof tests for USpoofDetector | |
11 | */ | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO | |
16 | ||
17 | #include "itspoof.h" | |
51004dcb | 18 | |
729e4ab9 | 19 | #include "unicode/normlzr.h" |
51004dcb A |
20 | #include "unicode/regex.h" |
21 | #include "unicode/unistr.h" | |
22 | #include "unicode/uscript.h" | |
23 | #include "unicode/uspoof.h" | |
24 | ||
729e4ab9 | 25 | #include "cstring.h" |
51004dcb A |
26 | #include "scriptset.h" |
27 | #include "uhash.h" | |
28 | ||
729e4ab9 A |
29 | #include <stdlib.h> |
30 | #include <stdio.h> | |
31 | ||
32 | #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ | |
33 | errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}} | |
34 | ||
35 | #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ | |
51004dcb A |
36 | errln("Test Failure at file %s, line %d: \"%s\" is false.", __FILE__, __LINE__, #expr);};} |
37 | ||
38 | #define TEST_ASSERT_MSG(expr, msg) {if ((expr)==FALSE) { \ | |
39 | dataerrln("Test Failure at file %s, line %d, %s: \"%s\" is false.", __FILE__, __LINE__, msg, #expr);};} | |
729e4ab9 A |
40 | |
41 | #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \ | |
51004dcb | 42 | errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d)", \ |
729e4ab9 A |
43 | __FILE__, __LINE__, #a, (a), #b, (b)); }} |
44 | ||
45 | #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \ | |
51004dcb | 46 | errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d)", \ |
729e4ab9 A |
47 | __FILE__, __LINE__, #a, (a), #b, (b)); }} |
48 | ||
49 | /* | |
50 | * TEST_SETUP and TEST_TEARDOWN | |
51 | * macros to handle the boilerplate around setting up test case. | |
52 | * Put arbitrary test code between SETUP and TEARDOWN. | |
53 | * "sc" is the ready-to-go SpoofChecker for use in the tests. | |
54 | */ | |
55 | #define TEST_SETUP { \ | |
56 | UErrorCode status = U_ZERO_ERROR; \ | |
57 | USpoofChecker *sc; \ | |
58 | sc = uspoof_open(&status); \ | |
59 | TEST_ASSERT_SUCCESS(status); \ | |
f3c0d7a5 A |
60 | USpoofCheckResult *checkResult; \ |
61 | checkResult = uspoof_openCheckResult(&status); \ | |
62 | TEST_ASSERT_SUCCESS(status); \ | |
729e4ab9 A |
63 | if (U_SUCCESS(status)){ |
64 | ||
65 | #define TEST_TEARDOWN \ | |
66 | } \ | |
67 | TEST_ASSERT_SUCCESS(status); \ | |
f3c0d7a5 | 68 | uspoof_closeCheckResult(checkResult); \ |
729e4ab9 A |
69 | uspoof_close(sc); \ |
70 | } | |
71 | ||
72 | ||
73 | ||
74 | ||
75 | void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) | |
76 | { | |
f3c0d7a5 A |
77 | if (exec) { |
78 | logln("TestSuite spoof: "); | |
729e4ab9 | 79 | } |
f3c0d7a5 A |
80 | TESTCASE_AUTO_BEGIN; |
81 | TESTCASE_AUTO(testSpoofAPI); | |
82 | TESTCASE_AUTO(testSkeleton); | |
83 | TESTCASE_AUTO(testAreConfusable); | |
84 | TESTCASE_AUTO(testInvisible); | |
85 | TESTCASE_AUTO(testConfData); | |
86 | TESTCASE_AUTO(testBug8654); | |
87 | TESTCASE_AUTO(testScriptSet); | |
88 | TESTCASE_AUTO(testRestrictionLevel); | |
89 | TESTCASE_AUTO(testMixedNumbers); | |
90 | TESTCASE_AUTO(testBug12153); | |
91 | TESTCASE_AUTO(testBug12825); | |
92 | TESTCASE_AUTO(testBug12815); | |
0f5d89e8 A |
93 | TESTCASE_AUTO(testBug13314_MixedNumbers); |
94 | TESTCASE_AUTO(testBug13328_MixedCombiningMarks); | |
95 | TESTCASE_AUTO(testCombiningDot); | |
f3c0d7a5 | 96 | TESTCASE_AUTO_END; |
729e4ab9 A |
97 | } |
98 | ||
99 | void IntlTestSpoof::testSpoofAPI() { | |
100 | ||
101 | TEST_SETUP | |
102 | UnicodeString s("xyz"); // Many latin ranges are whole-script confusable with other scripts. | |
103 | // If this test starts failing, consult confusablesWholeScript.txt | |
104 | int32_t position = 666; | |
105 | int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); | |
106 | TEST_ASSERT_SUCCESS(status); | |
107 | TEST_ASSERT_EQ(0, checkResults); | |
51004dcb | 108 | TEST_ASSERT_EQ(0, position); |
729e4ab9 A |
109 | TEST_TEARDOWN; |
110 | ||
111 | TEST_SETUP | |
112 | UnicodeString s1("cxs"); | |
113 | UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" | |
114 | int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); | |
f3c0d7a5 | 115 | TEST_ASSERT_SUCCESS(status); |
729e4ab9 A |
116 | TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); |
117 | ||
118 | TEST_TEARDOWN; | |
119 | ||
120 | TEST_SETUP | |
121 | UnicodeString s("I1l0O"); | |
122 | UnicodeString dest; | |
123 | UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); | |
124 | TEST_ASSERT_SUCCESS(status); | |
125 | TEST_ASSERT(UnicodeString("lllOO") == dest); | |
126 | TEST_ASSERT(&dest == &retStr); | |
127 | TEST_TEARDOWN; | |
128 | } | |
129 | ||
130 | ||
131 | #define CHECK_SKELETON(type, input, expected) { \ | |
132 | checkSkeleton(sc, type, input, expected, __LINE__); \ | |
133 | } | |
134 | ||
135 | ||
136 | // testSkeleton. Spot check a number of confusable skeleton substitutions from the | |
137 | // Unicode data file confusables.txt | |
138 | // Test cases chosen for substitutions of various lengths, and | |
139 | // membership in different mapping tables. | |
b331163b A |
140 | // Note: for ICU 55, all tables collapsed to the MA table data. |
141 | // TODO: for ICU 56 with Unicode 8, revisit this test. | |
142 | // | |
729e4ab9 A |
143 | void IntlTestSpoof::testSkeleton() { |
144 | const uint32_t ML = 0; | |
145 | const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE; | |
146 | const uint32_t MA = USPOOF_ANY_CASE; | |
147 | const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; | |
148 | ||
149 | TEST_SETUP | |
729e4ab9 | 150 | CHECK_SKELETON(SL, "nochange", "nochange"); |
b331163b A |
151 | CHECK_SKELETON(SA, "nochange", "nochange"); |
152 | CHECK_SKELETON(ML, "nochange", "nochange"); | |
153 | CHECK_SKELETON(MA, "nochange", "nochange"); | |
729e4ab9 A |
154 | CHECK_SKELETON(MA, "love", "love"); |
155 | CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l | |
156 | CHECK_SKELETON(ML, "OOPS", "OOPS"); | |
b331163b | 157 | CHECK_SKELETON(ML, "00PS", "OOPS"); |
729e4ab9 A |
158 | CHECK_SKELETON(MA, "OOPS", "OOPS"); |
159 | CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only | |
160 | CHECK_SKELETON(SL, "\\u059c", "\\u0301"); | |
161 | CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D"); | |
162 | CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)" | |
b331163b | 163 | CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f"); |
729e4ab9 A |
164 | |
165 | // This mapping exists in the ML and MA tables, does not exist in SL, SA | |
b331163b A |
166 | // 0C83 ; 0983 ; ML |
167 | // 0C83 ; 0983 ; MA | |
168 | // | |
169 | ||
170 | CHECK_SKELETON(SL, "\\u0C83", "\\u0983"); | |
171 | CHECK_SKELETON(SA, "\\u0C83", "\\u0983"); | |
729e4ab9 A |
172 | CHECK_SKELETON(ML, "\\u0C83", "\\u0983"); |
173 | CHECK_SKELETON(MA, "\\u0C83", "\\u0983"); | |
174 | ||
b331163b | 175 | // 0391 mappings exist only in MA and SA tables. |
729e4ab9 | 176 | CHECK_SKELETON(MA, "\\u0391", "A"); |
b331163b A |
177 | CHECK_SKELETON(SA, "\\u0391", "A"); |
178 | CHECK_SKELETON(ML, "\\u0391", "A"); | |
179 | CHECK_SKELETON(SL, "\\u0391", "A"); | |
729e4ab9 | 180 | |
b331163b | 181 | // 13CF Mappings in all four tables, different in MA. |
729e4ab9 A |
182 | CHECK_SKELETON(ML, "\\u13CF", "b"); |
183 | CHECK_SKELETON(MA, "\\u13CF", "b"); | |
b331163b A |
184 | CHECK_SKELETON(SL, "\\u13CF", "b"); |
185 | CHECK_SKELETON(SA, "\\u13CF", "b"); | |
729e4ab9 A |
186 | |
187 | // 0022 ; 0027 0027 ; | |
188 | // all tables. | |
189 | CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027"); | |
190 | CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027"); | |
191 | CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); | |
192 | CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); | |
193 | ||
b331163b | 194 | // 017F mappings exist only in MA and SA tables. |
729e4ab9 A |
195 | CHECK_SKELETON(MA, "\\u017F", "f"); |
196 | CHECK_SKELETON(SA, "\\u017F", "f"); | |
b331163b A |
197 | CHECK_SKELETON(ML, "\\u017F", "f"); |
198 | CHECK_SKELETON(SL, "\\u017F", "f"); | |
729e4ab9 A |
199 | |
200 | TEST_TEARDOWN; | |
201 | } | |
202 | ||
203 | ||
204 | // | |
205 | // Run a single confusable skeleton transformation test case. | |
206 | // | |
207 | void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, | |
208 | const char *input, const char *expected, int32_t lineNum) { | |
209 | UnicodeString uInput = UnicodeString(input).unescape(); | |
210 | UnicodeString uExpected = UnicodeString(expected).unescape(); | |
211 | ||
212 | UErrorCode status = U_ZERO_ERROR; | |
213 | UnicodeString actual; | |
214 | uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); | |
215 | if (U_FAILURE(status)) { | |
216 | errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum, | |
217 | u_errorName(status)); | |
218 | return; | |
219 | } | |
220 | if (uExpected != actual) { | |
221 | errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.", | |
222 | __FILE__, __LINE__, lineNum); | |
223 | errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") + | |
224 | UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\"")); | |
225 | } | |
226 | } | |
227 | ||
228 | void IntlTestSpoof::testAreConfusable() { | |
229 | TEST_SETUP | |
230 | UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. " | |
231 | "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "); | |
232 | UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " | |
233 | "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "); | |
f3c0d7a5 | 234 | int32_t result = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); |
729e4ab9 | 235 | TEST_ASSERT_SUCCESS(status); |
f3c0d7a5 | 236 | TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, result); |
729e4ab9 A |
237 | |
238 | TEST_TEARDOWN; | |
239 | } | |
240 | ||
241 | void IntlTestSpoof::testInvisible() { | |
242 | TEST_SETUP | |
243 | UnicodeString s = UnicodeString("abcd\\u0301ef").unescape(); | |
244 | int32_t position = -42; | |
245 | TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); | |
246 | TEST_ASSERT_SUCCESS(status); | |
51004dcb | 247 | TEST_ASSERT(0 == position); |
729e4ab9 A |
248 | |
249 | UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape(); | |
250 | TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status)); | |
251 | TEST_ASSERT_SUCCESS(status); | |
51004dcb | 252 | TEST_ASSERT_EQ(0, position); |
729e4ab9 | 253 | |
4388f060 | 254 | // Two acute accents, one from the composed a with acute accent, \u00e1, |
729e4ab9 A |
255 | // and one separate. |
256 | position = -42; | |
257 | UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); | |
258 | TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status)); | |
259 | TEST_ASSERT_SUCCESS(status); | |
51004dcb | 260 | TEST_ASSERT_EQ(0, position); |
729e4ab9 A |
261 | TEST_TEARDOWN; |
262 | } | |
263 | ||
4388f060 A |
264 | void IntlTestSpoof::testBug8654() { |
265 | TEST_SETUP | |
266 | UnicodeString s = UnicodeString("B\\u00c1\\u0301").unescape(); | |
267 | int32_t position = -42; | |
268 | TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE ); | |
269 | TEST_ASSERT_SUCCESS(status); | |
51004dcb | 270 | TEST_ASSERT_EQ(0, position); |
4388f060 A |
271 | TEST_TEARDOWN; |
272 | } | |
729e4ab9 A |
273 | |
274 | static UnicodeString parseHex(const UnicodeString &in) { | |
275 | // Convert a series of hex numbers in a Unicode String to a string with the | |
276 | // corresponding characters. | |
277 | // The conversion is _really_ annoying. There must be some function to just do it. | |
278 | UnicodeString result; | |
279 | UChar32 cc = 0; | |
280 | for (int32_t i=0; i<in.length(); i++) { | |
281 | UChar c = in.charAt(i); | |
282 | if (c == 0x20) { // Space | |
283 | if (cc > 0) { | |
284 | result.append(cc); | |
285 | cc = 0; | |
286 | } | |
287 | } else if (c>=0x30 && c<=0x39) { | |
288 | cc = (cc<<4) + (c - 0x30); | |
289 | } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { | |
290 | cc = (cc<<4) + (c & 0x0f)+9; | |
291 | } | |
292 | // else do something with bad input. | |
293 | } | |
294 | if (cc > 0) { | |
295 | result.append(cc); | |
296 | } | |
297 | return result; | |
298 | } | |
299 | ||
300 | ||
301 | // | |
302 | // Append the hex form of a UChar32 to a UnicodeString. | |
303 | // Used in formatting error messages. | |
304 | // Match the formatting of numbers in confusables.txt | |
305 | // Minimum of 4 digits, no leading zeroes for positions 5 and up. | |
306 | // | |
307 | static void appendHexUChar(UnicodeString &dest, UChar32 c) { | |
308 | UBool doZeroes = FALSE; | |
309 | for (int bitNum=28; bitNum>=0; bitNum-=4) { | |
310 | if (bitNum <= 12) { | |
311 | doZeroes = TRUE; | |
312 | } | |
313 | int hexDigit = (c>>bitNum) & 0x0f; | |
314 | if (hexDigit != 0 || doZeroes) { | |
315 | doZeroes = TRUE; | |
316 | dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41)); | |
317 | } | |
318 | } | |
319 | dest.append((UChar)0x20); | |
320 | } | |
321 | ||
322 | U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); | |
323 | ||
324 | // testConfData - Check each data item from the Unicode confusables.txt file, | |
325 | // verify that it transforms correctly in a skeleton. | |
326 | // | |
327 | void IntlTestSpoof::testConfData() { | |
729e4ab9 | 328 | char buffer[2000]; |
b331163b A |
329 | if (getUnidataPath(buffer) == NULL) { |
330 | errln("Skipping test spoof/testConfData. Unable to find path to source/data/unidata/."); | |
331 | return; | |
332 | } | |
729e4ab9 A |
333 | uprv_strcat(buffer, "confusables.txt"); |
334 | ||
335 | LocalStdioFilePointer f(fopen(buffer, "rb")); | |
336 | if (f.isNull()) { | |
337 | errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); | |
338 | return; | |
339 | } | |
340 | fseek(f.getAlias(), 0, SEEK_END); | |
341 | int32_t fileSize = ftell(f.getAlias()); | |
342 | LocalArray<char> fileBuf(new char[fileSize]); | |
343 | fseek(f.getAlias(), 0, SEEK_SET); | |
3d1f044b | 344 | int32_t amt_read = static_cast<int32_t>(fread(fileBuf.getAlias(), 1, fileSize, f.getAlias())); |
729e4ab9 A |
345 | TEST_ASSERT_EQ(amt_read, fileSize); |
346 | TEST_ASSERT(fileSize>0); | |
347 | if (amt_read != fileSize || fileSize <=0) { | |
348 | return; | |
349 | } | |
350 | UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); | |
351 | ||
b331163b | 352 | UErrorCode status = U_ZERO_ERROR; |
729e4ab9 A |
353 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); |
354 | TEST_ASSERT_SUCCESS(status); | |
355 | ||
356 | // Parse lines from the confusables.txt file. Example Line: | |
357 | // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... | |
358 | // Three fields. The hex fields can contain more than one character, | |
359 | // and each character may be more than 4 digits (for supplemntals) | |
360 | // This regular expression matches lines and splits the fields into capture groups. | |
361 | RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); | |
362 | TEST_ASSERT_SUCCESS(status); | |
363 | while (parseLine.find()) { | |
364 | UnicodeString from = parseHex(parseLine.group(1, status)); | |
365 | if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { | |
366 | // The source character was not NFD. | |
367 | // Skip this case; the first step in obtaining a skeleton is to NFD the input, | |
368 | // so the mapping in this line of confusables.txt will never be applied. | |
369 | continue; | |
370 | } | |
371 | ||
372 | UnicodeString rawExpected = parseHex(parseLine.group(2, status)); | |
373 | UnicodeString expected; | |
374 | Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); | |
375 | TEST_ASSERT_SUCCESS(status); | |
376 | ||
377 | int32_t skeletonType = 0; | |
378 | UnicodeString tableType = parseLine.group(3, status); | |
379 | TEST_ASSERT_SUCCESS(status); | |
380 | if (tableType.indexOf("SL") >= 0) { | |
381 | skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; | |
382 | } else if (tableType.indexOf("SA") >= 0) { | |
383 | skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; | |
384 | } else if (tableType.indexOf("ML") >= 0) { | |
385 | skeletonType = 0; | |
386 | } else if (tableType.indexOf("MA") >= 0) { | |
387 | skeletonType = USPOOF_ANY_CASE; | |
388 | } | |
389 | ||
390 | UnicodeString actual; | |
391 | uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); | |
392 | TEST_ASSERT_SUCCESS(status); | |
393 | TEST_ASSERT(actual == expected); | |
394 | if (actual != expected) { | |
395 | errln(parseLine.group(0, status)); | |
396 | UnicodeString line = "Actual: "; | |
397 | int i = 0; | |
398 | while (i < actual.length()) { | |
399 | appendHexUChar(line, actual.char32At(i)); | |
400 | i = actual.moveIndex32(i, 1); | |
401 | } | |
402 | errln(line); | |
403 | } | |
404 | if (U_FAILURE(status)) { | |
405 | break; | |
406 | } | |
407 | } | |
408 | } | |
729e4ab9 | 409 | |
51004dcb A |
410 | |
411 | void IntlTestSpoof::testScriptSet() { | |
412 | ScriptSet s1; | |
413 | ScriptSet s2; | |
414 | UErrorCode status = U_ZERO_ERROR; | |
415 | ||
416 | TEST_ASSERT(s1 == s2); | |
417 | s1.set(USCRIPT_ARABIC,status); | |
418 | TEST_ASSERT_SUCCESS(status); | |
419 | TEST_ASSERT(!(s1 == s2)); | |
420 | TEST_ASSERT(s1.test(USCRIPT_ARABIC, status)); | |
421 | TEST_ASSERT(s1.test(USCRIPT_GREEK, status) == FALSE); | |
422 | ||
423 | status = U_ZERO_ERROR; | |
424 | s1.reset(USCRIPT_ARABIC, status); | |
425 | TEST_ASSERT(s1 == s2); | |
426 | ||
427 | status = U_ZERO_ERROR; | |
428 | s1.setAll(); | |
429 | TEST_ASSERT(s1.test(USCRIPT_COMMON, status)); | |
430 | TEST_ASSERT(s1.test(USCRIPT_ETHIOPIC, status)); | |
431 | TEST_ASSERT(s1.test(USCRIPT_CODE_LIMIT, status)); | |
432 | s1.resetAll(); | |
433 | TEST_ASSERT(!s1.test(USCRIPT_COMMON, status)); | |
434 | TEST_ASSERT(!s1.test(USCRIPT_ETHIOPIC, status)); | |
435 | TEST_ASSERT(!s1.test(USCRIPT_CODE_LIMIT, status)); | |
436 | ||
437 | status = U_ZERO_ERROR; | |
438 | s1.set(USCRIPT_TAKRI, status); | |
439 | s1.set(USCRIPT_BLISSYMBOLS, status); | |
440 | s2.setAll(); | |
441 | TEST_ASSERT(s2.contains(s1)); | |
442 | TEST_ASSERT(!s1.contains(s2)); | |
443 | TEST_ASSERT(s2.intersects(s1)); | |
444 | TEST_ASSERT(s1.intersects(s2)); | |
445 | s2.reset(USCRIPT_TAKRI, status); | |
446 | TEST_ASSERT(!s2.contains(s1)); | |
447 | TEST_ASSERT(!s1.contains(s2)); | |
448 | TEST_ASSERT(s1.intersects(s2)); | |
449 | TEST_ASSERT(s2.intersects(s1)); | |
450 | TEST_ASSERT_SUCCESS(status); | |
451 | ||
452 | status = U_ZERO_ERROR; | |
453 | s1.resetAll(); | |
454 | s1.set(USCRIPT_NKO, status); | |
455 | s1.set(USCRIPT_COMMON, status); | |
456 | s2 = s1; | |
457 | TEST_ASSERT(s2 == s1); | |
458 | TEST_ASSERT_EQ(2, s2.countMembers()); | |
459 | s2.intersect(s1); | |
460 | TEST_ASSERT(s2 == s1); | |
461 | s2.setAll(); | |
462 | TEST_ASSERT(!(s2 == s1)); | |
463 | TEST_ASSERT(s2.countMembers() >= USCRIPT_CODE_LIMIT); | |
464 | s2.intersect(s1); | |
465 | TEST_ASSERT(s2 == s1); | |
466 | ||
467 | s2.setAll(); | |
468 | s2.reset(USCRIPT_COMMON, status); | |
469 | s2.intersect(s1); | |
470 | TEST_ASSERT(s2.countMembers() == 1); | |
471 | ||
f3c0d7a5 A |
472 | s1.resetAll(); |
473 | TEST_ASSERT(s1.isEmpty()); | |
474 | s1.set(USCRIPT_LATIN, status); | |
475 | TEST_ASSERT(!s1.isEmpty()); | |
476 | s1.setAll(); | |
477 | TEST_ASSERT(!s1.isEmpty()); | |
478 | TEST_ASSERT_SUCCESS(status); | |
479 | ||
51004dcb A |
480 | s1.resetAll(); |
481 | s1.set(USCRIPT_AFAKA, status); | |
482 | s1.set(USCRIPT_VAI, status); | |
483 | s1.set(USCRIPT_INHERITED, status); | |
484 | int32_t n = -1; | |
485 | for (int32_t i=0; i<4; i++) { | |
486 | n = s1.nextSetBit(n+1); | |
487 | switch (i) { | |
488 | case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break; | |
489 | case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break; | |
490 | case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break; | |
491 | case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break; | |
492 | default: TEST_ASSERT(FALSE); | |
493 | } | |
494 | } | |
495 | TEST_ASSERT_SUCCESS(status); | |
f3c0d7a5 A |
496 | |
497 | // Script extensions. Depends on data. | |
498 | s1.resetAll(); | |
499 | s1.setScriptExtensions(0x67, status); | |
500 | TEST_ASSERT(s1.countMembers() == 1); | |
501 | TEST_ASSERT(s1.test(USCRIPT_LATIN, status)); | |
502 | TEST_ASSERT_SUCCESS(status); | |
503 | ||
504 | s1.resetAll(); | |
505 | s1.setScriptExtensions(0x303C, status); | |
506 | TEST_ASSERT(s1.countMembers() == 3); | |
507 | TEST_ASSERT(s1.test(USCRIPT_HAN, status)); | |
508 | TEST_ASSERT(s1.test(USCRIPT_HIRAGANA, status)); | |
509 | TEST_ASSERT(s1.test(USCRIPT_KATAKANA, status)); | |
510 | TEST_ASSERT_SUCCESS(status); | |
511 | ||
512 | // Additional tests | |
513 | ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status); | |
514 | ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status); | |
515 | TEST_ASSERT(bitset12.contains(bitset2)); | |
516 | TEST_ASSERT(bitset12.contains(bitset12)); | |
517 | TEST_ASSERT(!bitset2.contains(bitset12)); | |
518 | ||
519 | ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status); | |
520 | ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status); | |
521 | UElement arabEl; arabEl.pointer = &arabSet; | |
522 | UElement latinEl; latinEl.pointer = &latinSet; | |
523 | TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0); | |
524 | TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0); | |
525 | ||
526 | UnicodeString scriptString; | |
527 | bitset12.displayScripts(scriptString); | |
528 | TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString); | |
51004dcb A |
529 | } |
530 | ||
531 | ||
532 | void IntlTestSpoof::testRestrictionLevel() { | |
533 | struct Test { | |
534 | const char *fId; | |
535 | URestrictionLevel fExpectedRestrictionLevel; | |
536 | } tests[] = { | |
537 | {"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE}, | |
538 | {"a", USPOOF_ASCII}, | |
57a6839d | 539 | {"\\u03B3", USPOOF_SINGLE_SCRIPT_RESTRICTIVE}, |
51004dcb A |
540 | {"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE}, |
541 | {"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE}, | |
f3c0d7a5 A |
542 | {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}, |
543 | {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE}, | |
544 | {"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE}, | |
545 | {"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE}, | |
546 | {"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE}, | |
547 | {"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE}, | |
548 | {"\\u0061\\u0031\\u0661", USPOOF_MODERATELY_RESTRICTIVE}, | |
549 | {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_MODERATELY_RESTRICTIVE}, | |
550 | {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE}, | |
551 | {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE} | |
51004dcb A |
552 | }; |
553 | char msgBuffer[100]; | |
57a6839d | 554 | URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_SINGLE_SCRIPT_RESTRICTIVE, |
f3c0d7a5 A |
555 | USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE, |
556 | USPOOF_UNRESTRICTIVE}; | |
557 | ||
51004dcb | 558 | UErrorCode status = U_ZERO_ERROR; |
f3c0d7a5 A |
559 | UnicodeSet allowedChars; |
560 | // Allowed Identifier Characters. In addition to the Recommended Set, | |
561 | // allow u303c, which has an interesting script extension of Hani Hira Kana. | |
562 | allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C); | |
563 | ||
b331163b | 564 | for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) { |
51004dcb A |
565 | status = U_ZERO_ERROR; |
566 | const Test &test = tests[testNum]; | |
567 | UnicodeString testString = UnicodeString(test.fId).unescape(); | |
568 | URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel; | |
b331163b | 569 | for (int levelIndex=0; levelIndex<UPRV_LENGTHOF(restrictionLevels); levelIndex++) { |
51004dcb A |
570 | status = U_ZERO_ERROR; |
571 | URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex]; | |
572 | USpoofChecker *sc = uspoof_open(&status); | |
f3c0d7a5 | 573 | uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status); |
51004dcb | 574 | uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker); |
f3c0d7a5 | 575 | uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status); |
57a6839d A |
576 | int32_t actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status); |
577 | ||
51004dcb | 578 | // we want to fail if the text is (say) MODERATE and the testLevel is ASCII |
57a6839d A |
579 | int32_t expectedValue = 0; |
580 | if (expectedLevel > levelSetInSpoofChecker) { | |
581 | expectedValue |= USPOOF_RESTRICTION_LEVEL; | |
582 | } | |
57a6839d A |
583 | sprintf(msgBuffer, "testNum = %d, levelIndex = %d, expected = %#x, actual = %#x", |
584 | testNum, levelIndex, expectedValue, actualValue); | |
585 | TEST_ASSERT_MSG(expectedValue == actualValue, msgBuffer); | |
586 | TEST_ASSERT_SUCCESS(status); | |
587 | ||
588 | // Run the same check again, with the Spoof Checker configured to return | |
589 | // the actual restriction level. | |
f3c0d7a5 | 590 | uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status); |
57a6839d | 591 | uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker); |
f3c0d7a5 | 592 | uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status); |
57a6839d | 593 | int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status); |
51004dcb | 594 | TEST_ASSERT_SUCCESS(status); |
57a6839d A |
595 | if (U_SUCCESS(status)) { |
596 | TEST_ASSERT_EQ(expectedLevel, result & USPOOF_RESTRICTION_LEVEL_MASK); | |
597 | TEST_ASSERT_EQ(expectedValue, result & USPOOF_ALL_CHECKS); | |
598 | } | |
51004dcb A |
599 | uspoof_close(sc); |
600 | } | |
601 | } | |
51004dcb | 602 | |
f3c0d7a5 | 603 | } |
51004dcb A |
604 | |
605 | void IntlTestSpoof::testMixedNumbers() { | |
606 | struct Test { | |
607 | const char *fTestString; | |
608 | const char *fExpectedSet; | |
609 | } tests[] = { | |
610 | {"1", "[0]"}, | |
611 | {"\\u0967", "[\\u0966]"}, | |
612 | {"1\\u0967", "[0\\u0966]"}, | |
f3c0d7a5 A |
613 | {"\\u0661\\u06F1", "[\\u0660\\u06F0]"}, |
614 | {"\\u0061\\u2665", "[]"}, | |
615 | {"\\u0061\\u303C", "[]"}, | |
616 | {"\\u0061\\u30FC\\u303C", "[]"}, | |
617 | {"\\u0061\\u30FC\\u303C\\u30A2", "[]"}, | |
618 | {"\\u30A2\\u0061\\u30FC\\u303C", "[]"}, | |
619 | {"\\u0061\\u0031\\u0661", "[\\u0030\\u0660]"}, | |
620 | {"\\u0061\\u0031\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0]"}, | |
621 | {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"}, | |
622 | {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"} | |
51004dcb A |
623 | }; |
624 | UErrorCode status = U_ZERO_ERROR; | |
b331163b | 625 | for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) { |
51004dcb A |
626 | char msgBuf[100]; |
627 | sprintf(msgBuf, "testNum = %d ", testNum); | |
628 | Test &test = tests[testNum]; | |
629 | ||
630 | status = U_ZERO_ERROR; | |
631 | UnicodeString testString = UnicodeString(test.fTestString).unescape(); | |
632 | UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status); | |
51004dcb A |
633 | |
634 | status = U_ZERO_ERROR; | |
f3c0d7a5 A |
635 | TEST_SETUP |
636 | uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this | |
637 | uspoof_check2UnicodeString(sc, testString, checkResult, &status); | |
638 | UBool mixedNumberFailure = ((uspoof_getCheckResultChecks(checkResult, &status) & USPOOF_MIXED_NUMBERS) != 0); | |
639 | TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf); | |
640 | const UnicodeSet* actualSet = UnicodeSet::fromUSet(uspoof_getCheckResultNumerics(checkResult, &status)); | |
641 | TEST_ASSERT_MSG(expectedSet == *actualSet, msgBuf); | |
642 | TEST_TEARDOWN | |
51004dcb A |
643 | } |
644 | } | |
645 | ||
f3c0d7a5 A |
646 | // Bug #12153 - uspoof_setRestrictionLevel() should enable restriction level testing. |
647 | // | |
648 | void IntlTestSpoof::testBug12153() { | |
649 | UErrorCode status = U_ZERO_ERROR; | |
650 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
0f5d89e8 | 651 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } |
f3c0d7a5 A |
652 | int32_t checks = uspoof_getChecks(sc.getAlias(), &status); |
653 | TEST_ASSERT((checks & USPOOF_RESTRICTION_LEVEL) != 0); | |
654 | checks &= ~USPOOF_RESTRICTION_LEVEL; | |
655 | uspoof_setChecks(sc.getAlias(), checks, &status); | |
656 | checks = uspoof_getChecks(sc.getAlias(), &status); | |
657 | TEST_ASSERT((checks & USPOOF_RESTRICTION_LEVEL) == 0); | |
658 | ||
659 | uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); | |
660 | checks = uspoof_getChecks(sc.getAlias(), &status); | |
661 | TEST_ASSERT((checks & USPOOF_RESTRICTION_LEVEL) != 0); | |
662 | TEST_ASSERT_SUCCESS(status); | |
663 | } | |
664 | ||
665 | // uspoof_checkUnicodeString should NOT have an infinite loop. | |
666 | void IntlTestSpoof::testBug12825() { | |
667 | UErrorCode status = U_ZERO_ERROR; | |
668 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
669 | TEST_ASSERT_SUCCESS(status); | |
670 | uspoof_setChecks(sc.getAlias(), USPOOF_ALL_CHECKS | USPOOF_AUX_INFO, &status); | |
671 | TEST_ASSERT_SUCCESS(status); | |
672 | uspoof_checkUnicodeString(sc.getAlias(), UnicodeString("\\u30FB").unescape(), NULL, &status); | |
673 | TEST_ASSERT_SUCCESS(status); | |
674 | } | |
675 | ||
676 | // uspoof_getSkeleton should NOT set an ILLEGAL_ARGUMENT_EXCEPTION. | |
677 | void IntlTestSpoof::testBug12815() { | |
678 | UErrorCode status = U_ZERO_ERROR; | |
679 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
680 | TEST_ASSERT_SUCCESS(status); | |
681 | uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL, &status); | |
682 | TEST_ASSERT_SUCCESS(status); | |
683 | UnicodeString result; | |
684 | uspoof_getSkeletonUnicodeString(sc.getAlias(), 0, UnicodeString("hello world"), result, &status); | |
685 | TEST_ASSERT_SUCCESS(status); | |
686 | } | |
687 | ||
0f5d89e8 A |
688 | void IntlTestSpoof::testBug13314_MixedNumbers() { |
689 | UErrorCode status = U_ZERO_ERROR; | |
690 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
691 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } | |
692 | uspoof_setChecks(sc.getAlias(), USPOOF_ALL_CHECKS, &status); | |
693 | TEST_ASSERT_SUCCESS(status); | |
694 | int32_t failedChecks = uspoof_areConfusableUnicodeString(sc.getAlias(), u"列", u"列", &status); | |
695 | TEST_ASSERT_SUCCESS(status); | |
696 | assertEquals("The CJK strings should be confusable", USPOOF_SINGLE_SCRIPT_CONFUSABLE, failedChecks); | |
697 | failedChecks = uspoof_check2UnicodeString(sc.getAlias(), u"3Ȝ", nullptr, &status); | |
698 | TEST_ASSERT_SUCCESS(status); | |
699 | assertEquals("The '33' string does not fail spoof", 0, failedChecks); | |
700 | } | |
701 | ||
702 | void IntlTestSpoof::testBug13328_MixedCombiningMarks() { | |
703 | UErrorCode status = U_ZERO_ERROR; | |
704 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
705 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } | |
706 | int32_t failedChecks = uspoof_check2UnicodeString(sc.getAlias(), u"\u0061\u0F84", nullptr, &status); | |
707 | TEST_ASSERT_SUCCESS(status); | |
708 | assertEquals( | |
709 | "The mismatched combining marks string fails spoof", | |
710 | USPOOF_RESTRICTION_LEVEL, | |
711 | failedChecks); | |
712 | } | |
713 | ||
714 | void IntlTestSpoof::testCombiningDot() { | |
715 | UErrorCode status = U_ZERO_ERROR; | |
716 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
717 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } | |
718 | uspoof_setChecks(sc.getAlias(), USPOOF_HIDDEN_OVERLAY, &status); | |
719 | TEST_ASSERT_SUCCESS(status); | |
720 | ||
721 | static const struct TestCase { | |
722 | bool shouldFail; | |
723 | const char16_t* input; | |
724 | } cases[] = { | |
725 | {false, u"i"}, | |
726 | {false, u"j"}, | |
727 | {false, u"l"}, | |
728 | {true, u"i\u0307"}, | |
729 | {true, u"j\u0307"}, | |
730 | {true, u"l\u0307"}, | |
731 | {true, u"ı\u0307"}, | |
732 | {true, u"ȷ\u0307"}, | |
733 | {true, u"𝚤\u0307"}, | |
734 | {true, u"𝑗\u0307"}, | |
735 | {false, u"m\u0307"}, | |
736 | {true, u"1\u0307"}, | |
737 | {true, u"ij\u0307"}, | |
738 | {true, u"i\u0307\u0307"}, | |
739 | {true, u"abci\u0307def"}, | |
740 | {false, u"i\u0301\u0307"}, // U+0301 has combining class ABOVE (230) | |
741 | {true, u"i\u0320\u0307"}, // U+0320 has combining class BELOW | |
742 | {true, u"i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW | |
743 | {false, u"i\u0320\u0301\u0307"}, | |
744 | {false, u"iz\u0307"}, | |
745 | }; | |
746 | ||
747 | for (auto& cas : cases) { | |
748 | int32_t failedChecks = uspoof_check2(sc.getAlias(), cas.input, -1, nullptr, &status); | |
749 | TEST_ASSERT_SUCCESS(status); | |
750 | int32_t expected = cas.shouldFail ? USPOOF_HIDDEN_OVERLAY : 0; | |
751 | assertEquals(cas.input, expected, failedChecks); | |
752 | } | |
753 | } | |
754 | ||
51004dcb | 755 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */ |