]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ********************************************************************** | |
5 | * Copyright (C) 2011-2015, International Business Machines Corporation | |
6 | * and others. All Rights Reserved. | |
7 | ********************************************************************** | |
8 | */ | |
9 | /** | |
10 | * IntlTestSpoof tests for USpoofDetector | |
11 | */ | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO | |
16 | ||
17 | #include "itspoof.h" | |
18 | ||
19 | #include "unicode/normlzr.h" | |
20 | #include "unicode/regex.h" | |
21 | #include "unicode/unistr.h" | |
22 | #include "unicode/uscript.h" | |
23 | #include "unicode/uspoof.h" | |
24 | ||
25 | #include "cstring.h" | |
26 | #include "scriptset.h" | |
27 | #include "uhash.h" | |
28 | ||
29 | #include <stdlib.h> | |
30 | #include <stdio.h> | |
31 | ||
32 | #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \ | |
33 | if (U_FAILURE(status)) { \ | |
34 | errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status)); \ | |
35 | } \ | |
36 | } UPRV_BLOCK_MACRO_END | |
37 | ||
38 | #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \ | |
39 | if ((expr)==FALSE) { \ | |
40 | errln("Test Failure at file %s, line %d: \"%s\" is false.", __FILE__, __LINE__, #expr); \ | |
41 | } \ | |
42 | } UPRV_BLOCK_MACRO_END | |
43 | ||
44 | #define TEST_ASSERT_MSG(expr, msg) UPRV_BLOCK_MACRO_BEGIN { \ | |
45 | if ((expr)==FALSE) { \ | |
46 | dataerrln("Test Failure at file %s, line %d, %s: \"%s\" is false.", __FILE__, __LINE__, msg, #expr); \ | |
47 | } \ | |
48 | } UPRV_BLOCK_MACRO_END | |
49 | ||
50 | #define TEST_ASSERT_EQ(a, b) UPRV_BLOCK_MACRO_BEGIN { \ | |
51 | if ((a) != (b)) { \ | |
52 | errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d)", \ | |
53 | __FILE__, __LINE__, #a, (a), #b, (b)); \ | |
54 | } \ | |
55 | } UPRV_BLOCK_MACRO_END | |
56 | ||
57 | #define TEST_ASSERT_NE(a, b) UPRV_BLOCK_MACRO_BEGIN { \ | |
58 | if ((a) == (b)) { \ | |
59 | errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d)", \ | |
60 | __FILE__, __LINE__, #a, (a), #b, (b)); \ | |
61 | } \ | |
62 | } UPRV_BLOCK_MACRO_END | |
63 | ||
64 | /* | |
65 | * TEST_SETUP and TEST_TEARDOWN | |
66 | * macros to handle the boilerplate around setting up test case. | |
67 | * Put arbitrary test code between SETUP and TEARDOWN. | |
68 | * "sc" is the ready-to-go SpoofChecker for use in the tests. | |
69 | */ | |
70 | #define TEST_SETUP UPRV_BLOCK_MACRO_BEGIN { \ | |
71 | UErrorCode status = U_ZERO_ERROR; \ | |
72 | USpoofChecker *sc; \ | |
73 | sc = uspoof_open(&status); \ | |
74 | TEST_ASSERT_SUCCESS(status); \ | |
75 | USpoofCheckResult *checkResult; \ | |
76 | checkResult = uspoof_openCheckResult(&status); \ | |
77 | TEST_ASSERT_SUCCESS(status); \ | |
78 | if (U_SUCCESS(status)){ | |
79 | ||
80 | #define TEST_TEARDOWN \ | |
81 | } \ | |
82 | TEST_ASSERT_SUCCESS(status); \ | |
83 | uspoof_closeCheckResult(checkResult); \ | |
84 | uspoof_close(sc); \ | |
85 | } UPRV_BLOCK_MACRO_END | |
86 | ||
87 | ||
88 | ||
89 | ||
90 | void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) | |
91 | { | |
92 | if (exec) { | |
93 | logln("TestSuite spoof: "); | |
94 | } | |
95 | TESTCASE_AUTO_BEGIN; | |
96 | TESTCASE_AUTO(testSpoofAPI); | |
97 | TESTCASE_AUTO(testSkeleton); | |
98 | TESTCASE_AUTO(testAreConfusable); | |
99 | TESTCASE_AUTO(testInvisible); | |
100 | TESTCASE_AUTO(testConfData); | |
101 | TESTCASE_AUTO(testBug8654); | |
102 | TESTCASE_AUTO(testScriptSet); | |
103 | TESTCASE_AUTO(testRestrictionLevel); | |
104 | TESTCASE_AUTO(testMixedNumbers); | |
105 | TESTCASE_AUTO(testBug12153); | |
106 | TESTCASE_AUTO(testBug12825); | |
107 | TESTCASE_AUTO(testBug12815); | |
108 | TESTCASE_AUTO(testBug13314_MixedNumbers); | |
109 | TESTCASE_AUTO(testBug13328_MixedCombiningMarks); | |
110 | TESTCASE_AUTO(testCombiningDot); | |
111 | TESTCASE_AUTO_END; | |
112 | } | |
113 | ||
114 | void IntlTestSpoof::testSpoofAPI() { | |
115 | ||
116 | TEST_SETUP | |
117 | UnicodeString s("xyz"); // Many latin ranges are whole-script confusable with other scripts. | |
118 | // If this test starts failing, consult confusablesWholeScript.txt | |
119 | int32_t position = 666; | |
120 | int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); | |
121 | TEST_ASSERT_SUCCESS(status); | |
122 | TEST_ASSERT_EQ(0, checkResults); | |
123 | TEST_ASSERT_EQ(0, position); | |
124 | TEST_TEARDOWN; | |
125 | ||
126 | TEST_SETUP | |
127 | UnicodeString s1("cxs"); | |
128 | UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" | |
129 | int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); | |
130 | TEST_ASSERT_SUCCESS(status); | |
131 | TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); | |
132 | ||
133 | TEST_TEARDOWN; | |
134 | ||
135 | TEST_SETUP | |
136 | UnicodeString s("I1l0O"); | |
137 | UnicodeString dest; | |
138 | UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); | |
139 | TEST_ASSERT_SUCCESS(status); | |
140 | TEST_ASSERT(UnicodeString("lllOO") == dest); | |
141 | TEST_ASSERT(&dest == &retStr); | |
142 | TEST_TEARDOWN; | |
143 | } | |
144 | ||
145 | ||
146 | #define CHECK_SKELETON(type, input, expected) UPRV_BLOCK_MACRO_BEGIN { \ | |
147 | checkSkeleton(sc, type, input, expected, __LINE__); \ | |
148 | } UPRV_BLOCK_MACRO_END | |
149 | ||
150 | ||
151 | // testSkeleton. Spot check a number of confusable skeleton substitutions from the | |
152 | // Unicode data file confusables.txt | |
153 | // Test cases chosen for substitutions of various lengths, and | |
154 | // membership in different mapping tables. | |
155 | // Note: for ICU 55, all tables collapsed to the MA table data. | |
156 | // TODO: for ICU 56 with Unicode 8, revisit this test. | |
157 | // | |
158 | void IntlTestSpoof::testSkeleton() { | |
159 | const uint32_t ML = 0; | |
160 | const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE; | |
161 | const uint32_t MA = USPOOF_ANY_CASE; | |
162 | const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; | |
163 | ||
164 | TEST_SETUP | |
165 | CHECK_SKELETON(SL, "nochange", "nochange"); | |
166 | CHECK_SKELETON(SA, "nochange", "nochange"); | |
167 | CHECK_SKELETON(ML, "nochange", "nochange"); | |
168 | CHECK_SKELETON(MA, "nochange", "nochange"); | |
169 | CHECK_SKELETON(MA, "love", "love"); | |
170 | CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l | |
171 | CHECK_SKELETON(ML, "OOPS", "OOPS"); | |
172 | CHECK_SKELETON(ML, "00PS", "OOPS"); | |
173 | CHECK_SKELETON(MA, "OOPS", "OOPS"); | |
174 | CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only | |
175 | CHECK_SKELETON(SL, "\\u059c", "\\u0301"); | |
176 | CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D"); | |
177 | CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)" | |
178 | CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f"); | |
179 | ||
180 | // This mapping exists in the ML and MA tables, does not exist in SL, SA | |
181 | // 0C83 ; 0983 ; ML | |
182 | // 0C83 ; 0983 ; MA | |
183 | // | |
184 | ||
185 | CHECK_SKELETON(SL, "\\u0C83", "\\u0983"); | |
186 | CHECK_SKELETON(SA, "\\u0C83", "\\u0983"); | |
187 | CHECK_SKELETON(ML, "\\u0C83", "\\u0983"); | |
188 | CHECK_SKELETON(MA, "\\u0C83", "\\u0983"); | |
189 | ||
190 | // 0391 mappings exist only in MA and SA tables. | |
191 | CHECK_SKELETON(MA, "\\u0391", "A"); | |
192 | CHECK_SKELETON(SA, "\\u0391", "A"); | |
193 | CHECK_SKELETON(ML, "\\u0391", "A"); | |
194 | CHECK_SKELETON(SL, "\\u0391", "A"); | |
195 | ||
196 | // 13CF Mappings in all four tables, different in MA. | |
197 | CHECK_SKELETON(ML, "\\u13CF", "b"); | |
198 | CHECK_SKELETON(MA, "\\u13CF", "b"); | |
199 | CHECK_SKELETON(SL, "\\u13CF", "b"); | |
200 | CHECK_SKELETON(SA, "\\u13CF", "b"); | |
201 | ||
202 | // 0022 ; 0027 0027 ; | |
203 | // all tables. | |
204 | CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027"); | |
205 | CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027"); | |
206 | CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); | |
207 | CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); | |
208 | ||
209 | // 017F mappings exist only in MA and SA tables. | |
210 | CHECK_SKELETON(MA, "\\u017F", "f"); | |
211 | CHECK_SKELETON(SA, "\\u017F", "f"); | |
212 | CHECK_SKELETON(ML, "\\u017F", "f"); | |
213 | CHECK_SKELETON(SL, "\\u017F", "f"); | |
214 | ||
215 | TEST_TEARDOWN; | |
216 | } | |
217 | ||
218 | ||
219 | // | |
220 | // Run a single confusable skeleton transformation test case. | |
221 | // | |
222 | void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, | |
223 | const char *input, const char *expected, int32_t lineNum) { | |
224 | UnicodeString uInput = UnicodeString(input).unescape(); | |
225 | UnicodeString uExpected = UnicodeString(expected).unescape(); | |
226 | ||
227 | UErrorCode status = U_ZERO_ERROR; | |
228 | UnicodeString actual; | |
229 | uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); | |
230 | if (U_FAILURE(status)) { | |
231 | errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum, | |
232 | u_errorName(status)); | |
233 | return; | |
234 | } | |
235 | if (uExpected != actual) { | |
236 | errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.", | |
237 | __FILE__, __LINE__, lineNum); | |
238 | errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") + | |
239 | UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\"")); | |
240 | } | |
241 | } | |
242 | ||
243 | void IntlTestSpoof::testAreConfusable() { | |
244 | TEST_SETUP | |
245 | UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. " | |
246 | "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "); | |
247 | UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " | |
248 | "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "); | |
249 | int32_t result = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); | |
250 | TEST_ASSERT_SUCCESS(status); | |
251 | TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, result); | |
252 | ||
253 | TEST_TEARDOWN; | |
254 | } | |
255 | ||
256 | void IntlTestSpoof::testInvisible() { | |
257 | TEST_SETUP | |
258 | UnicodeString s = UnicodeString("abcd\\u0301ef").unescape(); | |
259 | int32_t position = -42; | |
260 | TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); | |
261 | TEST_ASSERT_SUCCESS(status); | |
262 | TEST_ASSERT(0 == position); | |
263 | ||
264 | UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape(); | |
265 | TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status)); | |
266 | TEST_ASSERT_SUCCESS(status); | |
267 | TEST_ASSERT_EQ(0, position); | |
268 | ||
269 | // Two acute accents, one from the composed a with acute accent, \u00e1, | |
270 | // and one separate. | |
271 | position = -42; | |
272 | UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); | |
273 | TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status)); | |
274 | TEST_ASSERT_SUCCESS(status); | |
275 | TEST_ASSERT_EQ(0, position); | |
276 | TEST_TEARDOWN; | |
277 | } | |
278 | ||
279 | void IntlTestSpoof::testBug8654() { | |
280 | TEST_SETUP | |
281 | UnicodeString s = UnicodeString("B\\u00c1\\u0301").unescape(); | |
282 | int32_t position = -42; | |
283 | TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE ); | |
284 | TEST_ASSERT_SUCCESS(status); | |
285 | TEST_ASSERT_EQ(0, position); | |
286 | TEST_TEARDOWN; | |
287 | } | |
288 | ||
289 | static UnicodeString parseHex(const UnicodeString &in) { | |
290 | // Convert a series of hex numbers in a Unicode String to a string with the | |
291 | // corresponding characters. | |
292 | // The conversion is _really_ annoying. There must be some function to just do it. | |
293 | UnicodeString result; | |
294 | UChar32 cc = 0; | |
295 | for (int32_t i=0; i<in.length(); i++) { | |
296 | UChar c = in.charAt(i); | |
297 | if (c == 0x20) { // Space | |
298 | if (cc > 0) { | |
299 | result.append(cc); | |
300 | cc = 0; | |
301 | } | |
302 | } else if (c>=0x30 && c<=0x39) { | |
303 | cc = (cc<<4) + (c - 0x30); | |
304 | } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { | |
305 | cc = (cc<<4) + (c & 0x0f)+9; | |
306 | } | |
307 | // else do something with bad input. | |
308 | } | |
309 | if (cc > 0) { | |
310 | result.append(cc); | |
311 | } | |
312 | return result; | |
313 | } | |
314 | ||
315 | ||
316 | // | |
317 | // Append the hex form of a UChar32 to a UnicodeString. | |
318 | // Used in formatting error messages. | |
319 | // Match the formatting of numbers in confusables.txt | |
320 | // Minimum of 4 digits, no leading zeroes for positions 5 and up. | |
321 | // | |
322 | static void appendHexUChar(UnicodeString &dest, UChar32 c) { | |
323 | UBool doZeroes = FALSE; | |
324 | for (int bitNum=28; bitNum>=0; bitNum-=4) { | |
325 | if (bitNum <= 12) { | |
326 | doZeroes = TRUE; | |
327 | } | |
328 | int hexDigit = (c>>bitNum) & 0x0f; | |
329 | if (hexDigit != 0 || doZeroes) { | |
330 | doZeroes = TRUE; | |
331 | dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41)); | |
332 | } | |
333 | } | |
334 | dest.append((UChar)0x20); | |
335 | } | |
336 | ||
337 | U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); | |
338 | ||
339 | // testConfData - Check each data item from the Unicode confusables.txt file, | |
340 | // verify that it transforms correctly in a skeleton. | |
341 | // | |
342 | void IntlTestSpoof::testConfData() { | |
343 | char buffer[2000]; | |
344 | if (getUnidataPath(buffer) == NULL) { | |
345 | errln("Skipping test spoof/testConfData. Unable to find path to source/data/unidata/."); | |
346 | return; | |
347 | } | |
348 | uprv_strcat(buffer, "confusables.txt"); | |
349 | ||
350 | LocalStdioFilePointer f(fopen(buffer, "rb")); | |
351 | if (f.isNull()) { | |
352 | errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); | |
353 | return; | |
354 | } | |
355 | fseek(f.getAlias(), 0, SEEK_END); | |
356 | int32_t fileSize = ftell(f.getAlias()); | |
357 | LocalArray<char> fileBuf(new char[fileSize]); | |
358 | fseek(f.getAlias(), 0, SEEK_SET); | |
359 | int32_t amt_read = static_cast<int32_t>(fread(fileBuf.getAlias(), 1, fileSize, f.getAlias())); | |
360 | TEST_ASSERT_EQ(amt_read, fileSize); | |
361 | TEST_ASSERT(fileSize>0); | |
362 | if (amt_read != fileSize || fileSize <=0) { | |
363 | return; | |
364 | } | |
365 | UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); | |
366 | ||
367 | UErrorCode status = U_ZERO_ERROR; | |
368 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
369 | TEST_ASSERT_SUCCESS(status); | |
370 | ||
371 | // Parse lines from the confusables.txt file. Example Line: | |
372 | // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... | |
373 | // Three fields. The hex fields can contain more than one character, | |
374 | // and each character may be more than 4 digits (for supplemntals) | |
375 | // This regular expression matches lines and splits the fields into capture groups. | |
376 | RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); | |
377 | TEST_ASSERT_SUCCESS(status); | |
378 | while (parseLine.find()) { | |
379 | UnicodeString from = parseHex(parseLine.group(1, status)); | |
380 | if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { | |
381 | // The source character was not NFD. | |
382 | // Skip this case; the first step in obtaining a skeleton is to NFD the input, | |
383 | // so the mapping in this line of confusables.txt will never be applied. | |
384 | continue; | |
385 | } | |
386 | ||
387 | UnicodeString rawExpected = parseHex(parseLine.group(2, status)); | |
388 | UnicodeString expected; | |
389 | Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); | |
390 | TEST_ASSERT_SUCCESS(status); | |
391 | ||
392 | int32_t skeletonType = 0; | |
393 | UnicodeString tableType = parseLine.group(3, status); | |
394 | TEST_ASSERT_SUCCESS(status); | |
395 | if (tableType.indexOf("SL") >= 0) { | |
396 | skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; | |
397 | } else if (tableType.indexOf("SA") >= 0) { | |
398 | skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; | |
399 | } else if (tableType.indexOf("ML") >= 0) { | |
400 | skeletonType = 0; | |
401 | } else if (tableType.indexOf("MA") >= 0) { | |
402 | skeletonType = USPOOF_ANY_CASE; | |
403 | } | |
404 | ||
405 | UnicodeString actual; | |
406 | uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); | |
407 | TEST_ASSERT_SUCCESS(status); | |
408 | TEST_ASSERT(actual == expected); | |
409 | if (actual != expected) { | |
410 | errln(parseLine.group(0, status)); | |
411 | UnicodeString line = "Actual: "; | |
412 | int i = 0; | |
413 | while (i < actual.length()) { | |
414 | appendHexUChar(line, actual.char32At(i)); | |
415 | i = actual.moveIndex32(i, 1); | |
416 | } | |
417 | errln(line); | |
418 | } | |
419 | if (U_FAILURE(status)) { | |
420 | break; | |
421 | } | |
422 | } | |
423 | } | |
424 | ||
425 | ||
426 | void IntlTestSpoof::testScriptSet() { | |
427 | // ScriptSet::SCRIPT_LIMIT is hardcoded. | |
428 | // Increase it by multiples of 32 if there are too many script codes. | |
429 | TEST_ASSERT(USCRIPT_CODE_LIMIT <= ScriptSet::SCRIPT_LIMIT); | |
430 | // USCRIPT_CODE_LIMIT should include all script codes, | |
431 | // but theoretically the data may define more. | |
432 | TEST_ASSERT(u_getIntPropertyMaxValue(UCHAR_SCRIPT) < ScriptSet::SCRIPT_LIMIT); | |
433 | ||
434 | ScriptSet s1; | |
435 | ScriptSet s2; | |
436 | UErrorCode status = U_ZERO_ERROR; | |
437 | ||
438 | TEST_ASSERT(s1 == s2); | |
439 | s1.set(USCRIPT_ARABIC,status); | |
440 | TEST_ASSERT_SUCCESS(status); | |
441 | TEST_ASSERT(!(s1 == s2)); | |
442 | TEST_ASSERT(s1.test(USCRIPT_ARABIC, status)); | |
443 | TEST_ASSERT(s1.test(USCRIPT_GREEK, status) == FALSE); | |
444 | ||
445 | status = U_ZERO_ERROR; | |
446 | s1.reset(USCRIPT_ARABIC, status); | |
447 | TEST_ASSERT(s1 == s2); | |
448 | ||
449 | static constexpr UScriptCode LAST_SCRIPT_CODE = (UScriptCode)(USCRIPT_CODE_LIMIT - 1); | |
450 | status = U_ZERO_ERROR; | |
451 | s1.setAll(); | |
452 | TEST_ASSERT(s1.test(USCRIPT_COMMON, status)); | |
453 | TEST_ASSERT(s1.test(USCRIPT_ETHIOPIC, status)); | |
454 | TEST_ASSERT(s1.test(LAST_SCRIPT_CODE, status)); | |
455 | s1.resetAll(); | |
456 | TEST_ASSERT(!s1.test(USCRIPT_COMMON, status)); | |
457 | TEST_ASSERT(!s1.test(USCRIPT_ETHIOPIC, status)); | |
458 | TEST_ASSERT(!s1.test(LAST_SCRIPT_CODE, status)); | |
459 | ||
460 | status = U_ZERO_ERROR; | |
461 | s1.set(USCRIPT_TAKRI, status); | |
462 | s1.set(USCRIPT_BLISSYMBOLS, status); | |
463 | s2.setAll(); | |
464 | TEST_ASSERT(s2.contains(s1)); | |
465 | TEST_ASSERT(!s1.contains(s2)); | |
466 | TEST_ASSERT(s2.intersects(s1)); | |
467 | TEST_ASSERT(s1.intersects(s2)); | |
468 | s2.reset(USCRIPT_TAKRI, status); | |
469 | TEST_ASSERT(!s2.contains(s1)); | |
470 | TEST_ASSERT(!s1.contains(s2)); | |
471 | TEST_ASSERT(s1.intersects(s2)); | |
472 | TEST_ASSERT(s2.intersects(s1)); | |
473 | TEST_ASSERT_SUCCESS(status); | |
474 | ||
475 | status = U_ZERO_ERROR; | |
476 | s1.resetAll(); | |
477 | s1.set(USCRIPT_NKO, status); | |
478 | s1.set(USCRIPT_COMMON, status); | |
479 | s2 = s1; | |
480 | TEST_ASSERT(s2 == s1); | |
481 | TEST_ASSERT_EQ(2, s2.countMembers()); | |
482 | s2.intersect(s1); | |
483 | TEST_ASSERT(s2 == s1); | |
484 | s2.setAll(); | |
485 | TEST_ASSERT(!(s2 == s1)); | |
486 | TEST_ASSERT(s2.countMembers() >= USCRIPT_CODE_LIMIT); | |
487 | s2.intersect(s1); | |
488 | TEST_ASSERT(s2 == s1); | |
489 | ||
490 | s2.setAll(); | |
491 | s2.reset(USCRIPT_COMMON, status); | |
492 | s2.intersect(s1); | |
493 | TEST_ASSERT(s2.countMembers() == 1); | |
494 | ||
495 | s1.resetAll(); | |
496 | TEST_ASSERT(s1.isEmpty()); | |
497 | s1.set(USCRIPT_LATIN, status); | |
498 | TEST_ASSERT(!s1.isEmpty()); | |
499 | s1.setAll(); | |
500 | TEST_ASSERT(!s1.isEmpty()); | |
501 | TEST_ASSERT_SUCCESS(status); | |
502 | ||
503 | s1.resetAll(); | |
504 | s1.set(USCRIPT_AFAKA, status); | |
505 | s1.set(USCRIPT_VAI, status); | |
506 | s1.set(USCRIPT_INHERITED, status); | |
507 | int32_t n = -1; | |
508 | for (int32_t i=0; i<4; i++) { | |
509 | n = s1.nextSetBit(n+1); | |
510 | switch (i) { | |
511 | case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break; | |
512 | case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break; | |
513 | case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break; | |
514 | case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break; | |
515 | default: TEST_ASSERT(FALSE); | |
516 | } | |
517 | } | |
518 | TEST_ASSERT_SUCCESS(status); | |
519 | ||
520 | // Script extensions. Depends on data. | |
521 | s1.resetAll(); | |
522 | s1.setScriptExtensions(0x67, status); | |
523 | TEST_ASSERT(s1.countMembers() == 1); | |
524 | TEST_ASSERT(s1.test(USCRIPT_LATIN, status)); | |
525 | TEST_ASSERT_SUCCESS(status); | |
526 | ||
527 | s1.resetAll(); | |
528 | s1.setScriptExtensions(0x303C, status); | |
529 | TEST_ASSERT(s1.countMembers() == 3); | |
530 | TEST_ASSERT(s1.test(USCRIPT_HAN, status)); | |
531 | TEST_ASSERT(s1.test(USCRIPT_HIRAGANA, status)); | |
532 | TEST_ASSERT(s1.test(USCRIPT_KATAKANA, status)); | |
533 | TEST_ASSERT_SUCCESS(status); | |
534 | ||
535 | // Additional tests | |
536 | ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status); | |
537 | ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status); | |
538 | TEST_ASSERT(bitset12.contains(bitset2)); | |
539 | TEST_ASSERT(bitset12.contains(bitset12)); | |
540 | TEST_ASSERT(!bitset2.contains(bitset12)); | |
541 | ||
542 | ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status); | |
543 | ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status); | |
544 | UElement arabEl; arabEl.pointer = &arabSet; | |
545 | UElement latinEl; latinEl.pointer = &latinSet; | |
546 | TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0); | |
547 | TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0); | |
548 | ||
549 | UnicodeString scriptString; | |
550 | bitset12.displayScripts(scriptString); | |
551 | TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString); | |
552 | } | |
553 | ||
554 | ||
555 | void IntlTestSpoof::testRestrictionLevel() { | |
556 | struct Test { | |
557 | const char *fId; | |
558 | URestrictionLevel fExpectedRestrictionLevel; | |
559 | } tests[] = { | |
560 | {"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE}, | |
561 | {"a", USPOOF_ASCII}, | |
562 | {"\\u03B3", USPOOF_SINGLE_SCRIPT_RESTRICTIVE}, | |
563 | {"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE}, | |
564 | {"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE}, | |
565 | {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}, | |
566 | {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE}, | |
567 | {"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE}, | |
568 | {"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE}, | |
569 | {"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE}, | |
570 | {"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE}, | |
571 | {"\\u0061\\u0031\\u0661", USPOOF_MODERATELY_RESTRICTIVE}, | |
572 | {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_MODERATELY_RESTRICTIVE}, | |
573 | {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE}, | |
574 | {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE} | |
575 | }; | |
576 | char msgBuffer[100]; | |
577 | URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_SINGLE_SCRIPT_RESTRICTIVE, | |
578 | USPOOF_HIGHLY_RESTRICTIVE, USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE, | |
579 | USPOOF_UNRESTRICTIVE}; | |
580 | ||
581 | UErrorCode status = U_ZERO_ERROR; | |
582 | UnicodeSet allowedChars; | |
583 | // Allowed Identifier Characters. In addition to the Recommended Set, | |
584 | // allow u303c, which has an interesting script extension of Hani Hira Kana. | |
585 | allowedChars.addAll(*uspoof_getRecommendedUnicodeSet(&status)).add(0x303C); | |
586 | ||
587 | for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) { | |
588 | status = U_ZERO_ERROR; | |
589 | const Test &test = tests[testNum]; | |
590 | UnicodeString testString = UnicodeString(test.fId).unescape(); | |
591 | URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel; | |
592 | for (int levelIndex=0; levelIndex<UPRV_LENGTHOF(restrictionLevels); levelIndex++) { | |
593 | status = U_ZERO_ERROR; | |
594 | URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex]; | |
595 | USpoofChecker *sc = uspoof_open(&status); | |
596 | uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status); | |
597 | uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker); | |
598 | uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status); | |
599 | int32_t actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status); | |
600 | ||
601 | // we want to fail if the text is (say) MODERATE and the testLevel is ASCII | |
602 | int32_t expectedValue = 0; | |
603 | if (expectedLevel > levelSetInSpoofChecker) { | |
604 | expectedValue |= USPOOF_RESTRICTION_LEVEL; | |
605 | } | |
606 | sprintf(msgBuffer, "testNum = %d, levelIndex = %d, expected = %#x, actual = %#x", | |
607 | testNum, levelIndex, expectedValue, actualValue); | |
608 | TEST_ASSERT_MSG(expectedValue == actualValue, msgBuffer); | |
609 | TEST_ASSERT_SUCCESS(status); | |
610 | ||
611 | // Run the same check again, with the Spoof Checker configured to return | |
612 | // the actual restriction level. | |
613 | uspoof_setAllowedChars(sc, allowedChars.toUSet(), &status); | |
614 | uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker); | |
615 | uspoof_setChecks(sc, USPOOF_AUX_INFO | USPOOF_RESTRICTION_LEVEL, &status); | |
616 | int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status); | |
617 | TEST_ASSERT_SUCCESS(status); | |
618 | if (U_SUCCESS(status)) { | |
619 | TEST_ASSERT_EQ(expectedLevel, result & USPOOF_RESTRICTION_LEVEL_MASK); | |
620 | TEST_ASSERT_EQ(expectedValue, result & USPOOF_ALL_CHECKS); | |
621 | } | |
622 | uspoof_close(sc); | |
623 | } | |
624 | } | |
625 | ||
626 | } | |
627 | ||
628 | void IntlTestSpoof::testMixedNumbers() { | |
629 | struct Test { | |
630 | const char *fTestString; | |
631 | const char *fExpectedSet; | |
632 | } tests[] = { | |
633 | {"1", "[0]"}, | |
634 | {"\\u0967", "[\\u0966]"}, | |
635 | {"1\\u0967", "[0\\u0966]"}, | |
636 | {"\\u0661\\u06F1", "[\\u0660\\u06F0]"}, | |
637 | {"\\u0061\\u2665", "[]"}, | |
638 | {"\\u0061\\u303C", "[]"}, | |
639 | {"\\u0061\\u30FC\\u303C", "[]"}, | |
640 | {"\\u0061\\u30FC\\u303C\\u30A2", "[]"}, | |
641 | {"\\u30A2\\u0061\\u30FC\\u303C", "[]"}, | |
642 | {"\\u0061\\u0031\\u0661", "[\\u0030\\u0660]"}, | |
643 | {"\\u0061\\u0031\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0]"}, | |
644 | {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"}, | |
645 | {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"} | |
646 | }; | |
647 | UErrorCode status = U_ZERO_ERROR; | |
648 | for (int32_t testNum=0; testNum < UPRV_LENGTHOF(tests); testNum++) { | |
649 | char msgBuf[100]; | |
650 | sprintf(msgBuf, "testNum = %d ", testNum); | |
651 | Test &test = tests[testNum]; | |
652 | ||
653 | status = U_ZERO_ERROR; | |
654 | UnicodeString testString = UnicodeString(test.fTestString).unescape(); | |
655 | UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status); | |
656 | ||
657 | status = U_ZERO_ERROR; | |
658 | TEST_SETUP | |
659 | uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this | |
660 | uspoof_check2UnicodeString(sc, testString, checkResult, &status); | |
661 | UBool mixedNumberFailure = ((uspoof_getCheckResultChecks(checkResult, &status) & USPOOF_MIXED_NUMBERS) != 0); | |
662 | TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf); | |
663 | const UnicodeSet* actualSet = UnicodeSet::fromUSet(uspoof_getCheckResultNumerics(checkResult, &status)); | |
664 | TEST_ASSERT_MSG(expectedSet == *actualSet, msgBuf); | |
665 | TEST_TEARDOWN; | |
666 | } | |
667 | } | |
668 | ||
669 | // Bug #12153 - uspoof_setRestrictionLevel() should enable restriction level testing. | |
670 | // | |
671 | void IntlTestSpoof::testBug12153() { | |
672 | UErrorCode status = U_ZERO_ERROR; | |
673 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
674 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } | |
675 | int32_t checks = uspoof_getChecks(sc.getAlias(), &status); | |
676 | TEST_ASSERT((checks & USPOOF_RESTRICTION_LEVEL) != 0); | |
677 | checks &= ~USPOOF_RESTRICTION_LEVEL; | |
678 | uspoof_setChecks(sc.getAlias(), checks, &status); | |
679 | checks = uspoof_getChecks(sc.getAlias(), &status); | |
680 | TEST_ASSERT((checks & USPOOF_RESTRICTION_LEVEL) == 0); | |
681 | ||
682 | uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); | |
683 | checks = uspoof_getChecks(sc.getAlias(), &status); | |
684 | TEST_ASSERT((checks & USPOOF_RESTRICTION_LEVEL) != 0); | |
685 | TEST_ASSERT_SUCCESS(status); | |
686 | } | |
687 | ||
688 | // uspoof_checkUnicodeString should NOT have an infinite loop. | |
689 | void IntlTestSpoof::testBug12825() { | |
690 | UErrorCode status = U_ZERO_ERROR; | |
691 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
692 | TEST_ASSERT_SUCCESS(status); | |
693 | uspoof_setChecks(sc.getAlias(), USPOOF_ALL_CHECKS | USPOOF_AUX_INFO, &status); | |
694 | TEST_ASSERT_SUCCESS(status); | |
695 | uspoof_checkUnicodeString(sc.getAlias(), UnicodeString("\\u30FB").unescape(), NULL, &status); | |
696 | TEST_ASSERT_SUCCESS(status); | |
697 | } | |
698 | ||
699 | // uspoof_getSkeleton should NOT set an ILLEGAL_ARGUMENT_EXCEPTION. | |
700 | void IntlTestSpoof::testBug12815() { | |
701 | UErrorCode status = U_ZERO_ERROR; | |
702 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
703 | TEST_ASSERT_SUCCESS(status); | |
704 | uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL, &status); | |
705 | TEST_ASSERT_SUCCESS(status); | |
706 | UnicodeString result; | |
707 | uspoof_getSkeletonUnicodeString(sc.getAlias(), 0, UnicodeString("hello world"), result, &status); | |
708 | TEST_ASSERT_SUCCESS(status); | |
709 | } | |
710 | ||
711 | void IntlTestSpoof::testBug13314_MixedNumbers() { | |
712 | UErrorCode status = U_ZERO_ERROR; | |
713 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
714 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } | |
715 | uspoof_setChecks(sc.getAlias(), USPOOF_ALL_CHECKS, &status); | |
716 | TEST_ASSERT_SUCCESS(status); | |
717 | int32_t failedChecks = uspoof_areConfusableUnicodeString(sc.getAlias(), u"列", u"列", &status); | |
718 | TEST_ASSERT_SUCCESS(status); | |
719 | assertEquals("The CJK strings should be confusable", USPOOF_SINGLE_SCRIPT_CONFUSABLE, failedChecks); | |
720 | failedChecks = uspoof_check2UnicodeString(sc.getAlias(), u"3Ȝ", nullptr, &status); | |
721 | TEST_ASSERT_SUCCESS(status); | |
722 | assertEquals("The '33' string does not fail spoof", 0, failedChecks); | |
723 | } | |
724 | ||
725 | void IntlTestSpoof::testBug13328_MixedCombiningMarks() { | |
726 | UErrorCode status = U_ZERO_ERROR; | |
727 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
728 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } | |
729 | int32_t failedChecks = uspoof_check2UnicodeString(sc.getAlias(), u"\u0061\u0F84", nullptr, &status); | |
730 | TEST_ASSERT_SUCCESS(status); | |
731 | assertEquals( | |
732 | "The mismatched combining marks string fails spoof", | |
733 | USPOOF_RESTRICTION_LEVEL, | |
734 | failedChecks); | |
735 | } | |
736 | ||
737 | void IntlTestSpoof::testCombiningDot() { | |
738 | UErrorCode status = U_ZERO_ERROR; | |
739 | LocalUSpoofCheckerPointer sc(uspoof_open(&status)); | |
740 | if (!assertSuccess("", status, true, __FILE__, __LINE__)) { return; } | |
741 | uspoof_setChecks(sc.getAlias(), USPOOF_HIDDEN_OVERLAY, &status); | |
742 | TEST_ASSERT_SUCCESS(status); | |
743 | ||
744 | static const struct TestCase { | |
745 | bool shouldFail; | |
746 | const char16_t* input; | |
747 | } cases[] = { | |
748 | {false, u"i"}, | |
749 | {false, u"j"}, | |
750 | {false, u"l"}, | |
751 | {true, u"i\u0307"}, | |
752 | {true, u"j\u0307"}, | |
753 | {true, u"l\u0307"}, | |
754 | {true, u"ı\u0307"}, | |
755 | {true, u"ȷ\u0307"}, | |
756 | {true, u"𝚤\u0307"}, | |
757 | {true, u"𝑗\u0307"}, | |
758 | {false, u"m\u0307"}, | |
759 | {true, u"1\u0307"}, | |
760 | {true, u"ij\u0307"}, | |
761 | {true, u"i\u0307\u0307"}, | |
762 | {true, u"abci\u0307def"}, | |
763 | {false, u"i\u0301\u0307"}, // U+0301 has combining class ABOVE (230) | |
764 | {true, u"i\u0320\u0307"}, // U+0320 has combining class BELOW | |
765 | {true, u"i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW | |
766 | {false, u"i\u0320\u0301\u0307"}, | |
767 | {false, u"iz\u0307"}, | |
768 | }; | |
769 | ||
770 | for (auto& cas : cases) { | |
771 | int32_t failedChecks = uspoof_check2(sc.getAlias(), cas.input, -1, nullptr, &status); | |
772 | TEST_ASSERT_SUCCESS(status); | |
773 | int32_t expected = cas.shouldFail ? USPOOF_HIDDEN_OVERLAY : 0; | |
774 | assertEquals(cas.input, expected, failedChecks); | |
775 | } | |
776 | } | |
777 | ||
778 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */ |