1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2011-2015, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 **********************************************************************
10 * IntlTestSpoof tests for USpoofDetector
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
19 #include "unicode/normlzr.h"
20 #include "unicode/regex.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uscript.h"
23 #include "unicode/uspoof.h"
26 #include "scriptset.h"
32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
33 if (U_FAILURE(status)) { \
34 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status)); \
36 } UPRV_BLOCK_MACRO_END
38 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
39 if ((expr)==FALSE) { \
40 errln("Test Failure at file %s, line %d: \"%s\" is false.", __FILE__, __LINE__, #expr); \
42 } UPRV_BLOCK_MACRO_END
44 #define TEST_ASSERT_MSG(expr, msg) UPRV_BLOCK_MACRO_BEGIN { \
45 if ((expr)==FALSE) { \
46 dataerrln("Test Failure at file %s, line %d, %s: \"%s\" is false.", __FILE__, __LINE__, msg, #expr); \
48 } UPRV_BLOCK_MACRO_END
50 #define TEST_ASSERT_EQ(a, b) UPRV_BLOCK_MACRO_BEGIN { \
52 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d)", \
53 __FILE__, __LINE__, #a, (a), #b, (b)); \
55 } UPRV_BLOCK_MACRO_END
57 #define TEST_ASSERT_NE(a, b) UPRV_BLOCK_MACRO_BEGIN { \
59 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d)", \
60 __FILE__, __LINE__, #a, (a), #b, (b)); \
62 } UPRV_BLOCK_MACRO_END
65 * TEST_SETUP and TEST_TEARDOWN
66 * macros to handle the boilerplate around setting up test case.
67 * Put arbitrary test code between SETUP and TEARDOWN.
68 * "sc" is the ready-to-go SpoofChecker for use in the tests.
70 #define TEST_SETUP UPRV_BLOCK_MACRO_BEGIN { \
71 UErrorCode status = U_ZERO_ERROR; \
73 sc = uspoof_open(&status); \
74 TEST_ASSERT_SUCCESS(status); \
75 USpoofCheckResult *checkResult; \
76 checkResult = uspoof_openCheckResult(&status); \
77 TEST_ASSERT_SUCCESS(status); \
78 if (U_SUCCESS(status)){
80 #define TEST_TEARDOWN \
82 TEST_ASSERT_SUCCESS(status); \
83 uspoof_closeCheckResult(checkResult); \
85 } UPRV_BLOCK_MACRO_END
90 void IntlTestSpoof::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
93 logln("TestSuite spoof: ");
96 TESTCASE_AUTO(testSpoofAPI
);
97 TESTCASE_AUTO(testSkeleton
);
98 TESTCASE_AUTO(testAreConfusable
);
99 TESTCASE_AUTO(testInvisible
);
100 TESTCASE_AUTO(testConfData
);
101 TESTCASE_AUTO(testBug8654
);
102 TESTCASE_AUTO(testScriptSet
);
103 TESTCASE_AUTO(testRestrictionLevel
);
104 TESTCASE_AUTO(testMixedNumbers
);
105 TESTCASE_AUTO(testBug12153
);
106 TESTCASE_AUTO(testBug12825
);
107 TESTCASE_AUTO(testBug12815
);
108 TESTCASE_AUTO(testBug13314_MixedNumbers
);
109 TESTCASE_AUTO(testBug13328_MixedCombiningMarks
);
110 TESTCASE_AUTO(testCombiningDot
);
114 void IntlTestSpoof::testSpoofAPI() {
117 UnicodeString
s("xyz"); // Many latin ranges are whole-script confusable with other scripts.
118 // If this test starts failing, consult confusablesWholeScript.txt
119 int32_t position
= 666;
120 int32_t checkResults
= uspoof_checkUnicodeString(sc
, s
, &position
, &status
);
121 TEST_ASSERT_SUCCESS(status
);
122 TEST_ASSERT_EQ(0, checkResults
);
123 TEST_ASSERT_EQ(0, position
);
127 UnicodeString
s1("cxs");
128 UnicodeString s2
= UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
129 int32_t checkResults
= uspoof_areConfusableUnicodeString(sc
, s1
, s2
, &status
);
130 TEST_ASSERT_SUCCESS(status
);
131 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE
| USPOOF_WHOLE_SCRIPT_CONFUSABLE
, checkResults
);
136 UnicodeString
s("I1l0O");
138 UnicodeString
&retStr
= uspoof_getSkeletonUnicodeString(sc
, USPOOF_ANY_CASE
, s
, dest
, &status
);
139 TEST_ASSERT_SUCCESS(status
);
140 TEST_ASSERT(UnicodeString("lllOO") == dest
);
141 TEST_ASSERT(&dest
== &retStr
);
146 #define CHECK_SKELETON(type, input, expected) UPRV_BLOCK_MACRO_BEGIN { \
147 checkSkeleton(sc, type, input, expected, __LINE__); \
148 } UPRV_BLOCK_MACRO_END
151 // testSkeleton. Spot check a number of confusable skeleton substitutions from the
152 // Unicode data file confusables.txt
153 // Test cases chosen for substitutions of various lengths, and
154 // membership in different mapping tables.
155 // Note: for ICU 55, all tables collapsed to the MA table data.
156 // TODO: for ICU 56 with Unicode 8, revisit this test.
158 void IntlTestSpoof::testSkeleton() {
159 const uint32_t ML
= 0;
160 const uint32_t SL
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
161 const uint32_t MA
= USPOOF_ANY_CASE
;
162 const uint32_t SA
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
;
165 CHECK_SKELETON(SL
, "nochange", "nochange");
166 CHECK_SKELETON(SA
, "nochange", "nochange");
167 CHECK_SKELETON(ML
, "nochange", "nochange");
168 CHECK_SKELETON(MA
, "nochange", "nochange");
169 CHECK_SKELETON(MA
, "love", "love");
170 CHECK_SKELETON(MA
, "1ove", "love"); // Digit 1 to letter l
171 CHECK_SKELETON(ML
, "OOPS", "OOPS");
172 CHECK_SKELETON(ML
, "00PS", "OOPS");
173 CHECK_SKELETON(MA
, "OOPS", "OOPS");
174 CHECK_SKELETON(MA
, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only
175 CHECK_SKELETON(SL
, "\\u059c", "\\u0301");
176 CHECK_SKELETON(SL
, "\\u2A74", "\\u003A\\u003A\\u003D");
177 CHECK_SKELETON(SL
, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)"
178 CHECK_SKELETON(SL
, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f");
180 // This mapping exists in the ML and MA tables, does not exist in SL, SA
185 CHECK_SKELETON(SL
, "\\u0C83", "\\u0983");
186 CHECK_SKELETON(SA
, "\\u0C83", "\\u0983");
187 CHECK_SKELETON(ML
, "\\u0C83", "\\u0983");
188 CHECK_SKELETON(MA
, "\\u0C83", "\\u0983");
190 // 0391 mappings exist only in MA and SA tables.
191 CHECK_SKELETON(MA
, "\\u0391", "A");
192 CHECK_SKELETON(SA
, "\\u0391", "A");
193 CHECK_SKELETON(ML
, "\\u0391", "A");
194 CHECK_SKELETON(SL
, "\\u0391", "A");
196 // 13CF Mappings in all four tables, different in MA.
197 CHECK_SKELETON(ML
, "\\u13CF", "b");
198 CHECK_SKELETON(MA
, "\\u13CF", "b");
199 CHECK_SKELETON(SL
, "\\u13CF", "b");
200 CHECK_SKELETON(SA
, "\\u13CF", "b");
202 // 0022 ; 0027 0027 ;
204 CHECK_SKELETON(SL
, "\\u0022", "\\u0027\\u0027");
205 CHECK_SKELETON(SA
, "\\u0022", "\\u0027\\u0027");
206 CHECK_SKELETON(ML
, "\\u0022", "\\u0027\\u0027");
207 CHECK_SKELETON(MA
, "\\u0022", "\\u0027\\u0027");
209 // 017F mappings exist only in MA and SA tables.
210 CHECK_SKELETON(MA
, "\\u017F", "f");
211 CHECK_SKELETON(SA
, "\\u017F", "f");
212 CHECK_SKELETON(ML
, "\\u017F", "f");
213 CHECK_SKELETON(SL
, "\\u017F", "f");
220 // Run a single confusable skeleton transformation test case.
222 void IntlTestSpoof::checkSkeleton(const USpoofChecker
*sc
, uint32_t type
,
223 const char *input
, const char *expected
, int32_t lineNum
) {
224 UnicodeString uInput
= UnicodeString(input
).unescape();
225 UnicodeString uExpected
= UnicodeString(expected
).unescape();
227 UErrorCode status
= U_ZERO_ERROR
;
228 UnicodeString actual
;
229 uspoof_getSkeletonUnicodeString(sc
, type
, uInput
, actual
, &status
);
230 if (U_FAILURE(status
)) {
231 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__
, __LINE__
, lineNum
,
232 u_errorName(status
));
235 if (uExpected
!= actual
) {
236 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
237 __FILE__
, __LINE__
, lineNum
);
238 errln(UnicodeString(" Actual Skeleton: \"") + actual
+ UnicodeString("\"\n") +
239 UnicodeString(" Expected Skeleton: \"") + uExpected
+ UnicodeString("\""));
243 void IntlTestSpoof::testAreConfusable() {
245 UnicodeString
s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
246 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
247 UnicodeString
s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
248 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
249 int32_t result
= uspoof_areConfusableUnicodeString(sc
, s1
, s2
, &status
);
250 TEST_ASSERT_SUCCESS(status
);
251 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE
, result
);
256 void IntlTestSpoof::testInvisible() {
258 UnicodeString s
= UnicodeString("abcd\\u0301ef").unescape();
259 int32_t position
= -42;
260 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc
, s
, &position
, &status
));
261 TEST_ASSERT_SUCCESS(status
);
262 TEST_ASSERT(0 == position
);
264 UnicodeString s2
= UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
265 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s2
, &position
, &status
));
266 TEST_ASSERT_SUCCESS(status
);
267 TEST_ASSERT_EQ(0, position
);
269 // Two acute accents, one from the composed a with acute accent, \u00e1,
272 UnicodeString s3
= UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
273 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s3
, &position
, &status
));
274 TEST_ASSERT_SUCCESS(status
);
275 TEST_ASSERT_EQ(0, position
);
279 void IntlTestSpoof::testBug8654() {
281 UnicodeString s
= UnicodeString("B\\u00c1\\u0301").unescape();
282 int32_t position
= -42;
283 TEST_ASSERT_EQ(USPOOF_INVISIBLE
, uspoof_checkUnicodeString(sc
, s
, &position
, &status
) & USPOOF_INVISIBLE
);
284 TEST_ASSERT_SUCCESS(status
);
285 TEST_ASSERT_EQ(0, position
);
289 static UnicodeString
parseHex(const UnicodeString
&in
) {
290 // Convert a series of hex numbers in a Unicode String to a string with the
291 // corresponding characters.
292 // The conversion is _really_ annoying. There must be some function to just do it.
293 UnicodeString result
;
295 for (int32_t i
=0; i
<in
.length(); i
++) {
296 UChar c
= in
.charAt(i
);
297 if (c
== 0x20) { // Space
302 } else if (c
>=0x30 && c
<=0x39) {
303 cc
= (cc
<<4) + (c
- 0x30);
304 } else if ((c
>=0x41 && c
<=0x46) || (c
>=0x61 && c
<=0x66)) {
305 cc
= (cc
<<4) + (c
& 0x0f)+9;
307 // else do something with bad input.
317 // Append the hex form of a UChar32 to a UnicodeString.
318 // Used in formatting error messages.
319 // Match the formatting of numbers in confusables.txt
320 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
322 static void appendHexUChar(UnicodeString
&dest
, UChar32 c
) {
323 UBool doZeroes
= FALSE
;
324 for (int bitNum
=28; bitNum
>=0; bitNum
-=4) {
328 int hexDigit
= (c
>>bitNum
) & 0x0f;
329 if (hexDigit
!= 0 || doZeroes
) {
331 dest
.append((UChar
)(hexDigit
<=9? hexDigit
+ 0x30: hexDigit
-10 + 0x41));
334 dest
.append((UChar
)0x20);
337 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer
, FILE, fclose
);
339 // testConfData - Check each data item from the Unicode confusables.txt file,
340 // verify that it transforms correctly in a skeleton.
342 void IntlTestSpoof::testConfData() {
344 if (getUnidataPath(buffer
) == NULL
) {
345 errln("Skipping test spoof/testConfData. Unable to find path to source/data/unidata/.");
348 uprv_strcat(buffer
, "confusables.txt");
350 LocalStdioFilePointer
f(fopen(buffer
, "rb"));
352 errln("Skipping test spoof/testConfData. File confusables.txt not accessible.");
355 fseek(f
.getAlias(), 0, SEEK_END
);
356 int32_t fileSize
= ftell(f
.getAlias());
357 LocalArray
<char> fileBuf(new char[fileSize
]);
358 fseek(f
.getAlias(), 0, SEEK_SET
);
359 int32_t amt_read
= static_cast<int32_t>(fread(fileBuf
.getAlias(), 1, fileSize
, f
.getAlias()));
360 TEST_ASSERT_EQ(amt_read
, fileSize
);
361 TEST_ASSERT(fileSize
>0);
362 if (amt_read
!= fileSize
|| fileSize
<=0) {
365 UnicodeString confusablesTxt
= UnicodeString::fromUTF8(StringPiece(fileBuf
.getAlias(), fileSize
));
367 UErrorCode status
= U_ZERO_ERROR
;
368 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
369 TEST_ASSERT_SUCCESS(status
);
371 // Parse lines from the confusables.txt file. Example Line:
372 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
373 // Three fields. The hex fields can contain more than one character,
374 // and each character may be more than 4 digits (for supplemntals)
375 // This regular expression matches lines and splits the fields into capture groups.
376 RegexMatcher
parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt
, 0, status
);
377 TEST_ASSERT_SUCCESS(status
);
378 while (parseLine
.find()) {
379 UnicodeString from
= parseHex(parseLine
.group(1, status
));
380 if (!Normalizer::isNormalized(from
, UNORM_NFD
, status
)) {
381 // The source character was not NFD.
382 // Skip this case; the first step in obtaining a skeleton is to NFD the input,
383 // so the mapping in this line of confusables.txt will never be applied.
387 UnicodeString rawExpected
= parseHex(parseLine
.group(2, status
));
388 UnicodeString expected
;
389 Normalizer::decompose(rawExpected
, FALSE
/*NFD*/, 0, expected
, status
);
390 TEST_ASSERT_SUCCESS(status
);
392 int32_t skeletonType
= 0;
393 UnicodeString tableType
= parseLine
.group(3, status
);
394 TEST_ASSERT_SUCCESS(status
);
395 if (tableType
.indexOf("SL") >= 0) {
396 skeletonType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
397 } else if (tableType
.indexOf("SA") >= 0) {
398 skeletonType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
| USPOOF_ANY_CASE
;
399 } else if (tableType
.indexOf("ML") >= 0) {
401 } else if (tableType
.indexOf("MA") >= 0) {
402 skeletonType
= USPOOF_ANY_CASE
;
405 UnicodeString actual
;
406 uspoof_getSkeletonUnicodeString(sc
.getAlias(), skeletonType
, from
, actual
, &status
);
407 TEST_ASSERT_SUCCESS(status
);
408 TEST_ASSERT(actual
== expected
);
409 if (actual
!= expected
) {
410 errln(parseLine
.group(0, status
));
411 UnicodeString line
= "Actual: ";
413 while (i
< actual
.length()) {
414 appendHexUChar(line
, actual
.char32At(i
));
415 i
= actual
.moveIndex32(i
, 1);
419 if (U_FAILURE(status
)) {
426 void IntlTestSpoof::testScriptSet() {
427 // ScriptSet::SCRIPT_LIMIT is hardcoded.
428 // Increase it by multiples of 32 if there are too many script codes.
429 TEST_ASSERT(USCRIPT_CODE_LIMIT
<= ScriptSet::SCRIPT_LIMIT
);
430 // USCRIPT_CODE_LIMIT should include all script codes,
431 // but theoretically the data may define more.
432 TEST_ASSERT(u_getIntPropertyMaxValue(UCHAR_SCRIPT
) < ScriptSet::SCRIPT_LIMIT
);
436 UErrorCode status
= U_ZERO_ERROR
;
438 TEST_ASSERT(s1
== s2
);
439 s1
.set(USCRIPT_ARABIC
,status
);
440 TEST_ASSERT_SUCCESS(status
);
441 TEST_ASSERT(!(s1
== s2
));
442 TEST_ASSERT(s1
.test(USCRIPT_ARABIC
, status
));
443 TEST_ASSERT(s1
.test(USCRIPT_GREEK
, status
) == FALSE
);
445 status
= U_ZERO_ERROR
;
446 s1
.reset(USCRIPT_ARABIC
, status
);
447 TEST_ASSERT(s1
== s2
);
449 static constexpr UScriptCode LAST_SCRIPT_CODE
= (UScriptCode
)(USCRIPT_CODE_LIMIT
- 1);
450 status
= U_ZERO_ERROR
;
452 TEST_ASSERT(s1
.test(USCRIPT_COMMON
, status
));
453 TEST_ASSERT(s1
.test(USCRIPT_ETHIOPIC
, status
));
454 TEST_ASSERT(s1
.test(LAST_SCRIPT_CODE
, status
));
456 TEST_ASSERT(!s1
.test(USCRIPT_COMMON
, status
));
457 TEST_ASSERT(!s1
.test(USCRIPT_ETHIOPIC
, status
));
458 TEST_ASSERT(!s1
.test(LAST_SCRIPT_CODE
, status
));
460 status
= U_ZERO_ERROR
;
461 s1
.set(USCRIPT_TAKRI
, status
);
462 s1
.set(USCRIPT_BLISSYMBOLS
, status
);
464 TEST_ASSERT(s2
.contains(s1
));
465 TEST_ASSERT(!s1
.contains(s2
));
466 TEST_ASSERT(s2
.intersects(s1
));
467 TEST_ASSERT(s1
.intersects(s2
));
468 s2
.reset(USCRIPT_TAKRI
, status
);
469 TEST_ASSERT(!s2
.contains(s1
));
470 TEST_ASSERT(!s1
.contains(s2
));
471 TEST_ASSERT(s1
.intersects(s2
));
472 TEST_ASSERT(s2
.intersects(s1
));
473 TEST_ASSERT_SUCCESS(status
);
475 status
= U_ZERO_ERROR
;
477 s1
.set(USCRIPT_NKO
, status
);
478 s1
.set(USCRIPT_COMMON
, status
);
480 TEST_ASSERT(s2
== s1
);
481 TEST_ASSERT_EQ(2, s2
.countMembers());
483 TEST_ASSERT(s2
== s1
);
485 TEST_ASSERT(!(s2
== s1
));
486 TEST_ASSERT(s2
.countMembers() >= USCRIPT_CODE_LIMIT
);
488 TEST_ASSERT(s2
== s1
);
491 s2
.reset(USCRIPT_COMMON
, status
);
493 TEST_ASSERT(s2
.countMembers() == 1);
496 TEST_ASSERT(s1
.isEmpty());
497 s1
.set(USCRIPT_LATIN
, status
);
498 TEST_ASSERT(!s1
.isEmpty());
500 TEST_ASSERT(!s1
.isEmpty());
501 TEST_ASSERT_SUCCESS(status
);
504 s1
.set(USCRIPT_AFAKA
, status
);
505 s1
.set(USCRIPT_VAI
, status
);
506 s1
.set(USCRIPT_INHERITED
, status
);
508 for (int32_t i
=0; i
<4; i
++) {
509 n
= s1
.nextSetBit(n
+1);
511 case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED
, n
); break;
512 case 1: TEST_ASSERT_EQ(USCRIPT_VAI
, n
); break;
513 case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA
, n
); break;
514 case 3: TEST_ASSERT_EQ(-1, (int32_t)n
); break;
515 default: TEST_ASSERT(FALSE
);
518 TEST_ASSERT_SUCCESS(status
);
520 // Script extensions. Depends on data.
522 s1
.setScriptExtensions(0x67, status
);
523 TEST_ASSERT(s1
.countMembers() == 1);
524 TEST_ASSERT(s1
.test(USCRIPT_LATIN
, status
));
525 TEST_ASSERT_SUCCESS(status
);
528 s1
.setScriptExtensions(0x303C, status
);
529 TEST_ASSERT(s1
.countMembers() == 3);
530 TEST_ASSERT(s1
.test(USCRIPT_HAN
, status
));
531 TEST_ASSERT(s1
.test(USCRIPT_HIRAGANA
, status
));
532 TEST_ASSERT(s1
.test(USCRIPT_KATAKANA
, status
));
533 TEST_ASSERT_SUCCESS(status
);
536 ScriptSet bitset12
; bitset12
.set(USCRIPT_LATIN
, status
).set(USCRIPT_HANGUL
, status
);
537 ScriptSet bitset2
; bitset2
.set(USCRIPT_HANGUL
, status
);
538 TEST_ASSERT(bitset12
.contains(bitset2
));
539 TEST_ASSERT(bitset12
.contains(bitset12
));
540 TEST_ASSERT(!bitset2
.contains(bitset12
));
542 ScriptSet arabSet
; arabSet
.set(USCRIPT_ARABIC
, status
);
543 ScriptSet latinSet
; latinSet
.set(USCRIPT_LATIN
, status
);
544 UElement arabEl
; arabEl
.pointer
= &arabSet
;
545 UElement latinEl
; latinEl
.pointer
= &latinSet
;
546 TEST_ASSERT(uhash_compareScriptSet(arabEl
, latinEl
) < 0);
547 TEST_ASSERT(uhash_compareScriptSet(latinEl
, arabEl
) > 0);
549 UnicodeString scriptString
;
550 bitset12
.displayScripts(scriptString
);
551 TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString
);
555 void IntlTestSpoof::testRestrictionLevel() {
558 URestrictionLevel fExpectedRestrictionLevel
;
560 {"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE
},
562 {"\\u03B3", USPOOF_SINGLE_SCRIPT_RESTRICTIVE
},
563 {"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE
},
564 {"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE
},
565 {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE
},
566 {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE
},
567 {"\\u0061\\u303C", USPOOF_HIGHLY_RESTRICTIVE
},
568 {"\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE
},
569 {"\\u0061\\u30FC\\u303C\\u30A2", USPOOF_HIGHLY_RESTRICTIVE
},
570 {"\\u30A2\\u0061\\u30FC\\u303C", USPOOF_HIGHLY_RESTRICTIVE
},
571 {"\\u0061\\u0031\\u0661", USPOOF_MODERATELY_RESTRICTIVE
},
572 {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_MODERATELY_RESTRICTIVE
},
573 {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE
},
574 {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", USPOOF_MINIMALLY_RESTRICTIVE
}
577 URestrictionLevel restrictionLevels
[] = { USPOOF_ASCII
, USPOOF_SINGLE_SCRIPT_RESTRICTIVE
,
578 USPOOF_HIGHLY_RESTRICTIVE
, USPOOF_MODERATELY_RESTRICTIVE
, USPOOF_MINIMALLY_RESTRICTIVE
,
579 USPOOF_UNRESTRICTIVE
};
581 UErrorCode status
= U_ZERO_ERROR
;
582 UnicodeSet allowedChars
;
583 // Allowed Identifier Characters. In addition to the Recommended Set,
584 // allow u303c, which has an interesting script extension of Hani Hira Kana.
585 allowedChars
.addAll(*uspoof_getRecommendedUnicodeSet(&status
)).add(0x303C);
587 for (int32_t testNum
=0; testNum
< UPRV_LENGTHOF(tests
); testNum
++) {
588 status
= U_ZERO_ERROR
;
589 const Test
&test
= tests
[testNum
];
590 UnicodeString testString
= UnicodeString(test
.fId
).unescape();
591 URestrictionLevel expectedLevel
= test
.fExpectedRestrictionLevel
;
592 for (int levelIndex
=0; levelIndex
<UPRV_LENGTHOF(restrictionLevels
); levelIndex
++) {
593 status
= U_ZERO_ERROR
;
594 URestrictionLevel levelSetInSpoofChecker
= restrictionLevels
[levelIndex
];
595 USpoofChecker
*sc
= uspoof_open(&status
);
596 uspoof_setAllowedChars(sc
, allowedChars
.toUSet(), &status
);
597 uspoof_setRestrictionLevel(sc
, levelSetInSpoofChecker
);
598 uspoof_setChecks(sc
, USPOOF_RESTRICTION_LEVEL
, &status
);
599 int32_t actualValue
= uspoof_checkUnicodeString(sc
, testString
, NULL
, &status
);
601 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
602 int32_t expectedValue
= 0;
603 if (expectedLevel
> levelSetInSpoofChecker
) {
604 expectedValue
|= USPOOF_RESTRICTION_LEVEL
;
606 sprintf(msgBuffer
, "testNum = %d, levelIndex = %d, expected = %#x, actual = %#x",
607 testNum
, levelIndex
, expectedValue
, actualValue
);
608 TEST_ASSERT_MSG(expectedValue
== actualValue
, msgBuffer
);
609 TEST_ASSERT_SUCCESS(status
);
611 // Run the same check again, with the Spoof Checker configured to return
612 // the actual restriction level.
613 uspoof_setAllowedChars(sc
, allowedChars
.toUSet(), &status
);
614 uspoof_setRestrictionLevel(sc
, levelSetInSpoofChecker
);
615 uspoof_setChecks(sc
, USPOOF_AUX_INFO
| USPOOF_RESTRICTION_LEVEL
, &status
);
616 int32_t result
= uspoof_checkUnicodeString(sc
, testString
, NULL
, &status
);
617 TEST_ASSERT_SUCCESS(status
);
618 if (U_SUCCESS(status
)) {
619 TEST_ASSERT_EQ(expectedLevel
, result
& USPOOF_RESTRICTION_LEVEL_MASK
);
620 TEST_ASSERT_EQ(expectedValue
, result
& USPOOF_ALL_CHECKS
);
628 void IntlTestSpoof::testMixedNumbers() {
630 const char *fTestString
;
631 const char *fExpectedSet
;
634 {"\\u0967", "[\\u0966]"},
635 {"1\\u0967", "[0\\u0966]"},
636 {"\\u0661\\u06F1", "[\\u0660\\u06F0]"},
637 {"\\u0061\\u2665", "[]"},
638 {"\\u0061\\u303C", "[]"},
639 {"\\u0061\\u30FC\\u303C", "[]"},
640 {"\\u0061\\u30FC\\u303C\\u30A2", "[]"},
641 {"\\u30A2\\u0061\\u30FC\\u303C", "[]"},
642 {"\\u0061\\u0031\\u0661", "[\\u0030\\u0660]"},
643 {"\\u0061\\u0031\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0]"},
644 {"\\u0661\\u30FC\\u303C\\u0061\\u30A2\\u0031\\u0967\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"},
645 {"\\u0061\\u30A2\\u30FC\\u303C\\u0031\\u0967\\u0661\\u06F1", "[\\u0030\\u0660\\u06F0\\u0966]"}
647 UErrorCode status
= U_ZERO_ERROR
;
648 for (int32_t testNum
=0; testNum
< UPRV_LENGTHOF(tests
); testNum
++) {
650 sprintf(msgBuf
, "testNum = %d ", testNum
);
651 Test
&test
= tests
[testNum
];
653 status
= U_ZERO_ERROR
;
654 UnicodeString testString
= UnicodeString(test
.fTestString
).unescape();
655 UnicodeSet
expectedSet(UnicodeString(test
.fExpectedSet
).unescape(), status
);
657 status
= U_ZERO_ERROR
;
659 uspoof_setChecks(sc
, USPOOF_MIXED_NUMBERS
, &status
); // only check this
660 uspoof_check2UnicodeString(sc
, testString
, checkResult
, &status
);
661 UBool mixedNumberFailure
= ((uspoof_getCheckResultChecks(checkResult
, &status
) & USPOOF_MIXED_NUMBERS
) != 0);
662 TEST_ASSERT_MSG((expectedSet
.size() > 1) == mixedNumberFailure
, msgBuf
);
663 const UnicodeSet
* actualSet
= UnicodeSet::fromUSet(uspoof_getCheckResultNumerics(checkResult
, &status
));
664 TEST_ASSERT_MSG(expectedSet
== *actualSet
, msgBuf
);
669 // Bug #12153 - uspoof_setRestrictionLevel() should enable restriction level testing.
671 void IntlTestSpoof::testBug12153() {
672 UErrorCode status
= U_ZERO_ERROR
;
673 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
674 if (!assertSuccess("", status
, true, __FILE__
, __LINE__
)) { return; }
675 int32_t checks
= uspoof_getChecks(sc
.getAlias(), &status
);
676 TEST_ASSERT((checks
& USPOOF_RESTRICTION_LEVEL
) != 0);
677 checks
&= ~USPOOF_RESTRICTION_LEVEL
;
678 uspoof_setChecks(sc
.getAlias(), checks
, &status
);
679 checks
= uspoof_getChecks(sc
.getAlias(), &status
);
680 TEST_ASSERT((checks
& USPOOF_RESTRICTION_LEVEL
) == 0);
682 uspoof_setRestrictionLevel(sc
.getAlias(), USPOOF_MODERATELY_RESTRICTIVE
);
683 checks
= uspoof_getChecks(sc
.getAlias(), &status
);
684 TEST_ASSERT((checks
& USPOOF_RESTRICTION_LEVEL
) != 0);
685 TEST_ASSERT_SUCCESS(status
);
688 // uspoof_checkUnicodeString should NOT have an infinite loop.
689 void IntlTestSpoof::testBug12825() {
690 UErrorCode status
= U_ZERO_ERROR
;
691 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
692 TEST_ASSERT_SUCCESS(status
);
693 uspoof_setChecks(sc
.getAlias(), USPOOF_ALL_CHECKS
| USPOOF_AUX_INFO
, &status
);
694 TEST_ASSERT_SUCCESS(status
);
695 uspoof_checkUnicodeString(sc
.getAlias(), UnicodeString("\\u30FB").unescape(), NULL
, &status
);
696 TEST_ASSERT_SUCCESS(status
);
699 // uspoof_getSkeleton should NOT set an ILLEGAL_ARGUMENT_EXCEPTION.
700 void IntlTestSpoof::testBug12815() {
701 UErrorCode status
= U_ZERO_ERROR
;
702 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
703 TEST_ASSERT_SUCCESS(status
);
704 uspoof_setChecks(sc
.getAlias(), USPOOF_RESTRICTION_LEVEL
, &status
);
705 TEST_ASSERT_SUCCESS(status
);
706 UnicodeString result
;
707 uspoof_getSkeletonUnicodeString(sc
.getAlias(), 0, UnicodeString("hello world"), result
, &status
);
708 TEST_ASSERT_SUCCESS(status
);
711 void IntlTestSpoof::testBug13314_MixedNumbers() {
712 UErrorCode status
= U_ZERO_ERROR
;
713 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
714 if (!assertSuccess("", status
, true, __FILE__
, __LINE__
)) { return; }
715 uspoof_setChecks(sc
.getAlias(), USPOOF_ALL_CHECKS
, &status
);
716 TEST_ASSERT_SUCCESS(status
);
717 int32_t failedChecks
= uspoof_areConfusableUnicodeString(sc
.getAlias(), u
"列", u
"列", &status
);
718 TEST_ASSERT_SUCCESS(status
);
719 assertEquals("The CJK strings should be confusable", USPOOF_SINGLE_SCRIPT_CONFUSABLE
, failedChecks
);
720 failedChecks
= uspoof_check2UnicodeString(sc
.getAlias(), u
"3Ȝ", nullptr, &status
);
721 TEST_ASSERT_SUCCESS(status
);
722 assertEquals("The '33' string does not fail spoof", 0, failedChecks
);
725 void IntlTestSpoof::testBug13328_MixedCombiningMarks() {
726 UErrorCode status
= U_ZERO_ERROR
;
727 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
728 if (!assertSuccess("", status
, true, __FILE__
, __LINE__
)) { return; }
729 int32_t failedChecks
= uspoof_check2UnicodeString(sc
.getAlias(), u
"\u0061\u0F84", nullptr, &status
);
730 TEST_ASSERT_SUCCESS(status
);
732 "The mismatched combining marks string fails spoof",
733 USPOOF_RESTRICTION_LEVEL
,
737 void IntlTestSpoof::testCombiningDot() {
738 UErrorCode status
= U_ZERO_ERROR
;
739 LocalUSpoofCheckerPointer
sc(uspoof_open(&status
));
740 if (!assertSuccess("", status
, true, __FILE__
, __LINE__
)) { return; }
741 uspoof_setChecks(sc
.getAlias(), USPOOF_HIDDEN_OVERLAY
, &status
);
742 TEST_ASSERT_SUCCESS(status
);
744 static const struct TestCase
{
746 const char16_t* input
;
761 {true, u
"i\u0307\u0307"},
762 {true, u
"abci\u0307def"},
763 {false, u
"i\u0301\u0307"}, // U+0301 has combining class ABOVE (230)
764 {true, u
"i\u0320\u0307"}, // U+0320 has combining class BELOW
765 {true, u
"i\u0320\u0321\u0307"}, // U+0321 also has combining class BELOW
766 {false, u
"i\u0320\u0301\u0307"},
767 {false, u
"iz\u0307"},
770 for (auto& cas
: cases
) {
771 int32_t failedChecks
= uspoof_check2(sc
.getAlias(), cas
.input
, -1, nullptr, &status
);
772 TEST_ASSERT_SUCCESS(status
);
773 int32_t expected
= cas
.shouldFail
? USPOOF_HIDDEN_OVERLAY
: 0;
774 assertEquals(cas
.input
, expected
, failedChecks
);
778 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */