1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ****************************************************************************
5 * Copyright (c) 2005-2016, International Business Machines Corporation and *
6 * others. All Rights Reserved. *
7 ****************************************************************************
10 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/ustring.h"
22 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
23 #define DELETE_ARRAY(array) free(array)
25 static void TestConstruction(void);
26 static void TestUTF8(void);
27 static void TestUTF16(void);
28 static void TestC1Bytes(void);
29 static void TestInputFilter(void);
30 static void TestChaining(void);
31 static void TestBufferOverflow(void);
32 static void TestIBM424(void);
33 static void TestIBM420(void);
34 #if U_PLATFORM_IS_DARWIN_BASED
35 static void TestMailFilterCSS(void);
38 void addUCsdetTest(TestNode
** root
);
40 void addUCsdetTest(TestNode
** root
)
42 addTest(root
, &TestConstruction
, "ucsdetst/TestConstruction");
43 addTest(root
, &TestUTF8
, "ucsdetst/TestUTF8");
44 addTest(root
, &TestUTF16
, "ucsdetst/TestUTF16");
45 addTest(root
, &TestC1Bytes
, "ucsdetst/TestC1Bytes");
46 addTest(root
, &TestInputFilter
, "ucsdetst/TestInputFilter");
47 addTest(root
, &TestChaining
, "ucsdetst/TestErrorChaining");
48 addTest(root
, &TestBufferOverflow
, "ucsdetst/TestBufferOverflow");
49 #if !UCONFIG_NO_LEGACY_CONVERSION
50 addTest(root
, &TestIBM424
, "ucsdetst/TestIBM424");
51 addTest(root
, &TestIBM420
, "ucsdetst/TestIBM420");
53 #if U_PLATFORM_IS_DARWIN_BASED
54 addTest(root
, &TestMailFilterCSS
, "ucsdetst/TestMailFilterCSS");
58 static int32_t preflight(const UChar
*src
, int32_t length
, UConverter
*cnv
)
62 char *dest
, *destLimit
= buffer
+ sizeof(buffer
);
63 const UChar
*srcLimit
= src
+ length
;
68 status
= U_ZERO_ERROR
;
69 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &status
);
70 result
+= (int32_t) (dest
- buffer
);
71 } while (status
== U_BUFFER_OVERFLOW_ERROR
);
76 static char *extractBytes(const UChar
*src
, int32_t length
, const char *codepage
, int32_t *byteLength
)
78 UErrorCode status
= U_ZERO_ERROR
;
79 UConverter
*cnv
= ucnv_open(codepage
, &status
);
80 int32_t byteCount
= preflight(src
, length
, cnv
);
81 const UChar
*srcLimit
= src
+ length
;
82 char *bytes
= NEW_ARRAY(char, byteCount
+ 1);
83 char *dest
= bytes
, *destLimit
= bytes
+ byteCount
+ 1;
85 ucnv_fromUnicode(cnv
, &dest
, destLimit
, &src
, srcLimit
, 0, TRUE
, &status
);
88 *byteLength
= byteCount
;
92 static void freeBytes(char *bytes
)
97 static void TestConstruction(void)
99 UErrorCode status
= U_ZERO_ERROR
;
100 UCharsetDetector
*csd
= ucsdet_open(&status
);
101 UEnumeration
*e
= ucsdet_getAllDetectableCharsets(csd
, &status
);
103 int32_t count
= uenum_count(e
, &status
);
106 for(i
= 0; i
< count
; i
+= 1) {
107 name
= uenum_next(e
, &length
, &status
);
109 if(name
== NULL
|| length
<= 0) {
110 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
113 /* one past the list of all names must return NULL */
114 name
= uenum_next(e
, &length
, &status
);
115 if(name
!= NULL
|| length
!= 0 || U_FAILURE(status
)) {
116 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
123 static void TestUTF8(void)
125 UErrorCode status
= U_ZERO_ERROR
;
126 static const char ss
[] = "This is a string with some non-ascii characters that will "
127 "be converted to UTF-8, then shoved through the detection process. "
128 "\\u0391\\u0392\\u0393\\u0394\\u0395"
129 "Sure would be nice if our source could contain Unicode directly!";
130 int32_t byteLength
= 0, sLength
= 0, dLength
= 0;
133 UCharsetDetector
*csd
= ucsdet_open(&status
);
134 const UCharsetMatch
*match
;
135 UChar detected
[sizeof(ss
)];
137 sLength
= u_unescape(ss
, s
, sizeof(ss
));
138 bytes
= extractBytes(s
, sLength
, "UTF-8", &byteLength
);
140 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
141 if (U_FAILURE(status
)) {
142 log_err("status is %s\n", u_errorName(status
));
146 match
= ucsdet_detect(csd
, &status
);
149 log_err("Detection failure for UTF-8: got no matches.\n");
153 dLength
= ucsdet_getUChars(match
, detected
, sLength
, &status
);
155 if (u_strCompare(detected
, dLength
, s
, sLength
, FALSE
) != 0) {
156 log_err("Round-trip test failed!\n");
159 ucsdet_setDeclaredEncoding(csd
, "UTF-8", 5, &status
); /* for coverage */
166 static void TestUTF16(void)
168 UErrorCode status
= U_ZERO_ERROR
;
169 /* Notice the BOM on the start of this string */
170 static const UChar chars
[] = {
171 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
172 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
173 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
174 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
175 0x064a, 0x062a, 0x0000};
176 int32_t beLength
= 0, leLength
= 0, cLength
= UPRV_LENGTHOF(chars
);
177 char *beBytes
= extractBytes(chars
, cLength
, "UTF-16BE", &beLength
);
178 char *leBytes
= extractBytes(chars
, cLength
, "UTF-16LE", &leLength
);
179 UCharsetDetector
*csd
= ucsdet_open(&status
);
180 const UCharsetMatch
*match
;
184 ucsdet_setText(csd
, beBytes
, beLength
, &status
);
185 match
= ucsdet_detect(csd
, &status
);
188 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
192 name
= ucsdet_getName(match
, &status
);
193 conf
= ucsdet_getConfidence(match
, &status
);
195 if (strcmp(name
, "UTF-16BE") != 0) {
196 log_err("Encoding detection failure for UTF-16BE: got %s\n", name
);
200 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf
);
204 ucsdet_setText(csd
, leBytes
, leLength
, &status
);
205 match
= ucsdet_detect(csd
, &status
);
208 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
212 name
= ucsdet_getName(match
, &status
);
213 conf
= ucsdet_getConfidence(match
, &status
);
216 if (strcmp(name
, "UTF-16LE") != 0) {
217 log_err("Enconding detection failure for UTF-16LE: got %s\n", name
);
221 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf
);
230 static void TestC1Bytes(void)
232 #if !UCONFIG_NO_LEGACY_CONVERSION
233 UErrorCode status
= U_ZERO_ERROR
;
234 static const char ssISO
[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
235 static const char ssWindows
[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
236 int32_t sISOLength
= 0, sWindowsLength
= 0;
237 UChar sISO
[sizeof(ssISO
)];
238 UChar sWindows
[sizeof(ssWindows
)];
239 int32_t lISO
= 0, lWindows
= 0;
242 UCharsetDetector
*csd
= ucsdet_open(&status
);
243 const UCharsetMatch
*match
;
246 sISOLength
= u_unescape(ssISO
, sISO
, sizeof(ssISO
));
247 sWindowsLength
= u_unescape(ssWindows
, sWindows
, sizeof(ssWindows
));
248 bISO
= extractBytes(sISO
, sISOLength
, "ISO-8859-1", &lISO
);
249 bWindows
= extractBytes(sWindows
, sWindowsLength
, "windows-1252", &lWindows
);
251 ucsdet_setText(csd
, bWindows
, lWindows
, &status
);
252 match
= ucsdet_detect(csd
, &status
);
255 log_err("English test with C1 bytes got no matches.\n");
259 name
= ucsdet_getName(match
, &status
);
261 if (strcmp(name
, "windows-1252") != 0) {
262 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name
);
265 ucsdet_setText(csd
, bISO
, lISO
, &status
);
266 match
= ucsdet_detect(csd
, &status
);
269 log_err("English text without C1 bytes got no matches.\n");
273 name
= ucsdet_getName(match
, &status
);
275 if (strcmp(name
, "ISO-8859-1") != 0) {
276 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name
);
287 static void TestInputFilter(void)
289 UErrorCode status
= U_ZERO_ERROR
;
290 static const char ss
[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
293 int32_t byteLength
= 0;
295 UCharsetDetector
*csd
= ucsdet_open(&status
);
296 const UCharsetMatch
*match
;
297 const char *lang
, *name
;
299 sLength
= u_unescape(ss
, s
, sizeof(ss
));
300 bytes
= extractBytes(s
, sLength
, "ISO-8859-1", &byteLength
);
302 ucsdet_enableInputFilter(csd
, TRUE
);
304 if (!ucsdet_isInputFilterEnabled(csd
)) {
305 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
309 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
310 match
= ucsdet_detect(csd
, &status
);
313 log_err("Turning on the input filter resulted in no matches.\n");
317 name
= ucsdet_getName(match
, &status
);
319 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
320 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name
);
322 lang
= ucsdet_getLanguage(match
, &status
);
324 if (lang
== NULL
|| strcmp(lang
, "fr") != 0) {
325 log_err("Input filter did not strip markup!\n");
330 ucsdet_enableInputFilter(csd
, FALSE
);
331 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
332 match
= ucsdet_detect(csd
, &status
);
335 log_err("Turning off the input filter resulted in no matches.\n");
339 name
= ucsdet_getName(match
, &status
);
341 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
342 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name
);
344 lang
= ucsdet_getLanguage(match
, &status
);
346 if (lang
== NULL
|| strcmp(lang
, "en") != 0) {
347 log_err("Unfiltered input did not detect as English!\n");
356 static void TestChaining(void) {
357 UErrorCode status
= U_USELESS_COLLATOR_ERROR
;
359 ucsdet_open(&status
);
360 ucsdet_setText(NULL
, NULL
, 0, &status
);
361 ucsdet_getName(NULL
, &status
);
362 ucsdet_getConfidence(NULL
, &status
);
363 ucsdet_getLanguage(NULL
, &status
);
364 ucsdet_detect(NULL
, &status
);
365 ucsdet_setDeclaredEncoding(NULL
, NULL
, 0, &status
);
366 ucsdet_detectAll(NULL
, NULL
, &status
);
367 ucsdet_getUChars(NULL
, NULL
, 0, &status
);
368 ucsdet_getUChars(NULL
, NULL
, 0, &status
);
371 /* All of this code should have done nothing. */
372 if (status
!= U_USELESS_COLLATOR_ERROR
) {
373 log_err("Status got changed to %s\n", u_errorName(status
));
377 static void TestBufferOverflow(void) {
378 UErrorCode status
= U_ZERO_ERROR
;
379 static const char *testStrings
[] = {
380 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
381 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
382 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
383 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
384 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
385 "\xa1", /* Could be a single byte shift-jis at the end */
386 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
387 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
389 static const char *testResults
[] = {
400 UCharsetDetector
*csd
= ucsdet_open(&status
);
401 const UCharsetMatch
*match
;
403 ucsdet_setDeclaredEncoding(csd
, "ISO-2022-JP", -1, &status
);
405 if (U_FAILURE(status
)) {
406 log_err("Couldn't open detector. %s\n", u_errorName(status
));
410 for (idx
= 0; idx
< UPRV_LENGTHOF(testStrings
); idx
++) {
411 ucsdet_setText(csd
, testStrings
[idx
], -1, &status
);
412 match
= ucsdet_detect(csd
, &status
);
415 if (testResults
[idx
] != NULL
) {
416 log_err("Unexpectedly got no results at index %d.\n", idx
);
419 log_verbose("Got no result as expected at index %d.\n", idx
);
424 if (testResults
[idx
] == NULL
|| strcmp(ucsdet_getName(match
, &status
), testResults
[idx
]) != 0) {
425 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
426 ucsdet_getName(match
, &status
), testResults
[idx
], idx
, ucsdet_getConfidence(match
, &status
));
435 static void TestIBM424(void)
437 UErrorCode status
= U_ZERO_ERROR
;
439 static const UChar chars
[] = {
440 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
441 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
442 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
443 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
444 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
445 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
446 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
447 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
448 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
449 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
450 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
451 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
452 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
453 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
454 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
455 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
456 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
459 static const UChar chars_reverse
[] = {
460 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
461 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
462 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
463 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
464 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
465 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
466 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
467 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
468 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
469 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
470 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
471 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
472 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
473 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
474 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
475 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
476 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
480 int32_t bLength
= 0, brLength
= 0, cLength
= UPRV_LENGTHOF(chars
), crLength
= UPRV_LENGTHOF(chars_reverse
);
482 char *bytes
= extractBytes(chars
, cLength
, "IBM424", &bLength
);
483 char *bytes_r
= extractBytes(chars_reverse
, crLength
, "IBM424", &brLength
);
485 UCharsetDetector
*csd
= ucsdet_open(&status
);
486 const UCharsetMatch
*match
;
489 ucsdet_setText(csd
, bytes
, bLength
, &status
);
490 match
= ucsdet_detect(csd
, &status
);
493 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
497 name
= ucsdet_getName(match
, &status
);
498 if (strcmp(name
, "IBM424_rtl") != 0) {
499 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name
);
502 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
503 match
= ucsdet_detect(csd
, &status
);
506 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
510 name
= ucsdet_getName(match
, &status
);
511 if (strcmp(name
, "IBM424_ltr") != 0) {
512 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name
);
521 static void TestIBM420(void)
523 UErrorCode status
= U_ZERO_ERROR
;
525 static const UChar chars
[] = {
526 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
527 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
528 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
529 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
530 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
531 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
532 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
533 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
534 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
535 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
536 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
537 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
538 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
541 static const UChar chars_reverse
[] = {
542 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
543 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
544 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
545 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
546 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
547 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
548 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
549 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
550 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
551 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
552 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
553 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
554 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
558 int32_t bLength
= 0, brLength
= 0, cLength
= UPRV_LENGTHOF(chars
), crLength
= UPRV_LENGTHOF(chars_reverse
);
560 char *bytes
= extractBytes(chars
, cLength
, "IBM420", &bLength
);
561 char *bytes_r
= extractBytes(chars_reverse
, crLength
, "IBM420", &brLength
);
563 UCharsetDetector
*csd
= ucsdet_open(&status
);
564 const UCharsetMatch
*match
;
567 ucsdet_setText(csd
, bytes
, bLength
, &status
);
568 match
= ucsdet_detect(csd
, &status
);
571 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
575 name
= ucsdet_getName(match
, &status
);
576 if (strcmp(name
, "IBM420_rtl") != 0) {
577 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name
);
580 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
581 match
= ucsdet_detect(csd
, &status
);
584 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
588 name
= ucsdet_getName(match
, &status
);
589 if (strcmp(name
, "IBM420_ltr") != 0) {
590 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name
);
599 #if U_PLATFORM_IS_DARWIN_BASED
601 // read data from file into a malloc'ed buf, which must be freed by caller.
602 // returns NULL if error. Copied from cbiapts.c
603 static void* dataBufFromFile(const char* path
, long* dataBufSizeP
) {
606 long dataBufSize
, dataFileRead
= 0;
611 dataFile
= fopen(path
, "r");
612 if (dataFile
== NULL
) {
613 log_data_err("FAIL: for %s, fopen fails\n", path
);
616 fseek(dataFile
, 0, SEEK_END
);
617 dataBufSize
= ftell(dataFile
);
620 dataBuf
= uprv_malloc(dataBufSize
);
621 if (dataBuf
!= NULL
) {
622 dataFileRead
= fread(dataBuf
, 1, dataBufSize
, dataFile
);
625 if (dataBuf
== NULL
) {
626 log_data_err("FAIL: for %s, uprv_malloc fails for dataBuf[%ld]\n", path
, dataBufSize
);
629 if (dataFileRead
< dataBufSize
) {
630 log_data_err("FAIL: for %s, fread fails, read %ld of %ld\n", path
, dataFileRead
, dataBufSize
);
635 *dataBufSizeP
= dataBufSize
;
641 const char* sampleTextPath
; // relative to cintltst directory
642 const char* encodingName
; // expected
643 } SampleTextAndEncoding
;
645 static const SampleTextAndEncoding mailSampleTests
[] = {
646 { "../testdata/encodingSamples/mailExample_Latin1_2.txt", "iso-8859-1" },
647 { "../testdata/encodingSamples/mailExample_Latin1_3.txt", "iso-8859-1" },
648 { "../testdata/encodingSamples/mailExample_Latin1_4.txt", "iso-8859-1" },
649 { "../testdata/encodingSamples/mailExample_Latin1_6.txt", "iso-8859-1" },
650 { "../testdata/encodingSamples/mailExample_Latin1_7.txt", "iso-8859-1" },
651 { "../testdata/encodingSamples/mailExample_Latin1_8.txt", "iso-8859-1" },
652 { "../testdata/encodingSamples/mailExample_Latin1_9.txt", "iso-8859-1" },
653 { "../testdata/encodingSamples/mailExample_Latin1Esc_2.txt", "iso-8859-1" },
654 { "../testdata/encodingSamples/mailExample_Latin1Esc_3.txt", "iso-8859-1" },
655 { "../testdata/encodingSamples/mailExample_Latin1Esc_4.txt", "iso-8859-1" },
656 { "../testdata/encodingSamples/mailExample_Latin1Esc_6.txt", "iso-8859-1" },
657 { "../testdata/encodingSamples/mailExample_Latin1Esc_7.txt", "iso-8859-1" },
658 { "../testdata/encodingSamples/mailExample_Latin1Esc_8.txt", "iso-8859-1" },
659 { "../testdata/encodingSamples/mailExample_Latin1Esc_9.txt", "iso-8859-1" },
663 static void TestMailFilterCSS(void) {
664 UErrorCode status
= U_ZERO_ERROR
;
665 UCharsetDetector
*detector
= ucsdet_open(&status
);
666 if (U_FAILURE(status
)) {
667 log_data_err("ucsdet_open fails. %s\n", u_errorName(status
));
669 const SampleTextAndEncoding
* testPtr
;
670 for (testPtr
= mailSampleTests
; testPtr
->sampleTextPath
!= NULL
; testPtr
++) {
672 char * sampleText
= (char *)dataBufFromFile(testPtr
->sampleTextPath
, &sampleTextLen
);
673 if (sampleText
!= NULL
) { // dataBufFromFile reports the errors that would produce NULL
674 status
= U_ZERO_ERROR
;
675 ucsdet_setText(detector
, sampleText
, sampleTextLen
, &status
);
676 if (U_FAILURE(status
)) {
677 log_data_err("ucsdet_setText fails for text file %s: %s\n", testPtr
->sampleTextPath
, u_errorName(status
));
679 const UCharsetMatch
*highestMatch
= NULL
;
680 ucsdet_enableInputFilter(detector
, TRUE
);
681 highestMatch
= ucsdet_detect(detector
, &status
);
682 if (U_FAILURE(status
) || highestMatch
==NULL
) {
683 log_err("ucsdet_detect fails for text file %s: %s\n", testPtr
->sampleTextPath
, u_errorName(status
));
685 const char *icuName
= ucsdet_getName(highestMatch
, &status
);
686 int32_t confidence
= ucsdet_getConfidence(highestMatch
, &status
);
687 if (U_FAILURE(status
) || icuName
==NULL
) {
688 log_err("ucsdet_getName and/or ucsdet_getConfidence fails for text file %s: %s\n", testPtr
->sampleTextPath
, u_errorName(status
));
690 log_info("For text file %s: expect %s; get %s with confidence %d, text length %ld\n",
691 testPtr
->sampleTextPath
, testPtr
->encodingName
, icuName
, confidence
, sampleTextLen
);
695 uprv_free(sampleText
);
698 ucsdet_close(detector
);
701 #endif /* U_PLATFORM_IS_DARWIN_BASED */