]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/ucsdetst.c
ICU-66108.tar.gz
[apple/icu.git] / icuSources / test / cintltst / ucsdetst.c
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/*
4 ****************************************************************************
2ca993e8 5 * Copyright (c) 2005-2016, International Business Machines Corporation and *
73c04bcf
A
6 * others. All Rights Reserved. *
7 ****************************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#include "unicode/ucsdet.h"
13#include "unicode/ucnv.h"
14#include "unicode/ustring.h"
15
16#include "cintltst.h"
2ca993e8 17#include "cmemory.h"
73c04bcf
A
18
19#include <stdlib.h>
20#include <string.h>
21
46f4442e
A
22#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
23#define DELETE_ARRAY(array) free(array)
73c04bcf
A
24
25static void TestConstruction(void);
26static void TestUTF8(void);
27static void TestUTF16(void);
28static void TestC1Bytes(void);
29static void TestInputFilter(void);
30static void TestChaining(void);
46f4442e 31static void TestBufferOverflow(void);
729e4ab9
A
32static void TestIBM424(void);
33static void TestIBM420(void);
3d1f044b
A
34#if U_PLATFORM_IS_DARWIN_BASED
35static void TestMailFilterCSS(void);
36#endif
73c04bcf
A
37
38void addUCsdetTest(TestNode** root);
39
40void addUCsdetTest(TestNode** root)
41{
42 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
43 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
44 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
45 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
46 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
47 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
46f4442e 48 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
729e4ab9
A
49#if !UCONFIG_NO_LEGACY_CONVERSION
50 addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
51 addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
52#endif
3d1f044b
A
53#if U_PLATFORM_IS_DARWIN_BASED
54 addTest(root, &TestMailFilterCSS, "ucsdetst/TestMailFilterCSS");
55#endif
73c04bcf
A
56}
57
58static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
59{
60 UErrorCode status;
61 char buffer[1024];
62 char *dest, *destLimit = buffer + sizeof(buffer);
63 const UChar *srcLimit = src + length;
64 int32_t result = 0;
65
66 do {
67 dest = buffer;
68 status = U_ZERO_ERROR;
69 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
70 result += (int32_t) (dest - buffer);
71 } while (status == U_BUFFER_OVERFLOW_ERROR);
72
73 return result;
74}
75
73c04bcf
A
76static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
77{
78 UErrorCode status = U_ZERO_ERROR;
79 UConverter *cnv = ucnv_open(codepage, &status);
80 int32_t byteCount = preflight(src, length, cnv);
81 const UChar *srcLimit = src + length;
82 char *bytes = NEW_ARRAY(char, byteCount + 1);
83 char *dest = bytes, *destLimit = bytes + byteCount + 1;
84
85 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
86 ucnv_close(cnv);
87
88 *byteLength = byteCount;
89 return bytes;
90}
91
92static void freeBytes(char *bytes)
93{
94 DELETE_ARRAY(bytes);
95}
96
97static void TestConstruction(void)
98{
99 UErrorCode status = U_ZERO_ERROR;
100 UCharsetDetector *csd = ucsdet_open(&status);
101 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
102 const char *name;
103 int32_t count = uenum_count(e, &status);
104 int32_t i, length;
105
106 for(i = 0; i < count; i += 1) {
107 name = uenum_next(e, &length, &status);
108
109 if(name == NULL || length <= 0) {
110 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
111 }
112 }
113 /* one past the list of all names must return NULL */
114 name = uenum_next(e, &length, &status);
115 if(name != NULL || length != 0 || U_FAILURE(status)) {
116 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
117 }
118
119 uenum_close(e);
120 ucsdet_close(csd);
121}
122
123static void TestUTF8(void)
124{
125 UErrorCode status = U_ZERO_ERROR;
46f4442e 126 static const char ss[] = "This is a string with some non-ascii characters that will "
73c04bcf
A
127 "be converted to UTF-8, then shoved through the detection process. "
128 "\\u0391\\u0392\\u0393\\u0394\\u0395"
129 "Sure would be nice if our source could contain Unicode directly!";
130 int32_t byteLength = 0, sLength = 0, dLength = 0;
46f4442e
A
131 UChar s[sizeof(ss)];
132 char *bytes;
73c04bcf
A
133 UCharsetDetector *csd = ucsdet_open(&status);
134 const UCharsetMatch *match;
46f4442e
A
135 UChar detected[sizeof(ss)];
136
137 sLength = u_unescape(ss, s, sizeof(ss));
138 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
73c04bcf
A
139
140 ucsdet_setText(csd, bytes, byteLength, &status);
46f4442e
A
141 if (U_FAILURE(status)) {
142 log_err("status is %s\n", u_errorName(status));
143 goto bail;
144 }
145
73c04bcf
A
146 match = ucsdet_detect(csd, &status);
147
148 if (match == NULL) {
149 log_err("Detection failure for UTF-8: got no matches.\n");
150 goto bail;
151 }
152
153 dLength = ucsdet_getUChars(match, detected, sLength, &status);
154
155 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
156 log_err("Round-trip test failed!\n");
157 }
158
159 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
160
161bail:
73c04bcf
A
162 freeBytes(bytes);
163 ucsdet_close(csd);
164}
165
166static void TestUTF16(void)
167{
168 UErrorCode status = U_ZERO_ERROR;
169 /* Notice the BOM on the start of this string */
46f4442e 170 static const UChar chars[] = {
73c04bcf
A
171 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
172 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
173 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
174 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
175 0x064a, 0x062a, 0x0000};
2ca993e8 176 int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
73c04bcf
A
177 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
178 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
179 UCharsetDetector *csd = ucsdet_open(&status);
180 const UCharsetMatch *match;
181 const char *name;
182 int32_t conf;
183
184 ucsdet_setText(csd, beBytes, beLength, &status);
185 match = ucsdet_detect(csd, &status);
186
187 if (match == NULL) {
188 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
189 goto try_le;
190 }
191
192 name = ucsdet_getName(match, &status);
193 conf = ucsdet_getConfidence(match, &status);
194
195 if (strcmp(name, "UTF-16BE") != 0) {
196 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
197 }
198
199 if (conf != 100) {
200 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
201 }
202
203try_le:
204 ucsdet_setText(csd, leBytes, leLength, &status);
205 match = ucsdet_detect(csd, &status);
206
207 if (match == NULL) {
208 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
209 goto bail;
210 }
211
212 name = ucsdet_getName(match, &status);
213 conf = ucsdet_getConfidence(match, &status);
214
215
216 if (strcmp(name, "UTF-16LE") != 0) {
217 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
218 }
219
220 if (conf != 100) {
221 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
222 }
223
224bail:
225 freeBytes(leBytes);
226 freeBytes(beBytes);
227 ucsdet_close(csd);
228}
229
230static void TestC1Bytes(void)
231{
232#if !UCONFIG_NO_LEGACY_CONVERSION
233 UErrorCode status = U_ZERO_ERROR;
46f4442e
A
234 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
235 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
73c04bcf 236 int32_t sISOLength = 0, sWindowsLength = 0;
46f4442e
A
237 UChar sISO[sizeof(ssISO)];
238 UChar sWindows[sizeof(ssWindows)];
73c04bcf 239 int32_t lISO = 0, lWindows = 0;
46f4442e
A
240 char *bISO;
241 char *bWindows;
73c04bcf
A
242 UCharsetDetector *csd = ucsdet_open(&status);
243 const UCharsetMatch *match;
244 const char *name;
245
46f4442e
A
246 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
247 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
248 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
249 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
250
73c04bcf
A
251 ucsdet_setText(csd, bWindows, lWindows, &status);
252 match = ucsdet_detect(csd, &status);
253
254 if (match == NULL) {
255 log_err("English test with C1 bytes got no matches.\n");
256 goto bail;
257 }
258
259 name = ucsdet_getName(match, &status);
260
261 if (strcmp(name, "windows-1252") != 0) {
729e4ab9 262 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
73c04bcf
A
263 }
264
265 ucsdet_setText(csd, bISO, lISO, &status);
266 match = ucsdet_detect(csd, &status);
267
268 if (match == NULL) {
269 log_err("English text without C1 bytes got no matches.\n");
270 goto bail;
271 }
272
273 name = ucsdet_getName(match, &status);
274
275 if (strcmp(name, "ISO-8859-1") != 0) {
276 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
277 }
278
279bail:
280 freeBytes(bWindows);
281 freeBytes(bISO);
282
283 ucsdet_close(csd);
284#endif
285}
286
287static void TestInputFilter(void)
288{
289 UErrorCode status = U_ZERO_ERROR;
46f4442e 290 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
73c04bcf 291 int32_t sLength = 0;
46f4442e 292 UChar s[sizeof(ss)];
73c04bcf 293 int32_t byteLength = 0;
46f4442e 294 char *bytes;
73c04bcf
A
295 UCharsetDetector *csd = ucsdet_open(&status);
296 const UCharsetMatch *match;
297 const char *lang, *name;
298
46f4442e
A
299 sLength = u_unescape(ss, s, sizeof(ss));
300 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
301
73c04bcf
A
302 ucsdet_enableInputFilter(csd, TRUE);
303
304 if (!ucsdet_isInputFilterEnabled(csd)) {
305 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
306 }
307
308
309 ucsdet_setText(csd, bytes, byteLength, &status);
310 match = ucsdet_detect(csd, &status);
311
312 if (match == NULL) {
313 log_err("Turning on the input filter resulted in no matches.\n");
314 goto turn_off;
315 }
316
317 name = ucsdet_getName(match, &status);
318
319 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
320 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
321 } else {
322 lang = ucsdet_getLanguage(match, &status);
323
324 if (lang == NULL || strcmp(lang, "fr") != 0) {
325 log_err("Input filter did not strip markup!\n");
326 }
327 }
328
329turn_off:
330 ucsdet_enableInputFilter(csd, FALSE);
331 ucsdet_setText(csd, bytes, byteLength, &status);
332 match = ucsdet_detect(csd, &status);
333
334 if (match == NULL) {
335 log_err("Turning off the input filter resulted in no matches.\n");
336 goto bail;
337 }
338
339 name = ucsdet_getName(match, &status);
340
341 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
342 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
343 } else {
344 lang = ucsdet_getLanguage(match, &status);
345
346 if (lang == NULL || strcmp(lang, "en") != 0) {
347 log_err("Unfiltered input did not detect as English!\n");
348 }
349 }
350
351bail:
352 freeBytes(bytes);
353 ucsdet_close(csd);
354}
355
356static void TestChaining(void) {
357 UErrorCode status = U_USELESS_COLLATOR_ERROR;
358
359 ucsdet_open(&status);
360 ucsdet_setText(NULL, NULL, 0, &status);
361 ucsdet_getName(NULL, &status);
362 ucsdet_getConfidence(NULL, &status);
363 ucsdet_getLanguage(NULL, &status);
364 ucsdet_detect(NULL, &status);
365 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
366 ucsdet_detectAll(NULL, NULL, &status);
367 ucsdet_getUChars(NULL, NULL, 0, &status);
368 ucsdet_getUChars(NULL, NULL, 0, &status);
369 ucsdet_close(NULL);
370
371 /* All of this code should have done nothing. */
372 if (status != U_USELESS_COLLATOR_ERROR) {
373 log_err("Status got changed to %s\n", u_errorName(status));
374 }
375}
46f4442e
A
376
377static void TestBufferOverflow(void) {
378 UErrorCode status = U_ZERO_ERROR;
379 static const char *testStrings[] = {
380 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
381 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
382 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
383 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
384 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
385 "\xa1", /* Could be a single byte shift-jis at the end */
386 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
387 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
388 };
389 static const char *testResults[] = {
390 "windows-1252",
391 "windows-1252",
392 "windows-1252",
393 "windows-1252",
394 "ISO-2022-JP",
395 NULL,
396 NULL,
397 "ISO-8859-1"
398 };
399 int32_t idx = 0;
400 UCharsetDetector *csd = ucsdet_open(&status);
401 const UCharsetMatch *match;
402
403 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
404
405 if (U_FAILURE(status)) {
406 log_err("Couldn't open detector. %s\n", u_errorName(status));
407 goto bail;
408 }
409
2ca993e8 410 for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
46f4442e
A
411 ucsdet_setText(csd, testStrings[idx], -1, &status);
412 match = ucsdet_detect(csd, &status);
413
414 if (match == NULL) {
415 if (testResults[idx] != NULL) {
416 log_err("Unexpectedly got no results at index %d.\n", idx);
417 }
418 else {
419 log_verbose("Got no result as expected at index %d.\n", idx);
420 }
421 continue;
422 }
423
424 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
425 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
426 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
427 goto bail;
428 }
429 }
430
431bail:
432 ucsdet_close(csd);
433}
434
729e4ab9
A
435static void TestIBM424(void)
436{
437 UErrorCode status = U_ZERO_ERROR;
438
439 static const UChar chars[] = {
440 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
441 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
442 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
443 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
444 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
445 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
446 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
447 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
448 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
449 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
450 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
451 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
452 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
453 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
454 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
455 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
456 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
457 };
458
459 static const UChar chars_reverse[] = {
460 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
461 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
462 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
463 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
464 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
465 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
466 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
467 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
468 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
469 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
470 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
471 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
472 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
473 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
474 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
475 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
476 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
477 0x0000
478 };
479
2ca993e8 480 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
729e4ab9
A
481
482 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
483 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
484
485 UCharsetDetector *csd = ucsdet_open(&status);
486 const UCharsetMatch *match;
487 const char *name;
488
489 ucsdet_setText(csd, bytes, bLength, &status);
490 match = ucsdet_detect(csd, &status);
491
492 if (match == NULL) {
493 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
494 goto bail;
495 }
496
497 name = ucsdet_getName(match, &status);
498 if (strcmp(name, "IBM424_rtl") != 0) {
499 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
500 }
501
502 ucsdet_setText(csd, bytes_r, brLength, &status);
503 match = ucsdet_detect(csd, &status);
504
505 if (match == NULL) {
506 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
507 goto bail;
508 }
509
510 name = ucsdet_getName(match, &status);
511 if (strcmp(name, "IBM424_ltr") != 0) {
512 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
513 }
514
515bail:
516 freeBytes(bytes);
517 freeBytes(bytes_r);
518 ucsdet_close(csd);
519}
520
521static void TestIBM420(void)
522{
523 UErrorCode status = U_ZERO_ERROR;
524
525 static const UChar chars[] = {
526 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
527 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
528 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
529 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
530 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
531 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
532 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
533 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
534 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
535 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
536 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
537 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
538 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
539 0x0000
540 };
541 static const UChar chars_reverse[] = {
542 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
543 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
544 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
545 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
546 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
547 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
548 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
549 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
550 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
551 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
552 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
553 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
554 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
555 0x0000,
556 };
557
2ca993e8 558 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
729e4ab9
A
559
560 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
561 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
562
563 UCharsetDetector *csd = ucsdet_open(&status);
564 const UCharsetMatch *match;
565 const char *name;
566
567 ucsdet_setText(csd, bytes, bLength, &status);
568 match = ucsdet_detect(csd, &status);
569
570 if (match == NULL) {
571 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
572 goto bail;
573 }
574
575 name = ucsdet_getName(match, &status);
576 if (strcmp(name, "IBM420_rtl") != 0) {
577 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
578 }
579
580 ucsdet_setText(csd, bytes_r, brLength, &status);
581 match = ucsdet_detect(csd, &status);
582
583 if (match == NULL) {
584 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
585 goto bail;
586 }
587
588 name = ucsdet_getName(match, &status);
589 if (strcmp(name, "IBM420_ltr") != 0) {
590 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
591 }
592
593bail:
594 freeBytes(bytes);
595 freeBytes(bytes_r);
596 ucsdet_close(csd);
597}
3d1f044b
A
598
599#if U_PLATFORM_IS_DARWIN_BASED
600#include <stdio.h>
601// read data from file into a malloc'ed buf, which must be freed by caller.
602// returns NULL if error. Copied from cbiapts.c
603static void* dataBufFromFile(const char* path, long* dataBufSizeP) {
604 FILE * dataFile;
605 void * dataBuf;
606 long dataBufSize, dataFileRead = 0;
607
608 if (dataBufSizeP) {
609 *dataBufSizeP = 0;
610 }
611 dataFile = fopen(path, "r");
612 if (dataFile == NULL) {
613 log_data_err("FAIL: for %s, fopen fails\n", path);
614 return NULL;
615 }
616 fseek(dataFile, 0, SEEK_END);
617 dataBufSize = ftell(dataFile);
618 rewind(dataFile);
619
620 dataBuf = uprv_malloc(dataBufSize);
621 if (dataBuf != NULL) {
622 dataFileRead = fread(dataBuf, 1, dataBufSize, dataFile);
623 }
624 fclose(dataFile);
625 if (dataBuf == NULL) {
626 log_data_err("FAIL: for %s, uprv_malloc fails for dataBuf[%ld]\n", path, dataBufSize);
627 return NULL;
628 }
629 if (dataFileRead < dataBufSize) {
630 log_data_err("FAIL: for %s, fread fails, read %ld of %ld\n", path, dataFileRead, dataBufSize);
631 uprv_free(dataBuf);
632 return NULL;
633 }
634 if (dataBufSizeP) {
635 *dataBufSizeP = dataBufSize;
636 }
637 return dataBuf;
638}
639
640typedef struct {
641 const char* sampleTextPath; // relative to cintltst directory
642 const char* encodingName; // expected
643} SampleTextAndEncoding;
644
340931cb
A
645#ifdef APPLE_XCODE_BUILD
646#define TESTDATA_DIR "testdata"
647#else
648#define TESTDATA_DIR "../testdata"
649#endif
650
3d1f044b 651static const SampleTextAndEncoding mailSampleTests[] = {
340931cb
A
652 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_2.txt", "iso-8859-1" },
653 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_3.txt", "iso-8859-1" },
654 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_4.txt", "iso-8859-1" },
655 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_6.txt", "iso-8859-1" },
656 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_7.txt", "iso-8859-1" },
657 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_8.txt", "iso-8859-1" },
658 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_9.txt", "iso-8859-1" },
659 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_2.txt", "iso-8859-1" },
660 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_3.txt", "iso-8859-1" },
661 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_4.txt", "iso-8859-1" },
662 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_6.txt", "iso-8859-1" },
663 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_7.txt", "iso-8859-1" },
664 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_8.txt", "iso-8859-1" },
665 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1Esc_9.txt", "iso-8859-1" },
1a147d09 666 // additions for <rdar://problem/56373519>
340931cb 667 { TESTDATA_DIR "/encodingSamples/mailExample_Latin1_11.txt", "iso-8859-1" },
3d1f044b
A
668 { NULL, NULL }
669};
670
671static void TestMailFilterCSS(void) {
672 UErrorCode status = U_ZERO_ERROR;
673 UCharsetDetector *detector = ucsdet_open(&status);
674 if (U_FAILURE(status)) {
675 log_data_err("ucsdet_open fails. %s\n", u_errorName(status));
676 } else {
677 const SampleTextAndEncoding* testPtr;
678 for (testPtr = mailSampleTests; testPtr->sampleTextPath != NULL; testPtr++) {
679 long sampleTextLen;
680 char * sampleText = (char *)dataBufFromFile(testPtr->sampleTextPath, &sampleTextLen);
681 if (sampleText != NULL) { // dataBufFromFile reports the errors that would produce NULL
682 status = U_ZERO_ERROR;
683 ucsdet_setText(detector, sampleText, sampleTextLen, &status);
684 if (U_FAILURE(status)) {
685 log_data_err("ucsdet_setText fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
686 } else {
687 const UCharsetMatch *highestMatch = NULL;
688 ucsdet_enableInputFilter(detector, TRUE);
689 highestMatch = ucsdet_detect(detector, &status);
690 if (U_FAILURE(status) || highestMatch==NULL) {
691 log_err("ucsdet_detect fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
692 } else {
693 const char *icuName = ucsdet_getName(highestMatch, &status);
694 int32_t confidence = ucsdet_getConfidence(highestMatch, &status);
1a147d09 695 const char *langCode = ucsdet_getLanguage(highestMatch, &status);
3d1f044b
A
696 if (U_FAILURE(status) || icuName==NULL) {
697 log_err("ucsdet_getName and/or ucsdet_getConfidence fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
698 } else {
1a147d09
A
699 log_info("For text file %s: expect %s; get %s with confidence %d, langCode %s; text length %ld\n",
700 testPtr->sampleTextPath, testPtr->encodingName, icuName, confidence, langCode, sampleTextLen);
3d1f044b
A
701 }
702 }
703 }
704 uprv_free(sampleText);
705 }
706 }
707 ucsdet_close(detector);
708 }
709}
710#endif /* U_PLATFORM_IS_DARWIN_BASED */