]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/ucsdetst.c
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / test / cintltst / ucsdetst.c
CommitLineData
73c04bcf
A
1/*
2 ****************************************************************************
729e4ab9 3 * Copyright (c) 2005-2009, International Business Machines Corporation and *
73c04bcf
A
4 * others. All Rights Reserved. *
5 ****************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/ustring.h"
13
14#include "cintltst.h"
15
16#include <stdlib.h>
17#include <string.h>
18
46f4442e 19#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
73c04bcf 20
46f4442e
A
21#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
22#define DELETE_ARRAY(array) free(array)
73c04bcf
A
23
24static void TestConstruction(void);
25static void TestUTF8(void);
26static void TestUTF16(void);
27static void TestC1Bytes(void);
28static void TestInputFilter(void);
29static void TestChaining(void);
46f4442e 30static void TestBufferOverflow(void);
729e4ab9
A
31static void TestIBM424(void);
32static void TestIBM420(void);
73c04bcf
A
33
34void addUCsdetTest(TestNode** root);
35
36void addUCsdetTest(TestNode** root)
37{
38 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
39 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
40 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
41 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
42 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
43 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
46f4442e 44 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
729e4ab9
A
45#if !UCONFIG_NO_LEGACY_CONVERSION
46 addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
47 addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
48#endif
73c04bcf
A
49}
50
51static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
52{
53 UErrorCode status;
54 char buffer[1024];
55 char *dest, *destLimit = buffer + sizeof(buffer);
56 const UChar *srcLimit = src + length;
57 int32_t result = 0;
58
59 do {
60 dest = buffer;
61 status = U_ZERO_ERROR;
62 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
63 result += (int32_t) (dest - buffer);
64 } while (status == U_BUFFER_OVERFLOW_ERROR);
65
66 return result;
67}
68
73c04bcf
A
69static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
70{
71 UErrorCode status = U_ZERO_ERROR;
72 UConverter *cnv = ucnv_open(codepage, &status);
73 int32_t byteCount = preflight(src, length, cnv);
74 const UChar *srcLimit = src + length;
75 char *bytes = NEW_ARRAY(char, byteCount + 1);
76 char *dest = bytes, *destLimit = bytes + byteCount + 1;
77
78 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
79 ucnv_close(cnv);
80
81 *byteLength = byteCount;
82 return bytes;
83}
84
85static void freeBytes(char *bytes)
86{
87 DELETE_ARRAY(bytes);
88}
89
90static void TestConstruction(void)
91{
92 UErrorCode status = U_ZERO_ERROR;
93 UCharsetDetector *csd = ucsdet_open(&status);
94 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
95 const char *name;
96 int32_t count = uenum_count(e, &status);
97 int32_t i, length;
98
99 for(i = 0; i < count; i += 1) {
100 name = uenum_next(e, &length, &status);
101
102 if(name == NULL || length <= 0) {
103 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
104 }
105 }
106 /* one past the list of all names must return NULL */
107 name = uenum_next(e, &length, &status);
108 if(name != NULL || length != 0 || U_FAILURE(status)) {
109 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
110 }
111
112 uenum_close(e);
113 ucsdet_close(csd);
114}
115
116static void TestUTF8(void)
117{
118 UErrorCode status = U_ZERO_ERROR;
46f4442e 119 static const char ss[] = "This is a string with some non-ascii characters that will "
73c04bcf
A
120 "be converted to UTF-8, then shoved through the detection process. "
121 "\\u0391\\u0392\\u0393\\u0394\\u0395"
122 "Sure would be nice if our source could contain Unicode directly!";
123 int32_t byteLength = 0, sLength = 0, dLength = 0;
46f4442e
A
124 UChar s[sizeof(ss)];
125 char *bytes;
73c04bcf
A
126 UCharsetDetector *csd = ucsdet_open(&status);
127 const UCharsetMatch *match;
46f4442e
A
128 UChar detected[sizeof(ss)];
129
130 sLength = u_unescape(ss, s, sizeof(ss));
131 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
73c04bcf
A
132
133 ucsdet_setText(csd, bytes, byteLength, &status);
46f4442e
A
134 if (U_FAILURE(status)) {
135 log_err("status is %s\n", u_errorName(status));
136 goto bail;
137 }
138
73c04bcf
A
139 match = ucsdet_detect(csd, &status);
140
141 if (match == NULL) {
142 log_err("Detection failure for UTF-8: got no matches.\n");
143 goto bail;
144 }
145
146 dLength = ucsdet_getUChars(match, detected, sLength, &status);
147
148 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
149 log_err("Round-trip test failed!\n");
150 }
151
152 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
153
154bail:
73c04bcf
A
155 freeBytes(bytes);
156 ucsdet_close(csd);
157}
158
159static void TestUTF16(void)
160{
161 UErrorCode status = U_ZERO_ERROR;
162 /* Notice the BOM on the start of this string */
46f4442e 163 static const UChar chars[] = {
73c04bcf
A
164 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
165 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
166 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
167 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
168 0x064a, 0x062a, 0x0000};
169 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
170 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
171 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
172 UCharsetDetector *csd = ucsdet_open(&status);
173 const UCharsetMatch *match;
174 const char *name;
175 int32_t conf;
176
177 ucsdet_setText(csd, beBytes, beLength, &status);
178 match = ucsdet_detect(csd, &status);
179
180 if (match == NULL) {
181 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
182 goto try_le;
183 }
184
185 name = ucsdet_getName(match, &status);
186 conf = ucsdet_getConfidence(match, &status);
187
188 if (strcmp(name, "UTF-16BE") != 0) {
189 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
190 }
191
192 if (conf != 100) {
193 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
194 }
195
196try_le:
197 ucsdet_setText(csd, leBytes, leLength, &status);
198 match = ucsdet_detect(csd, &status);
199
200 if (match == NULL) {
201 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
202 goto bail;
203 }
204
205 name = ucsdet_getName(match, &status);
206 conf = ucsdet_getConfidence(match, &status);
207
208
209 if (strcmp(name, "UTF-16LE") != 0) {
210 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
211 }
212
213 if (conf != 100) {
214 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
215 }
216
217bail:
218 freeBytes(leBytes);
219 freeBytes(beBytes);
220 ucsdet_close(csd);
221}
222
223static void TestC1Bytes(void)
224{
225#if !UCONFIG_NO_LEGACY_CONVERSION
226 UErrorCode status = U_ZERO_ERROR;
46f4442e
A
227 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
228 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
73c04bcf 229 int32_t sISOLength = 0, sWindowsLength = 0;
46f4442e
A
230 UChar sISO[sizeof(ssISO)];
231 UChar sWindows[sizeof(ssWindows)];
73c04bcf 232 int32_t lISO = 0, lWindows = 0;
46f4442e
A
233 char *bISO;
234 char *bWindows;
73c04bcf
A
235 UCharsetDetector *csd = ucsdet_open(&status);
236 const UCharsetMatch *match;
237 const char *name;
238
46f4442e
A
239 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
240 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
241 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
242 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
243
73c04bcf
A
244 ucsdet_setText(csd, bWindows, lWindows, &status);
245 match = ucsdet_detect(csd, &status);
246
247 if (match == NULL) {
248 log_err("English test with C1 bytes got no matches.\n");
249 goto bail;
250 }
251
252 name = ucsdet_getName(match, &status);
253
254 if (strcmp(name, "windows-1252") != 0) {
729e4ab9 255 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
73c04bcf
A
256 }
257
258 ucsdet_setText(csd, bISO, lISO, &status);
259 match = ucsdet_detect(csd, &status);
260
261 if (match == NULL) {
262 log_err("English text without C1 bytes got no matches.\n");
263 goto bail;
264 }
265
266 name = ucsdet_getName(match, &status);
267
268 if (strcmp(name, "ISO-8859-1") != 0) {
269 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
270 }
271
272bail:
273 freeBytes(bWindows);
274 freeBytes(bISO);
275
276 ucsdet_close(csd);
277#endif
278}
279
280static void TestInputFilter(void)
281{
282 UErrorCode status = U_ZERO_ERROR;
46f4442e 283 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
73c04bcf 284 int32_t sLength = 0;
46f4442e 285 UChar s[sizeof(ss)];
73c04bcf 286 int32_t byteLength = 0;
46f4442e 287 char *bytes;
73c04bcf
A
288 UCharsetDetector *csd = ucsdet_open(&status);
289 const UCharsetMatch *match;
290 const char *lang, *name;
291
46f4442e
A
292 sLength = u_unescape(ss, s, sizeof(ss));
293 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
294
73c04bcf
A
295 ucsdet_enableInputFilter(csd, TRUE);
296
297 if (!ucsdet_isInputFilterEnabled(csd)) {
298 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
299 }
300
301
302 ucsdet_setText(csd, bytes, byteLength, &status);
303 match = ucsdet_detect(csd, &status);
304
305 if (match == NULL) {
306 log_err("Turning on the input filter resulted in no matches.\n");
307 goto turn_off;
308 }
309
310 name = ucsdet_getName(match, &status);
311
312 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
313 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
314 } else {
315 lang = ucsdet_getLanguage(match, &status);
316
317 if (lang == NULL || strcmp(lang, "fr") != 0) {
318 log_err("Input filter did not strip markup!\n");
319 }
320 }
321
322turn_off:
323 ucsdet_enableInputFilter(csd, FALSE);
324 ucsdet_setText(csd, bytes, byteLength, &status);
325 match = ucsdet_detect(csd, &status);
326
327 if (match == NULL) {
328 log_err("Turning off the input filter resulted in no matches.\n");
329 goto bail;
330 }
331
332 name = ucsdet_getName(match, &status);
333
334 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
335 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
336 } else {
337 lang = ucsdet_getLanguage(match, &status);
338
339 if (lang == NULL || strcmp(lang, "en") != 0) {
340 log_err("Unfiltered input did not detect as English!\n");
341 }
342 }
343
344bail:
345 freeBytes(bytes);
346 ucsdet_close(csd);
347}
348
349static void TestChaining(void) {
350 UErrorCode status = U_USELESS_COLLATOR_ERROR;
351
352 ucsdet_open(&status);
353 ucsdet_setText(NULL, NULL, 0, &status);
354 ucsdet_getName(NULL, &status);
355 ucsdet_getConfidence(NULL, &status);
356 ucsdet_getLanguage(NULL, &status);
357 ucsdet_detect(NULL, &status);
358 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
359 ucsdet_detectAll(NULL, NULL, &status);
360 ucsdet_getUChars(NULL, NULL, 0, &status);
361 ucsdet_getUChars(NULL, NULL, 0, &status);
362 ucsdet_close(NULL);
363
364 /* All of this code should have done nothing. */
365 if (status != U_USELESS_COLLATOR_ERROR) {
366 log_err("Status got changed to %s\n", u_errorName(status));
367 }
368}
46f4442e
A
369
370static void TestBufferOverflow(void) {
371 UErrorCode status = U_ZERO_ERROR;
372 static const char *testStrings[] = {
373 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
377 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
378 "\xa1", /* Could be a single byte shift-jis at the end */
379 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
380 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
381 };
382 static const char *testResults[] = {
383 "windows-1252",
384 "windows-1252",
385 "windows-1252",
386 "windows-1252",
387 "ISO-2022-JP",
388 NULL,
389 NULL,
390 "ISO-8859-1"
391 };
392 int32_t idx = 0;
393 UCharsetDetector *csd = ucsdet_open(&status);
394 const UCharsetMatch *match;
395
396 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
397
398 if (U_FAILURE(status)) {
399 log_err("Couldn't open detector. %s\n", u_errorName(status));
400 goto bail;
401 }
402
403 for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
404 ucsdet_setText(csd, testStrings[idx], -1, &status);
405 match = ucsdet_detect(csd, &status);
406
407 if (match == NULL) {
408 if (testResults[idx] != NULL) {
409 log_err("Unexpectedly got no results at index %d.\n", idx);
410 }
411 else {
412 log_verbose("Got no result as expected at index %d.\n", idx);
413 }
414 continue;
415 }
416
417 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
418 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
419 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
420 goto bail;
421 }
422 }
423
424bail:
425 ucsdet_close(csd);
426}
427
729e4ab9
A
428static void TestIBM424(void)
429{
430 UErrorCode status = U_ZERO_ERROR;
431
432 static const UChar chars[] = {
433 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
434 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
435 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
436 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
437 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
438 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
439 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
440 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
441 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
442 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
443 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
444 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
445 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
446 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
447 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
448 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
449 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
450 };
451
452 static const UChar chars_reverse[] = {
453 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
454 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
455 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
456 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
457 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
458 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
459 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
460 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
461 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
462 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
463 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
464 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
465 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
466 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
467 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
468 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
469 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
470 0x0000
471 };
472
473 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
474
475 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
476 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
477
478 UCharsetDetector *csd = ucsdet_open(&status);
479 const UCharsetMatch *match;
480 const char *name;
481
482 ucsdet_setText(csd, bytes, bLength, &status);
483 match = ucsdet_detect(csd, &status);
484
485 if (match == NULL) {
486 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
487 goto bail;
488 }
489
490 name = ucsdet_getName(match, &status);
491 if (strcmp(name, "IBM424_rtl") != 0) {
492 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
493 }
494
495 ucsdet_setText(csd, bytes_r, brLength, &status);
496 match = ucsdet_detect(csd, &status);
497
498 if (match == NULL) {
499 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
500 goto bail;
501 }
502
503 name = ucsdet_getName(match, &status);
504 if (strcmp(name, "IBM424_ltr") != 0) {
505 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
506 }
507
508bail:
509 freeBytes(bytes);
510 freeBytes(bytes_r);
511 ucsdet_close(csd);
512}
513
514static void TestIBM420(void)
515{
516 UErrorCode status = U_ZERO_ERROR;
517
518 static const UChar chars[] = {
519 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
520 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
521 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
522 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
523 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
524 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
525 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
526 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
527 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
528 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
529 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
530 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
531 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
532 0x0000
533 };
534 static const UChar chars_reverse[] = {
535 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
536 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
537 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
538 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
539 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
540 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
541 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
542 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
543 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
544 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
545 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
546 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
547 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
548 0x0000,
549 };
550
551 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
552
553 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
554 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
555
556 UCharsetDetector *csd = ucsdet_open(&status);
557 const UCharsetMatch *match;
558 const char *name;
559
560 ucsdet_setText(csd, bytes, bLength, &status);
561 match = ucsdet_detect(csd, &status);
562
563 if (match == NULL) {
564 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
565 goto bail;
566 }
567
568 name = ucsdet_getName(match, &status);
569 if (strcmp(name, "IBM420_rtl") != 0) {
570 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
571 }
572
573 ucsdet_setText(csd, bytes_r, brLength, &status);
574 match = ucsdet_detect(csd, &status);
575
576 if (match == NULL) {
577 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
578 goto bail;
579 }
580
581 name = ucsdet_getName(match, &status);
582 if (strcmp(name, "IBM420_ltr") != 0) {
583 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
584 }
585
586bail:
587 freeBytes(bytes);
588 freeBytes(bytes_r);
589 ucsdet_close(csd);
590}