]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/ucsdetst.c
ICU-551.24.tar.gz
[apple/icu.git] / icuSources / test / cintltst / ucsdetst.c
1 /*
2 ****************************************************************************
3 * Copyright (c) 2005-2009, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ****************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/ustring.h"
13
14 #include "cintltst.h"
15
16 #include <stdlib.h>
17 #include <string.h>
18
19 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
20
21 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
22 #define DELETE_ARRAY(array) free(array)
23
24 static void TestConstruction(void);
25 static void TestUTF8(void);
26 static void TestUTF16(void);
27 static void TestC1Bytes(void);
28 static void TestInputFilter(void);
29 static void TestChaining(void);
30 static void TestBufferOverflow(void);
31 static void TestIBM424(void);
32 static void TestIBM420(void);
33
34 void addUCsdetTest(TestNode** root);
35
36 void addUCsdetTest(TestNode** root)
37 {
38 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
39 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
40 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
41 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
42 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
43 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
44 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
45 #if !UCONFIG_NO_LEGACY_CONVERSION
46 addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
47 addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
48 #endif
49 }
50
51 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
52 {
53 UErrorCode status;
54 char buffer[1024];
55 char *dest, *destLimit = buffer + sizeof(buffer);
56 const UChar *srcLimit = src + length;
57 int32_t result = 0;
58
59 do {
60 dest = buffer;
61 status = U_ZERO_ERROR;
62 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
63 result += (int32_t) (dest - buffer);
64 } while (status == U_BUFFER_OVERFLOW_ERROR);
65
66 return result;
67 }
68
69 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
70 {
71 UErrorCode status = U_ZERO_ERROR;
72 UConverter *cnv = ucnv_open(codepage, &status);
73 int32_t byteCount = preflight(src, length, cnv);
74 const UChar *srcLimit = src + length;
75 char *bytes = NEW_ARRAY(char, byteCount + 1);
76 char *dest = bytes, *destLimit = bytes + byteCount + 1;
77
78 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
79 ucnv_close(cnv);
80
81 *byteLength = byteCount;
82 return bytes;
83 }
84
85 static void freeBytes(char *bytes)
86 {
87 DELETE_ARRAY(bytes);
88 }
89
90 static void TestConstruction(void)
91 {
92 UErrorCode status = U_ZERO_ERROR;
93 UCharsetDetector *csd = ucsdet_open(&status);
94 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
95 const char *name;
96 int32_t count = uenum_count(e, &status);
97 int32_t i, length;
98
99 for(i = 0; i < count; i += 1) {
100 name = uenum_next(e, &length, &status);
101
102 if(name == NULL || length <= 0) {
103 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
104 }
105 }
106 /* one past the list of all names must return NULL */
107 name = uenum_next(e, &length, &status);
108 if(name != NULL || length != 0 || U_FAILURE(status)) {
109 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
110 }
111
112 uenum_close(e);
113 ucsdet_close(csd);
114 }
115
116 static void TestUTF8(void)
117 {
118 UErrorCode status = U_ZERO_ERROR;
119 static const char ss[] = "This is a string with some non-ascii characters that will "
120 "be converted to UTF-8, then shoved through the detection process. "
121 "\\u0391\\u0392\\u0393\\u0394\\u0395"
122 "Sure would be nice if our source could contain Unicode directly!";
123 int32_t byteLength = 0, sLength = 0, dLength = 0;
124 UChar s[sizeof(ss)];
125 char *bytes;
126 UCharsetDetector *csd = ucsdet_open(&status);
127 const UCharsetMatch *match;
128 UChar detected[sizeof(ss)];
129
130 sLength = u_unescape(ss, s, sizeof(ss));
131 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
132
133 ucsdet_setText(csd, bytes, byteLength, &status);
134 if (U_FAILURE(status)) {
135 log_err("status is %s\n", u_errorName(status));
136 goto bail;
137 }
138
139 match = ucsdet_detect(csd, &status);
140
141 if (match == NULL) {
142 log_err("Detection failure for UTF-8: got no matches.\n");
143 goto bail;
144 }
145
146 dLength = ucsdet_getUChars(match, detected, sLength, &status);
147
148 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
149 log_err("Round-trip test failed!\n");
150 }
151
152 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
153
154 bail:
155 freeBytes(bytes);
156 ucsdet_close(csd);
157 }
158
159 static void TestUTF16(void)
160 {
161 UErrorCode status = U_ZERO_ERROR;
162 /* Notice the BOM on the start of this string */
163 static const UChar chars[] = {
164 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
165 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
166 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
167 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
168 0x064a, 0x062a, 0x0000};
169 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
170 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
171 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
172 UCharsetDetector *csd = ucsdet_open(&status);
173 const UCharsetMatch *match;
174 const char *name;
175 int32_t conf;
176
177 ucsdet_setText(csd, beBytes, beLength, &status);
178 match = ucsdet_detect(csd, &status);
179
180 if (match == NULL) {
181 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
182 goto try_le;
183 }
184
185 name = ucsdet_getName(match, &status);
186 conf = ucsdet_getConfidence(match, &status);
187
188 if (strcmp(name, "UTF-16BE") != 0) {
189 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
190 }
191
192 if (conf != 100) {
193 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
194 }
195
196 try_le:
197 ucsdet_setText(csd, leBytes, leLength, &status);
198 match = ucsdet_detect(csd, &status);
199
200 if (match == NULL) {
201 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
202 goto bail;
203 }
204
205 name = ucsdet_getName(match, &status);
206 conf = ucsdet_getConfidence(match, &status);
207
208
209 if (strcmp(name, "UTF-16LE") != 0) {
210 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
211 }
212
213 if (conf != 100) {
214 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
215 }
216
217 bail:
218 freeBytes(leBytes);
219 freeBytes(beBytes);
220 ucsdet_close(csd);
221 }
222
223 static void TestC1Bytes(void)
224 {
225 #if !UCONFIG_NO_LEGACY_CONVERSION
226 UErrorCode status = U_ZERO_ERROR;
227 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
228 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
229 int32_t sISOLength = 0, sWindowsLength = 0;
230 UChar sISO[sizeof(ssISO)];
231 UChar sWindows[sizeof(ssWindows)];
232 int32_t lISO = 0, lWindows = 0;
233 char *bISO;
234 char *bWindows;
235 UCharsetDetector *csd = ucsdet_open(&status);
236 const UCharsetMatch *match;
237 const char *name;
238
239 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
240 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
241 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
242 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
243
244 ucsdet_setText(csd, bWindows, lWindows, &status);
245 match = ucsdet_detect(csd, &status);
246
247 if (match == NULL) {
248 log_err("English test with C1 bytes got no matches.\n");
249 goto bail;
250 }
251
252 name = ucsdet_getName(match, &status);
253
254 if (strcmp(name, "windows-1252") != 0) {
255 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
256 }
257
258 ucsdet_setText(csd, bISO, lISO, &status);
259 match = ucsdet_detect(csd, &status);
260
261 if (match == NULL) {
262 log_err("English text without C1 bytes got no matches.\n");
263 goto bail;
264 }
265
266 name = ucsdet_getName(match, &status);
267
268 if (strcmp(name, "ISO-8859-1") != 0) {
269 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
270 }
271
272 bail:
273 freeBytes(bWindows);
274 freeBytes(bISO);
275
276 ucsdet_close(csd);
277 #endif
278 }
279
280 static void TestInputFilter(void)
281 {
282 UErrorCode status = U_ZERO_ERROR;
283 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
284 int32_t sLength = 0;
285 UChar s[sizeof(ss)];
286 int32_t byteLength = 0;
287 char *bytes;
288 UCharsetDetector *csd = ucsdet_open(&status);
289 const UCharsetMatch *match;
290 const char *lang, *name;
291
292 sLength = u_unescape(ss, s, sizeof(ss));
293 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
294
295 ucsdet_enableInputFilter(csd, TRUE);
296
297 if (!ucsdet_isInputFilterEnabled(csd)) {
298 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
299 }
300
301
302 ucsdet_setText(csd, bytes, byteLength, &status);
303 match = ucsdet_detect(csd, &status);
304
305 if (match == NULL) {
306 log_err("Turning on the input filter resulted in no matches.\n");
307 goto turn_off;
308 }
309
310 name = ucsdet_getName(match, &status);
311
312 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
313 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
314 } else {
315 lang = ucsdet_getLanguage(match, &status);
316
317 if (lang == NULL || strcmp(lang, "fr") != 0) {
318 log_err("Input filter did not strip markup!\n");
319 }
320 }
321
322 turn_off:
323 ucsdet_enableInputFilter(csd, FALSE);
324 ucsdet_setText(csd, bytes, byteLength, &status);
325 match = ucsdet_detect(csd, &status);
326
327 if (match == NULL) {
328 log_err("Turning off the input filter resulted in no matches.\n");
329 goto bail;
330 }
331
332 name = ucsdet_getName(match, &status);
333
334 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
335 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
336 } else {
337 lang = ucsdet_getLanguage(match, &status);
338
339 if (lang == NULL || strcmp(lang, "en") != 0) {
340 log_err("Unfiltered input did not detect as English!\n");
341 }
342 }
343
344 bail:
345 freeBytes(bytes);
346 ucsdet_close(csd);
347 }
348
349 static void TestChaining(void) {
350 UErrorCode status = U_USELESS_COLLATOR_ERROR;
351
352 ucsdet_open(&status);
353 ucsdet_setText(NULL, NULL, 0, &status);
354 ucsdet_getName(NULL, &status);
355 ucsdet_getConfidence(NULL, &status);
356 ucsdet_getLanguage(NULL, &status);
357 ucsdet_detect(NULL, &status);
358 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
359 ucsdet_detectAll(NULL, NULL, &status);
360 ucsdet_getUChars(NULL, NULL, 0, &status);
361 ucsdet_getUChars(NULL, NULL, 0, &status);
362 ucsdet_close(NULL);
363
364 /* All of this code should have done nothing. */
365 if (status != U_USELESS_COLLATOR_ERROR) {
366 log_err("Status got changed to %s\n", u_errorName(status));
367 }
368 }
369
370 static void TestBufferOverflow(void) {
371 UErrorCode status = U_ZERO_ERROR;
372 static const char *testStrings[] = {
373 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
377 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
378 "\xa1", /* Could be a single byte shift-jis at the end */
379 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
380 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
381 };
382 static const char *testResults[] = {
383 "windows-1252",
384 "windows-1252",
385 "windows-1252",
386 "windows-1252",
387 "ISO-2022-JP",
388 NULL,
389 NULL,
390 "ISO-8859-1"
391 };
392 int32_t idx = 0;
393 UCharsetDetector *csd = ucsdet_open(&status);
394 const UCharsetMatch *match;
395
396 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
397
398 if (U_FAILURE(status)) {
399 log_err("Couldn't open detector. %s\n", u_errorName(status));
400 goto bail;
401 }
402
403 for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
404 ucsdet_setText(csd, testStrings[idx], -1, &status);
405 match = ucsdet_detect(csd, &status);
406
407 if (match == NULL) {
408 if (testResults[idx] != NULL) {
409 log_err("Unexpectedly got no results at index %d.\n", idx);
410 }
411 else {
412 log_verbose("Got no result as expected at index %d.\n", idx);
413 }
414 continue;
415 }
416
417 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
418 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
419 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
420 goto bail;
421 }
422 }
423
424 bail:
425 ucsdet_close(csd);
426 }
427
428 static void TestIBM424(void)
429 {
430 UErrorCode status = U_ZERO_ERROR;
431
432 static const UChar chars[] = {
433 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
434 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
435 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
436 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
437 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
438 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
439 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
440 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
441 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
442 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
443 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
444 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
445 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
446 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
447 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
448 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
449 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
450 };
451
452 static const UChar chars_reverse[] = {
453 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
454 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
455 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
456 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
457 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
458 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
459 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
460 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
461 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
462 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
463 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
464 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
465 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
466 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
467 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
468 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
469 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
470 0x0000
471 };
472
473 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
474
475 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
476 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
477
478 UCharsetDetector *csd = ucsdet_open(&status);
479 const UCharsetMatch *match;
480 const char *name;
481
482 ucsdet_setText(csd, bytes, bLength, &status);
483 match = ucsdet_detect(csd, &status);
484
485 if (match == NULL) {
486 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
487 goto bail;
488 }
489
490 name = ucsdet_getName(match, &status);
491 if (strcmp(name, "IBM424_rtl") != 0) {
492 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
493 }
494
495 ucsdet_setText(csd, bytes_r, brLength, &status);
496 match = ucsdet_detect(csd, &status);
497
498 if (match == NULL) {
499 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
500 goto bail;
501 }
502
503 name = ucsdet_getName(match, &status);
504 if (strcmp(name, "IBM424_ltr") != 0) {
505 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
506 }
507
508 bail:
509 freeBytes(bytes);
510 freeBytes(bytes_r);
511 ucsdet_close(csd);
512 }
513
514 static void TestIBM420(void)
515 {
516 UErrorCode status = U_ZERO_ERROR;
517
518 static const UChar chars[] = {
519 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
520 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
521 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
522 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
523 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
524 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
525 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
526 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
527 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
528 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
529 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
530 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
531 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
532 0x0000
533 };
534 static const UChar chars_reverse[] = {
535 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
536 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
537 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
538 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
539 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
540 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
541 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
542 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
543 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
544 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
545 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
546 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
547 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
548 0x0000,
549 };
550
551 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
552
553 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
554 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
555
556 UCharsetDetector *csd = ucsdet_open(&status);
557 const UCharsetMatch *match;
558 const char *name;
559
560 ucsdet_setText(csd, bytes, bLength, &status);
561 match = ucsdet_detect(csd, &status);
562
563 if (match == NULL) {
564 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
565 goto bail;
566 }
567
568 name = ucsdet_getName(match, &status);
569 if (strcmp(name, "IBM420_rtl") != 0) {
570 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
571 }
572
573 ucsdet_setText(csd, bytes_r, brLength, &status);
574 match = ucsdet_detect(csd, &status);
575
576 if (match == NULL) {
577 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
578 goto bail;
579 }
580
581 name = ucsdet_getName(match, &status);
582 if (strcmp(name, "IBM420_ltr") != 0) {
583 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
584 }
585
586 bail:
587 freeBytes(bytes);
588 freeBytes(bytes_r);
589 ucsdet_close(csd);
590 }