]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/ucsdetst.c
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / test / cintltst / ucsdetst.c
1 /*
2 ****************************************************************************
3 * Copyright (c) 2005-2016, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ****************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/ustring.h"
13
14 #include "cintltst.h"
15 #include "cmemory.h"
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
21 #define DELETE_ARRAY(array) free(array)
22
23 static void TestConstruction(void);
24 static void TestUTF8(void);
25 static void TestUTF16(void);
26 static void TestC1Bytes(void);
27 static void TestInputFilter(void);
28 static void TestChaining(void);
29 static void TestBufferOverflow(void);
30 static void TestIBM424(void);
31 static void TestIBM420(void);
32
33 void addUCsdetTest(TestNode** root);
34
35 void addUCsdetTest(TestNode** root)
36 {
37 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
38 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
39 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
40 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
41 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
42 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
43 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
44 #if !UCONFIG_NO_LEGACY_CONVERSION
45 addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
46 addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
47 #endif
48 }
49
50 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
51 {
52 UErrorCode status;
53 char buffer[1024];
54 char *dest, *destLimit = buffer + sizeof(buffer);
55 const UChar *srcLimit = src + length;
56 int32_t result = 0;
57
58 do {
59 dest = buffer;
60 status = U_ZERO_ERROR;
61 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
62 result += (int32_t) (dest - buffer);
63 } while (status == U_BUFFER_OVERFLOW_ERROR);
64
65 return result;
66 }
67
68 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
69 {
70 UErrorCode status = U_ZERO_ERROR;
71 UConverter *cnv = ucnv_open(codepage, &status);
72 int32_t byteCount = preflight(src, length, cnv);
73 const UChar *srcLimit = src + length;
74 char *bytes = NEW_ARRAY(char, byteCount + 1);
75 char *dest = bytes, *destLimit = bytes + byteCount + 1;
76
77 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
78 ucnv_close(cnv);
79
80 *byteLength = byteCount;
81 return bytes;
82 }
83
84 static void freeBytes(char *bytes)
85 {
86 DELETE_ARRAY(bytes);
87 }
88
89 static void TestConstruction(void)
90 {
91 UErrorCode status = U_ZERO_ERROR;
92 UCharsetDetector *csd = ucsdet_open(&status);
93 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
94 const char *name;
95 int32_t count = uenum_count(e, &status);
96 int32_t i, length;
97
98 for(i = 0; i < count; i += 1) {
99 name = uenum_next(e, &length, &status);
100
101 if(name == NULL || length <= 0) {
102 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
103 }
104 }
105 /* one past the list of all names must return NULL */
106 name = uenum_next(e, &length, &status);
107 if(name != NULL || length != 0 || U_FAILURE(status)) {
108 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
109 }
110
111 uenum_close(e);
112 ucsdet_close(csd);
113 }
114
115 static void TestUTF8(void)
116 {
117 UErrorCode status = U_ZERO_ERROR;
118 static const char ss[] = "This is a string with some non-ascii characters that will "
119 "be converted to UTF-8, then shoved through the detection process. "
120 "\\u0391\\u0392\\u0393\\u0394\\u0395"
121 "Sure would be nice if our source could contain Unicode directly!";
122 int32_t byteLength = 0, sLength = 0, dLength = 0;
123 UChar s[sizeof(ss)];
124 char *bytes;
125 UCharsetDetector *csd = ucsdet_open(&status);
126 const UCharsetMatch *match;
127 UChar detected[sizeof(ss)];
128
129 sLength = u_unescape(ss, s, sizeof(ss));
130 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
131
132 ucsdet_setText(csd, bytes, byteLength, &status);
133 if (U_FAILURE(status)) {
134 log_err("status is %s\n", u_errorName(status));
135 goto bail;
136 }
137
138 match = ucsdet_detect(csd, &status);
139
140 if (match == NULL) {
141 log_err("Detection failure for UTF-8: got no matches.\n");
142 goto bail;
143 }
144
145 dLength = ucsdet_getUChars(match, detected, sLength, &status);
146
147 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
148 log_err("Round-trip test failed!\n");
149 }
150
151 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
152
153 bail:
154 freeBytes(bytes);
155 ucsdet_close(csd);
156 }
157
158 static void TestUTF16(void)
159 {
160 UErrorCode status = U_ZERO_ERROR;
161 /* Notice the BOM on the start of this string */
162 static const UChar chars[] = {
163 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
164 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
165 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
166 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
167 0x064a, 0x062a, 0x0000};
168 int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
169 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
170 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
171 UCharsetDetector *csd = ucsdet_open(&status);
172 const UCharsetMatch *match;
173 const char *name;
174 int32_t conf;
175
176 ucsdet_setText(csd, beBytes, beLength, &status);
177 match = ucsdet_detect(csd, &status);
178
179 if (match == NULL) {
180 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
181 goto try_le;
182 }
183
184 name = ucsdet_getName(match, &status);
185 conf = ucsdet_getConfidence(match, &status);
186
187 if (strcmp(name, "UTF-16BE") != 0) {
188 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
189 }
190
191 if (conf != 100) {
192 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
193 }
194
195 try_le:
196 ucsdet_setText(csd, leBytes, leLength, &status);
197 match = ucsdet_detect(csd, &status);
198
199 if (match == NULL) {
200 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
201 goto bail;
202 }
203
204 name = ucsdet_getName(match, &status);
205 conf = ucsdet_getConfidence(match, &status);
206
207
208 if (strcmp(name, "UTF-16LE") != 0) {
209 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
210 }
211
212 if (conf != 100) {
213 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
214 }
215
216 bail:
217 freeBytes(leBytes);
218 freeBytes(beBytes);
219 ucsdet_close(csd);
220 }
221
222 static void TestC1Bytes(void)
223 {
224 #if !UCONFIG_NO_LEGACY_CONVERSION
225 UErrorCode status = U_ZERO_ERROR;
226 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
227 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
228 int32_t sISOLength = 0, sWindowsLength = 0;
229 UChar sISO[sizeof(ssISO)];
230 UChar sWindows[sizeof(ssWindows)];
231 int32_t lISO = 0, lWindows = 0;
232 char *bISO;
233 char *bWindows;
234 UCharsetDetector *csd = ucsdet_open(&status);
235 const UCharsetMatch *match;
236 const char *name;
237
238 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
239 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
240 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
241 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
242
243 ucsdet_setText(csd, bWindows, lWindows, &status);
244 match = ucsdet_detect(csd, &status);
245
246 if (match == NULL) {
247 log_err("English test with C1 bytes got no matches.\n");
248 goto bail;
249 }
250
251 name = ucsdet_getName(match, &status);
252
253 if (strcmp(name, "windows-1252") != 0) {
254 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
255 }
256
257 ucsdet_setText(csd, bISO, lISO, &status);
258 match = ucsdet_detect(csd, &status);
259
260 if (match == NULL) {
261 log_err("English text without C1 bytes got no matches.\n");
262 goto bail;
263 }
264
265 name = ucsdet_getName(match, &status);
266
267 if (strcmp(name, "ISO-8859-1") != 0) {
268 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
269 }
270
271 bail:
272 freeBytes(bWindows);
273 freeBytes(bISO);
274
275 ucsdet_close(csd);
276 #endif
277 }
278
279 static void TestInputFilter(void)
280 {
281 UErrorCode status = U_ZERO_ERROR;
282 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
283 int32_t sLength = 0;
284 UChar s[sizeof(ss)];
285 int32_t byteLength = 0;
286 char *bytes;
287 UCharsetDetector *csd = ucsdet_open(&status);
288 const UCharsetMatch *match;
289 const char *lang, *name;
290
291 sLength = u_unescape(ss, s, sizeof(ss));
292 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
293
294 ucsdet_enableInputFilter(csd, TRUE);
295
296 if (!ucsdet_isInputFilterEnabled(csd)) {
297 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
298 }
299
300
301 ucsdet_setText(csd, bytes, byteLength, &status);
302 match = ucsdet_detect(csd, &status);
303
304 if (match == NULL) {
305 log_err("Turning on the input filter resulted in no matches.\n");
306 goto turn_off;
307 }
308
309 name = ucsdet_getName(match, &status);
310
311 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
312 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
313 } else {
314 lang = ucsdet_getLanguage(match, &status);
315
316 if (lang == NULL || strcmp(lang, "fr") != 0) {
317 log_err("Input filter did not strip markup!\n");
318 }
319 }
320
321 turn_off:
322 ucsdet_enableInputFilter(csd, FALSE);
323 ucsdet_setText(csd, bytes, byteLength, &status);
324 match = ucsdet_detect(csd, &status);
325
326 if (match == NULL) {
327 log_err("Turning off the input filter resulted in no matches.\n");
328 goto bail;
329 }
330
331 name = ucsdet_getName(match, &status);
332
333 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
334 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
335 } else {
336 lang = ucsdet_getLanguage(match, &status);
337
338 if (lang == NULL || strcmp(lang, "en") != 0) {
339 log_err("Unfiltered input did not detect as English!\n");
340 }
341 }
342
343 bail:
344 freeBytes(bytes);
345 ucsdet_close(csd);
346 }
347
348 static void TestChaining(void) {
349 UErrorCode status = U_USELESS_COLLATOR_ERROR;
350
351 ucsdet_open(&status);
352 ucsdet_setText(NULL, NULL, 0, &status);
353 ucsdet_getName(NULL, &status);
354 ucsdet_getConfidence(NULL, &status);
355 ucsdet_getLanguage(NULL, &status);
356 ucsdet_detect(NULL, &status);
357 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
358 ucsdet_detectAll(NULL, NULL, &status);
359 ucsdet_getUChars(NULL, NULL, 0, &status);
360 ucsdet_getUChars(NULL, NULL, 0, &status);
361 ucsdet_close(NULL);
362
363 /* All of this code should have done nothing. */
364 if (status != U_USELESS_COLLATOR_ERROR) {
365 log_err("Status got changed to %s\n", u_errorName(status));
366 }
367 }
368
369 static void TestBufferOverflow(void) {
370 UErrorCode status = U_ZERO_ERROR;
371 static const char *testStrings[] = {
372 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
373 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
376 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
377 "\xa1", /* Could be a single byte shift-jis at the end */
378 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
379 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
380 };
381 static const char *testResults[] = {
382 "windows-1252",
383 "windows-1252",
384 "windows-1252",
385 "windows-1252",
386 "ISO-2022-JP",
387 NULL,
388 NULL,
389 "ISO-8859-1"
390 };
391 int32_t idx = 0;
392 UCharsetDetector *csd = ucsdet_open(&status);
393 const UCharsetMatch *match;
394
395 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
396
397 if (U_FAILURE(status)) {
398 log_err("Couldn't open detector. %s\n", u_errorName(status));
399 goto bail;
400 }
401
402 for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
403 ucsdet_setText(csd, testStrings[idx], -1, &status);
404 match = ucsdet_detect(csd, &status);
405
406 if (match == NULL) {
407 if (testResults[idx] != NULL) {
408 log_err("Unexpectedly got no results at index %d.\n", idx);
409 }
410 else {
411 log_verbose("Got no result as expected at index %d.\n", idx);
412 }
413 continue;
414 }
415
416 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
417 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
418 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
419 goto bail;
420 }
421 }
422
423 bail:
424 ucsdet_close(csd);
425 }
426
427 static void TestIBM424(void)
428 {
429 UErrorCode status = U_ZERO_ERROR;
430
431 static const UChar chars[] = {
432 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
433 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
434 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
435 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
436 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
437 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
438 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
439 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
440 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
441 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
442 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
443 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
444 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
445 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
446 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
447 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
448 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
449 };
450
451 static const UChar chars_reverse[] = {
452 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
453 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
454 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
455 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
456 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
457 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
458 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
459 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
460 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
461 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
462 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
463 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
464 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
465 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
466 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
467 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
468 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
469 0x0000
470 };
471
472 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
473
474 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
475 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
476
477 UCharsetDetector *csd = ucsdet_open(&status);
478 const UCharsetMatch *match;
479 const char *name;
480
481 ucsdet_setText(csd, bytes, bLength, &status);
482 match = ucsdet_detect(csd, &status);
483
484 if (match == NULL) {
485 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
486 goto bail;
487 }
488
489 name = ucsdet_getName(match, &status);
490 if (strcmp(name, "IBM424_rtl") != 0) {
491 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
492 }
493
494 ucsdet_setText(csd, bytes_r, brLength, &status);
495 match = ucsdet_detect(csd, &status);
496
497 if (match == NULL) {
498 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
499 goto bail;
500 }
501
502 name = ucsdet_getName(match, &status);
503 if (strcmp(name, "IBM424_ltr") != 0) {
504 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
505 }
506
507 bail:
508 freeBytes(bytes);
509 freeBytes(bytes_r);
510 ucsdet_close(csd);
511 }
512
513 static void TestIBM420(void)
514 {
515 UErrorCode status = U_ZERO_ERROR;
516
517 static const UChar chars[] = {
518 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
519 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
520 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
521 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
522 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
523 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
524 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
525 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
526 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
527 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
528 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
529 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
530 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
531 0x0000
532 };
533 static const UChar chars_reverse[] = {
534 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
535 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
536 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
537 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
538 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
539 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
540 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
541 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
542 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
543 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
544 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
545 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
546 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
547 0x0000,
548 };
549
550 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
551
552 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
553 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
554
555 UCharsetDetector *csd = ucsdet_open(&status);
556 const UCharsetMatch *match;
557 const char *name;
558
559 ucsdet_setText(csd, bytes, bLength, &status);
560 match = ucsdet_detect(csd, &status);
561
562 if (match == NULL) {
563 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
564 goto bail;
565 }
566
567 name = ucsdet_getName(match, &status);
568 if (strcmp(name, "IBM420_rtl") != 0) {
569 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
570 }
571
572 ucsdet_setText(csd, bytes_r, brLength, &status);
573 match = ucsdet_detect(csd, &status);
574
575 if (match == NULL) {
576 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
577 goto bail;
578 }
579
580 name = ucsdet_getName(match, &status);
581 if (strcmp(name, "IBM420_ltr") != 0) {
582 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
583 }
584
585 bail:
586 freeBytes(bytes);
587 freeBytes(bytes_r);
588 ucsdet_close(csd);
589 }