]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/csdetest.cpp
ICU-491.11.3.tar.gz
[apple/icu.git] / icuSources / test / intltest / csdetest.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/uniset.h"
15
16 #include "intltest.h"
17 #include "csdetest.h"
18
19 #include "xmlparser.h"
20
21 #include <stdlib.h>
22 #include <string.h>
23
24 #ifdef DEBUG_DETECT
25 #include <stdio.h>
26 #endif
27
28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29
30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32
33 #define CH_SPACE 0x0020
34 #define CH_SLASH 0x002F
35
36 //---------------------------------------------------------------------------
37 //
38 // Test class boilerplate
39 //
40 //---------------------------------------------------------------------------
41 CharsetDetectionTest::CharsetDetectionTest()
42 {
43 }
44
45
46 CharsetDetectionTest::~CharsetDetectionTest()
47 {
48 }
49
50
51
52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
53 {
54 if (exec) logln("TestSuite CharsetDetectionTest: ");
55 switch (index) {
56 case 0: name = "ConstructionTest";
57 if (exec) ConstructionTest();
58 break;
59
60 case 1: name = "UTF8Test";
61 if (exec) UTF8Test();
62 break;
63
64 case 2: name = "UTF16Test";
65 if (exec) UTF16Test();
66 break;
67
68 case 3: name = "C1BytesTest";
69 if (exec) C1BytesTest();
70 break;
71
72 case 4: name = "InputFilterTest";
73 if (exec) InputFilterTest();
74 break;
75
76 case 5: name = "DetectionTest";
77 if (exec) DetectionTest();
78 break;
79 #if !UCONFIG_NO_LEGACY_CONVERSION
80 case 6: name = "IBM424Test";
81 if (exec) IBM424Test();
82 break;
83
84 case 7: name = "IBM420Test";
85 if (exec) IBM420Test();
86 break;
87 #else
88 case 6:
89 case 7: name = "skip"; break;
90 #endif
91 case 8: name = "Ticket6394Test";
92 if (exec) Ticket6394Test();
93 break;
94
95 default: name = "";
96 break; //needed to end loop
97 }
98 }
99
100 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
101 {
102 int32_t offset = -1;
103
104 splits = 1;
105 while((offset = src.indexOf(ch, offset + 1)) >= 0) {
106 splits += 1;
107 }
108
109 UnicodeString *result = new UnicodeString[splits];
110
111 int32_t start = 0;
112 int32_t split = 0;
113 int32_t end;
114
115 while((end = src.indexOf(ch, start)) >= 0) {
116 src.extractBetween(start, end, result[split++]);
117 start = end + 1;
118 }
119
120 src.extractBetween(start, src.length(), result[split]);
121
122 return result;
123 }
124
125 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
126 {
127 int32_t sLength = source.length();
128 char *bytes = NULL;
129
130 length = source.extract(0, sLength, NULL, codepage);
131
132 if (length > 0) {
133 bytes = NEW_ARRAY(char, length + 1);
134 source.extract(0, sLength, bytes, codepage);
135 }
136
137 return bytes;
138 }
139
140 static void freeBytes(char *bytes)
141 {
142 DELETE_ARRAY(bytes);
143 }
144
145 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
146 {
147 int32_t splits = 0;
148 int32_t testLength = testString.length();
149 UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
150 UErrorCode status = U_ZERO_ERROR;
151 int32_t cpLength = eSplit[0].length();
152 char codepage[64];
153
154 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
155 codepage[cpLength] = '\0';
156
157 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
158
159 int32_t byteLength = 0;
160 char *bytes = extractBytes(testString, codepage, byteLength);
161
162 if (bytes == NULL) {
163 #if !UCONFIG_NO_LEGACY_CONVERSION
164 dataerrln("Can't open a " + encoding + " converter for " + id);
165 #endif
166 return;
167 }
168
169 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
170
171 int32_t matchCount = 0;
172 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
173
174
175 UnicodeString name(ucsdet_getName(matches[0], &status));
176 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
177 UChar *decoded = NULL;
178 int32_t dLength = 0;
179
180 if (matchCount == 0) {
181 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
182 goto bail;
183 }
184
185 if (name.compare(eSplit[0]) != 0) {
186 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
187
188 #ifdef DEBUG_DETECT
189 for (int32_t m = 0; m < matchCount; m += 1) {
190 const char *name = ucsdet_getName(matches[m], &status);
191 const char *lang = ucsdet_getLanguage(matches[m], &status);
192 int32_t confidence = ucsdet_getConfidence(matches[m], &status);
193
194 printf("%s (%s) %d\n", name, lang, confidence);
195 }
196 #endif
197 goto bail;
198 }
199
200 if (splits > 1 && lang.compare(eSplit[1]) != 0) {
201 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
202 goto bail;
203 }
204
205 decoded = NEW_ARRAY(UChar, testLength);
206 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
207
208 if (testString.compare(decoded, dLength) != 0) {
209 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
210
211 #ifdef DEBUG_DETECT
212 for(int32_t i = 0; i < testLength; i += 1) {
213 if(testString[i] != decoded[i]) {
214 printf("Strings differ at byte %d\n", i);
215 break;
216 }
217 }
218 #endif
219
220 }
221
222 DELETE_ARRAY(decoded);
223
224 bail:
225 freeBytes(bytes);
226 delete[] eSplit;
227 }
228
229 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
230 UErrorCode status = U_ZERO_ERROR;
231 const char *testDataDirectory = IntlTest::getSourceTestData(status);
232
233 if (U_FAILURE(status)) {
234 errln("ERROR: getPath() failed - %s", u_errorName(status));
235 return NULL;
236 }
237
238 strcpy(buffer, testDataDirectory);
239 strcat(buffer, filename);
240 return buffer;
241 }
242
243 void CharsetDetectionTest::ConstructionTest()
244 {
245 IcuTestErrorCode status(*this, "ConstructionTest");
246 LocalUCharsetDetectorPointer csd(ucsdet_open(status));
247 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
248 int32_t count = uenum_count(e.getAlias(), status);
249
250 #ifdef DEBUG_DETECT
251 printf("There are %d recognizers.\n", count);
252 #endif
253
254 for(int32_t i = 0; i < count; i += 1) {
255 int32_t length;
256 const char *name = uenum_next(e.getAlias(), &length, status);
257
258 if(name == NULL || length <= 0) {
259 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
260 }
261
262 #ifdef DEBUG_DETECT
263 printf("%s\n", name);
264 #endif
265 }
266 }
267
268 void CharsetDetectionTest::UTF8Test()
269 {
270 UErrorCode status = U_ZERO_ERROR;
271 UnicodeString ss = "This is a string with some non-ascii characters that will "
272 "be converted to UTF-8, then shoved through the detection process. "
273 "\\u0391\\u0392\\u0393\\u0394\\u0395"
274 "Sure would be nice if our source could contain Unicode directly!";
275 UnicodeString s = ss.unescape();
276 int32_t byteLength = 0, sLength = s.length();
277 char *bytes = extractBytes(s, "UTF-8", byteLength);
278 UCharsetDetector *csd = ucsdet_open(&status);
279 const UCharsetMatch *match;
280 UChar *detected = NEW_ARRAY(UChar, sLength);
281
282 ucsdet_setText(csd, bytes, byteLength, &status);
283 match = ucsdet_detect(csd, &status);
284
285 if (match == NULL) {
286 errln("Detection failure for UTF-8: got no matches.");
287 goto bail;
288 }
289
290 ucsdet_getUChars(match, detected, sLength, &status);
291
292 if (s.compare(detected, sLength) != 0) {
293 errln("Round-trip test failed!");
294 }
295
296 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
297
298 bail:
299 DELETE_ARRAY(detected);
300 freeBytes(bytes);
301 ucsdet_close(csd);
302 }
303
304 void CharsetDetectionTest::UTF16Test()
305 {
306 UErrorCode status = U_ZERO_ERROR;
307 /* Notice the BOM on the start of this string */
308 UChar chars[] = {
309 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
310 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
311 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
312 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
313 0x064a, 0x062a, 0x0000};
314 UnicodeString s(chars);
315 int32_t beLength = 0, leLength = 0;
316 char *beBytes = extractBytes(s, "UTF-16BE", beLength);
317 char *leBytes = extractBytes(s, "UTF-16LE", leLength);
318 UCharsetDetector *csd = ucsdet_open(&status);
319 const UCharsetMatch *match;
320 const char *name;
321 int32_t conf;
322
323 ucsdet_setText(csd, beBytes, beLength, &status);
324 match = ucsdet_detect(csd, &status);
325
326 if (match == NULL) {
327 errln("Encoding detection failure for UTF-16BE: got no matches.");
328 goto try_le;
329 }
330
331 name = ucsdet_getName(match, &status);
332 conf = ucsdet_getConfidence(match, &status);
333
334 if (strcmp(name, "UTF-16BE") != 0) {
335 errln("Encoding detection failure for UTF-16BE: got %s", name);
336 goto try_le; // no point in looking at confidence if we got the wrong character set.
337 }
338
339 if (conf != 100) {
340 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
341 }
342
343 try_le:
344 ucsdet_setText(csd, leBytes, leLength, &status);
345 match = ucsdet_detect(csd, &status);
346
347 if (match == NULL) {
348 errln("Encoding detection failure for UTF-16LE: got no matches.");
349 goto bail;
350 }
351
352 name = ucsdet_getName(match, &status);
353 conf = ucsdet_getConfidence(match, &status);
354
355
356 if (strcmp(name, "UTF-16LE") != 0) {
357 errln("Enconding detection failure for UTF-16LE: got %s", name);
358 goto bail; // no point in looking at confidence if we got the wrong character set.
359 }
360
361 if (conf != 100) {
362 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
363 }
364
365 bail:
366 freeBytes(leBytes);
367 freeBytes(beBytes);
368 ucsdet_close(csd);
369 }
370
371 void CharsetDetectionTest::InputFilterTest()
372 {
373 UErrorCode status = U_ZERO_ERROR;
374 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
375 UnicodeString s = ss.unescape();
376 int32_t byteLength = 0;
377 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
378 UCharsetDetector *csd = ucsdet_open(&status);
379 const UCharsetMatch *match;
380 const char *lang, *name;
381
382 ucsdet_enableInputFilter(csd, TRUE);
383
384 if (!ucsdet_isInputFilterEnabled(csd)) {
385 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
386 }
387
388
389 ucsdet_setText(csd, bytes, byteLength, &status);
390 match = ucsdet_detect(csd, &status);
391
392 if (match == NULL) {
393 errln("Turning on the input filter resulted in no matches.");
394 goto turn_off;
395 }
396
397 name = ucsdet_getName(match, &status);
398
399 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
400 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
401 } else {
402 lang = ucsdet_getLanguage(match, &status);
403
404 if (lang == NULL || strcmp(lang, "fr") != 0) {
405 errln("Input filter did not strip markup!");
406 }
407 }
408
409 turn_off:
410 ucsdet_enableInputFilter(csd, FALSE);
411 ucsdet_setText(csd, bytes, byteLength, &status);
412 match = ucsdet_detect(csd, &status);
413
414 if (match == NULL) {
415 errln("Turning off the input filter resulted in no matches.");
416 goto bail;
417 }
418
419 name = ucsdet_getName(match, &status);
420
421 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
422 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
423 } else {
424 lang = ucsdet_getLanguage(match, &status);
425
426 if (lang == NULL || strcmp(lang, "en") != 0) {
427 errln("Unfiltered input did not detect as English!");
428 }
429 }
430
431 bail:
432 freeBytes(bytes);
433 ucsdet_close(csd);
434 }
435
436 void CharsetDetectionTest::C1BytesTest()
437 {
438 #if !UCONFIG_NO_LEGACY_CONVERSION
439 UErrorCode status = U_ZERO_ERROR;
440 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
441 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
442 UnicodeString sWindows = ssWindows.unescape();
443 int32_t lISO = 0, lWindows = 0;
444 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
445 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
446 UCharsetDetector *csd = ucsdet_open(&status);
447 const UCharsetMatch *match;
448 const char *name;
449
450 ucsdet_setText(csd, bWindows, lWindows, &status);
451 match = ucsdet_detect(csd, &status);
452
453 if (match == NULL) {
454 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
455 goto bail;
456 }
457
458 name = ucsdet_getName(match, &status);
459
460 if (strcmp(name, "windows-1252") != 0) {
461 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
462 }
463
464 ucsdet_setText(csd, bISO, lISO, &status);
465 match = ucsdet_detect(csd, &status);
466
467 if (match == NULL) {
468 errln("English text without C1 bytes got no matches.");
469 goto bail;
470 }
471
472 name = ucsdet_getName(match, &status);
473
474 if (strcmp(name, "ISO-8859-1") != 0) {
475 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
476 }
477
478 bail:
479 freeBytes(bWindows);
480 freeBytes(bISO);
481
482 ucsdet_close(csd);
483 #endif
484 }
485
486 void CharsetDetectionTest::DetectionTest()
487 {
488 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
489 UErrorCode status = U_ZERO_ERROR;
490 char path[2048];
491 const char *testFilePath = getPath(path, "csdetest.xml");
492
493 if (testFilePath == NULL) {
494 return; /* Couldn't get path: error message already output. */
495 }
496
497 UXMLParser *parser = UXMLParser::createParser(status);
498 if (U_FAILURE(status)) {
499 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
500 return;
501 }
502
503 UXMLElement *root = parser->parseFile(testFilePath, status);
504 if (!assertSuccess( "parseFile",status)) return;
505
506 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
507 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
508 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
509
510 const UXMLElement *testCase;
511 int32_t tc = 0;
512
513 while((testCase = root->nextChildElement(tc)) != NULL) {
514 if (testCase->getTagName().compare(test_case) == 0) {
515 const UnicodeString *id = testCase->getAttribute(id_attr);
516 const UnicodeString *encodings = testCase->getAttribute(enc_attr);
517 const UnicodeString text = testCase->getText(TRUE);
518 int32_t encodingCount;
519 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
520
521 for(int32_t e = 0; e < encodingCount; e += 1) {
522 checkEncoding(text, encodingList[e], *id);
523 }
524
525 delete[] encodingList;
526 }
527 }
528
529 delete root;
530 delete parser;
531 #endif
532 }
533
534 void CharsetDetectionTest::IBM424Test()
535 {
536 UErrorCode status = U_ZERO_ERROR;
537
538 static const UChar chars[] = {
539 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
540 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
541 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
542 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
543 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
544 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
545 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
546 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
547 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
548 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
549 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
550 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
551 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
552 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
553 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
554 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
555 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
556 };
557
558 static const UChar chars_reverse[] = {
559 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
560 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
561 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
562 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
563 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
564 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
565 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
566 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
567 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
568 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
569 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
570 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
571 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
572 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
573 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
574 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
575 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
576 0x0000
577 };
578
579 int32_t bLength = 0, brLength = 0;
580
581 UnicodeString s1(chars);
582 UnicodeString s2(chars_reverse);
583
584 char *bytes = extractBytes(s1, "IBM424", bLength);
585 char *bytes_r = extractBytes(s2, "IBM424", brLength);
586
587 UCharsetDetector *csd = ucsdet_open(&status);
588 if (U_FAILURE(status)) {
589 errln("Error opening charset detector. - %s", u_errorName(status));
590 }
591 const UCharsetMatch *match;
592 const char *name;
593
594 ucsdet_setText(csd, bytes, bLength, &status);
595 match = ucsdet_detect(csd, &status);
596
597 if (match == NULL) {
598 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
599 goto bail;
600 }
601
602 name = ucsdet_getName(match, &status);
603 if (strcmp(name, "IBM424_rtl") != 0) {
604 errln("Encoding detection failure for IBM424_rtl: got %s", name);
605 }
606
607 ucsdet_setText(csd, bytes_r, brLength, &status);
608 match = ucsdet_detect(csd, &status);
609
610 if (match == NULL) {
611 errln("Encoding detection failure for IBM424_ltr: got no matches.");
612 goto bail;
613 }
614
615 name = ucsdet_getName(match, &status);
616 if (strcmp(name, "IBM424_ltr") != 0) {
617 errln("Encoding detection failure for IBM424_ltr: got %s", name);
618 }
619
620 bail:
621 freeBytes(bytes);
622 freeBytes(bytes_r);
623 ucsdet_close(csd);
624 }
625
626 void CharsetDetectionTest::IBM420Test()
627 {
628 UErrorCode status = U_ZERO_ERROR;
629
630 static const UChar chars[] = {
631 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
632 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
633 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
634 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
635 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
636 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
637 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
638 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
639 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
640 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
641 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
642 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
643 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
644 0x0000
645 };
646 static const UChar chars_reverse[] = {
647 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
648 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
649 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
650 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
651 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
652 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
653 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
654 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
655 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
656 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
657 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
658 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
659 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
660 0x0000,
661 };
662
663 int32_t bLength = 0, brLength = 0;
664
665 UnicodeString s1(chars);
666 UnicodeString s2(chars_reverse);
667
668 char *bytes = extractBytes(s1, "IBM420", bLength);
669 char *bytes_r = extractBytes(s2, "IBM420", brLength);
670
671 UCharsetDetector *csd = ucsdet_open(&status);
672 if (U_FAILURE(status)) {
673 errln("Error opening charset detector. - %s", u_errorName(status));
674 }
675 const UCharsetMatch *match;
676 const char *name;
677
678 ucsdet_setText(csd, bytes, bLength, &status);
679 match = ucsdet_detect(csd, &status);
680
681 if (match == NULL) {
682 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
683 goto bail;
684 }
685
686 name = ucsdet_getName(match, &status);
687 if (strcmp(name, "IBM420_rtl") != 0) {
688 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
689 }
690
691 ucsdet_setText(csd, bytes_r, brLength, &status);
692 match = ucsdet_detect(csd, &status);
693
694 if (match == NULL) {
695 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
696 goto bail;
697 }
698
699 name = ucsdet_getName(match, &status);
700 if (strcmp(name, "IBM420_ltr") != 0) {
701 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
702 }
703
704 bail:
705 freeBytes(bytes);
706 freeBytes(bytes_r);
707 ucsdet_close(csd);
708 }
709
710
711 void CharsetDetectionTest::Ticket6394Test() {
712 #if !UCONFIG_NO_CONVERSION
713 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
714 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
715 "encodings more than once. The hop through UnicodeString is for platforms "
716 "where this char * string is be EBCDIC and needs conversion to Latin1.";
717 char latin1Text[sizeof(charText)];
718 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
719
720 UErrorCode status = U_ZERO_ERROR;
721 UCharsetDetector *csd = ucsdet_open(&status);
722 ucsdet_setText(csd, latin1Text, -1, &status);
723 if (U_FAILURE(status)) {
724 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
725 return;
726 }
727
728 int32_t matchCount = 0;
729 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
730 if (U_FAILURE(status)) {
731 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
732 return;
733 }
734
735 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
736 int32_t i;
737 for (i=0; i<matchCount; i++) {
738 UnicodeString charSetName(ucsdet_getName(matches[i], &status));
739 if (U_FAILURE(status)) {
740 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
741 status = U_ZERO_ERROR;
742 }
743 if (setOfCharsetNames.contains(charSetName)) {
744 errln("Fail at file %s, line %d ", __FILE__, __LINE__);
745 errln(UnicodeString(" Duplicate charset name = ") + charSetName);
746 }
747 setOfCharsetNames.add(charSetName);
748 }
749 ucsdet_close(csd);
750 #endif
751 }
752