]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/csdetest.cpp
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / csdetest.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/uniset.h"
15
16 #include "intltest.h"
17 #include "csdetest.h"
18
19 #include "xmlparser.h"
20
21 #include <stdlib.h>
22 #include <string.h>
23
24 #ifdef DEBUG_DETECT
25 #include <stdio.h>
26 #endif
27
28 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
29 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
30
31 #define CH_SPACE 0x0020
32 #define CH_SLASH 0x002F
33
34 #define TEST_ASSERT(x) {if (!(x)) { \
35 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
36
37 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
38 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
39 return;}}
40
41
42 //---------------------------------------------------------------------------
43 //
44 // Test class boilerplate
45 //
46 //---------------------------------------------------------------------------
47 CharsetDetectionTest::CharsetDetectionTest()
48 {
49 }
50
51
52 CharsetDetectionTest::~CharsetDetectionTest()
53 {
54 }
55
56
57
58 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
59 {
60 if (exec) logln("TestSuite CharsetDetectionTest: ");
61 switch (index) {
62 case 0: name = "ConstructionTest";
63 if (exec) ConstructionTest();
64 break;
65
66 case 1: name = "UTF8Test";
67 if (exec) UTF8Test();
68 break;
69
70 case 2: name = "UTF16Test";
71 if (exec) UTF16Test();
72 break;
73
74 case 3: name = "C1BytesTest";
75 if (exec) C1BytesTest();
76 break;
77
78 case 4: name = "InputFilterTest";
79 if (exec) InputFilterTest();
80 break;
81
82 case 5: name = "DetectionTest";
83 if (exec) DetectionTest();
84 break;
85 #if !UCONFIG_NO_LEGACY_CONVERSION
86 case 6: name = "IBM424Test";
87 if (exec) IBM424Test();
88 break;
89
90 case 7: name = "IBM420Test";
91 if (exec) IBM420Test();
92 break;
93 #else
94 case 6:
95 case 7: name = "skip"; break;
96 #endif
97 case 8: name = "Ticket6394Test";
98 if (exec) Ticket6394Test();
99 break;
100
101 case 9: name = "Ticket6954Test";
102 if (exec) Ticket6954Test();
103 break;
104
105 default: name = "";
106 break; //needed to end loop
107 }
108 }
109
110 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
111 {
112 int32_t offset = -1;
113
114 splits = 1;
115 while((offset = src.indexOf(ch, offset + 1)) >= 0) {
116 splits += 1;
117 }
118
119 UnicodeString *result = new UnicodeString[splits];
120
121 int32_t start = 0;
122 int32_t split = 0;
123 int32_t end;
124
125 while((end = src.indexOf(ch, start)) >= 0) {
126 src.extractBetween(start, end, result[split++]);
127 start = end + 1;
128 }
129
130 src.extractBetween(start, src.length(), result[split]);
131
132 return result;
133 }
134
135 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
136 {
137 int32_t sLength = source.length();
138 char *bytes = NULL;
139
140 length = source.extract(0, sLength, NULL, codepage);
141
142 if (length > 0) {
143 bytes = NEW_ARRAY(char, length + 1);
144 source.extract(0, sLength, bytes, codepage);
145 }
146
147 return bytes;
148 }
149
150 static void freeBytes(char *bytes)
151 {
152 DELETE_ARRAY(bytes);
153 }
154
155 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
156 {
157 int32_t splits = 0;
158 int32_t testLength = testString.length();
159 UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
160 UErrorCode status = U_ZERO_ERROR;
161 int32_t cpLength = eSplit[0].length();
162 char codepage[64];
163
164 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
165 codepage[cpLength] = '\0';
166
167 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
168
169 int32_t byteLength = 0;
170 char *bytes = extractBytes(testString, codepage, byteLength);
171
172 if (bytes == NULL) {
173 #if !UCONFIG_NO_LEGACY_CONVERSION
174 dataerrln("Can't open a " + encoding + " converter for " + id);
175 #endif
176 return;
177 }
178
179 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
180
181 int32_t matchCount = 0;
182 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
183
184
185 UnicodeString name(ucsdet_getName(matches[0], &status));
186 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
187 UChar *decoded = NULL;
188 int32_t dLength = 0;
189
190 if (matchCount == 0) {
191 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
192 goto bail;
193 }
194
195 if (name.compare(eSplit[0]) != 0) {
196 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
197
198 #ifdef DEBUG_DETECT
199 for (int32_t m = 0; m < matchCount; m += 1) {
200 const char *name = ucsdet_getName(matches[m], &status);
201 const char *lang = ucsdet_getLanguage(matches[m], &status);
202 int32_t confidence = ucsdet_getConfidence(matches[m], &status);
203
204 printf("%s (%s) %d\n", name, lang, confidence);
205 }
206 #endif
207 goto bail;
208 }
209
210 if (splits > 1 && lang.compare(eSplit[1]) != 0) {
211 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
212 goto bail;
213 }
214
215 decoded = NEW_ARRAY(UChar, testLength);
216 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
217
218 if (testString.compare(decoded, dLength) != 0) {
219 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
220
221 #ifdef DEBUG_DETECT
222 for(int32_t i = 0; i < testLength; i += 1) {
223 if(testString[i] != decoded[i]) {
224 printf("Strings differ at byte %d\n", i);
225 break;
226 }
227 }
228 #endif
229
230 }
231
232 DELETE_ARRAY(decoded);
233
234 bail:
235 freeBytes(bytes);
236 delete[] eSplit;
237 }
238
239 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
240 UErrorCode status = U_ZERO_ERROR;
241 const char *testDataDirectory = IntlTest::getSourceTestData(status);
242
243 if (U_FAILURE(status)) {
244 errln("ERROR: getPath() failed - %s", u_errorName(status));
245 return NULL;
246 }
247
248 strcpy(buffer, testDataDirectory);
249 strcat(buffer, filename);
250 return buffer;
251 }
252
253 void CharsetDetectionTest::ConstructionTest()
254 {
255 IcuTestErrorCode status(*this, "ConstructionTest");
256 LocalUCharsetDetectorPointer csd(ucsdet_open(status));
257 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
258 int32_t count = uenum_count(e.getAlias(), status);
259
260 #ifdef DEBUG_DETECT
261 printf("There are %d recognizers.\n", count);
262 #endif
263
264 for(int32_t i = 0; i < count; i += 1) {
265 int32_t length;
266 const char *name = uenum_next(e.getAlias(), &length, status);
267
268 if(name == NULL || length <= 0) {
269 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
270 }
271
272 #ifdef DEBUG_DETECT
273 printf("%s\n", name);
274 #endif
275 }
276
277 const char* defDisabled[] = {
278 "IBM420_rtl", "IBM420_ltr",
279 "IBM424_rtl", "IBM424_ltr",
280 0
281 };
282
283 LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
284 const char *activeName = NULL;
285
286 while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
287 // the charset must be included in all list
288 UBool found = FALSE;
289
290 const char *name = NULL;
291 uenum_reset(e.getAlias(), status);
292 while ((name = uenum_next(e.getAlias(), NULL, status))) {
293 if (strcmp(activeName, name) == 0) {
294 found = TRUE;
295 break;
296 }
297 }
298
299 if (!found) {
300 errln(UnicodeString(activeName) + " is not included in the all charset list.");
301 }
302
303 // some charsets are disabled by default
304 found = FALSE;
305 for (int32_t i = 0; defDisabled[i] != 0; i++) {
306 if (strcmp(activeName, defDisabled[i]) == 0) {
307 found = TRUE;
308 break;
309 }
310 }
311 if (found) {
312 errln(UnicodeString(activeName) + " should not be included in the default charset list.");
313 }
314 }
315 }
316
317 void CharsetDetectionTest::UTF8Test()
318 {
319 UErrorCode status = U_ZERO_ERROR;
320 UnicodeString ss = "This is a string with some non-ascii characters that will "
321 "be converted to UTF-8, then shoved through the detection process. "
322 "\\u0391\\u0392\\u0393\\u0394\\u0395"
323 "Sure would be nice if our source could contain Unicode directly!";
324 UnicodeString s = ss.unescape();
325 int32_t byteLength = 0, sLength = s.length();
326 char *bytes = extractBytes(s, "UTF-8", byteLength);
327 UCharsetDetector *csd = ucsdet_open(&status);
328 const UCharsetMatch *match;
329 UChar *detected = NEW_ARRAY(UChar, sLength);
330
331 ucsdet_setText(csd, bytes, byteLength, &status);
332 match = ucsdet_detect(csd, &status);
333
334 if (match == NULL) {
335 errln("Detection failure for UTF-8: got no matches.");
336 goto bail;
337 }
338
339 ucsdet_getUChars(match, detected, sLength, &status);
340
341 if (s.compare(detected, sLength) != 0) {
342 errln("Round-trip test failed!");
343 }
344
345 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
346
347 bail:
348 DELETE_ARRAY(detected);
349 freeBytes(bytes);
350 ucsdet_close(csd);
351 }
352
353 void CharsetDetectionTest::UTF16Test()
354 {
355 UErrorCode status = U_ZERO_ERROR;
356 /* Notice the BOM on the start of this string */
357 UChar chars[] = {
358 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
359 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
360 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
361 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
362 0x064a, 0x062a, 0x0000};
363 UnicodeString s(chars);
364 int32_t beLength = 0, leLength = 0;
365 char *beBytes = extractBytes(s, "UTF-16BE", beLength);
366 char *leBytes = extractBytes(s, "UTF-16LE", leLength);
367 UCharsetDetector *csd = ucsdet_open(&status);
368 const UCharsetMatch *match;
369 const char *name;
370 int32_t conf;
371
372 ucsdet_setText(csd, beBytes, beLength, &status);
373 match = ucsdet_detect(csd, &status);
374
375 if (match == NULL) {
376 errln("Encoding detection failure for UTF-16BE: got no matches.");
377 goto try_le;
378 }
379
380 name = ucsdet_getName(match, &status);
381 conf = ucsdet_getConfidence(match, &status);
382
383 if (strcmp(name, "UTF-16BE") != 0) {
384 errln("Encoding detection failure for UTF-16BE: got %s", name);
385 goto try_le; // no point in looking at confidence if we got the wrong character set.
386 }
387
388 if (conf != 100) {
389 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
390 }
391
392 try_le:
393 ucsdet_setText(csd, leBytes, leLength, &status);
394 match = ucsdet_detect(csd, &status);
395
396 if (match == NULL) {
397 errln("Encoding detection failure for UTF-16LE: got no matches.");
398 goto bail;
399 }
400
401 name = ucsdet_getName(match, &status);
402 conf = ucsdet_getConfidence(match, &status);
403
404
405 if (strcmp(name, "UTF-16LE") != 0) {
406 errln("Enconding detection failure for UTF-16LE: got %s", name);
407 goto bail; // no point in looking at confidence if we got the wrong character set.
408 }
409
410 if (conf != 100) {
411 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
412 }
413
414 bail:
415 freeBytes(leBytes);
416 freeBytes(beBytes);
417 ucsdet_close(csd);
418 }
419
420 void CharsetDetectionTest::InputFilterTest()
421 {
422 UErrorCode status = U_ZERO_ERROR;
423 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
424 UnicodeString s = ss.unescape();
425 int32_t byteLength = 0;
426 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
427 UCharsetDetector *csd = ucsdet_open(&status);
428 const UCharsetMatch *match;
429 const char *lang, *name;
430
431 ucsdet_enableInputFilter(csd, TRUE);
432
433 if (!ucsdet_isInputFilterEnabled(csd)) {
434 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
435 }
436
437
438 ucsdet_setText(csd, bytes, byteLength, &status);
439 match = ucsdet_detect(csd, &status);
440
441 if (match == NULL) {
442 errln("Turning on the input filter resulted in no matches.");
443 goto turn_off;
444 }
445
446 name = ucsdet_getName(match, &status);
447
448 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
449 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
450 } else {
451 lang = ucsdet_getLanguage(match, &status);
452
453 if (lang == NULL || strcmp(lang, "fr") != 0) {
454 errln("Input filter did not strip markup!");
455 }
456 }
457
458 turn_off:
459 ucsdet_enableInputFilter(csd, FALSE);
460 ucsdet_setText(csd, bytes, byteLength, &status);
461 match = ucsdet_detect(csd, &status);
462
463 if (match == NULL) {
464 errln("Turning off the input filter resulted in no matches.");
465 goto bail;
466 }
467
468 name = ucsdet_getName(match, &status);
469
470 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
471 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
472 } else {
473 lang = ucsdet_getLanguage(match, &status);
474
475 if (lang == NULL || strcmp(lang, "en") != 0) {
476 errln("Unfiltered input did not detect as English!");
477 }
478 }
479
480 bail:
481 freeBytes(bytes);
482 ucsdet_close(csd);
483 }
484
485 void CharsetDetectionTest::C1BytesTest()
486 {
487 #if !UCONFIG_NO_LEGACY_CONVERSION
488 UErrorCode status = U_ZERO_ERROR;
489 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
490 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
491 UnicodeString sWindows = ssWindows.unescape();
492 int32_t lISO = 0, lWindows = 0;
493 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
494 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
495 UCharsetDetector *csd = ucsdet_open(&status);
496 const UCharsetMatch *match;
497 const char *name;
498
499 ucsdet_setText(csd, bWindows, lWindows, &status);
500 match = ucsdet_detect(csd, &status);
501
502 if (match == NULL) {
503 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
504 goto bail;
505 }
506
507 name = ucsdet_getName(match, &status);
508
509 if (strcmp(name, "windows-1252") != 0) {
510 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
511 }
512
513 ucsdet_setText(csd, bISO, lISO, &status);
514 match = ucsdet_detect(csd, &status);
515
516 if (match == NULL) {
517 errln("English text without C1 bytes got no matches.");
518 goto bail;
519 }
520
521 name = ucsdet_getName(match, &status);
522
523 if (strcmp(name, "ISO-8859-1") != 0) {
524 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
525 }
526
527 bail:
528 freeBytes(bWindows);
529 freeBytes(bISO);
530
531 ucsdet_close(csd);
532 #endif
533 }
534
535 void CharsetDetectionTest::DetectionTest()
536 {
537 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
538 UErrorCode status = U_ZERO_ERROR;
539 char path[2048];
540 const char *testFilePath = getPath(path, "csdetest.xml");
541
542 if (testFilePath == NULL) {
543 return; /* Couldn't get path: error message already output. */
544 }
545
546 UXMLParser *parser = UXMLParser::createParser(status);
547 if (U_FAILURE(status)) {
548 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
549 return;
550 }
551
552 UXMLElement *root = parser->parseFile(testFilePath, status);
553 if (!assertSuccess( "parseFile",status)) return;
554
555 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
556 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
557 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
558
559 const UXMLElement *testCase;
560 int32_t tc = 0;
561
562 while((testCase = root->nextChildElement(tc)) != NULL) {
563 if (testCase->getTagName().compare(test_case) == 0) {
564 const UnicodeString *id = testCase->getAttribute(id_attr);
565 const UnicodeString *encodings = testCase->getAttribute(enc_attr);
566 const UnicodeString text = testCase->getText(TRUE);
567 int32_t encodingCount;
568 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
569
570 for(int32_t e = 0; e < encodingCount; e += 1) {
571 checkEncoding(text, encodingList[e], *id);
572 }
573
574 delete[] encodingList;
575 }
576 }
577
578 delete root;
579 delete parser;
580 #endif
581 }
582
583 void CharsetDetectionTest::IBM424Test()
584 {
585 #if !UCONFIG_ONLY_HTML_CONVERSION
586 UErrorCode status = U_ZERO_ERROR;
587
588 static const UChar chars[] = {
589 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
590 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
591 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
592 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
593 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
594 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
595 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
596 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
597 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
598 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
599 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
600 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
601 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
602 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
603 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
604 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
605 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
606 };
607
608 static const UChar chars_reverse[] = {
609 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
610 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
611 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
612 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
613 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
614 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
615 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
616 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
617 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
618 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
619 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
620 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
621 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
622 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
623 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
624 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
625 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
626 0x0000
627 };
628
629 int32_t bLength = 0, brLength = 0;
630
631 UnicodeString s1(chars);
632 UnicodeString s2(chars_reverse);
633
634 char *bytes = extractBytes(s1, "IBM424", bLength);
635 char *bytes_r = extractBytes(s2, "IBM424", brLength);
636
637 UCharsetDetector *csd = ucsdet_open(&status);
638 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
639 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
640 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
641 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
642 if (U_FAILURE(status)) {
643 errln("Error opening charset detector. - %s", u_errorName(status));
644 }
645 const UCharsetMatch *match;
646 const char *name;
647
648 ucsdet_setText(csd, bytes, bLength, &status);
649 match = ucsdet_detect(csd, &status);
650
651 if (match == NULL) {
652 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
653 goto bail;
654 }
655
656 name = ucsdet_getName(match, &status);
657 if (strcmp(name, "IBM424_rtl") != 0) {
658 errln("Encoding detection failure for IBM424_rtl: got %s", name);
659 }
660
661 ucsdet_setText(csd, bytes_r, brLength, &status);
662 match = ucsdet_detect(csd, &status);
663
664 if (match == NULL) {
665 errln("Encoding detection failure for IBM424_ltr: got no matches.");
666 goto bail;
667 }
668
669 name = ucsdet_getName(match, &status);
670 if (strcmp(name, "IBM424_ltr") != 0) {
671 errln("Encoding detection failure for IBM424_ltr: got %s", name);
672 }
673
674 bail:
675 freeBytes(bytes);
676 freeBytes(bytes_r);
677 ucsdet_close(csd);
678 #endif
679 }
680
681 void CharsetDetectionTest::IBM420Test()
682 {
683 #if !UCONFIG_ONLY_HTML_CONVERSION
684 UErrorCode status = U_ZERO_ERROR;
685
686 static const UChar chars[] = {
687 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
688 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
689 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
690 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
691 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
692 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
693 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
694 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
695 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
696 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
697 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
698 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
699 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
700 0x0000
701 };
702 static const UChar chars_reverse[] = {
703 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
704 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
705 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
706 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
707 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
708 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
709 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
710 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
711 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
712 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
713 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
714 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
715 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
716 0x0000,
717 };
718
719 int32_t bLength = 0, brLength = 0;
720
721 UnicodeString s1(chars);
722 UnicodeString s2(chars_reverse);
723
724 char *bytes = extractBytes(s1, "IBM420", bLength);
725 char *bytes_r = extractBytes(s2, "IBM420", brLength);
726
727 UCharsetDetector *csd = ucsdet_open(&status);
728 if (U_FAILURE(status)) {
729 errln("Error opening charset detector. - %s", u_errorName(status));
730 }
731 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
732 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
733 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
734 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
735 const UCharsetMatch *match;
736 const char *name;
737
738 ucsdet_setText(csd, bytes, bLength, &status);
739 match = ucsdet_detect(csd, &status);
740
741 if (match == NULL) {
742 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
743 goto bail;
744 }
745
746 name = ucsdet_getName(match, &status);
747 if (strcmp(name, "IBM420_rtl") != 0) {
748 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
749 }
750
751 ucsdet_setText(csd, bytes_r, brLength, &status);
752 match = ucsdet_detect(csd, &status);
753
754 if (match == NULL) {
755 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
756 goto bail;
757 }
758
759 name = ucsdet_getName(match, &status);
760 if (strcmp(name, "IBM420_ltr") != 0) {
761 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
762 }
763
764 bail:
765 freeBytes(bytes);
766 freeBytes(bytes_r);
767 ucsdet_close(csd);
768 #endif
769 }
770
771
772 void CharsetDetectionTest::Ticket6394Test() {
773 #if !UCONFIG_NO_CONVERSION
774 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
775 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
776 "encodings more than once. The hop through UnicodeString is for platforms "
777 "where this char * string is be EBCDIC and needs conversion to Latin1.";
778 char latin1Text[sizeof(charText)];
779 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
780
781 UErrorCode status = U_ZERO_ERROR;
782 UCharsetDetector *csd = ucsdet_open(&status);
783 ucsdet_setText(csd, latin1Text, -1, &status);
784 if (U_FAILURE(status)) {
785 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
786 return;
787 }
788
789 int32_t matchCount = 0;
790 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
791 if (U_FAILURE(status)) {
792 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
793 return;
794 }
795
796 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
797 int32_t i;
798 for (i=0; i<matchCount; i++) {
799 UnicodeString charSetName(ucsdet_getName(matches[i], &status));
800 if (U_FAILURE(status)) {
801 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
802 status = U_ZERO_ERROR;
803 }
804 if (setOfCharsetNames.contains(charSetName)) {
805 errln("Fail at file %s, line %d ", __FILE__, __LINE__);
806 errln(UnicodeString(" Duplicate charset name = ") + charSetName);
807 }
808 setOfCharsetNames.add(charSetName);
809 }
810 ucsdet_close(csd);
811 #endif
812 }
813
814
815 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
816 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
817 // Charset Recognizer objects, and could be overwritten.
818 void CharsetDetectionTest::Ticket6954Test() {
819 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
820 UErrorCode status = U_ZERO_ERROR;
821 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
822 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
823 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
824 UnicodeString sWindows = ssWindows.unescape();
825 int32_t lISO = 0, lWindows = 0;
826 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
827 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
828
829 // First do a plain vanilla detect of 1252 text
830
831 UCharsetDetector *csd1 = ucsdet_open(&status);
832 ucsdet_setText(csd1, bWindows, lWindows, &status);
833 const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
834 const char *name1 = ucsdet_getName(match1, &status);
835 TEST_ASSERT_SUCCESS(status);
836 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
837
838 // Next, using a completely separate detector, detect some 8859-1 text
839
840 UCharsetDetector *csd2 = ucsdet_open(&status);
841 ucsdet_setText(csd2, bISO, lISO, &status);
842 const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
843 const char *name2 = ucsdet_getName(match2, &status);
844 TEST_ASSERT_SUCCESS(status);
845 TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
846
847 // Recheck the 1252 results from the first detector, which should not have been
848 // altered by the use of a different detector.
849
850 name1 = ucsdet_getName(match1, &status);
851 TEST_ASSERT_SUCCESS(status);
852 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
853
854 ucsdet_close(csd1);
855 ucsdet_close(csd2);
856 freeBytes(bISO);
857 freeBytes(bWindows);
858 #endif
859 }