1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/unistr.h"
15 #include "unicode/putil.h"
16 #include "unicode/uniset.h"
21 #include "xmlparser.h"
30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
33 #define CH_SPACE 0x0020
34 #define CH_SLASH 0x002F
36 #define TEST_ASSERT(x) {if (!(x)) { \
37 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
44 //---------------------------------------------------------------------------
46 // Test class boilerplate
48 //---------------------------------------------------------------------------
49 CharsetDetectionTest::CharsetDetectionTest()
54 CharsetDetectionTest::~CharsetDetectionTest()
60 void CharsetDetectionTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
62 if (exec
) logln("TestSuite CharsetDetectionTest: ");
64 case 0: name
= "ConstructionTest";
65 if (exec
) ConstructionTest();
68 case 1: name
= "UTF8Test";
72 case 2: name
= "UTF16Test";
73 if (exec
) UTF16Test();
76 case 3: name
= "C1BytesTest";
77 if (exec
) C1BytesTest();
80 case 4: name
= "InputFilterTest";
81 if (exec
) InputFilterTest();
84 case 5: name
= "DetectionTest";
85 if (exec
) DetectionTest();
87 #if !UCONFIG_NO_LEGACY_CONVERSION
88 case 6: name
= "IBM424Test";
89 if (exec
) IBM424Test();
92 case 7: name
= "IBM420Test";
93 if (exec
) IBM420Test();
97 case 7: name
= "skip"; break;
99 case 8: name
= "Ticket6394Test";
100 if (exec
) Ticket6394Test();
103 case 9: name
= "Ticket6954Test";
104 if (exec
) Ticket6954Test();
108 break; //needed to end loop
112 static UnicodeString
*split(const UnicodeString
&src
, UChar ch
, int32_t &splits
)
117 while((offset
= src
.indexOf(ch
, offset
+ 1)) >= 0) {
121 UnicodeString
*result
= new UnicodeString
[splits
];
127 while((end
= src
.indexOf(ch
, start
)) >= 0) {
128 src
.extractBetween(start
, end
, result
[split
++]);
132 src
.extractBetween(start
, src
.length(), result
[split
]);
137 static char *extractBytes(const UnicodeString
&source
, const char *codepage
, int32_t &length
)
139 int32_t sLength
= source
.length();
142 length
= source
.extract(0, sLength
, NULL
, codepage
);
145 bytes
= NEW_ARRAY(char, length
+ 1);
146 source
.extract(0, sLength
, bytes
, codepage
);
152 static void freeBytes(char *bytes
)
157 void CharsetDetectionTest::checkEncoding(const UnicodeString
&testString
, const UnicodeString
&encoding
, const UnicodeString
&id
)
160 int32_t testLength
= testString
.length();
161 UnicodeString
*eSplit
= split(encoding
, CH_SLASH
, splits
);
162 UErrorCode status
= U_ZERO_ERROR
;
163 int32_t cpLength
= eSplit
[0].length();
166 u_UCharsToChars(eSplit
[0].getBuffer(), codepage
, cpLength
);
167 codepage
[cpLength
] = '\0';
169 LocalUCharsetDetectorPointer
csd(ucsdet_open(&status
));
171 int32_t byteLength
= 0;
172 char *bytes
= extractBytes(testString
, codepage
, byteLength
);
175 #if !UCONFIG_NO_LEGACY_CONVERSION
176 dataerrln("Can't open a " + encoding
+ " converter for " + id
);
181 ucsdet_setText(csd
.getAlias(), bytes
, byteLength
, &status
);
183 int32_t matchCount
= 0;
184 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
.getAlias(), &matchCount
, &status
);
187 UnicodeString
name(ucsdet_getName(matches
[0], &status
));
188 UnicodeString
lang(ucsdet_getLanguage(matches
[0], &status
));
189 UChar
*decoded
= NULL
;
192 if (matchCount
== 0) {
193 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got no matches");
197 if (name
.compare(eSplit
[0]) != 0) {
198 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got " + name
);
201 for (int32_t m
= 0; m
< matchCount
; m
+= 1) {
202 const char *name
= ucsdet_getName(matches
[m
], &status
);
203 const char *lang
= ucsdet_getLanguage(matches
[m
], &status
);
204 int32_t confidence
= ucsdet_getConfidence(matches
[m
], &status
);
206 printf("%s (%s) %d\n", name
, lang
, confidence
);
212 if (splits
> 1 && lang
.compare(eSplit
[1]) != 0) {
213 errln("Language detection failure for " + id
+ ", " + eSplit
[0] + ": expected " + eSplit
[1] + ", got " + lang
);
217 decoded
= NEW_ARRAY(UChar
, testLength
);
218 dLength
= ucsdet_getUChars(matches
[0], decoded
, testLength
, &status
);
220 if (testString
.compare(decoded
, dLength
) != 0) {
221 errln("Round-trip error for " + id
+ ", " + eSplit
[0] + ": getUChars() didn't yeild the original string.");
224 for(int32_t i
= 0; i
< testLength
; i
+= 1) {
225 if(testString
[i
] != decoded
[i
]) {
226 printf("Strings differ at byte %d\n", i
);
234 DELETE_ARRAY(decoded
);
241 const char *CharsetDetectionTest::getPath(char buffer
[2048], const char *filename
) {
242 UErrorCode status
= U_ZERO_ERROR
;
243 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
245 if (U_FAILURE(status
)) {
246 errln("ERROR: getPath() failed - %s", u_errorName(status
));
250 strcpy(buffer
, testDataDirectory
);
251 strcat(buffer
, filename
);
255 void CharsetDetectionTest::ConstructionTest()
257 IcuTestErrorCode
status(*this, "ConstructionTest");
258 LocalUCharsetDetectorPointer
csd(ucsdet_open(status
));
259 LocalUEnumerationPointer
e(ucsdet_getAllDetectableCharsets(csd
.getAlias(), status
));
260 int32_t count
= uenum_count(e
.getAlias(), status
);
263 printf("There are %d recognizers.\n", count
);
266 for(int32_t i
= 0; i
< count
; i
+= 1) {
268 const char *name
= uenum_next(e
.getAlias(), &length
, status
);
270 if(name
== NULL
|| length
<= 0) {
271 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
275 printf("%s\n", name
);
279 const char* defDisabled
[] = {
280 "IBM420_rtl", "IBM420_ltr",
281 "IBM424_rtl", "IBM424_ltr",
285 LocalUEnumerationPointer
eActive(ucsdet_getDetectableCharsets(csd
.getAlias(), status
));
286 const char *activeName
= NULL
;
288 while ((activeName
= uenum_next(eActive
.getAlias(), NULL
, status
))) {
289 // the charset must be included in all list
292 const char *name
= NULL
;
293 uenum_reset(e
.getAlias(), status
);
294 while ((name
= uenum_next(e
.getAlias(), NULL
, status
))) {
295 if (strcmp(activeName
, name
) == 0) {
302 errln(UnicodeString(activeName
) + " is not included in the all charset list.");
305 // some charsets are disabled by default
307 for (int32_t i
= 0; defDisabled
[i
] != 0; i
++) {
308 if (strcmp(activeName
, defDisabled
[i
]) == 0) {
314 errln(UnicodeString(activeName
) + " should not be included in the default charset list.");
319 void CharsetDetectionTest::UTF8Test()
321 UErrorCode status
= U_ZERO_ERROR
;
322 UnicodeString ss
= "This is a string with some non-ascii characters that will "
323 "be converted to UTF-8, then shoved through the detection process. "
324 "\\u0391\\u0392\\u0393\\u0394\\u0395"
325 "Sure would be nice if our source could contain Unicode directly!";
326 UnicodeString s
= ss
.unescape();
327 int32_t byteLength
= 0, sLength
= s
.length();
328 char *bytes
= extractBytes(s
, "UTF-8", byteLength
);
329 UCharsetDetector
*csd
= ucsdet_open(&status
);
330 const UCharsetMatch
*match
;
331 UChar
*detected
= NEW_ARRAY(UChar
, sLength
);
333 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
334 match
= ucsdet_detect(csd
, &status
);
337 errln("Detection failure for UTF-8: got no matches.");
341 ucsdet_getUChars(match
, detected
, sLength
, &status
);
343 if (s
.compare(detected
, sLength
) != 0) {
344 errln("Round-trip test failed!");
347 ucsdet_setDeclaredEncoding(csd
, "UTF-8", 5, &status
); /* for coverage */
350 DELETE_ARRAY(detected
);
355 void CharsetDetectionTest::UTF16Test()
357 UErrorCode status
= U_ZERO_ERROR
;
358 /* Notice the BOM on the start of this string */
360 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364 0x064a, 0x062a, 0x0000};
365 UnicodeString
s(chars
);
366 int32_t beLength
= 0, leLength
= 0;
367 char *beBytes
= extractBytes(s
, "UTF-16BE", beLength
);
368 char *leBytes
= extractBytes(s
, "UTF-16LE", leLength
);
369 UCharsetDetector
*csd
= ucsdet_open(&status
);
370 const UCharsetMatch
*match
;
374 ucsdet_setText(csd
, beBytes
, beLength
, &status
);
375 match
= ucsdet_detect(csd
, &status
);
378 errln("Encoding detection failure for UTF-16BE: got no matches.");
382 name
= ucsdet_getName(match
, &status
);
383 conf
= ucsdet_getConfidence(match
, &status
);
385 if (strcmp(name
, "UTF-16BE") != 0) {
386 errln("Encoding detection failure for UTF-16BE: got %s", name
);
387 goto try_le
; // no point in looking at confidence if we got the wrong character set.
391 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf
);
395 ucsdet_setText(csd
, leBytes
, leLength
, &status
);
396 match
= ucsdet_detect(csd
, &status
);
399 errln("Encoding detection failure for UTF-16LE: got no matches.");
403 name
= ucsdet_getName(match
, &status
);
404 conf
= ucsdet_getConfidence(match
, &status
);
407 if (strcmp(name
, "UTF-16LE") != 0) {
408 errln("Enconding detection failure for UTF-16LE: got %s", name
);
409 goto bail
; // no point in looking at confidence if we got the wrong character set.
413 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf
);
422 void CharsetDetectionTest::InputFilterTest()
424 UErrorCode status
= U_ZERO_ERROR
;
425 UnicodeString ss
= "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
426 UnicodeString s
= ss
.unescape();
427 int32_t byteLength
= 0;
428 char *bytes
= extractBytes(s
, "ISO-8859-1", byteLength
);
429 UCharsetDetector
*csd
= ucsdet_open(&status
);
430 const UCharsetMatch
*match
;
431 const char *lang
, *name
;
433 ucsdet_enableInputFilter(csd
, TRUE
);
435 if (!ucsdet_isInputFilterEnabled(csd
)) {
436 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
440 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
441 match
= ucsdet_detect(csd
, &status
);
444 errln("Turning on the input filter resulted in no matches.");
448 name
= ucsdet_getName(match
, &status
);
450 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
451 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name
);
453 lang
= ucsdet_getLanguage(match
, &status
);
455 if (lang
== NULL
|| strcmp(lang
, "fr") != 0) {
456 errln("Input filter did not strip markup!");
461 ucsdet_enableInputFilter(csd
, FALSE
);
462 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
463 match
= ucsdet_detect(csd
, &status
);
466 errln("Turning off the input filter resulted in no matches.");
470 name
= ucsdet_getName(match
, &status
);
472 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
473 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name
);
475 lang
= ucsdet_getLanguage(match
, &status
);
477 if (lang
== NULL
|| strcmp(lang
, "en") != 0) {
478 errln("Unfiltered input did not detect as English!");
487 void CharsetDetectionTest::C1BytesTest()
489 #if !UCONFIG_NO_LEGACY_CONVERSION
490 UErrorCode status
= U_ZERO_ERROR
;
491 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
492 UnicodeString
ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV
);
493 UnicodeString sWindows
= ssWindows
.unescape();
494 int32_t lISO
= 0, lWindows
= 0;
495 char *bISO
= extractBytes(sISO
, "ISO-8859-1", lISO
);
496 char *bWindows
= extractBytes(sWindows
, "windows-1252", lWindows
);
497 UCharsetDetector
*csd
= ucsdet_open(&status
);
498 const UCharsetMatch
*match
;
501 ucsdet_setText(csd
, bWindows
, lWindows
, &status
);
502 match
= ucsdet_detect(csd
, &status
);
505 errcheckln(status
, "English test with C1 bytes got no matches. - %s", u_errorName(status
));
509 name
= ucsdet_getName(match
, &status
);
511 if (strcmp(name
, "windows-1252") != 0) {
512 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name
);
515 ucsdet_setText(csd
, bISO
, lISO
, &status
);
516 match
= ucsdet_detect(csd
, &status
);
519 errln("English text without C1 bytes got no matches.");
523 name
= ucsdet_getName(match
, &status
);
525 if (strcmp(name
, "ISO-8859-1") != 0) {
526 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name
);
537 void CharsetDetectionTest::DetectionTest()
539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
540 UErrorCode status
= U_ZERO_ERROR
;
542 const char *testFilePath
= getPath(path
, "csdetest.xml");
544 if (testFilePath
== NULL
) {
545 return; /* Couldn't get path: error message already output. */
548 UXMLParser
*parser
= UXMLParser::createParser(status
);
549 if (U_FAILURE(status
)) {
550 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status
));
554 UXMLElement
*root
= parser
->parseFile(testFilePath
, status
);
555 if (!assertSuccess( "parseFile",status
)) return;
557 UnicodeString test_case
= UNICODE_STRING_SIMPLE("test-case");
558 UnicodeString id_attr
= UNICODE_STRING_SIMPLE("id");
559 UnicodeString enc_attr
= UNICODE_STRING_SIMPLE("encodings");
561 const UXMLElement
*testCase
;
564 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
565 if (testCase
->getTagName().compare(test_case
) == 0) {
566 const UnicodeString
*id
= testCase
->getAttribute(id_attr
);
567 const UnicodeString
*encodings
= testCase
->getAttribute(enc_attr
);
568 const UnicodeString text
= testCase
->getText(TRUE
);
569 int32_t encodingCount
;
570 UnicodeString
*encodingList
= split(*encodings
, CH_SPACE
, encodingCount
);
572 for(int32_t e
= 0; e
< encodingCount
; e
+= 1) {
573 checkEncoding(text
, encodingList
[e
], *id
);
576 delete[] encodingList
;
585 void CharsetDetectionTest::IBM424Test()
587 #if !UCONFIG_ONLY_HTML_CONVERSION
588 UErrorCode status
= U_ZERO_ERROR
;
590 static const UChar chars
[] = {
591 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
592 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
593 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
594 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
595 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
596 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
597 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
598 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
599 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
600 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
601 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
602 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
603 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
604 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
605 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
606 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
607 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
610 static const UChar chars_reverse
[] = {
611 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
612 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
613 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
614 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
615 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
616 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
617 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
618 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
619 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
620 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
621 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
622 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
623 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
624 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
625 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
626 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
627 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
631 int32_t bLength
= 0, brLength
= 0;
633 UnicodeString
s1(chars
);
634 UnicodeString
s2(chars_reverse
);
636 char *bytes
= extractBytes(s1
, "IBM424", bLength
);
637 char *bytes_r
= extractBytes(s2
, "IBM424", brLength
);
639 UCharsetDetector
*csd
= ucsdet_open(&status
);
640 ucsdet_setDetectableCharset(csd
, "IBM424_rtl", TRUE
, &status
);
641 ucsdet_setDetectableCharset(csd
, "IBM424_ltr", TRUE
, &status
);
642 ucsdet_setDetectableCharset(csd
, "IBM420_rtl", TRUE
, &status
);
643 ucsdet_setDetectableCharset(csd
, "IBM420_ltr", TRUE
, &status
);
644 if (U_FAILURE(status
)) {
645 errln("Error opening charset detector. - %s", u_errorName(status
));
647 const UCharsetMatch
*match
;
650 ucsdet_setText(csd
, bytes
, bLength
, &status
);
651 match
= ucsdet_detect(csd
, &status
);
654 errcheckln(status
, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status
));
658 name
= ucsdet_getName(match
, &status
);
659 if (strcmp(name
, "IBM424_rtl") != 0) {
660 errln("Encoding detection failure for IBM424_rtl: got %s", name
);
663 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
664 match
= ucsdet_detect(csd
, &status
);
667 errln("Encoding detection failure for IBM424_ltr: got no matches.");
671 name
= ucsdet_getName(match
, &status
);
672 if (strcmp(name
, "IBM424_ltr") != 0) {
673 errln("Encoding detection failure for IBM424_ltr: got %s", name
);
683 void CharsetDetectionTest::IBM420Test()
685 #if !UCONFIG_ONLY_HTML_CONVERSION
686 UErrorCode status
= U_ZERO_ERROR
;
688 static const UChar chars
[] = {
689 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
690 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
691 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
692 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
693 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
694 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
695 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
696 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
697 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
698 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
699 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
700 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
701 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
704 static const UChar chars_reverse
[] = {
705 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
706 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
707 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
708 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
709 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
710 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
711 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
712 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
713 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
714 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
715 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
716 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
717 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
721 int32_t bLength
= 0, brLength
= 0;
723 UnicodeString
s1(chars
);
724 UnicodeString
s2(chars_reverse
);
726 char *bytes
= extractBytes(s1
, "IBM420", bLength
);
727 char *bytes_r
= extractBytes(s2
, "IBM420", brLength
);
729 UCharsetDetector
*csd
= ucsdet_open(&status
);
730 if (U_FAILURE(status
)) {
731 errln("Error opening charset detector. - %s", u_errorName(status
));
733 ucsdet_setDetectableCharset(csd
, "IBM424_rtl", TRUE
, &status
);
734 ucsdet_setDetectableCharset(csd
, "IBM424_ltr", TRUE
, &status
);
735 ucsdet_setDetectableCharset(csd
, "IBM420_rtl", TRUE
, &status
);
736 ucsdet_setDetectableCharset(csd
, "IBM420_ltr", TRUE
, &status
);
737 const UCharsetMatch
*match
;
740 ucsdet_setText(csd
, bytes
, bLength
, &status
);
741 match
= ucsdet_detect(csd
, &status
);
744 errcheckln(status
, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status
));
748 name
= ucsdet_getName(match
, &status
);
749 if (strcmp(name
, "IBM420_rtl") != 0) {
750 errln("Encoding detection failure for IBM420_rtl: got %s\n", name
);
753 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
754 match
= ucsdet_detect(csd
, &status
);
757 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
761 name
= ucsdet_getName(match
, &status
);
762 if (strcmp(name
, "IBM420_ltr") != 0) {
763 errln("Encoding detection failure for IBM420_ltr: got %s\n", name
);
774 void CharsetDetectionTest::Ticket6394Test() {
775 #if !UCONFIG_NO_CONVERSION
776 const char charText
[] = "Here is some random English text that should be detected as ISO-8859-1."
777 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
778 "encodings more than once. The hop through UnicodeString is for platforms "
779 "where this char * string is be EBCDIC and needs conversion to Latin1.";
780 char latin1Text
[sizeof(charText
)];
781 UnicodeString(charText
).extract(0, sizeof(charText
)-2, latin1Text
, sizeof(latin1Text
), "ISO-8859-1");
783 UErrorCode status
= U_ZERO_ERROR
;
784 UCharsetDetector
*csd
= ucsdet_open(&status
);
785 ucsdet_setText(csd
, latin1Text
, -1, &status
);
786 if (U_FAILURE(status
)) {
787 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
791 int32_t matchCount
= 0;
792 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
, &matchCount
, &status
);
793 if (U_FAILURE(status
)) {
794 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
798 UnicodeSet setOfCharsetNames
; // UnicodSets can hold strings.
800 for (i
=0; i
<matchCount
; i
++) {
801 UnicodeString
charSetName(ucsdet_getName(matches
[i
], &status
));
802 if (U_FAILURE(status
)) {
803 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__
, __LINE__
, u_errorName(status
), i
);
804 status
= U_ZERO_ERROR
;
806 if (setOfCharsetNames
.contains(charSetName
)) {
807 errln("Fail at file %s, line %d ", __FILE__
, __LINE__
);
808 errln(UnicodeString(" Duplicate charset name = ") + charSetName
);
810 setOfCharsetNames
.add(charSetName
);
817 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
818 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
819 // Charset Recognizer objects, and could be overwritten.
820 void CharsetDetectionTest::Ticket6954Test() {
821 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
822 UErrorCode status
= U_ZERO_ERROR
;
823 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
824 UnicodeString
ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
825 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV
);
826 UnicodeString sWindows
= ssWindows
.unescape();
827 int32_t lISO
= 0, lWindows
= 0;
828 char *bISO
= extractBytes(sISO
, "ISO-8859-1", lISO
);
829 char *bWindows
= extractBytes(sWindows
, "windows-1252", lWindows
);
831 // First do a plain vanilla detect of 1252 text
833 UCharsetDetector
*csd1
= ucsdet_open(&status
);
834 ucsdet_setText(csd1
, bWindows
, lWindows
, &status
);
835 const UCharsetMatch
*match1
= ucsdet_detect(csd1
, &status
);
836 const char *name1
= ucsdet_getName(match1
, &status
);
837 TEST_ASSERT_SUCCESS(status
);
838 TEST_ASSERT(strcmp(name1
, "windows-1252")==0);
840 // Next, using a completely separate detector, detect some 8859-1 text
842 UCharsetDetector
*csd2
= ucsdet_open(&status
);
843 ucsdet_setText(csd2
, bISO
, lISO
, &status
);
844 const UCharsetMatch
*match2
= ucsdet_detect(csd2
, &status
);
845 const char *name2
= ucsdet_getName(match2
, &status
);
846 TEST_ASSERT_SUCCESS(status
);
847 TEST_ASSERT(strcmp(name2
, "ISO-8859-1")==0);
849 // Recheck the 1252 results from the first detector, which should not have been
850 // altered by the use of a different detector.
852 name1
= ucsdet_getName(match1
, &status
);
853 TEST_ASSERT_SUCCESS(status
);
854 TEST_ASSERT(strcmp(name1
, "windows-1252")==0);