2 **********************************************************************
3 * Copyright (C) 2005-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/uniset.h"
19 #include "xmlparser.h"
28 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
29 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
31 #define CH_SPACE 0x0020
32 #define CH_SLASH 0x002F
34 #define TEST_ASSERT(x) {if (!(x)) { \
35 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
37 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
38 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
42 //---------------------------------------------------------------------------
44 // Test class boilerplate
46 //---------------------------------------------------------------------------
47 CharsetDetectionTest::CharsetDetectionTest()
52 CharsetDetectionTest::~CharsetDetectionTest()
58 void CharsetDetectionTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
60 if (exec
) logln("TestSuite CharsetDetectionTest: ");
62 case 0: name
= "ConstructionTest";
63 if (exec
) ConstructionTest();
66 case 1: name
= "UTF8Test";
70 case 2: name
= "UTF16Test";
71 if (exec
) UTF16Test();
74 case 3: name
= "C1BytesTest";
75 if (exec
) C1BytesTest();
78 case 4: name
= "InputFilterTest";
79 if (exec
) InputFilterTest();
82 case 5: name
= "DetectionTest";
83 if (exec
) DetectionTest();
85 #if !UCONFIG_NO_LEGACY_CONVERSION
86 case 6: name
= "IBM424Test";
87 if (exec
) IBM424Test();
90 case 7: name
= "IBM420Test";
91 if (exec
) IBM420Test();
95 case 7: name
= "skip"; break;
97 case 8: name
= "Ticket6394Test";
98 if (exec
) Ticket6394Test();
101 case 9: name
= "Ticket6954Test";
102 if (exec
) Ticket6954Test();
106 break; //needed to end loop
110 static UnicodeString
*split(const UnicodeString
&src
, UChar ch
, int32_t &splits
)
115 while((offset
= src
.indexOf(ch
, offset
+ 1)) >= 0) {
119 UnicodeString
*result
= new UnicodeString
[splits
];
125 while((end
= src
.indexOf(ch
, start
)) >= 0) {
126 src
.extractBetween(start
, end
, result
[split
++]);
130 src
.extractBetween(start
, src
.length(), result
[split
]);
135 static char *extractBytes(const UnicodeString
&source
, const char *codepage
, int32_t &length
)
137 int32_t sLength
= source
.length();
140 length
= source
.extract(0, sLength
, NULL
, codepage
);
143 bytes
= NEW_ARRAY(char, length
+ 1);
144 source
.extract(0, sLength
, bytes
, codepage
);
150 static void freeBytes(char *bytes
)
155 void CharsetDetectionTest::checkEncoding(const UnicodeString
&testString
, const UnicodeString
&encoding
, const UnicodeString
&id
)
158 int32_t testLength
= testString
.length();
159 UnicodeString
*eSplit
= split(encoding
, CH_SLASH
, splits
);
160 UErrorCode status
= U_ZERO_ERROR
;
161 int32_t cpLength
= eSplit
[0].length();
164 u_UCharsToChars(eSplit
[0].getBuffer(), codepage
, cpLength
);
165 codepage
[cpLength
] = '\0';
167 LocalUCharsetDetectorPointer
csd(ucsdet_open(&status
));
169 int32_t byteLength
= 0;
170 char *bytes
= extractBytes(testString
, codepage
, byteLength
);
173 #if !UCONFIG_NO_LEGACY_CONVERSION
174 dataerrln("Can't open a " + encoding
+ " converter for " + id
);
179 ucsdet_setText(csd
.getAlias(), bytes
, byteLength
, &status
);
181 int32_t matchCount
= 0;
182 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
.getAlias(), &matchCount
, &status
);
185 UnicodeString
name(ucsdet_getName(matches
[0], &status
));
186 UnicodeString
lang(ucsdet_getLanguage(matches
[0], &status
));
187 UChar
*decoded
= NULL
;
190 if (matchCount
== 0) {
191 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got no matches");
195 if (name
.compare(eSplit
[0]) != 0) {
196 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got " + name
);
199 for (int32_t m
= 0; m
< matchCount
; m
+= 1) {
200 const char *name
= ucsdet_getName(matches
[m
], &status
);
201 const char *lang
= ucsdet_getLanguage(matches
[m
], &status
);
202 int32_t confidence
= ucsdet_getConfidence(matches
[m
], &status
);
204 printf("%s (%s) %d\n", name
, lang
, confidence
);
210 if (splits
> 1 && lang
.compare(eSplit
[1]) != 0) {
211 errln("Language detection failure for " + id
+ ", " + eSplit
[0] + ": expected " + eSplit
[1] + ", got " + lang
);
215 decoded
= NEW_ARRAY(UChar
, testLength
);
216 dLength
= ucsdet_getUChars(matches
[0], decoded
, testLength
, &status
);
218 if (testString
.compare(decoded
, dLength
) != 0) {
219 errln("Round-trip error for " + id
+ ", " + eSplit
[0] + ": getUChars() didn't yeild the original string.");
222 for(int32_t i
= 0; i
< testLength
; i
+= 1) {
223 if(testString
[i
] != decoded
[i
]) {
224 printf("Strings differ at byte %d\n", i
);
232 DELETE_ARRAY(decoded
);
239 const char *CharsetDetectionTest::getPath(char buffer
[2048], const char *filename
) {
240 UErrorCode status
= U_ZERO_ERROR
;
241 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
243 if (U_FAILURE(status
)) {
244 errln("ERROR: getPath() failed - %s", u_errorName(status
));
248 strcpy(buffer
, testDataDirectory
);
249 strcat(buffer
, filename
);
253 void CharsetDetectionTest::ConstructionTest()
255 IcuTestErrorCode
status(*this, "ConstructionTest");
256 LocalUCharsetDetectorPointer
csd(ucsdet_open(status
));
257 LocalUEnumerationPointer
e(ucsdet_getAllDetectableCharsets(csd
.getAlias(), status
));
258 int32_t count
= uenum_count(e
.getAlias(), status
);
261 printf("There are %d recognizers.\n", count
);
264 for(int32_t i
= 0; i
< count
; i
+= 1) {
266 const char *name
= uenum_next(e
.getAlias(), &length
, status
);
268 if(name
== NULL
|| length
<= 0) {
269 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
273 printf("%s\n", name
);
277 const char* defDisabled
[] = {
278 "IBM420_rtl", "IBM420_ltr",
279 "IBM424_rtl", "IBM424_ltr",
283 LocalUEnumerationPointer
eActive(ucsdet_getDetectableCharsets(csd
.getAlias(), status
));
284 const char *activeName
= NULL
;
286 while ((activeName
= uenum_next(eActive
.getAlias(), NULL
, status
))) {
287 // the charset must be included in all list
290 const char *name
= NULL
;
291 uenum_reset(e
.getAlias(), status
);
292 while ((name
= uenum_next(e
.getAlias(), NULL
, status
))) {
293 if (strcmp(activeName
, name
) == 0) {
300 errln(UnicodeString(activeName
) + " is not included in the all charset list.");
303 // some charsets are disabled by default
305 for (int32_t i
= 0; defDisabled
[i
] != 0; i
++) {
306 if (strcmp(activeName
, defDisabled
[i
]) == 0) {
312 errln(UnicodeString(activeName
) + " should not be included in the default charset list.");
317 void CharsetDetectionTest::UTF8Test()
319 UErrorCode status
= U_ZERO_ERROR
;
320 UnicodeString ss
= "This is a string with some non-ascii characters that will "
321 "be converted to UTF-8, then shoved through the detection process. "
322 "\\u0391\\u0392\\u0393\\u0394\\u0395"
323 "Sure would be nice if our source could contain Unicode directly!";
324 UnicodeString s
= ss
.unescape();
325 int32_t byteLength
= 0, sLength
= s
.length();
326 char *bytes
= extractBytes(s
, "UTF-8", byteLength
);
327 UCharsetDetector
*csd
= ucsdet_open(&status
);
328 const UCharsetMatch
*match
;
329 UChar
*detected
= NEW_ARRAY(UChar
, sLength
);
331 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
332 match
= ucsdet_detect(csd
, &status
);
335 errln("Detection failure for UTF-8: got no matches.");
339 ucsdet_getUChars(match
, detected
, sLength
, &status
);
341 if (s
.compare(detected
, sLength
) != 0) {
342 errln("Round-trip test failed!");
345 ucsdet_setDeclaredEncoding(csd
, "UTF-8", 5, &status
); /* for coverage */
348 DELETE_ARRAY(detected
);
353 void CharsetDetectionTest::UTF16Test()
355 UErrorCode status
= U_ZERO_ERROR
;
356 /* Notice the BOM on the start of this string */
358 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
359 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
360 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
361 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
362 0x064a, 0x062a, 0x0000};
363 UnicodeString
s(chars
);
364 int32_t beLength
= 0, leLength
= 0;
365 char *beBytes
= extractBytes(s
, "UTF-16BE", beLength
);
366 char *leBytes
= extractBytes(s
, "UTF-16LE", leLength
);
367 UCharsetDetector
*csd
= ucsdet_open(&status
);
368 const UCharsetMatch
*match
;
372 ucsdet_setText(csd
, beBytes
, beLength
, &status
);
373 match
= ucsdet_detect(csd
, &status
);
376 errln("Encoding detection failure for UTF-16BE: got no matches.");
380 name
= ucsdet_getName(match
, &status
);
381 conf
= ucsdet_getConfidence(match
, &status
);
383 if (strcmp(name
, "UTF-16BE") != 0) {
384 errln("Encoding detection failure for UTF-16BE: got %s", name
);
385 goto try_le
; // no point in looking at confidence if we got the wrong character set.
389 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf
);
393 ucsdet_setText(csd
, leBytes
, leLength
, &status
);
394 match
= ucsdet_detect(csd
, &status
);
397 errln("Encoding detection failure for UTF-16LE: got no matches.");
401 name
= ucsdet_getName(match
, &status
);
402 conf
= ucsdet_getConfidence(match
, &status
);
405 if (strcmp(name
, "UTF-16LE") != 0) {
406 errln("Enconding detection failure for UTF-16LE: got %s", name
);
407 goto bail
; // no point in looking at confidence if we got the wrong character set.
411 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf
);
420 void CharsetDetectionTest::InputFilterTest()
422 UErrorCode status
= U_ZERO_ERROR
;
423 UnicodeString ss
= "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
424 UnicodeString s
= ss
.unescape();
425 int32_t byteLength
= 0;
426 char *bytes
= extractBytes(s
, "ISO-8859-1", byteLength
);
427 UCharsetDetector
*csd
= ucsdet_open(&status
);
428 const UCharsetMatch
*match
;
429 const char *lang
, *name
;
431 ucsdet_enableInputFilter(csd
, TRUE
);
433 if (!ucsdet_isInputFilterEnabled(csd
)) {
434 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
438 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
439 match
= ucsdet_detect(csd
, &status
);
442 errln("Turning on the input filter resulted in no matches.");
446 name
= ucsdet_getName(match
, &status
);
448 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
449 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name
);
451 lang
= ucsdet_getLanguage(match
, &status
);
453 if (lang
== NULL
|| strcmp(lang
, "fr") != 0) {
454 errln("Input filter did not strip markup!");
459 ucsdet_enableInputFilter(csd
, FALSE
);
460 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
461 match
= ucsdet_detect(csd
, &status
);
464 errln("Turning off the input filter resulted in no matches.");
468 name
= ucsdet_getName(match
, &status
);
470 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
471 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name
);
473 lang
= ucsdet_getLanguage(match
, &status
);
475 if (lang
== NULL
|| strcmp(lang
, "en") != 0) {
476 errln("Unfiltered input did not detect as English!");
485 void CharsetDetectionTest::C1BytesTest()
487 #if !UCONFIG_NO_LEGACY_CONVERSION
488 UErrorCode status
= U_ZERO_ERROR
;
489 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
490 UnicodeString
ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV
);
491 UnicodeString sWindows
= ssWindows
.unescape();
492 int32_t lISO
= 0, lWindows
= 0;
493 char *bISO
= extractBytes(sISO
, "ISO-8859-1", lISO
);
494 char *bWindows
= extractBytes(sWindows
, "windows-1252", lWindows
);
495 UCharsetDetector
*csd
= ucsdet_open(&status
);
496 const UCharsetMatch
*match
;
499 ucsdet_setText(csd
, bWindows
, lWindows
, &status
);
500 match
= ucsdet_detect(csd
, &status
);
503 errcheckln(status
, "English test with C1 bytes got no matches. - %s", u_errorName(status
));
507 name
= ucsdet_getName(match
, &status
);
509 if (strcmp(name
, "windows-1252") != 0) {
510 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name
);
513 ucsdet_setText(csd
, bISO
, lISO
, &status
);
514 match
= ucsdet_detect(csd
, &status
);
517 errln("English text without C1 bytes got no matches.");
521 name
= ucsdet_getName(match
, &status
);
523 if (strcmp(name
, "ISO-8859-1") != 0) {
524 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name
);
535 void CharsetDetectionTest::DetectionTest()
537 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
538 UErrorCode status
= U_ZERO_ERROR
;
540 const char *testFilePath
= getPath(path
, "csdetest.xml");
542 if (testFilePath
== NULL
) {
543 return; /* Couldn't get path: error message already output. */
546 UXMLParser
*parser
= UXMLParser::createParser(status
);
547 if (U_FAILURE(status
)) {
548 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status
));
552 UXMLElement
*root
= parser
->parseFile(testFilePath
, status
);
553 if (!assertSuccess( "parseFile",status
)) return;
555 UnicodeString test_case
= UNICODE_STRING_SIMPLE("test-case");
556 UnicodeString id_attr
= UNICODE_STRING_SIMPLE("id");
557 UnicodeString enc_attr
= UNICODE_STRING_SIMPLE("encodings");
559 const UXMLElement
*testCase
;
562 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
563 if (testCase
->getTagName().compare(test_case
) == 0) {
564 const UnicodeString
*id
= testCase
->getAttribute(id_attr
);
565 const UnicodeString
*encodings
= testCase
->getAttribute(enc_attr
);
566 const UnicodeString text
= testCase
->getText(TRUE
);
567 int32_t encodingCount
;
568 UnicodeString
*encodingList
= split(*encodings
, CH_SPACE
, encodingCount
);
570 for(int32_t e
= 0; e
< encodingCount
; e
+= 1) {
571 checkEncoding(text
, encodingList
[e
], *id
);
574 delete[] encodingList
;
583 void CharsetDetectionTest::IBM424Test()
585 #if !UCONFIG_ONLY_HTML_CONVERSION
586 UErrorCode status
= U_ZERO_ERROR
;
588 static const UChar chars
[] = {
589 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
590 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
591 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
592 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
593 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
594 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
595 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
596 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
597 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
598 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
599 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
600 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
601 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
602 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
603 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
604 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
605 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
608 static const UChar chars_reverse
[] = {
609 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
610 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
611 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
612 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
613 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
614 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
615 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
616 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
617 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
618 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
619 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
620 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
621 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
622 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
623 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
624 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
625 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
629 int32_t bLength
= 0, brLength
= 0;
631 UnicodeString
s1(chars
);
632 UnicodeString
s2(chars_reverse
);
634 char *bytes
= extractBytes(s1
, "IBM424", bLength
);
635 char *bytes_r
= extractBytes(s2
, "IBM424", brLength
);
637 UCharsetDetector
*csd
= ucsdet_open(&status
);
638 ucsdet_setDetectableCharset(csd
, "IBM424_rtl", TRUE
, &status
);
639 ucsdet_setDetectableCharset(csd
, "IBM424_ltr", TRUE
, &status
);
640 ucsdet_setDetectableCharset(csd
, "IBM420_rtl", TRUE
, &status
);
641 ucsdet_setDetectableCharset(csd
, "IBM420_ltr", TRUE
, &status
);
642 if (U_FAILURE(status
)) {
643 errln("Error opening charset detector. - %s", u_errorName(status
));
645 const UCharsetMatch
*match
;
648 ucsdet_setText(csd
, bytes
, bLength
, &status
);
649 match
= ucsdet_detect(csd
, &status
);
652 errcheckln(status
, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status
));
656 name
= ucsdet_getName(match
, &status
);
657 if (strcmp(name
, "IBM424_rtl") != 0) {
658 errln("Encoding detection failure for IBM424_rtl: got %s", name
);
661 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
662 match
= ucsdet_detect(csd
, &status
);
665 errln("Encoding detection failure for IBM424_ltr: got no matches.");
669 name
= ucsdet_getName(match
, &status
);
670 if (strcmp(name
, "IBM424_ltr") != 0) {
671 errln("Encoding detection failure for IBM424_ltr: got %s", name
);
681 void CharsetDetectionTest::IBM420Test()
683 #if !UCONFIG_ONLY_HTML_CONVERSION
684 UErrorCode status
= U_ZERO_ERROR
;
686 static const UChar chars
[] = {
687 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
688 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
689 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
690 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
691 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
692 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
693 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
694 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
695 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
696 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
697 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
698 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
699 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
702 static const UChar chars_reverse
[] = {
703 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
704 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
705 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
706 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
707 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
708 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
709 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
710 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
711 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
712 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
713 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
714 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
715 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
719 int32_t bLength
= 0, brLength
= 0;
721 UnicodeString
s1(chars
);
722 UnicodeString
s2(chars_reverse
);
724 char *bytes
= extractBytes(s1
, "IBM420", bLength
);
725 char *bytes_r
= extractBytes(s2
, "IBM420", brLength
);
727 UCharsetDetector
*csd
= ucsdet_open(&status
);
728 if (U_FAILURE(status
)) {
729 errln("Error opening charset detector. - %s", u_errorName(status
));
731 ucsdet_setDetectableCharset(csd
, "IBM424_rtl", TRUE
, &status
);
732 ucsdet_setDetectableCharset(csd
, "IBM424_ltr", TRUE
, &status
);
733 ucsdet_setDetectableCharset(csd
, "IBM420_rtl", TRUE
, &status
);
734 ucsdet_setDetectableCharset(csd
, "IBM420_ltr", TRUE
, &status
);
735 const UCharsetMatch
*match
;
738 ucsdet_setText(csd
, bytes
, bLength
, &status
);
739 match
= ucsdet_detect(csd
, &status
);
742 errcheckln(status
, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status
));
746 name
= ucsdet_getName(match
, &status
);
747 if (strcmp(name
, "IBM420_rtl") != 0) {
748 errln("Encoding detection failure for IBM420_rtl: got %s\n", name
);
751 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
752 match
= ucsdet_detect(csd
, &status
);
755 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
759 name
= ucsdet_getName(match
, &status
);
760 if (strcmp(name
, "IBM420_ltr") != 0) {
761 errln("Encoding detection failure for IBM420_ltr: got %s\n", name
);
772 void CharsetDetectionTest::Ticket6394Test() {
773 #if !UCONFIG_NO_CONVERSION
774 const char charText
[] = "Here is some random English text that should be detected as ISO-8859-1."
775 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
776 "encodings more than once. The hop through UnicodeString is for platforms "
777 "where this char * string is be EBCDIC and needs conversion to Latin1.";
778 char latin1Text
[sizeof(charText
)];
779 UnicodeString(charText
).extract(0, sizeof(charText
)-2, latin1Text
, sizeof(latin1Text
), "ISO-8859-1");
781 UErrorCode status
= U_ZERO_ERROR
;
782 UCharsetDetector
*csd
= ucsdet_open(&status
);
783 ucsdet_setText(csd
, latin1Text
, -1, &status
);
784 if (U_FAILURE(status
)) {
785 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
789 int32_t matchCount
= 0;
790 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
, &matchCount
, &status
);
791 if (U_FAILURE(status
)) {
792 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
796 UnicodeSet setOfCharsetNames
; // UnicodSets can hold strings.
798 for (i
=0; i
<matchCount
; i
++) {
799 UnicodeString
charSetName(ucsdet_getName(matches
[i
], &status
));
800 if (U_FAILURE(status
)) {
801 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__
, __LINE__
, u_errorName(status
), i
);
802 status
= U_ZERO_ERROR
;
804 if (setOfCharsetNames
.contains(charSetName
)) {
805 errln("Fail at file %s, line %d ", __FILE__
, __LINE__
);
806 errln(UnicodeString(" Duplicate charset name = ") + charSetName
);
808 setOfCharsetNames
.add(charSetName
);
815 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
816 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
817 // Charset Recognizer objects, and could be overwritten.
818 void CharsetDetectionTest::Ticket6954Test() {
819 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
820 UErrorCode status
= U_ZERO_ERROR
;
821 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
822 UnicodeString
ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
823 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV
);
824 UnicodeString sWindows
= ssWindows
.unescape();
825 int32_t lISO
= 0, lWindows
= 0;
826 char *bISO
= extractBytes(sISO
, "ISO-8859-1", lISO
);
827 char *bWindows
= extractBytes(sWindows
, "windows-1252", lWindows
);
829 // First do a plain vanilla detect of 1252 text
831 UCharsetDetector
*csd1
= ucsdet_open(&status
);
832 ucsdet_setText(csd1
, bWindows
, lWindows
, &status
);
833 const UCharsetMatch
*match1
= ucsdet_detect(csd1
, &status
);
834 const char *name1
= ucsdet_getName(match1
, &status
);
835 TEST_ASSERT_SUCCESS(status
);
836 TEST_ASSERT(strcmp(name1
, "windows-1252")==0);
838 // Next, using a completely separate detector, detect some 8859-1 text
840 UCharsetDetector
*csd2
= ucsdet_open(&status
);
841 ucsdet_setText(csd2
, bISO
, lISO
, &status
);
842 const UCharsetMatch
*match2
= ucsdet_detect(csd2
, &status
);
843 const char *name2
= ucsdet_getName(match2
, &status
);
844 TEST_ASSERT_SUCCESS(status
);
845 TEST_ASSERT(strcmp(name2
, "ISO-8859-1")==0);
847 // Recheck the 1252 results from the first detector, which should not have been
848 // altered by the use of a different detector.
850 name1
= ucsdet_getName(match1
, &status
);
851 TEST_ASSERT_SUCCESS(status
);
852 TEST_ASSERT(strcmp(name1
, "windows-1252")==0);