2 **********************************************************************
3 * Copyright (C) 2005-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/uniset.h"
19 #include "xmlparser.h"
28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
33 #define CH_SPACE 0x0020
34 #define CH_SLASH 0x002F
36 //---------------------------------------------------------------------------
38 // Test class boilerplate
40 //---------------------------------------------------------------------------
41 CharsetDetectionTest::CharsetDetectionTest()
46 CharsetDetectionTest::~CharsetDetectionTest()
52 void CharsetDetectionTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
54 if (exec
) logln("TestSuite CharsetDetectionTest: ");
56 case 0: name
= "ConstructionTest";
57 if (exec
) ConstructionTest();
60 case 1: name
= "UTF8Test";
64 case 2: name
= "UTF16Test";
65 if (exec
) UTF16Test();
68 case 3: name
= "C1BytesTest";
69 if (exec
) C1BytesTest();
72 case 4: name
= "InputFilterTest";
73 if (exec
) InputFilterTest();
76 case 5: name
= "DetectionTest";
77 if (exec
) DetectionTest();
79 #if !UCONFIG_NO_LEGACY_CONVERSION
80 case 6: name
= "IBM424Test";
81 if (exec
) IBM424Test();
84 case 7: name
= "IBM420Test";
85 if (exec
) IBM420Test();
89 case 7: name
= "skip"; break;
91 case 8: name
= "Ticket6394Test";
92 if (exec
) Ticket6394Test();
96 break; //needed to end loop
100 static UnicodeString
*split(const UnicodeString
&src
, UChar ch
, int32_t &splits
)
105 while((offset
= src
.indexOf(ch
, offset
+ 1)) >= 0) {
109 UnicodeString
*result
= new UnicodeString
[splits
];
115 while((end
= src
.indexOf(ch
, start
)) >= 0) {
116 src
.extractBetween(start
, end
, result
[split
++]);
120 src
.extractBetween(start
, src
.length(), result
[split
]);
125 static char *extractBytes(const UnicodeString
&source
, const char *codepage
, int32_t &length
)
127 int32_t sLength
= source
.length();
130 length
= source
.extract(0, sLength
, NULL
, codepage
);
133 bytes
= NEW_ARRAY(char, length
+ 1);
134 source
.extract(0, sLength
, bytes
, codepage
);
140 static void freeBytes(char *bytes
)
145 void CharsetDetectionTest::checkEncoding(const UnicodeString
&testString
, const UnicodeString
&encoding
, const UnicodeString
&id
)
148 int32_t testLength
= testString
.length();
149 UnicodeString
*eSplit
= split(encoding
, CH_SLASH
, splits
);
150 UErrorCode status
= U_ZERO_ERROR
;
151 int32_t cpLength
= eSplit
[0].length();
154 u_UCharsToChars(eSplit
[0].getBuffer(), codepage
, cpLength
);
155 codepage
[cpLength
] = '\0';
157 LocalUCharsetDetectorPointer
csd(ucsdet_open(&status
));
159 int32_t byteLength
= 0;
160 char *bytes
= extractBytes(testString
, codepage
, byteLength
);
163 #if !UCONFIG_NO_LEGACY_CONVERSION
164 dataerrln("Can't open a " + encoding
+ " converter for " + id
);
169 ucsdet_setText(csd
.getAlias(), bytes
, byteLength
, &status
);
171 int32_t matchCount
= 0;
172 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
.getAlias(), &matchCount
, &status
);
175 UnicodeString
name(ucsdet_getName(matches
[0], &status
));
176 UnicodeString
lang(ucsdet_getLanguage(matches
[0], &status
));
177 UChar
*decoded
= NULL
;
180 if (matchCount
== 0) {
181 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got no matches");
185 if (name
.compare(eSplit
[0]) != 0) {
186 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got " + name
);
189 for (int32_t m
= 0; m
< matchCount
; m
+= 1) {
190 const char *name
= ucsdet_getName(matches
[m
], &status
);
191 const char *lang
= ucsdet_getLanguage(matches
[m
], &status
);
192 int32_t confidence
= ucsdet_getConfidence(matches
[m
], &status
);
194 printf("%s (%s) %d\n", name
, lang
, confidence
);
200 if (splits
> 1 && lang
.compare(eSplit
[1]) != 0) {
201 errln("Language detection failure for " + id
+ ", " + eSplit
[0] + ": expected " + eSplit
[1] + ", got " + lang
);
205 decoded
= NEW_ARRAY(UChar
, testLength
);
206 dLength
= ucsdet_getUChars(matches
[0], decoded
, testLength
, &status
);
208 if (testString
.compare(decoded
, dLength
) != 0) {
209 errln("Round-trip error for " + id
+ ", " + eSplit
[0] + ": getUChars() didn't yeild the original string.");
212 for(int32_t i
= 0; i
< testLength
; i
+= 1) {
213 if(testString
[i
] != decoded
[i
]) {
214 printf("Strings differ at byte %d\n", i
);
222 DELETE_ARRAY(decoded
);
229 const char *CharsetDetectionTest::getPath(char buffer
[2048], const char *filename
) {
230 UErrorCode status
= U_ZERO_ERROR
;
231 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
233 if (U_FAILURE(status
)) {
234 errln("ERROR: getPath() failed - %s", u_errorName(status
));
238 strcpy(buffer
, testDataDirectory
);
239 strcat(buffer
, filename
);
243 void CharsetDetectionTest::ConstructionTest()
245 IcuTestErrorCode
status(*this, "ConstructionTest");
246 LocalUCharsetDetectorPointer
csd(ucsdet_open(status
));
247 LocalUEnumerationPointer
e(ucsdet_getAllDetectableCharsets(csd
.getAlias(), status
));
248 int32_t count
= uenum_count(e
.getAlias(), status
);
251 printf("There are %d recognizers.\n", count
);
254 for(int32_t i
= 0; i
< count
; i
+= 1) {
256 const char *name
= uenum_next(e
.getAlias(), &length
, status
);
258 if(name
== NULL
|| length
<= 0) {
259 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
263 printf("%s\n", name
);
268 void CharsetDetectionTest::UTF8Test()
270 UErrorCode status
= U_ZERO_ERROR
;
271 UnicodeString ss
= "This is a string with some non-ascii characters that will "
272 "be converted to UTF-8, then shoved through the detection process. "
273 "\\u0391\\u0392\\u0393\\u0394\\u0395"
274 "Sure would be nice if our source could contain Unicode directly!";
275 UnicodeString s
= ss
.unescape();
276 int32_t byteLength
= 0, sLength
= s
.length();
277 char *bytes
= extractBytes(s
, "UTF-8", byteLength
);
278 UCharsetDetector
*csd
= ucsdet_open(&status
);
279 const UCharsetMatch
*match
;
280 UChar
*detected
= NEW_ARRAY(UChar
, sLength
);
282 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
283 match
= ucsdet_detect(csd
, &status
);
286 errln("Detection failure for UTF-8: got no matches.");
290 ucsdet_getUChars(match
, detected
, sLength
, &status
);
292 if (s
.compare(detected
, sLength
) != 0) {
293 errln("Round-trip test failed!");
296 ucsdet_setDeclaredEncoding(csd
, "UTF-8", 5, &status
); /* for coverage */
299 DELETE_ARRAY(detected
);
304 void CharsetDetectionTest::UTF16Test()
306 UErrorCode status
= U_ZERO_ERROR
;
307 /* Notice the BOM on the start of this string */
309 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
310 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
311 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
312 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
313 0x064a, 0x062a, 0x0000};
314 UnicodeString
s(chars
);
315 int32_t beLength
= 0, leLength
= 0;
316 char *beBytes
= extractBytes(s
, "UTF-16BE", beLength
);
317 char *leBytes
= extractBytes(s
, "UTF-16LE", leLength
);
318 UCharsetDetector
*csd
= ucsdet_open(&status
);
319 const UCharsetMatch
*match
;
323 ucsdet_setText(csd
, beBytes
, beLength
, &status
);
324 match
= ucsdet_detect(csd
, &status
);
327 errln("Encoding detection failure for UTF-16BE: got no matches.");
331 name
= ucsdet_getName(match
, &status
);
332 conf
= ucsdet_getConfidence(match
, &status
);
334 if (strcmp(name
, "UTF-16BE") != 0) {
335 errln("Encoding detection failure for UTF-16BE: got %s", name
);
336 goto try_le
; // no point in looking at confidence if we got the wrong character set.
340 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf
);
344 ucsdet_setText(csd
, leBytes
, leLength
, &status
);
345 match
= ucsdet_detect(csd
, &status
);
348 errln("Encoding detection failure for UTF-16LE: got no matches.");
352 name
= ucsdet_getName(match
, &status
);
353 conf
= ucsdet_getConfidence(match
, &status
);
356 if (strcmp(name
, "UTF-16LE") != 0) {
357 errln("Enconding detection failure for UTF-16LE: got %s", name
);
358 goto bail
; // no point in looking at confidence if we got the wrong character set.
362 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf
);
371 void CharsetDetectionTest::InputFilterTest()
373 UErrorCode status
= U_ZERO_ERROR
;
374 UnicodeString ss
= "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
375 UnicodeString s
= ss
.unescape();
376 int32_t byteLength
= 0;
377 char *bytes
= extractBytes(s
, "ISO-8859-1", byteLength
);
378 UCharsetDetector
*csd
= ucsdet_open(&status
);
379 const UCharsetMatch
*match
;
380 const char *lang
, *name
;
382 ucsdet_enableInputFilter(csd
, TRUE
);
384 if (!ucsdet_isInputFilterEnabled(csd
)) {
385 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
389 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
390 match
= ucsdet_detect(csd
, &status
);
393 errln("Turning on the input filter resulted in no matches.");
397 name
= ucsdet_getName(match
, &status
);
399 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
400 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name
);
402 lang
= ucsdet_getLanguage(match
, &status
);
404 if (lang
== NULL
|| strcmp(lang
, "fr") != 0) {
405 errln("Input filter did not strip markup!");
410 ucsdet_enableInputFilter(csd
, FALSE
);
411 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
412 match
= ucsdet_detect(csd
, &status
);
415 errln("Turning off the input filter resulted in no matches.");
419 name
= ucsdet_getName(match
, &status
);
421 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
422 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name
);
424 lang
= ucsdet_getLanguage(match
, &status
);
426 if (lang
== NULL
|| strcmp(lang
, "en") != 0) {
427 errln("Unfiltered input did not detect as English!");
436 void CharsetDetectionTest::C1BytesTest()
438 #if !UCONFIG_NO_LEGACY_CONVERSION
439 UErrorCode status
= U_ZERO_ERROR
;
440 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
441 UnicodeString
ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV
);
442 UnicodeString sWindows
= ssWindows
.unescape();
443 int32_t lISO
= 0, lWindows
= 0;
444 char *bISO
= extractBytes(sISO
, "ISO-8859-1", lISO
);
445 char *bWindows
= extractBytes(sWindows
, "windows-1252", lWindows
);
446 UCharsetDetector
*csd
= ucsdet_open(&status
);
447 const UCharsetMatch
*match
;
450 ucsdet_setText(csd
, bWindows
, lWindows
, &status
);
451 match
= ucsdet_detect(csd
, &status
);
454 errcheckln(status
, "English test with C1 bytes got no matches. - %s", u_errorName(status
));
458 name
= ucsdet_getName(match
, &status
);
460 if (strcmp(name
, "windows-1252") != 0) {
461 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name
);
464 ucsdet_setText(csd
, bISO
, lISO
, &status
);
465 match
= ucsdet_detect(csd
, &status
);
468 errln("English text without C1 bytes got no matches.");
472 name
= ucsdet_getName(match
, &status
);
474 if (strcmp(name
, "ISO-8859-1") != 0) {
475 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name
);
486 void CharsetDetectionTest::DetectionTest()
488 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
489 UErrorCode status
= U_ZERO_ERROR
;
491 const char *testFilePath
= getPath(path
, "csdetest.xml");
493 if (testFilePath
== NULL
) {
494 return; /* Couldn't get path: error message already output. */
497 UXMLParser
*parser
= UXMLParser::createParser(status
);
498 if (U_FAILURE(status
)) {
499 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status
));
503 UXMLElement
*root
= parser
->parseFile(testFilePath
, status
);
504 if (!assertSuccess( "parseFile",status
)) return;
506 UnicodeString test_case
= UNICODE_STRING_SIMPLE("test-case");
507 UnicodeString id_attr
= UNICODE_STRING_SIMPLE("id");
508 UnicodeString enc_attr
= UNICODE_STRING_SIMPLE("encodings");
510 const UXMLElement
*testCase
;
513 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
514 if (testCase
->getTagName().compare(test_case
) == 0) {
515 const UnicodeString
*id
= testCase
->getAttribute(id_attr
);
516 const UnicodeString
*encodings
= testCase
->getAttribute(enc_attr
);
517 const UnicodeString text
= testCase
->getText(TRUE
);
518 int32_t encodingCount
;
519 UnicodeString
*encodingList
= split(*encodings
, CH_SPACE
, encodingCount
);
521 for(int32_t e
= 0; e
< encodingCount
; e
+= 1) {
522 checkEncoding(text
, encodingList
[e
], *id
);
525 delete[] encodingList
;
534 void CharsetDetectionTest::IBM424Test()
536 UErrorCode status
= U_ZERO_ERROR
;
538 static const UChar chars
[] = {
539 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
540 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
541 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
542 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
543 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
544 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
545 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
546 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
547 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
548 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
549 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
550 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
551 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
552 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
553 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
554 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
555 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
558 static const UChar chars_reverse
[] = {
559 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
560 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
561 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
562 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
563 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
564 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
565 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
566 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
567 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
568 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
569 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
570 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
571 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
572 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
573 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
574 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
575 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
579 int32_t bLength
= 0, brLength
= 0;
581 UnicodeString
s1(chars
);
582 UnicodeString
s2(chars_reverse
);
584 char *bytes
= extractBytes(s1
, "IBM424", bLength
);
585 char *bytes_r
= extractBytes(s2
, "IBM424", brLength
);
587 UCharsetDetector
*csd
= ucsdet_open(&status
);
588 if (U_FAILURE(status
)) {
589 errln("Error opening charset detector. - %s", u_errorName(status
));
591 const UCharsetMatch
*match
;
594 ucsdet_setText(csd
, bytes
, bLength
, &status
);
595 match
= ucsdet_detect(csd
, &status
);
598 errcheckln(status
, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status
));
602 name
= ucsdet_getName(match
, &status
);
603 if (strcmp(name
, "IBM424_rtl") != 0) {
604 errln("Encoding detection failure for IBM424_rtl: got %s", name
);
607 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
608 match
= ucsdet_detect(csd
, &status
);
611 errln("Encoding detection failure for IBM424_ltr: got no matches.");
615 name
= ucsdet_getName(match
, &status
);
616 if (strcmp(name
, "IBM424_ltr") != 0) {
617 errln("Encoding detection failure for IBM424_ltr: got %s", name
);
626 void CharsetDetectionTest::IBM420Test()
628 UErrorCode status
= U_ZERO_ERROR
;
630 static const UChar chars
[] = {
631 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
632 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
633 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
634 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
635 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
636 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
637 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
638 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
639 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
640 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
641 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
642 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
643 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
646 static const UChar chars_reverse
[] = {
647 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
648 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
649 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
650 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
651 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
652 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
653 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
654 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
655 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
656 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
657 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
658 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
659 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
663 int32_t bLength
= 0, brLength
= 0;
665 UnicodeString
s1(chars
);
666 UnicodeString
s2(chars_reverse
);
668 char *bytes
= extractBytes(s1
, "IBM420", bLength
);
669 char *bytes_r
= extractBytes(s2
, "IBM420", brLength
);
671 UCharsetDetector
*csd
= ucsdet_open(&status
);
672 if (U_FAILURE(status
)) {
673 errln("Error opening charset detector. - %s", u_errorName(status
));
675 const UCharsetMatch
*match
;
678 ucsdet_setText(csd
, bytes
, bLength
, &status
);
679 match
= ucsdet_detect(csd
, &status
);
682 errcheckln(status
, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status
));
686 name
= ucsdet_getName(match
, &status
);
687 if (strcmp(name
, "IBM420_rtl") != 0) {
688 errln("Encoding detection failure for IBM420_rtl: got %s\n", name
);
691 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
692 match
= ucsdet_detect(csd
, &status
);
695 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
699 name
= ucsdet_getName(match
, &status
);
700 if (strcmp(name
, "IBM420_ltr") != 0) {
701 errln("Encoding detection failure for IBM420_ltr: got %s\n", name
);
711 void CharsetDetectionTest::Ticket6394Test() {
712 #if !UCONFIG_NO_CONVERSION
713 const char charText
[] = "Here is some random English text that should be detected as ISO-8859-1."
714 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
715 "encodings more than once. The hop through UnicodeString is for platforms "
716 "where this char * string is be EBCDIC and needs conversion to Latin1.";
717 char latin1Text
[sizeof(charText
)];
718 UnicodeString(charText
).extract(0, sizeof(charText
)-2, latin1Text
, sizeof(latin1Text
), "ISO-8859-1");
720 UErrorCode status
= U_ZERO_ERROR
;
721 UCharsetDetector
*csd
= ucsdet_open(&status
);
722 ucsdet_setText(csd
, latin1Text
, -1, &status
);
723 if (U_FAILURE(status
)) {
724 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
728 int32_t matchCount
= 0;
729 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
, &matchCount
, &status
);
730 if (U_FAILURE(status
)) {
731 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
735 UnicodeSet setOfCharsetNames
; // UnicodSets can hold strings.
737 for (i
=0; i
<matchCount
; i
++) {
738 UnicodeString
charSetName(ucsdet_getName(matches
[i
], &status
));
739 if (U_FAILURE(status
)) {
740 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__
, __LINE__
, u_errorName(status
), i
);
741 status
= U_ZERO_ERROR
;
743 if (setOfCharsetNames
.contains(charSetName
)) {
744 errln("Fail at file %s, line %d ", __FILE__
, __LINE__
);
745 errln(UnicodeString(" Duplicate charset name = ") + charSetName
);
747 setOfCharsetNames
.add(charSetName
);