1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/unistr.h"
15 #include "unicode/putil.h"
16 #include "unicode/uniset.h"
21 #include "xmlparser.h"
32 #define CH_SPACE 0x0020
33 #define CH_SLASH 0x002F
35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
37 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
39 } UPRV_BLOCK_MACRO_END
41 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
42 if (U_FAILURE(errcode)) { \
43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
46 } UPRV_BLOCK_MACRO_END
49 //---------------------------------------------------------------------------
51 // Test class boilerplate
53 //---------------------------------------------------------------------------
54 CharsetDetectionTest::CharsetDetectionTest()
59 CharsetDetectionTest::~CharsetDetectionTest()
65 void CharsetDetectionTest::runIndexedTest( int32_t index
, UBool exec
, const char* &name
, char* /*par*/ )
67 if (exec
) logln("TestSuite CharsetDetectionTest: ");
69 case 0: name
= "ConstructionTest";
70 if (exec
) ConstructionTest();
73 case 1: name
= "UTF8Test";
77 case 2: name
= "UTF16Test";
78 if (exec
) UTF16Test();
81 case 3: name
= "C1BytesTest";
82 if (exec
) C1BytesTest();
85 case 4: name
= "InputFilterTest";
86 if (exec
) InputFilterTest();
89 case 5: name
= "DetectionTest";
90 if (exec
) DetectionTest();
92 #if !UCONFIG_NO_LEGACY_CONVERSION
93 case 6: name
= "IBM424Test";
94 if (exec
) IBM424Test();
97 case 7: name
= "IBM420Test";
98 if (exec
) IBM420Test();
102 case 7: name
= "skip"; break;
104 case 8: name
= "Ticket6394Test";
105 if (exec
) Ticket6394Test();
108 case 9: name
= "Ticket6954Test";
109 if (exec
) Ticket6954Test();
113 break; //needed to end loop
117 static UnicodeString
*split(const UnicodeString
&src
, UChar ch
, int32_t &splits
)
122 while((offset
= src
.indexOf(ch
, offset
+ 1)) >= 0) {
126 UnicodeString
*result
= new UnicodeString
[splits
];
132 while((end
= src
.indexOf(ch
, start
)) >= 0) {
133 src
.extractBetween(start
, end
, result
[split
++]);
137 src
.extractBetween(start
, src
.length(), result
[split
]);
142 static char *extractBytes(const UnicodeString
&source
, const char *codepage
, int32_t &length
)
144 int32_t sLength
= source
.length();
147 length
= source
.extract(0, sLength
, NULL
, codepage
);
150 bytes
= new char[length
+ 1];
151 source
.extract(0, sLength
, bytes
, codepage
);
157 void CharsetDetectionTest::checkEncoding(const UnicodeString
&testString
, const UnicodeString
&encoding
, const UnicodeString
&id
)
160 int32_t testLength
= testString
.length();
161 std::unique_ptr
<UnicodeString
[]> eSplit(split(encoding
, CH_SLASH
, splits
));
162 UErrorCode status
= U_ZERO_ERROR
;
163 int32_t cpLength
= eSplit
[0].length();
166 u_UCharsToChars(eSplit
[0].getBuffer(), codepage
, cpLength
);
167 codepage
[cpLength
] = '\0';
169 LocalUCharsetDetectorPointer
csd(ucsdet_open(&status
));
171 int32_t byteLength
= 0;
172 std::unique_ptr
<char []> bytes(extractBytes(testString
, codepage
, byteLength
));
175 #if !UCONFIG_NO_LEGACY_CONVERSION
176 dataerrln("Can't open a " + encoding
+ " converter for " + id
);
181 ucsdet_setText(csd
.getAlias(), bytes
.get(), byteLength
, &status
);
183 int32_t matchCount
= 0;
184 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
.getAlias(), &matchCount
, &status
);
187 UnicodeString
name(ucsdet_getName(matches
[0], &status
));
188 UnicodeString
lang(ucsdet_getLanguage(matches
[0], &status
));
189 UChar
*decoded
= NULL
;
192 if (matchCount
== 0) {
193 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got no matches");
197 if (name
.compare(eSplit
[0]) != 0) {
198 errln("Encoding detection failure for " + id
+ ": expected " + eSplit
[0] + ", got " + name
);
201 for (int32_t m
= 0; m
< matchCount
; m
+= 1) {
202 const char *name
= ucsdet_getName(matches
[m
], &status
);
203 const char *lang
= ucsdet_getLanguage(matches
[m
], &status
);
204 int32_t confidence
= ucsdet_getConfidence(matches
[m
], &status
);
206 printf("%s (%s) %d\n", name
, lang
, confidence
);
212 if (splits
> 1 && lang
.compare(eSplit
[1]) != 0) {
213 errln("Language detection failure for " + id
+ ", " + eSplit
[0] + ": expected " + eSplit
[1] + ", got " + lang
);
217 decoded
= new UChar
[testLength
];
218 dLength
= ucsdet_getUChars(matches
[0], decoded
, testLength
, &status
);
220 if (testString
.compare(decoded
, dLength
) != 0) {
221 errln("Round-trip error for " + id
+ ", " + eSplit
[0] + ": getUChars() didn't yeild the original string.");
224 for(int32_t i
= 0; i
< testLength
; i
+= 1) {
225 if(testString
[i
] != decoded
[i
]) {
226 printf("Strings differ at byte %d\n", i
);
237 const char *CharsetDetectionTest::getPath(char buffer
[2048], const char *filename
) {
238 UErrorCode status
= U_ZERO_ERROR
;
239 const char *testDataDirectory
= IntlTest::getSourceTestData(status
);
241 if (U_FAILURE(status
)) {
242 errln("ERROR: getPath() failed - %s", u_errorName(status
));
246 strcpy(buffer
, testDataDirectory
);
247 strcat(buffer
, filename
);
251 void CharsetDetectionTest::ConstructionTest()
253 IcuTestErrorCode
status(*this, "ConstructionTest");
254 LocalUCharsetDetectorPointer
csd(ucsdet_open(status
));
255 LocalUEnumerationPointer
e(ucsdet_getAllDetectableCharsets(csd
.getAlias(), status
));
256 int32_t count
= uenum_count(e
.getAlias(), status
);
259 printf("There are %d recognizers.\n", count
);
262 for(int32_t i
= 0; i
< count
; i
+= 1) {
264 const char *name
= uenum_next(e
.getAlias(), &length
, status
);
266 if(name
== NULL
|| length
<= 0) {
267 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
271 printf("%s\n", name
);
275 const char* defDisabled
[] = {
276 "IBM420_rtl", "IBM420_ltr",
277 "IBM424_rtl", "IBM424_ltr",
281 LocalUEnumerationPointer
eActive(ucsdet_getDetectableCharsets(csd
.getAlias(), status
));
282 const char *activeName
= NULL
;
284 while ((activeName
= uenum_next(eActive
.getAlias(), NULL
, status
))) {
285 // the charset must be included in all list
288 const char *name
= NULL
;
289 uenum_reset(e
.getAlias(), status
);
290 while ((name
= uenum_next(e
.getAlias(), NULL
, status
))) {
291 if (strcmp(activeName
, name
) == 0) {
298 errln(UnicodeString(activeName
) + " is not included in the all charset list.");
301 // some charsets are disabled by default
303 for (int32_t i
= 0; defDisabled
[i
] != 0; i
++) {
304 if (strcmp(activeName
, defDisabled
[i
]) == 0) {
310 errln(UnicodeString(activeName
) + " should not be included in the default charset list.");
315 void CharsetDetectionTest::UTF8Test()
317 UErrorCode status
= U_ZERO_ERROR
;
318 UnicodeString ss
= "This is a string with some non-ascii characters that will "
319 "be converted to UTF-8, then shoved through the detection process. "
320 "\\u0391\\u0392\\u0393\\u0394\\u0395"
321 "Sure would be nice if our source could contain Unicode directly!";
322 UnicodeString s
= ss
.unescape();
323 int32_t byteLength
= 0, sLength
= s
.length();
324 char *bytes
= extractBytes(s
, "UTF-8", byteLength
);
325 UCharsetDetector
*csd
= ucsdet_open(&status
);
326 const UCharsetMatch
*match
;
327 UChar
*detected
= new UChar
[sLength
];
329 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
330 match
= ucsdet_detect(csd
, &status
);
333 errln("Detection failure for UTF-8: got no matches.");
337 ucsdet_getUChars(match
, detected
, sLength
, &status
);
339 if (s
.compare(detected
, sLength
) != 0) {
340 errln("Round-trip test failed!");
343 ucsdet_setDeclaredEncoding(csd
, "UTF-8", 5, &status
); /* for coverage */
351 void CharsetDetectionTest::UTF16Test()
353 UErrorCode status
= U_ZERO_ERROR
;
354 /* Notice the BOM on the start of this string */
356 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
357 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
358 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
359 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
360 0x064a, 0x062a, 0x0000};
361 UnicodeString
s(chars
);
362 int32_t beLength
= 0, leLength
= 0;
363 std::unique_ptr
<char []>beBytes(extractBytes(s
, "UTF-16BE", beLength
));
364 std::unique_ptr
<char []>leBytes(extractBytes(s
, "UTF-16LE", leLength
));
365 LocalUCharsetDetectorPointer
csd(ucsdet_open(&status
));
366 const UCharsetMatch
*match
;
370 ucsdet_setText(csd
.getAlias(), beBytes
.get(), beLength
, &status
);
371 match
= ucsdet_detect(csd
.getAlias(), &status
);
374 errln("Encoding detection failure for UTF-16BE: got no matches.");
377 name
= ucsdet_getName(match
, &status
);
378 conf
= ucsdet_getConfidence(match
, &status
);
380 if (strcmp(name
, "UTF-16BE") != 0) {
381 errln("Encoding detection failure for UTF-16BE: got %s", name
);
382 } else if (conf
!= 100) {
383 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf
);
387 ucsdet_setText(csd
.getAlias(), leBytes
.get(), leLength
, &status
);
388 match
= ucsdet_detect(csd
.getAlias(), &status
);
391 errln("Encoding detection failure for UTF-16LE: got no matches.");
395 name
= ucsdet_getName(match
, &status
);
396 conf
= ucsdet_getConfidence(match
, &status
);
398 if (strcmp(name
, "UTF-16LE") != 0) {
399 errln("Enconding detection failure for UTF-16LE: got %s", name
);
404 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf
);
408 void CharsetDetectionTest::InputFilterTest()
410 UErrorCode status
= U_ZERO_ERROR
;
411 UnicodeString
s(u
"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>");
412 int32_t byteLength
= 0;
413 char *bytes
= extractBytes(s
, "ISO-8859-1", byteLength
);
414 UCharsetDetector
*csd
= ucsdet_open(&status
);
415 const UCharsetMatch
*match
;
416 const char *lang
, *name
;
418 ucsdet_enableInputFilter(csd
, TRUE
);
420 if (!ucsdet_isInputFilterEnabled(csd
)) {
421 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
425 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
426 match
= ucsdet_detect(csd
, &status
);
429 errln("Turning on the input filter resulted in no matches.");
433 name
= ucsdet_getName(match
, &status
);
435 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
436 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name
);
438 lang
= ucsdet_getLanguage(match
, &status
);
440 if (lang
== NULL
|| strcmp(lang
, "fr") != 0) {
441 errln("Input filter did not strip markup!");
446 ucsdet_enableInputFilter(csd
, FALSE
);
447 ucsdet_setText(csd
, bytes
, byteLength
, &status
);
448 match
= ucsdet_detect(csd
, &status
);
451 errln("Turning off the input filter resulted in no matches.");
455 name
= ucsdet_getName(match
, &status
);
457 if (name
== NULL
|| strcmp(name
, "ISO-8859-1") != 0) {
458 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name
);
460 lang
= ucsdet_getLanguage(match
, &status
);
462 if (lang
== NULL
|| strcmp(lang
, "en") != 0) {
463 errln("Unfiltered input did not detect as English!");
472 void CharsetDetectionTest::C1BytesTest()
474 #if !UCONFIG_NO_LEGACY_CONVERSION
475 UErrorCode status
= U_ZERO_ERROR
;
476 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
477 UnicodeString
ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV
);
478 UnicodeString sWindows
= ssWindows
.unescape();
479 int32_t lISO
= 0, lWindows
= 0;
480 char *bISO
= extractBytes(sISO
, "ISO-8859-1", lISO
);
481 char *bWindows
= extractBytes(sWindows
, "windows-1252", lWindows
);
482 UCharsetDetector
*csd
= ucsdet_open(&status
);
483 const UCharsetMatch
*match
;
486 ucsdet_setText(csd
, bWindows
, lWindows
, &status
);
487 match
= ucsdet_detect(csd
, &status
);
490 errcheckln(status
, "English test with C1 bytes got no matches. - %s", u_errorName(status
));
494 name
= ucsdet_getName(match
, &status
);
496 if (strcmp(name
, "windows-1252") != 0) {
497 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name
);
500 ucsdet_setText(csd
, bISO
, lISO
, &status
);
501 match
= ucsdet_detect(csd
, &status
);
504 errln("English text without C1 bytes got no matches.");
508 name
= ucsdet_getName(match
, &status
);
510 if (strcmp(name
, "ISO-8859-1") != 0) {
511 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name
);
522 void CharsetDetectionTest::DetectionTest()
524 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
525 UErrorCode status
= U_ZERO_ERROR
;
527 const char *testFilePath
= getPath(path
, "csdetest.xml");
529 if (testFilePath
== NULL
) {
530 return; /* Couldn't get path: error message already output. */
533 UXMLParser
*parser
= UXMLParser::createParser(status
);
534 if (U_FAILURE(status
)) {
535 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status
));
539 UXMLElement
*root
= parser
->parseFile(testFilePath
, status
);
540 if (!assertSuccess( "parseFile",status
)) return;
542 UnicodeString test_case
= UNICODE_STRING_SIMPLE("test-case");
543 UnicodeString id_attr
= UNICODE_STRING_SIMPLE("id");
544 UnicodeString enc_attr
= UNICODE_STRING_SIMPLE("encodings");
546 const UXMLElement
*testCase
;
549 while((testCase
= root
->nextChildElement(tc
)) != NULL
) {
550 if (testCase
->getTagName().compare(test_case
) == 0) {
551 const UnicodeString
*id
= testCase
->getAttribute(id_attr
);
552 const UnicodeString
*encodings
= testCase
->getAttribute(enc_attr
);
553 const UnicodeString text
= testCase
->getText(TRUE
);
554 int32_t encodingCount
;
555 UnicodeString
*encodingList
= split(*encodings
, CH_SPACE
, encodingCount
);
557 for(int32_t e
= 0; e
< encodingCount
; e
+= 1) {
558 checkEncoding(text
, encodingList
[e
], *id
);
561 delete[] encodingList
;
570 void CharsetDetectionTest::IBM424Test()
572 #if !UCONFIG_ONLY_HTML_CONVERSION
573 UErrorCode status
= U_ZERO_ERROR
;
575 static const UChar chars
[] = {
576 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
577 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
578 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
579 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
580 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
581 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
582 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
583 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
584 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
585 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
586 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
587 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
588 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
589 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
590 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
591 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
592 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
595 static const UChar chars_reverse
[] = {
596 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
597 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
598 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
599 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
600 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
601 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
602 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
603 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
604 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
605 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
606 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
607 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
608 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
609 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
610 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
611 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
612 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
616 int32_t bLength
= 0, brLength
= 0;
618 UnicodeString
s1(chars
);
619 UnicodeString
s2(chars_reverse
);
621 char *bytes
= extractBytes(s1
, "IBM424", bLength
);
622 char *bytes_r
= extractBytes(s2
, "IBM424", brLength
);
624 UCharsetDetector
*csd
= ucsdet_open(&status
);
625 ucsdet_setDetectableCharset(csd
, "IBM424_rtl", TRUE
, &status
);
626 ucsdet_setDetectableCharset(csd
, "IBM424_ltr", TRUE
, &status
);
627 ucsdet_setDetectableCharset(csd
, "IBM420_rtl", TRUE
, &status
);
628 ucsdet_setDetectableCharset(csd
, "IBM420_ltr", TRUE
, &status
);
629 if (U_FAILURE(status
)) {
630 errln("Error opening charset detector. - %s", u_errorName(status
));
632 const UCharsetMatch
*match
;
635 ucsdet_setText(csd
, bytes
, bLength
, &status
);
636 match
= ucsdet_detect(csd
, &status
);
639 errcheckln(status
, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status
));
643 name
= ucsdet_getName(match
, &status
);
644 if (strcmp(name
, "IBM424_rtl") != 0) {
645 errln("Encoding detection failure for IBM424_rtl: got %s", name
);
648 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
649 match
= ucsdet_detect(csd
, &status
);
652 errln("Encoding detection failure for IBM424_ltr: got no matches.");
656 name
= ucsdet_getName(match
, &status
);
657 if (strcmp(name
, "IBM424_ltr") != 0) {
658 errln("Encoding detection failure for IBM424_ltr: got %s", name
);
668 void CharsetDetectionTest::IBM420Test()
670 #if !UCONFIG_ONLY_HTML_CONVERSION
671 UErrorCode status
= U_ZERO_ERROR
;
673 static const UChar chars
[] = {
674 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
675 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
676 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
677 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
678 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
679 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
680 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
681 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
682 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
683 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
684 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
685 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
686 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
689 static const UChar chars_reverse
[] = {
690 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
691 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
692 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
693 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
694 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
695 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
696 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
697 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
698 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
699 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
700 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
701 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
702 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
706 int32_t bLength
= 0, brLength
= 0;
708 UnicodeString
s1(chars
);
709 UnicodeString
s2(chars_reverse
);
711 char *bytes
= extractBytes(s1
, "IBM420", bLength
);
712 char *bytes_r
= extractBytes(s2
, "IBM420", brLength
);
714 UCharsetDetector
*csd
= ucsdet_open(&status
);
715 if (U_FAILURE(status
)) {
716 errln("Error opening charset detector. - %s", u_errorName(status
));
718 ucsdet_setDetectableCharset(csd
, "IBM424_rtl", TRUE
, &status
);
719 ucsdet_setDetectableCharset(csd
, "IBM424_ltr", TRUE
, &status
);
720 ucsdet_setDetectableCharset(csd
, "IBM420_rtl", TRUE
, &status
);
721 ucsdet_setDetectableCharset(csd
, "IBM420_ltr", TRUE
, &status
);
722 const UCharsetMatch
*match
;
725 ucsdet_setText(csd
, bytes
, bLength
, &status
);
726 match
= ucsdet_detect(csd
, &status
);
729 errcheckln(status
, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status
));
733 name
= ucsdet_getName(match
, &status
);
734 if (strcmp(name
, "IBM420_rtl") != 0) {
735 errln("Encoding detection failure for IBM420_rtl: got %s\n", name
);
738 ucsdet_setText(csd
, bytes_r
, brLength
, &status
);
739 match
= ucsdet_detect(csd
, &status
);
742 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
746 name
= ucsdet_getName(match
, &status
);
747 if (strcmp(name
, "IBM420_ltr") != 0) {
748 errln("Encoding detection failure for IBM420_ltr: got %s\n", name
);
759 void CharsetDetectionTest::Ticket6394Test() {
760 #if !UCONFIG_NO_CONVERSION
761 const char charText
[] = "Here is some random English text that should be detected as ISO-8859-1."
762 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
763 "encodings more than once. The hop through UnicodeString is for platforms "
764 "where this char * string is be EBCDIC and needs conversion to Latin1.";
765 char latin1Text
[sizeof(charText
)];
766 UnicodeString(charText
).extract(0, sizeof(charText
)-2, latin1Text
, sizeof(latin1Text
), "ISO-8859-1");
768 UErrorCode status
= U_ZERO_ERROR
;
769 UCharsetDetector
*csd
= ucsdet_open(&status
);
770 ucsdet_setText(csd
, latin1Text
, -1, &status
);
771 if (U_FAILURE(status
)) {
772 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
776 int32_t matchCount
= 0;
777 const UCharsetMatch
**matches
= ucsdet_detectAll(csd
, &matchCount
, &status
);
778 if (U_FAILURE(status
)) {
779 errln("Fail at file %s, line %d. status = %s", __FILE__
, __LINE__
, u_errorName(status
));
783 UnicodeSet setOfCharsetNames
; // UnicodSets can hold strings.
785 for (i
=0; i
<matchCount
; i
++) {
786 UnicodeString
charSetName(ucsdet_getName(matches
[i
], &status
));
787 if (U_FAILURE(status
)) {
788 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__
, __LINE__
, u_errorName(status
), i
);
789 status
= U_ZERO_ERROR
;
791 if (setOfCharsetNames
.contains(charSetName
)) {
792 errln("Fail at file %s, line %d ", __FILE__
, __LINE__
);
793 errln(UnicodeString(" Duplicate charset name = ") + charSetName
);
795 setOfCharsetNames
.add(charSetName
);
802 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
803 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
804 // Charset Recognizer objects, and could be overwritten.
805 void CharsetDetectionTest::Ticket6954Test() {
806 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
807 UErrorCode status
= U_ZERO_ERROR
;
808 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
809 UnicodeString
ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
810 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV
);
811 UnicodeString sWindows
= ssWindows
.unescape();
812 int32_t lISO
= 0, lWindows
= 0;
813 std::unique_ptr
<char[]> bISO(extractBytes(sISO
, "ISO-8859-1", lISO
));
814 std::unique_ptr
<char[]> bWindows(extractBytes(sWindows
, "windows-1252", lWindows
));
816 // First do a plain vanilla detect of 1252 text
818 LocalUCharsetDetectorPointer
csd1(ucsdet_open(&status
));
819 ucsdet_setText(csd1
.getAlias(), bWindows
.get(), lWindows
, &status
);
820 const UCharsetMatch
*match1
= ucsdet_detect(csd1
.getAlias(), &status
);
821 const char *name1
= ucsdet_getName(match1
, &status
);
822 TEST_ASSERT_SUCCESS(status
);
823 TEST_ASSERT(strcmp(name1
, "windows-1252")==0);
825 // Next, using a completely separate detector, detect some 8859-1 text
827 LocalUCharsetDetectorPointer
csd2(ucsdet_open(&status
));
828 ucsdet_setText(csd2
.getAlias(), bISO
.get(), lISO
, &status
);
829 const UCharsetMatch
*match2
= ucsdet_detect(csd2
.getAlias(), &status
);
830 const char *name2
= ucsdet_getName(match2
, &status
);
831 TEST_ASSERT_SUCCESS(status
);
832 TEST_ASSERT(strcmp(name2
, "ISO-8859-1")==0);
834 // Recheck the 1252 results from the first detector, which should not have been
835 // altered by the use of a different detector.
837 name1
= ucsdet_getName(match1
, &status
);
838 TEST_ASSERT_SUCCESS(status
);
839 TEST_ASSERT(strcmp(name1
, "windows-1252")==0);