+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
- * Copyright (C) 2005-2008, International Business Machines
+ * Copyright (C) 2005-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/ucnv.h"
#include "unicode/unistr.h"
#include "unicode/putil.h"
+#include "unicode/uniset.h"
#include "intltest.h"
#include "csdetest.h"
#include <stdio.h>
#endif
-#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
-
#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
#define CH_SPACE 0x0020
#define CH_SLASH 0x002F
+#define TEST_ASSERT(x) {if (!(x)) { \
+ errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
+
+#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
+ errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
+ return;}}
+
+
//---------------------------------------------------------------------------
//
// Test class boilerplate
case 5: name = "DetectionTest";
if (exec) DetectionTest();
break;
+#if !UCONFIG_NO_LEGACY_CONVERSION
+ case 6: name = "IBM424Test";
+ if (exec) IBM424Test();
+ break;
+
+ case 7: name = "IBM420Test";
+ if (exec) IBM420Test();
+ break;
+#else
+ case 6:
+ case 7: name = "skip"; break;
+#endif
+ case 8: name = "Ticket6394Test";
+ if (exec) Ticket6394Test();
+ break;
+
+ case 9: name = "Ticket6954Test";
+ if (exec) Ticket6954Test();
+ break;
default: name = "";
break; //needed to end loop
u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
codepage[cpLength] = '\0';
- UCharsetDetector *csd = ucsdet_open(&status);
+ LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
int32_t byteLength = 0;
char *bytes = extractBytes(testString, codepage, byteLength);
if (bytes == NULL) {
#if !UCONFIG_NO_LEGACY_CONVERSION
- errln("Can't open a " + encoding + " converter for " + id);
+ dataerrln("Can't open a " + encoding + " converter for " + id);
#endif
return;
}
- ucsdet_setText(csd, bytes, byteLength, &status);
+ ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
int32_t matchCount = 0;
- const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
+ const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
UnicodeString name(ucsdet_getName(matches[0], &status));
bail:
freeBytes(bytes);
- ucsdet_close(csd);
delete[] eSplit;
}
void CharsetDetectionTest::ConstructionTest()
{
- UErrorCode status = U_ZERO_ERROR;
- UCharsetDetector *csd = ucsdet_open(&status);
- UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
- int32_t count = uenum_count(e, &status);
+ IcuTestErrorCode status(*this, "ConstructionTest");
+ LocalUCharsetDetectorPointer csd(ucsdet_open(status));
+ LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
+ int32_t count = uenum_count(e.getAlias(), status);
#ifdef DEBUG_DETECT
printf("There are %d recognizers.\n", count);
for(int32_t i = 0; i < count; i += 1) {
int32_t length;
- const char *name = uenum_next(e, &length, &status);
+ const char *name = uenum_next(e.getAlias(), &length, status);
if(name == NULL || length <= 0) {
errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
#endif
}
- uenum_close(e);
- ucsdet_close(csd);
+ const char* defDisabled[] = {
+ "IBM420_rtl", "IBM420_ltr",
+ "IBM424_rtl", "IBM424_ltr",
+ 0
+ };
+
+ LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
+ const char *activeName = NULL;
+
+ while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
+ // the charset must be included in all list
+ UBool found = FALSE;
+
+ const char *name = NULL;
+ uenum_reset(e.getAlias(), status);
+ while ((name = uenum_next(e.getAlias(), NULL, status))) {
+ if (strcmp(activeName, name) == 0) {
+ found = TRUE;
+ break;
+ }
+ }
+
+ if (!found) {
+ errln(UnicodeString(activeName) + " is not included in the all charset list.");
+ }
+
+ // some charsets are disabled by default
+ found = FALSE;
+ for (int32_t i = 0; defDisabled[i] != 0; i++) {
+ if (strcmp(activeName, defDisabled[i]) == 0) {
+ found = TRUE;
+ break;
+ }
+ }
+ if (found) {
+ errln(UnicodeString(activeName) + " should not be included in the default charset list.");
+ }
+ }
}
void CharsetDetectionTest::UTF8Test()
match = ucsdet_detect(csd, &status);
if (match == NULL) {
- errln("English test with C1 bytes got no matches.");
+ errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
goto bail;
}
}
UXMLParser *parser = UXMLParser::createParser(status);
- if (!assertSuccess("UXMLParser::createParser",status)) return;
+ if (U_FAILURE(status)) {
+ dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
+ return;
+ }
+
UXMLElement *root = parser->parseFile(testFilePath, status);
if (!assertSuccess( "parseFile",status)) return;
#endif
}
+void CharsetDetectionTest::IBM424Test()
+{
+#if !UCONFIG_ONLY_HTML_CONVERSION
+ UErrorCode status = U_ZERO_ERROR;
+
+ static const UChar chars[] = {
+ 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
+ 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
+ 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
+ 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
+ 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
+ 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
+ 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
+ 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
+ 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
+ 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
+ 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
+ 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
+ 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
+ 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
+ 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
+ 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
+ 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
+ };
+
+ static const UChar chars_reverse[] = {
+ 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
+ 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
+ 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
+ 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
+ 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
+ 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
+ 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
+ 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
+ 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
+ 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
+ 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
+ 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
+ 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
+ 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
+ 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
+ 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
+ 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
+ 0x0000
+ };
+
+ int32_t bLength = 0, brLength = 0;
+
+ UnicodeString s1(chars);
+ UnicodeString s2(chars_reverse);
+
+ char *bytes = extractBytes(s1, "IBM424", bLength);
+ char *bytes_r = extractBytes(s2, "IBM424", brLength);
+
+ UCharsetDetector *csd = ucsdet_open(&status);
+ ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
+ if (U_FAILURE(status)) {
+ errln("Error opening charset detector. - %s", u_errorName(status));
+ }
+ const UCharsetMatch *match;
+ const char *name;
+
+ ucsdet_setText(csd, bytes, bLength, &status);
+ match = ucsdet_detect(csd, &status);
+ if (match == NULL) {
+ errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM424_rtl") != 0) {
+ errln("Encoding detection failure for IBM424_rtl: got %s", name);
+ }
+
+ ucsdet_setText(csd, bytes_r, brLength, &status);
+ match = ucsdet_detect(csd, &status);
+
+ if (match == NULL) {
+ errln("Encoding detection failure for IBM424_ltr: got no matches.");
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM424_ltr") != 0) {
+ errln("Encoding detection failure for IBM424_ltr: got %s", name);
+ }
+
+bail:
+ freeBytes(bytes);
+ freeBytes(bytes_r);
+ ucsdet_close(csd);
+#endif
+}
+
+void CharsetDetectionTest::IBM420Test()
+{
+#if !UCONFIG_ONLY_HTML_CONVERSION
+ UErrorCode status = U_ZERO_ERROR;
+
+ static const UChar chars[] = {
+ 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
+ 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
+ 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
+ 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
+ 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
+ 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
+ 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
+ 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
+ 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
+ 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
+ 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
+ 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
+ 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
+ 0x0000
+ };
+ static const UChar chars_reverse[] = {
+ 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
+ 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
+ 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
+ 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
+ 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
+ 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
+ 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
+ 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
+ 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
+ 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
+ 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
+ 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
+ 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
+ 0x0000,
+ };
+
+ int32_t bLength = 0, brLength = 0;
+
+ UnicodeString s1(chars);
+ UnicodeString s2(chars_reverse);
+
+ char *bytes = extractBytes(s1, "IBM420", bLength);
+ char *bytes_r = extractBytes(s2, "IBM420", brLength);
+
+ UCharsetDetector *csd = ucsdet_open(&status);
+ if (U_FAILURE(status)) {
+ errln("Error opening charset detector. - %s", u_errorName(status));
+ }
+ ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
+ const UCharsetMatch *match;
+ const char *name;
+
+ ucsdet_setText(csd, bytes, bLength, &status);
+ match = ucsdet_detect(csd, &status);
+
+ if (match == NULL) {
+ errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM420_rtl") != 0) {
+ errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
+ }
+
+ ucsdet_setText(csd, bytes_r, brLength, &status);
+ match = ucsdet_detect(csd, &status);
+
+ if (match == NULL) {
+ errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM420_ltr") != 0) {
+ errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
+ }
+
+bail:
+ freeBytes(bytes);
+ freeBytes(bytes_r);
+ ucsdet_close(csd);
+#endif
+}
+
+
+void CharsetDetectionTest::Ticket6394Test() {
+#if !UCONFIG_NO_CONVERSION
+ const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
+ "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
+ "encodings more than once. The hop through UnicodeString is for platforms "
+ "where this char * string is be EBCDIC and needs conversion to Latin1.";
+ char latin1Text[sizeof(charText)];
+ UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
+
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector *csd = ucsdet_open(&status);
+ ucsdet_setText(csd, latin1Text, -1, &status);
+ if (U_FAILURE(status)) {
+ errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+
+ int32_t matchCount = 0;
+ const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
+ if (U_FAILURE(status)) {
+ errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+
+ UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
+ int32_t i;
+ for (i=0; i<matchCount; i++) {
+ UnicodeString charSetName(ucsdet_getName(matches[i], &status));
+ if (U_FAILURE(status)) {
+ errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
+ status = U_ZERO_ERROR;
+ }
+ if (setOfCharsetNames.contains(charSetName)) {
+ errln("Fail at file %s, line %d ", __FILE__, __LINE__);
+ errln(UnicodeString(" Duplicate charset name = ") + charSetName);
+ }
+ setOfCharsetNames.add(charSetName);
+ }
+ ucsdet_close(csd);
+#endif
+}
+
+
+// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
+// similar Windows and non-Windows SBCS encodings. State was kept in the shared
+// Charset Recognizer objects, and could be overwritten.
+void CharsetDetectionTest::Ticket6954Test() {
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
+ UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
+ "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
+ UnicodeString sWindows = ssWindows.unescape();
+ int32_t lISO = 0, lWindows = 0;
+ char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
+ char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
+
+ // First do a plain vanilla detect of 1252 text
+
+ UCharsetDetector *csd1 = ucsdet_open(&status);
+ ucsdet_setText(csd1, bWindows, lWindows, &status);
+ const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
+ const char *name1 = ucsdet_getName(match1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(strcmp(name1, "windows-1252")==0);
+
+ // Next, using a completely separate detector, detect some 8859-1 text
+
+ UCharsetDetector *csd2 = ucsdet_open(&status);
+ ucsdet_setText(csd2, bISO, lISO, &status);
+ const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
+ const char *name2 = ucsdet_getName(match2, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
+
+ // Recheck the 1252 results from the first detector, which should not have been
+ // altered by the use of a different detector.
+
+ name1 = ucsdet_getName(match1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(strcmp(name1, "windows-1252")==0);
+
+ ucsdet_close(csd1);
+ ucsdet_close(csd2);
+ freeBytes(bISO);
+ freeBytes(bWindows);
+#endif
+}