/*
**********************************************************************
- * Copyright (C) 2005-2011, International Business Machines
+ * Copyright (C) 2005-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#define CH_SPACE 0x0020
#define CH_SLASH 0x002F
+#define TEST_ASSERT(x) {if (!(x)) { \
+ errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
+
+#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
+ errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
+ return;}}
+
+
//---------------------------------------------------------------------------
//
// Test class boilerplate
if (exec) Ticket6394Test();
break;
+ case 9: name = "Ticket6954Test";
+ if (exec) Ticket6954Test();
+ break;
+
default: name = "";
break; //needed to end loop
}
printf("%s\n", name);
#endif
}
+
+ const char* defDisabled[] = {
+ "IBM420_rtl", "IBM420_ltr",
+ "IBM424_rtl", "IBM424_ltr",
+ 0
+ };
+
+ LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
+ const char *activeName = NULL;
+
+ while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
+ // the charset must be included in all list
+ UBool found = FALSE;
+
+ const char *name = NULL;
+ uenum_reset(e.getAlias(), status);
+ while ((name = uenum_next(e.getAlias(), NULL, status))) {
+ if (strcmp(activeName, name) == 0) {
+ found = TRUE;
+ break;
+ }
+ }
+
+ if (!found) {
+ errln(UnicodeString(activeName) + " is not included in the all charset list.");
+ }
+
+ // some charsets are disabled by default
+ found = FALSE;
+ for (int32_t i = 0; defDisabled[i] != 0; i++) {
+ if (strcmp(activeName, defDisabled[i]) == 0) {
+ found = TRUE;
+ break;
+ }
+ }
+ if (found) {
+ errln(UnicodeString(activeName) + " should not be included in the default charset list.");
+ }
+ }
}
void CharsetDetectionTest::UTF8Test()
void CharsetDetectionTest::IBM424Test()
{
+#if !UCONFIG_ONLY_HTML_CONVERSION
UErrorCode status = U_ZERO_ERROR;
static const UChar chars[] = {
char *bytes_r = extractBytes(s2, "IBM424", brLength);
UCharsetDetector *csd = ucsdet_open(&status);
+ ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
if (U_FAILURE(status)) {
errln("Error opening charset detector. - %s", u_errorName(status));
}
freeBytes(bytes);
freeBytes(bytes_r);
ucsdet_close(csd);
+#endif
}
void CharsetDetectionTest::IBM420Test()
{
+#if !UCONFIG_ONLY_HTML_CONVERSION
UErrorCode status = U_ZERO_ERROR;
static const UChar chars[] = {
if (U_FAILURE(status)) {
errln("Error opening charset detector. - %s", u_errorName(status));
}
+ ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
const UCharsetMatch *match;
const char *name;
freeBytes(bytes);
freeBytes(bytes_r);
ucsdet_close(csd);
+#endif
}
#endif
}
+
+// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
+// similar Windows and non-Windows SBCS encodings. State was kept in the shared
+// Charset Recognizer objects, and could be overwritten.
+void CharsetDetectionTest::Ticket6954Test() {
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
+ UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
+ "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
+ UnicodeString sWindows = ssWindows.unescape();
+ int32_t lISO = 0, lWindows = 0;
+ char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
+ char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
+
+ // First do a plain vanilla detect of 1252 text
+
+ UCharsetDetector *csd1 = ucsdet_open(&status);
+ ucsdet_setText(csd1, bWindows, lWindows, &status);
+ const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
+ const char *name1 = ucsdet_getName(match1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(strcmp(name1, "windows-1252")==0);
+
+ // Next, using a completely separate detector, detect some 8859-1 text
+
+ UCharsetDetector *csd2 = ucsdet_open(&status);
+ ucsdet_setText(csd2, bISO, lISO, &status);
+ const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
+ const char *name2 = ucsdet_getName(match2, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
+
+ // Recheck the 1252 results from the first detector, which should not have been
+ // altered by the use of a different detector.
+
+ name1 = ucsdet_getName(match1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(strcmp(name1, "windows-1252")==0);
+
+ ucsdet_close(csd1);
+ ucsdet_close(csd2);
+ freeBytes(bISO);
+ freeBytes(bWindows);
+#endif
+}