X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/4388f060552cc537e71e957d32f35e9d75a61233..586446045a9ad027ace9532db9e32639f87706dd:/icuSources/test/intltest/csdetest.cpp diff --git a/icuSources/test/intltest/csdetest.cpp b/icuSources/test/intltest/csdetest.cpp index 77c3bcc8..4363380b 100644 --- a/icuSources/test/intltest/csdetest.cpp +++ b/icuSources/test/intltest/csdetest.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2011, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -33,6 +33,14 @@ #define CH_SPACE 0x0020 #define CH_SLASH 0x002F +#define TEST_ASSERT(x) {if (!(x)) { \ + errln("Failure in file %s, line %d", __FILE__, __LINE__);}} + +#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ + errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\ + return;}} + + //--------------------------------------------------------------------------- // // Test class boilerplate @@ -92,6 +100,10 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char if (exec) Ticket6394Test(); break; + case 9: name = "Ticket6954Test"; + if (exec) Ticket6954Test(); + break; + default: name = ""; break; //needed to end loop } @@ -263,6 +275,45 @@ void CharsetDetectionTest::ConstructionTest() printf("%s\n", name); #endif } + + const char* defDisabled[] = { + "IBM420_rtl", "IBM420_ltr", + "IBM424_rtl", "IBM424_ltr", + 0 + }; + + LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status)); + const char *activeName = NULL; + + while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) { + // the charset must be included in all list + UBool found = FALSE; + + const char *name = NULL; + uenum_reset(e.getAlias(), status); + while ((name = uenum_next(e.getAlias(), NULL, status))) { + if (strcmp(activeName, name) == 0) { + found = TRUE; + break; + } + } + + if (!found) { + errln(UnicodeString(activeName) + " is not included in the all charset list."); + } + + // some charsets are disabled by default + found = FALSE; + for (int32_t i = 0; defDisabled[i] != 0; i++) { + if (strcmp(activeName, defDisabled[i]) == 0) { + found = TRUE; + break; + } + } + if (found) { + errln(UnicodeString(activeName) + " should not be included in the default charset list."); + } + } } void CharsetDetectionTest::UTF8Test() @@ -585,6 +636,10 @@ void CharsetDetectionTest::IBM424Test() char *bytes_r = extractBytes(s2, "IBM424", brLength); UCharsetDetector *csd = ucsdet_open(&status); + ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); if (U_FAILURE(status)) { errln("Error opening charset detector. - %s", u_errorName(status)); } @@ -672,6 +727,10 @@ void CharsetDetectionTest::IBM420Test() if (U_FAILURE(status)) { errln("Error opening charset detector. - %s", u_errorName(status)); } + ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); const UCharsetMatch *match; const char *name; @@ -750,3 +809,49 @@ void CharsetDetectionTest::Ticket6394Test() { #endif } + +// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between +// similar Windows and non-Windows SBCS encodings. State was kept in the shared +// Charset Recognizer objects, and could be overwritten. +void CharsetDetectionTest::Ticket6954Test() { +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING + UErrorCode status = U_ZERO_ERROR; + UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; + UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly." + "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); + UnicodeString sWindows = ssWindows.unescape(); + int32_t lISO = 0, lWindows = 0; + char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); + char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); + + // First do a plain vanilla detect of 1252 text + + UCharsetDetector *csd1 = ucsdet_open(&status); + ucsdet_setText(csd1, bWindows, lWindows, &status); + const UCharsetMatch *match1 = ucsdet_detect(csd1, &status); + const char *name1 = ucsdet_getName(match1, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(strcmp(name1, "windows-1252")==0); + + // Next, using a completely separate detector, detect some 8859-1 text + + UCharsetDetector *csd2 = ucsdet_open(&status); + ucsdet_setText(csd2, bISO, lISO, &status); + const UCharsetMatch *match2 = ucsdet_detect(csd2, &status); + const char *name2 = ucsdet_getName(match2, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0); + + // Recheck the 1252 results from the first detector, which should not have been + // altered by the use of a different detector. + + name1 = ucsdet_getName(match1, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(strcmp(name1, "windows-1252")==0); + + ucsdet_close(csd1); + ucsdet_close(csd2); + freeBytes(bISO); + freeBytes(bWindows); +#endif +}