/*
**********************************************************************
- * Copyright (C) 2005-2008, International Business Machines
+ * Copyright (C) 2005-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/ucnv.h"
#include "unicode/unistr.h"
#include "unicode/putil.h"
+#include "unicode/uniset.h"
#include "intltest.h"
#include "csdetest.h"
case 5: name = "DetectionTest";
if (exec) DetectionTest();
break;
+#if !UCONFIG_NO_LEGACY_CONVERSION
+ case 6: name = "IBM424Test";
+ if (exec) IBM424Test();
+ break;
+
+ case 7: name = "IBM420Test";
+ if (exec) IBM420Test();
+ break;
+#else
+ case 6:
+ case 7: name = "skip"; break;
+#endif
+ case 8: name = "Ticket6394Test";
+ if (exec) Ticket6394Test();
+ break;
default: name = "";
break; //needed to end loop
u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
codepage[cpLength] = '\0';
- UCharsetDetector *csd = ucsdet_open(&status);
+ LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
int32_t byteLength = 0;
char *bytes = extractBytes(testString, codepage, byteLength);
return;
}
- ucsdet_setText(csd, bytes, byteLength, &status);
+ ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
int32_t matchCount = 0;
- const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
+ const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
UnicodeString name(ucsdet_getName(matches[0], &status));
bail:
freeBytes(bytes);
- ucsdet_close(csd);
delete[] eSplit;
}
void CharsetDetectionTest::ConstructionTest()
{
- UErrorCode status = U_ZERO_ERROR;
- UCharsetDetector *csd = ucsdet_open(&status);
- UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
- int32_t count = uenum_count(e, &status);
+ IcuTestErrorCode status(*this, "ConstructionTest");
+ LocalUCharsetDetectorPointer csd(ucsdet_open(status));
+ LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
+ int32_t count = uenum_count(e.getAlias(), status);
#ifdef DEBUG_DETECT
printf("There are %d recognizers.\n", count);
for(int32_t i = 0; i < count; i += 1) {
int32_t length;
- const char *name = uenum_next(e, &length, &status);
+ const char *name = uenum_next(e.getAlias(), &length, status);
if(name == NULL || length <= 0) {
errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
printf("%s\n", name);
#endif
}
-
- uenum_close(e);
- ucsdet_close(csd);
}
void CharsetDetectionTest::UTF8Test()
match = ucsdet_detect(csd, &status);
if (match == NULL) {
- errln("English test with C1 bytes got no matches.");
+ errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
goto bail;
}
}
UXMLParser *parser = UXMLParser::createParser(status);
- if (!assertSuccess("UXMLParser::createParser",status)) return;
+ if (U_FAILURE(status)) {
+ dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
+ return;
+ }
+
UXMLElement *root = parser->parseFile(testFilePath, status);
if (!assertSuccess( "parseFile",status)) return;
#endif
}
+void CharsetDetectionTest::IBM424Test()
+{
+ UErrorCode status = U_ZERO_ERROR;
+
+ static const UChar chars[] = {
+ 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
+ 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
+ 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
+ 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
+ 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
+ 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
+ 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
+ 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
+ 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
+ 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
+ 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
+ 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
+ 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
+ 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
+ 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
+ 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
+ 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
+ };
+
+ static const UChar chars_reverse[] = {
+ 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
+ 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
+ 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
+ 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
+ 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
+ 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
+ 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
+ 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
+ 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
+ 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
+ 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
+ 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
+ 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
+ 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
+ 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
+ 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
+ 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
+ 0x0000
+ };
+
+ int32_t bLength = 0, brLength = 0;
+
+ UnicodeString s1(chars);
+ UnicodeString s2(chars_reverse);
+
+ char *bytes = extractBytes(s1, "IBM424", bLength);
+ char *bytes_r = extractBytes(s2, "IBM424", brLength);
+
+ UCharsetDetector *csd = ucsdet_open(&status);
+ if (U_FAILURE(status)) {
+ errln("Error opening charset detector. - %s", u_errorName(status));
+ }
+ const UCharsetMatch *match;
+ const char *name;
+
+ ucsdet_setText(csd, bytes, bLength, &status);
+ match = ucsdet_detect(csd, &status);
+
+ if (match == NULL) {
+ errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM424_rtl") != 0) {
+ errln("Encoding detection failure for IBM424_rtl: got %s", name);
+ }
+
+ ucsdet_setText(csd, bytes_r, brLength, &status);
+ match = ucsdet_detect(csd, &status);
+
+ if (match == NULL) {
+ errln("Encoding detection failure for IBM424_ltr: got no matches.");
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM424_ltr") != 0) {
+ errln("Encoding detection failure for IBM424_ltr: got %s", name);
+ }
+
+bail:
+ freeBytes(bytes);
+ freeBytes(bytes_r);
+ ucsdet_close(csd);
+}
+
+void CharsetDetectionTest::IBM420Test()
+{
+ UErrorCode status = U_ZERO_ERROR;
+
+ static const UChar chars[] = {
+ 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
+ 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
+ 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
+ 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
+ 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
+ 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
+ 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
+ 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
+ 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
+ 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
+ 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
+ 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
+ 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
+ 0x0000
+ };
+ static const UChar chars_reverse[] = {
+ 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
+ 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
+ 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
+ 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
+ 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
+ 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
+ 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
+ 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
+ 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
+ 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
+ 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
+ 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
+ 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
+ 0x0000,
+ };
+
+ int32_t bLength = 0, brLength = 0;
+
+ UnicodeString s1(chars);
+ UnicodeString s2(chars_reverse);
+
+ char *bytes = extractBytes(s1, "IBM420", bLength);
+ char *bytes_r = extractBytes(s2, "IBM420", brLength);
+
+ UCharsetDetector *csd = ucsdet_open(&status);
+ if (U_FAILURE(status)) {
+ errln("Error opening charset detector. - %s", u_errorName(status));
+ }
+ const UCharsetMatch *match;
+ const char *name;
+
+ ucsdet_setText(csd, bytes, bLength, &status);
+ match = ucsdet_detect(csd, &status);
+
+ if (match == NULL) {
+ errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM420_rtl") != 0) {
+ errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
+ }
+
+ ucsdet_setText(csd, bytes_r, brLength, &status);
+ match = ucsdet_detect(csd, &status);
+
+ if (match == NULL) {
+ errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
+ goto bail;
+ }
+
+ name = ucsdet_getName(match, &status);
+ if (strcmp(name, "IBM420_ltr") != 0) {
+ errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
+ }
+
+bail:
+ freeBytes(bytes);
+ freeBytes(bytes_r);
+ ucsdet_close(csd);
+}
+
+
+void CharsetDetectionTest::Ticket6394Test() {
+#if !UCONFIG_NO_CONVERSION
+ const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
+ "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
+ "encodings more than once. The hop through UnicodeString is for platforms "
+ "where this char * string is be EBCDIC and needs conversion to Latin1.";
+ char latin1Text[sizeof(charText)];
+ UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
+
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector *csd = ucsdet_open(&status);
+ ucsdet_setText(csd, latin1Text, -1, &status);
+ if (U_FAILURE(status)) {
+ errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+
+ int32_t matchCount = 0;
+ const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
+ if (U_FAILURE(status)) {
+ errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+
+ UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
+ int32_t i;
+ for (i=0; i<matchCount; i++) {
+ UnicodeString charSetName(ucsdet_getName(matches[i], &status));
+ if (U_FAILURE(status)) {
+ errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
+ status = U_ZERO_ERROR;
+ }
+ if (setOfCharsetNames.contains(charSetName)) {
+ errln("Fail at file %s, line %d ", __FILE__, __LINE__);
+ errln(UnicodeString(" Duplicate charset name = ") + charSetName);
+ }
+ setOfCharsetNames.add(charSetName);
+ }
+ ucsdet_close(csd);
+#endif
+}