icuSources/test/intltest/csdetest.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2011, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8
   9 #include "unicode/utypes.h"
  10 #include "unicode/ucsdet.h"
  11 #include "unicode/ucnv.h"
  12 #include "unicode/unistr.h"
  13 #include "unicode/putil.h"
  14 #include "unicode/uniset.h"
  15
  16 #include "intltest.h"
  17 #include "csdetest.h"
  18
  19 #include "xmlparser.h"
  20
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #ifdef DEBUG_DETECT
  25 #include <stdio.h>
  26 #endif
  27
  28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  29
  30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
  31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
  32
  33 #define CH_SPACE 0x0020
  34 #define CH_SLASH 0x002F
  35
  36 //---------------------------------------------------------------------------
  37 //
  38 //  Test class boilerplate
  39 //
  40 //---------------------------------------------------------------------------
  41 CharsetDetectionTest::CharsetDetectionTest()
  42 {
  43 }
  44
  45
  46 CharsetDetectionTest::~CharsetDetectionTest()
  47 {
  48 }
  49
  50
  51
  52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  53 {
  54     if (exec) logln("TestSuite CharsetDetectionTest: ");
  55     switch (index) {
  56        case 0: name = "ConstructionTest";
  57             if (exec) ConstructionTest();
  58             break;
  59
  60        case 1: name = "UTF8Test";
  61             if (exec) UTF8Test();
  62             break;
  63
  64        case 2: name = "UTF16Test";
  65             if (exec) UTF16Test();
  66             break;
  67
  68        case 3: name = "C1BytesTest";
  69             if (exec) C1BytesTest();
  70             break;
  71
  72        case 4: name = "InputFilterTest";
  73             if (exec) InputFilterTest();
  74             break;
  75
  76        case 5: name = "DetectionTest";
  77             if (exec) DetectionTest();
  78             break;
  79 #if !UCONFIG_NO_LEGACY_CONVERSION
  80        case 6: name = "IBM424Test";
  81             if (exec) IBM424Test();
  82             break;
  83
  84        case 7: name = "IBM420Test";
  85             if (exec) IBM420Test();
  86             break;
  87 #else
  88        case 6:
  89        case 7: name = "skip"; break;
  90 #endif
  91        case 8: name = "Ticket6394Test";
  92             if (exec) Ticket6394Test();
  93             break;
  94
  95         default: name = "";
  96             break; //needed to end loop
  97     }
  98 }
  99
 100 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
 101 {
 102     int32_t offset = -1;
 103
 104     splits = 1;
 105     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
 106         splits += 1;
 107     }
 108
 109     UnicodeString *result = new UnicodeString[splits];
 110
 111     int32_t start = 0;
 112     int32_t split = 0;
 113     int32_t end;
 114
 115     while((end = src.indexOf(ch, start)) >= 0) {
 116         src.extractBetween(start, end, result[split++]);
 117         start = end + 1;
 118     }
 119
 120     src.extractBetween(start, src.length(), result[split]);
 121
 122     return result;
 123 }
 124
 125 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
 126 {
 127     int32_t sLength = source.length();
 128     char *bytes = NULL;
 129
 130     length = source.extract(0, sLength, NULL, codepage);
 131
 132     if (length > 0) {
 133         bytes = NEW_ARRAY(char, length + 1);
 134         source.extract(0, sLength, bytes, codepage);
 135     }
 136
 137     return bytes;
 138 }
 139
 140 static void freeBytes(char *bytes)
 141 {
 142     DELETE_ARRAY(bytes);
 143 }
 144
 145 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
 146 {
 147     int32_t splits = 0;
 148     int32_t testLength = testString.length();
 149     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
 150     UErrorCode status = U_ZERO_ERROR;
 151     int32_t cpLength = eSplit[0].length();
 152     char codepage[64];
 153
 154     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
 155     codepage[cpLength] = '\0';
 156
 157     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
 158
 159     int32_t byteLength = 0;
 160     char *bytes = extractBytes(testString, codepage, byteLength);
 161
 162     if (bytes == NULL) {
 163 #if !UCONFIG_NO_LEGACY_CONVERSION
 164         dataerrln("Can't open a " + encoding + " converter for " + id);
 165 #endif
 166         return;
 167     }
 168
 169     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
 170
 171     int32_t matchCount = 0;
 172     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
 173
 174
 175     UnicodeString name(ucsdet_getName(matches[0], &status));
 176     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
 177     UChar *decoded = NULL;
 178     int32_t dLength = 0;
 179
 180     if (matchCount == 0) {
 181         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
 182         goto bail;
 183     }
 184
 185     if (name.compare(eSplit[0]) != 0) {
 186         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
 187
 188 #ifdef DEBUG_DETECT
 189         for (int32_t m = 0; m < matchCount; m += 1) {
 190             const char *name = ucsdet_getName(matches[m], &status);
 191             const char *lang = ucsdet_getLanguage(matches[m], &status);
 192             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
 193
 194             printf("%s (%s) %d\n", name, lang, confidence);
 195         }
 196 #endif
 197         goto bail;
 198     }
 199
 200     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
 201         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
 202         goto bail;
 203     }
 204
 205     decoded = NEW_ARRAY(UChar, testLength);
 206     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
 207
 208     if (testString.compare(decoded, dLength) != 0) {
 209         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
 210
 211 #ifdef DEBUG_DETECT
 212         for(int32_t i = 0; i < testLength; i += 1) {
 213             if(testString[i] != decoded[i]) {
 214                 printf("Strings differ at byte %d\n", i);
 215                 break;
 216             }
 217         }
 218 #endif
 219
 220     }
 221
 222     DELETE_ARRAY(decoded);
 223
 224 bail:
 225     freeBytes(bytes);
 226     delete[] eSplit;
 227 }
 228
 229 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
 230     UErrorCode status = U_ZERO_ERROR;
 231     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 232
 233     if (U_FAILURE(status)) {
 234         errln("ERROR: getPath() failed - %s", u_errorName(status));
 235         return NULL;
 236     }
 237
 238     strcpy(buffer, testDataDirectory);
 239     strcat(buffer, filename);
 240     return buffer;
 241 }
 242
 243 void CharsetDetectionTest::ConstructionTest()
 244 {
 245     IcuTestErrorCode status(*this, "ConstructionTest");
 246     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
 247     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
 248     int32_t count = uenum_count(e.getAlias(), status);
 249
 250 #ifdef DEBUG_DETECT
 251     printf("There are %d recognizers.\n", count);
 252 #endif
 253
 254     for(int32_t i = 0; i < count; i += 1) {
 255         int32_t length;
 256         const char *name = uenum_next(e.getAlias(), &length, status);
 257
 258         if(name == NULL || length <= 0) {
 259             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
 260         }
 261
 262 #ifdef DEBUG_DETECT
 263         printf("%s\n", name);
 264 #endif
 265     }
 266 }
 267
 268 void CharsetDetectionTest::UTF8Test()
 269 {
 270     UErrorCode status = U_ZERO_ERROR;
 271     UnicodeString ss = "This is a string with some non-ascii characters that will "
 272                        "be converted to UTF-8, then shoved through the detection process.  "
 273                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
 274                        "Sure would be nice if our source could contain Unicode directly!";
 275     UnicodeString s = ss.unescape();
 276     int32_t byteLength = 0, sLength = s.length();
 277     char *bytes = extractBytes(s, "UTF-8", byteLength);
 278     UCharsetDetector *csd = ucsdet_open(&status);
 279     const UCharsetMatch *match;
 280     UChar *detected = NEW_ARRAY(UChar, sLength);
 281
 282     ucsdet_setText(csd, bytes, byteLength, &status);
 283     match = ucsdet_detect(csd, &status);
 284
 285     if (match == NULL) {
 286         errln("Detection failure for UTF-8: got no matches.");
 287         goto bail;
 288     }
 289
 290     ucsdet_getUChars(match, detected, sLength, &status);
 291
 292     if (s.compare(detected, sLength) != 0) {
 293         errln("Round-trip test failed!");
 294     }
 295
 296     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 297
 298 bail:
 299     DELETE_ARRAY(detected);
 300     freeBytes(bytes);
 301     ucsdet_close(csd);
 302 }
 303
 304 void CharsetDetectionTest::UTF16Test()
 305 {
 306     UErrorCode status = U_ZERO_ERROR;
 307     /* Notice the BOM on the start of this string */
 308     UChar chars[] = {
 309         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 310         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 311         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 312         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 313         0x064a, 0x062a, 0x0000};
 314     UnicodeString s(chars);
 315     int32_t beLength = 0, leLength = 0;
 316     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
 317     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
 318     UCharsetDetector *csd = ucsdet_open(&status);
 319     const UCharsetMatch *match;
 320     const char *name;
 321     int32_t conf;
 322
 323     ucsdet_setText(csd, beBytes, beLength, &status);
 324     match = ucsdet_detect(csd, &status);
 325
 326     if (match == NULL) {
 327         errln("Encoding detection failure for UTF-16BE: got no matches.");
 328         goto try_le;
 329     }
 330
 331     name  = ucsdet_getName(match, &status);
 332     conf  = ucsdet_getConfidence(match, &status);
 333
 334     if (strcmp(name, "UTF-16BE") != 0) {
 335         errln("Encoding detection failure for UTF-16BE: got %s", name);
 336         goto try_le; // no point in looking at confidence if we got the wrong character set.
 337     }
 338
 339     if (conf != 100) {
 340         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
 341     }
 342
 343 try_le:
 344     ucsdet_setText(csd, leBytes, leLength, &status);
 345     match = ucsdet_detect(csd, &status);
 346
 347     if (match == NULL) {
 348         errln("Encoding detection failure for UTF-16LE: got no matches.");
 349         goto bail;
 350     }
 351
 352     name  = ucsdet_getName(match, &status);
 353     conf = ucsdet_getConfidence(match, &status);
 354
 355
 356     if (strcmp(name, "UTF-16LE") != 0) {
 357         errln("Enconding detection failure for UTF-16LE: got %s", name);
 358         goto bail; // no point in looking at confidence if we got the wrong character set.
 359     }
 360
 361     if (conf != 100) {
 362         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
 363     }
 364
 365 bail:
 366     freeBytes(leBytes);
 367     freeBytes(beBytes);
 368     ucsdet_close(csd);
 369 }
 370
 371 void CharsetDetectionTest::InputFilterTest()
 372 {
 373     UErrorCode status = U_ZERO_ERROR;
 374     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
 375     UnicodeString s  = ss.unescape();
 376     int32_t byteLength = 0;
 377     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
 378     UCharsetDetector *csd = ucsdet_open(&status);
 379     const UCharsetMatch *match;
 380     const char *lang, *name;
 381
 382     ucsdet_enableInputFilter(csd, TRUE);
 383
 384     if (!ucsdet_isInputFilterEnabled(csd)) {
 385         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
 386     }
 387
 388
 389     ucsdet_setText(csd, bytes, byteLength, &status);
 390     match = ucsdet_detect(csd, &status);
 391
 392     if (match == NULL) {
 393         errln("Turning on the input filter resulted in no matches.");
 394         goto turn_off;
 395     }
 396
 397     name = ucsdet_getName(match, &status);
 398
 399     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 400         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
 401     } else {
 402         lang = ucsdet_getLanguage(match, &status);
 403
 404         if (lang == NULL || strcmp(lang, "fr") != 0) {
 405             errln("Input filter did not strip markup!");
 406         }
 407     }
 408
 409 turn_off:
 410     ucsdet_enableInputFilter(csd, FALSE);
 411     ucsdet_setText(csd, bytes, byteLength, &status);
 412     match = ucsdet_detect(csd, &status);
 413
 414     if (match == NULL) {
 415         errln("Turning off the input filter resulted in no matches.");
 416         goto bail;
 417     }
 418
 419     name = ucsdet_getName(match, &status);
 420
 421     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 422         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
 423     } else {
 424         lang = ucsdet_getLanguage(match, &status);
 425
 426         if (lang == NULL || strcmp(lang, "en") != 0) {
 427             errln("Unfiltered input did not detect as English!");
 428         }
 429     }
 430
 431 bail:
 432     freeBytes(bytes);
 433     ucsdet_close(csd);
 434 }
 435
 436 void CharsetDetectionTest::C1BytesTest()
 437 {
 438 #if !UCONFIG_NO_LEGACY_CONVERSION
 439     UErrorCode status = U_ZERO_ERROR;
 440     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 441     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 442     UnicodeString sWindows  = ssWindows.unescape();
 443     int32_t lISO = 0, lWindows = 0;
 444     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 445     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 446     UCharsetDetector *csd = ucsdet_open(&status);
 447     const UCharsetMatch *match;
 448     const char *name;
 449
 450     ucsdet_setText(csd, bWindows, lWindows, &status);
 451     match = ucsdet_detect(csd, &status);
 452
 453     if (match == NULL) {
 454         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
 455         goto bail;
 456     }
 457
 458     name  = ucsdet_getName(match, &status);
 459
 460     if (strcmp(name, "windows-1252") != 0) {
 461         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
 462     }
 463
 464     ucsdet_setText(csd, bISO, lISO, &status);
 465     match = ucsdet_detect(csd, &status);
 466
 467     if (match == NULL) {
 468         errln("English text without C1 bytes got no matches.");
 469         goto bail;
 470     }
 471
 472     name  = ucsdet_getName(match, &status);
 473
 474     if (strcmp(name, "ISO-8859-1") != 0) {
 475         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
 476     }
 477
 478 bail:
 479     freeBytes(bWindows);
 480     freeBytes(bISO);
 481
 482     ucsdet_close(csd);
 483 #endif
 484 }
 485
 486 void CharsetDetectionTest::DetectionTest()
 487 {
 488 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 489     UErrorCode status = U_ZERO_ERROR;
 490     char path[2048];
 491     const char *testFilePath = getPath(path, "csdetest.xml");
 492
 493     if (testFilePath == NULL) {
 494         return; /* Couldn't get path: error message already output. */
 495     }
 496
 497     UXMLParser  *parser = UXMLParser::createParser(status);
 498     if (U_FAILURE(status)) {
 499         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
 500         return;
 501     }
 502
 503     UXMLElement *root   = parser->parseFile(testFilePath, status);
 504     if (!assertSuccess( "parseFile",status)) return;
 505
 506     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
 507     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
 508     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
 509
 510     const UXMLElement *testCase;
 511     int32_t tc = 0;
 512
 513     while((testCase = root->nextChildElement(tc)) != NULL) {
 514         if (testCase->getTagName().compare(test_case) == 0) {
 515             const UnicodeString *id = testCase->getAttribute(id_attr);
 516             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
 517             const UnicodeString  text = testCase->getText(TRUE);
 518             int32_t encodingCount;
 519             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
 520
 521             for(int32_t e = 0; e < encodingCount; e += 1) {
 522                 checkEncoding(text, encodingList[e], *id);
 523             }
 524
 525             delete[] encodingList;
 526         }
 527     }
 528
 529     delete root;
 530     delete parser;
 531 #endif
 532 }
 533
 534 void CharsetDetectionTest::IBM424Test()
 535 {
 536     UErrorCode status = U_ZERO_ERROR;
 537
 538     static const UChar chars[] = {
 539             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
 540             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
 541             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
 542             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
 543             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
 544             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
 545             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
 546             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
 547             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
 548             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
 549             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
 550             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
 551             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
 552             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
 553             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
 554             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
 555             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
 556     };
 557
 558     static const UChar chars_reverse[] = {
 559             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
 560             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
 561             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
 562             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
 563             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
 564             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
 565             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
 566             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
 567             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
 568             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
 569             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
 570             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
 571             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
 572             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
 573             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
 574             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
 575             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
 576             0x0000
 577     };
 578
 579     int32_t bLength = 0, brLength = 0;
 580
 581     UnicodeString s1(chars);
 582     UnicodeString s2(chars_reverse);
 583
 584     char *bytes = extractBytes(s1, "IBM424", bLength);
 585     char *bytes_r = extractBytes(s2, "IBM424", brLength);
 586
 587     UCharsetDetector *csd = ucsdet_open(&status);
 588     if (U_FAILURE(status)) {
 589         errln("Error opening charset detector. - %s", u_errorName(status));
 590     }
 591     const UCharsetMatch *match;
 592     const char *name;
 593
 594     ucsdet_setText(csd, bytes, bLength, &status);
 595     match = ucsdet_detect(csd, &status);
 596
 597     if (match == NULL) {
 598         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
 599         goto bail;
 600     }
 601
 602     name  = ucsdet_getName(match, &status);
 603     if (strcmp(name, "IBM424_rtl") != 0) {
 604         errln("Encoding detection failure for IBM424_rtl: got %s", name);
 605     }
 606
 607     ucsdet_setText(csd, bytes_r, brLength, &status);
 608     match = ucsdet_detect(csd, &status);
 609
 610     if (match == NULL) {
 611         errln("Encoding detection failure for IBM424_ltr: got no matches.");
 612         goto bail;
 613     }
 614
 615     name  = ucsdet_getName(match, &status);
 616     if (strcmp(name, "IBM424_ltr") != 0) {
 617         errln("Encoding detection failure for IBM424_ltr: got %s", name);
 618     }
 619
 620 bail:
 621     freeBytes(bytes);
 622     freeBytes(bytes_r);
 623     ucsdet_close(csd);
 624 }
 625
 626 void CharsetDetectionTest::IBM420Test()
 627 {
 628     UErrorCode status = U_ZERO_ERROR;
 629
 630     static const UChar chars[] = {
 631         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
 632         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
 633         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
 634         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
 635         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
 636         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
 637         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
 638         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
 639         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
 640         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
 641         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
 642         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
 643         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
 644         0x0000
 645     };
 646     static const UChar chars_reverse[] = {
 647         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
 648         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
 649         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
 650         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
 651         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
 652         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
 653         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
 654         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
 655         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
 656         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
 657         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
 658         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
 659         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
 660         0x0000,
 661     };
 662
 663     int32_t bLength = 0, brLength = 0;
 664
 665     UnicodeString s1(chars);
 666     UnicodeString s2(chars_reverse);
 667
 668     char *bytes = extractBytes(s1, "IBM420", bLength);
 669     char *bytes_r = extractBytes(s2, "IBM420", brLength);
 670
 671     UCharsetDetector *csd = ucsdet_open(&status);
 672     if (U_FAILURE(status)) {
 673         errln("Error opening charset detector. - %s", u_errorName(status));
 674     }
 675     const UCharsetMatch *match;
 676     const char *name;
 677
 678     ucsdet_setText(csd, bytes, bLength, &status);
 679     match = ucsdet_detect(csd, &status);
 680
 681     if (match == NULL) {
 682         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
 683         goto bail;
 684     }
 685
 686     name  = ucsdet_getName(match, &status);
 687     if (strcmp(name, "IBM420_rtl") != 0) {
 688         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
 689     }
 690
 691     ucsdet_setText(csd, bytes_r, brLength, &status);
 692     match = ucsdet_detect(csd, &status);
 693
 694     if (match == NULL) {
 695         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
 696         goto bail;
 697     }
 698
 699     name  = ucsdet_getName(match, &status);
 700     if (strcmp(name, "IBM420_ltr") != 0) {
 701         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
 702     }
 703
 704 bail:
 705     freeBytes(bytes);
 706     freeBytes(bytes_r);
 707     ucsdet_close(csd);
 708 }
 709
 710
 711 void CharsetDetectionTest::Ticket6394Test() {
 712 #if !UCONFIG_NO_CONVERSION
 713     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
 714                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
 715                              "encodings more than once.  The hop through UnicodeString is for platforms "
 716                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
 717     char latin1Text[sizeof(charText)];
 718     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
 719
 720     UErrorCode status = U_ZERO_ERROR;
 721     UCharsetDetector *csd = ucsdet_open(&status);
 722     ucsdet_setText(csd, latin1Text, -1, &status);
 723     if (U_FAILURE(status)) {
 724         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 725         return;
 726     }
 727
 728     int32_t matchCount = 0;
 729     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
 730     if (U_FAILURE(status)) {
 731         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 732         return;
 733     }
 734
 735     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
 736     int32_t i;
 737     for (i=0; i<matchCount; i++) {
 738         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
 739         if (U_FAILURE(status)) {
 740             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
 741             status = U_ZERO_ERROR;
 742         }
 743         if (setOfCharsetNames.contains(charSetName)) {
 744             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
 745             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
 746         }
 747         setOfCharsetNames.add(charSetName);
 748     }
 749     ucsdet_close(csd);
 750 #endif
 751 }
 752