icuSources/test/intltest/csdetest.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2016, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8
   9 #include "unicode/utypes.h"
  10 #include "unicode/ucsdet.h"
  11 #include "unicode/ucnv.h"
  12 #include "unicode/unistr.h"
  13 #include "unicode/putil.h"
  14 #include "unicode/uniset.h"
  15
  16 #include "intltest.h"
  17 #include "csdetest.h"
  18
  19 #include "xmlparser.h"
  20
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #ifdef DEBUG_DETECT
  25 #include <stdio.h>
  26 #endif
  27
  28 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
  29 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
  30
  31 #define CH_SPACE 0x0020
  32 #define CH_SLASH 0x002F
  33
  34 #define TEST_ASSERT(x) {if (!(x)) { \
  35     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  36
  37 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  38     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
  39     return;}}
  40
  41
  42 //---------------------------------------------------------------------------
  43 //
  44 //  Test class boilerplate
  45 //
  46 //---------------------------------------------------------------------------
  47 CharsetDetectionTest::CharsetDetectionTest()
  48 {
  49 }
  50
  51
  52 CharsetDetectionTest::~CharsetDetectionTest()
  53 {
  54 }
  55
  56
  57
  58 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  59 {
  60     if (exec) logln("TestSuite CharsetDetectionTest: ");
  61     switch (index) {
  62        case 0: name = "ConstructionTest";
  63             if (exec) ConstructionTest();
  64             break;
  65
  66        case 1: name = "UTF8Test";
  67             if (exec) UTF8Test();
  68             break;
  69
  70        case 2: name = "UTF16Test";
  71             if (exec) UTF16Test();
  72             break;
  73
  74        case 3: name = "C1BytesTest";
  75             if (exec) C1BytesTest();
  76             break;
  77
  78        case 4: name = "InputFilterTest";
  79             if (exec) InputFilterTest();
  80             break;
  81
  82        case 5: name = "DetectionTest";
  83             if (exec) DetectionTest();
  84             break;
  85 #if !UCONFIG_NO_LEGACY_CONVERSION
  86        case 6: name = "IBM424Test";
  87             if (exec) IBM424Test();
  88             break;
  89
  90        case 7: name = "IBM420Test";
  91             if (exec) IBM420Test();
  92             break;
  93 #else
  94        case 6:
  95        case 7: name = "skip"; break;
  96 #endif
  97        case 8: name = "Ticket6394Test";
  98             if (exec) Ticket6394Test();
  99             break;
 100
 101        case 9: name = "Ticket6954Test";
 102             if (exec) Ticket6954Test();
 103             break;
 104
 105         default: name = "";
 106             break; //needed to end loop
 107     }
 108 }
 109
 110 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
 111 {
 112     int32_t offset = -1;
 113
 114     splits = 1;
 115     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
 116         splits += 1;
 117     }
 118
 119     UnicodeString *result = new UnicodeString[splits];
 120
 121     int32_t start = 0;
 122     int32_t split = 0;
 123     int32_t end;
 124
 125     while((end = src.indexOf(ch, start)) >= 0) {
 126         src.extractBetween(start, end, result[split++]);
 127         start = end + 1;
 128     }
 129
 130     src.extractBetween(start, src.length(), result[split]);
 131
 132     return result;
 133 }
 134
 135 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
 136 {
 137     int32_t sLength = source.length();
 138     char *bytes = NULL;
 139
 140     length = source.extract(0, sLength, NULL, codepage);
 141
 142     if (length > 0) {
 143         bytes = NEW_ARRAY(char, length + 1);
 144         source.extract(0, sLength, bytes, codepage);
 145     }
 146
 147     return bytes;
 148 }
 149
 150 static void freeBytes(char *bytes)
 151 {
 152     DELETE_ARRAY(bytes);
 153 }
 154
 155 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
 156 {
 157     int32_t splits = 0;
 158     int32_t testLength = testString.length();
 159     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
 160     UErrorCode status = U_ZERO_ERROR;
 161     int32_t cpLength = eSplit[0].length();
 162     char codepage[64];
 163
 164     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
 165     codepage[cpLength] = '\0';
 166
 167     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
 168
 169     int32_t byteLength = 0;
 170     char *bytes = extractBytes(testString, codepage, byteLength);
 171
 172     if (bytes == NULL) {
 173 #if !UCONFIG_NO_LEGACY_CONVERSION
 174         dataerrln("Can't open a " + encoding + " converter for " + id);
 175 #endif
 176         return;
 177     }
 178
 179     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
 180
 181     int32_t matchCount = 0;
 182     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
 183
 184
 185     UnicodeString name(ucsdet_getName(matches[0], &status));
 186     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
 187     UChar *decoded = NULL;
 188     int32_t dLength = 0;
 189
 190     if (matchCount == 0) {
 191         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
 192         goto bail;
 193     }
 194
 195     if (name.compare(eSplit[0]) != 0) {
 196         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
 197
 198 #ifdef DEBUG_DETECT
 199         for (int32_t m = 0; m < matchCount; m += 1) {
 200             const char *name = ucsdet_getName(matches[m], &status);
 201             const char *lang = ucsdet_getLanguage(matches[m], &status);
 202             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
 203
 204             printf("%s (%s) %d\n", name, lang, confidence);
 205         }
 206 #endif
 207         goto bail;
 208     }
 209
 210     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
 211         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
 212         goto bail;
 213     }
 214
 215     decoded = NEW_ARRAY(UChar, testLength);
 216     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
 217
 218     if (testString.compare(decoded, dLength) != 0) {
 219         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
 220
 221 #ifdef DEBUG_DETECT
 222         for(int32_t i = 0; i < testLength; i += 1) {
 223             if(testString[i] != decoded[i]) {
 224                 printf("Strings differ at byte %d\n", i);
 225                 break;
 226             }
 227         }
 228 #endif
 229
 230     }
 231
 232     DELETE_ARRAY(decoded);
 233
 234 bail:
 235     freeBytes(bytes);
 236     delete[] eSplit;
 237 }
 238
 239 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
 240     UErrorCode status = U_ZERO_ERROR;
 241     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 242
 243     if (U_FAILURE(status)) {
 244         errln("ERROR: getPath() failed - %s", u_errorName(status));
 245         return NULL;
 246     }
 247
 248     strcpy(buffer, testDataDirectory);
 249     strcat(buffer, filename);
 250     return buffer;
 251 }
 252
 253 void CharsetDetectionTest::ConstructionTest()
 254 {
 255     IcuTestErrorCode status(*this, "ConstructionTest");
 256     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
 257     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
 258     int32_t count = uenum_count(e.getAlias(), status);
 259
 260 #ifdef DEBUG_DETECT
 261     printf("There are %d recognizers.\n", count);
 262 #endif
 263
 264     for(int32_t i = 0; i < count; i += 1) {
 265         int32_t length;
 266         const char *name = uenum_next(e.getAlias(), &length, status);
 267
 268         if(name == NULL || length <= 0) {
 269             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
 270         }
 271
 272 #ifdef DEBUG_DETECT
 273         printf("%s\n", name);
 274 #endif
 275     }
 276
 277     const char* defDisabled[] = {
 278         "IBM420_rtl", "IBM420_ltr",
 279         "IBM424_rtl", "IBM424_ltr",
 280         0
 281     };
 282
 283     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
 284     const char *activeName = NULL;
 285
 286     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
 287         // the charset must be included in all list
 288         UBool found = FALSE;
 289
 290         const char *name = NULL;
 291         uenum_reset(e.getAlias(), status);
 292         while ((name = uenum_next(e.getAlias(), NULL, status))) {
 293             if (strcmp(activeName, name) == 0) {
 294                 found = TRUE;
 295                 break;
 296             }
 297         }
 298
 299         if (!found) {
 300             errln(UnicodeString(activeName) + " is not included in the all charset list.");
 301         }
 302
 303         // some charsets are disabled by default
 304         found = FALSE;
 305         for (int32_t i = 0; defDisabled[i] != 0; i++) {
 306             if (strcmp(activeName, defDisabled[i]) == 0) {
 307                 found = TRUE;
 308                 break;
 309             }
 310         }
 311         if (found) {
 312             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
 313         }
 314     }
 315 }
 316
 317 void CharsetDetectionTest::UTF8Test()
 318 {
 319     UErrorCode status = U_ZERO_ERROR;
 320     UnicodeString ss = "This is a string with some non-ascii characters that will "
 321                        "be converted to UTF-8, then shoved through the detection process.  "
 322                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
 323                        "Sure would be nice if our source could contain Unicode directly!";
 324     UnicodeString s = ss.unescape();
 325     int32_t byteLength = 0, sLength = s.length();
 326     char *bytes = extractBytes(s, "UTF-8", byteLength);
 327     UCharsetDetector *csd = ucsdet_open(&status);
 328     const UCharsetMatch *match;
 329     UChar *detected = NEW_ARRAY(UChar, sLength);
 330
 331     ucsdet_setText(csd, bytes, byteLength, &status);
 332     match = ucsdet_detect(csd, &status);
 333
 334     if (match == NULL) {
 335         errln("Detection failure for UTF-8: got no matches.");
 336         goto bail;
 337     }
 338
 339     ucsdet_getUChars(match, detected, sLength, &status);
 340
 341     if (s.compare(detected, sLength) != 0) {
 342         errln("Round-trip test failed!");
 343     }
 344
 345     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 346
 347 bail:
 348     DELETE_ARRAY(detected);
 349     freeBytes(bytes);
 350     ucsdet_close(csd);
 351 }
 352
 353 void CharsetDetectionTest::UTF16Test()
 354 {
 355     UErrorCode status = U_ZERO_ERROR;
 356     /* Notice the BOM on the start of this string */
 357     UChar chars[] = {
 358         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 359         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 360         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 361         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 362         0x064a, 0x062a, 0x0000};
 363     UnicodeString s(chars);
 364     int32_t beLength = 0, leLength = 0;
 365     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
 366     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
 367     UCharsetDetector *csd = ucsdet_open(&status);
 368     const UCharsetMatch *match;
 369     const char *name;
 370     int32_t conf;
 371
 372     ucsdet_setText(csd, beBytes, beLength, &status);
 373     match = ucsdet_detect(csd, &status);
 374
 375     if (match == NULL) {
 376         errln("Encoding detection failure for UTF-16BE: got no matches.");
 377         goto try_le;
 378     }
 379
 380     name  = ucsdet_getName(match, &status);
 381     conf  = ucsdet_getConfidence(match, &status);
 382
 383     if (strcmp(name, "UTF-16BE") != 0) {
 384         errln("Encoding detection failure for UTF-16BE: got %s", name);
 385         goto try_le; // no point in looking at confidence if we got the wrong character set.
 386     }
 387
 388     if (conf != 100) {
 389         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
 390     }
 391
 392 try_le:
 393     ucsdet_setText(csd, leBytes, leLength, &status);
 394     match = ucsdet_detect(csd, &status);
 395
 396     if (match == NULL) {
 397         errln("Encoding detection failure for UTF-16LE: got no matches.");
 398         goto bail;
 399     }
 400
 401     name  = ucsdet_getName(match, &status);
 402     conf = ucsdet_getConfidence(match, &status);
 403
 404
 405     if (strcmp(name, "UTF-16LE") != 0) {
 406         errln("Enconding detection failure for UTF-16LE: got %s", name);
 407         goto bail; // no point in looking at confidence if we got the wrong character set.
 408     }
 409
 410     if (conf != 100) {
 411         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
 412     }
 413
 414 bail:
 415     freeBytes(leBytes);
 416     freeBytes(beBytes);
 417     ucsdet_close(csd);
 418 }
 419
 420 void CharsetDetectionTest::InputFilterTest()
 421 {
 422     UErrorCode status = U_ZERO_ERROR;
 423     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
 424     UnicodeString s  = ss.unescape();
 425     int32_t byteLength = 0;
 426     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
 427     UCharsetDetector *csd = ucsdet_open(&status);
 428     const UCharsetMatch *match;
 429     const char *lang, *name;
 430
 431     ucsdet_enableInputFilter(csd, TRUE);
 432
 433     if (!ucsdet_isInputFilterEnabled(csd)) {
 434         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
 435     }
 436
 437
 438     ucsdet_setText(csd, bytes, byteLength, &status);
 439     match = ucsdet_detect(csd, &status);
 440
 441     if (match == NULL) {
 442         errln("Turning on the input filter resulted in no matches.");
 443         goto turn_off;
 444     }
 445
 446     name = ucsdet_getName(match, &status);
 447
 448     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 449         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
 450     } else {
 451         lang = ucsdet_getLanguage(match, &status);
 452
 453         if (lang == NULL || strcmp(lang, "fr") != 0) {
 454             errln("Input filter did not strip markup!");
 455         }
 456     }
 457
 458 turn_off:
 459     ucsdet_enableInputFilter(csd, FALSE);
 460     ucsdet_setText(csd, bytes, byteLength, &status);
 461     match = ucsdet_detect(csd, &status);
 462
 463     if (match == NULL) {
 464         errln("Turning off the input filter resulted in no matches.");
 465         goto bail;
 466     }
 467
 468     name = ucsdet_getName(match, &status);
 469
 470     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 471         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
 472     } else {
 473         lang = ucsdet_getLanguage(match, &status);
 474
 475         if (lang == NULL || strcmp(lang, "en") != 0) {
 476             errln("Unfiltered input did not detect as English!");
 477         }
 478     }
 479
 480 bail:
 481     freeBytes(bytes);
 482     ucsdet_close(csd);
 483 }
 484
 485 void CharsetDetectionTest::C1BytesTest()
 486 {
 487 #if !UCONFIG_NO_LEGACY_CONVERSION
 488     UErrorCode status = U_ZERO_ERROR;
 489     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 490     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 491     UnicodeString sWindows  = ssWindows.unescape();
 492     int32_t lISO = 0, lWindows = 0;
 493     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 494     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 495     UCharsetDetector *csd = ucsdet_open(&status);
 496     const UCharsetMatch *match;
 497     const char *name;
 498
 499     ucsdet_setText(csd, bWindows, lWindows, &status);
 500     match = ucsdet_detect(csd, &status);
 501
 502     if (match == NULL) {
 503         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
 504         goto bail;
 505     }
 506
 507     name  = ucsdet_getName(match, &status);
 508
 509     if (strcmp(name, "windows-1252") != 0) {
 510         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
 511     }
 512
 513     ucsdet_setText(csd, bISO, lISO, &status);
 514     match = ucsdet_detect(csd, &status);
 515
 516     if (match == NULL) {
 517         errln("English text without C1 bytes got no matches.");
 518         goto bail;
 519     }
 520
 521     name  = ucsdet_getName(match, &status);
 522
 523     if (strcmp(name, "ISO-8859-1") != 0) {
 524         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
 525     }
 526
 527 bail:
 528     freeBytes(bWindows);
 529     freeBytes(bISO);
 530
 531     ucsdet_close(csd);
 532 #endif
 533 }
 534
 535 void CharsetDetectionTest::DetectionTest()
 536 {
 537 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 538     UErrorCode status = U_ZERO_ERROR;
 539     char path[2048];
 540     const char *testFilePath = getPath(path, "csdetest.xml");
 541
 542     if (testFilePath == NULL) {
 543         return; /* Couldn't get path: error message already output. */
 544     }
 545
 546     UXMLParser  *parser = UXMLParser::createParser(status);
 547     if (U_FAILURE(status)) {
 548         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
 549         return;
 550     }
 551
 552     UXMLElement *root   = parser->parseFile(testFilePath, status);
 553     if (!assertSuccess( "parseFile",status)) return;
 554
 555     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
 556     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
 557     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
 558
 559     const UXMLElement *testCase;
 560     int32_t tc = 0;
 561
 562     while((testCase = root->nextChildElement(tc)) != NULL) {
 563         if (testCase->getTagName().compare(test_case) == 0) {
 564             const UnicodeString *id = testCase->getAttribute(id_attr);
 565             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
 566             const UnicodeString  text = testCase->getText(TRUE);
 567             int32_t encodingCount;
 568             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
 569
 570             for(int32_t e = 0; e < encodingCount; e += 1) {
 571                 checkEncoding(text, encodingList[e], *id);
 572             }
 573
 574             delete[] encodingList;
 575         }
 576     }
 577
 578     delete root;
 579     delete parser;
 580 #endif
 581 }
 582
 583 void CharsetDetectionTest::IBM424Test()
 584 {
 585 #if !UCONFIG_ONLY_HTML_CONVERSION
 586     UErrorCode status = U_ZERO_ERROR;
 587
 588     static const UChar chars[] = {
 589             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
 590             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
 591             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
 592             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
 593             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
 594             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
 595             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
 596             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
 597             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
 598             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
 599             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
 600             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
 601             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
 602             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
 603             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
 604             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
 605             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
 606     };
 607
 608     static const UChar chars_reverse[] = {
 609             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
 610             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
 611             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
 612             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
 613             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
 614             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
 615             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
 616             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
 617             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
 618             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
 619             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
 620             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
 621             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
 622             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
 623             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
 624             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
 625             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
 626             0x0000
 627     };
 628
 629     int32_t bLength = 0, brLength = 0;
 630
 631     UnicodeString s1(chars);
 632     UnicodeString s2(chars_reverse);
 633
 634     char *bytes = extractBytes(s1, "IBM424", bLength);
 635     char *bytes_r = extractBytes(s2, "IBM424", brLength);
 636
 637     UCharsetDetector *csd = ucsdet_open(&status);
 638         ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
 639         ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
 640         ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
 641         ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
 642     if (U_FAILURE(status)) {
 643         errln("Error opening charset detector. - %s", u_errorName(status));
 644     }
 645     const UCharsetMatch *match;
 646     const char *name;
 647
 648     ucsdet_setText(csd, bytes, bLength, &status);
 649     match = ucsdet_detect(csd, &status);
 650
 651     if (match == NULL) {
 652         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
 653         goto bail;
 654     }
 655
 656     name  = ucsdet_getName(match, &status);
 657     if (strcmp(name, "IBM424_rtl") != 0) {
 658         errln("Encoding detection failure for IBM424_rtl: got %s", name);
 659     }
 660
 661     ucsdet_setText(csd, bytes_r, brLength, &status);
 662     match = ucsdet_detect(csd, &status);
 663
 664     if (match == NULL) {
 665         errln("Encoding detection failure for IBM424_ltr: got no matches.");
 666         goto bail;
 667     }
 668
 669     name  = ucsdet_getName(match, &status);
 670     if (strcmp(name, "IBM424_ltr") != 0) {
 671         errln("Encoding detection failure for IBM424_ltr: got %s", name);
 672     }
 673
 674 bail:
 675     freeBytes(bytes);
 676     freeBytes(bytes_r);
 677     ucsdet_close(csd);
 678 #endif
 679 }
 680
 681 void CharsetDetectionTest::IBM420Test()
 682 {
 683 #if !UCONFIG_ONLY_HTML_CONVERSION
 684     UErrorCode status = U_ZERO_ERROR;
 685
 686     static const UChar chars[] = {
 687         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
 688         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
 689         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
 690         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
 691         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
 692         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
 693         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
 694         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
 695         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
 696         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
 697         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
 698         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
 699         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
 700         0x0000
 701     };
 702     static const UChar chars_reverse[] = {
 703         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
 704         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
 705         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
 706         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
 707         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
 708         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
 709         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
 710         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
 711         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
 712         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
 713         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
 714         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
 715         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
 716         0x0000,
 717     };
 718
 719     int32_t bLength = 0, brLength = 0;
 720
 721     UnicodeString s1(chars);
 722     UnicodeString s2(chars_reverse);
 723
 724     char *bytes = extractBytes(s1, "IBM420", bLength);
 725     char *bytes_r = extractBytes(s2, "IBM420", brLength);
 726
 727     UCharsetDetector *csd = ucsdet_open(&status);
 728     if (U_FAILURE(status)) {
 729         errln("Error opening charset detector. - %s", u_errorName(status));
 730     }
 731         ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
 732         ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
 733         ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
 734         ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
 735     const UCharsetMatch *match;
 736     const char *name;
 737
 738     ucsdet_setText(csd, bytes, bLength, &status);
 739     match = ucsdet_detect(csd, &status);
 740
 741     if (match == NULL) {
 742         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
 743         goto bail;
 744     }
 745
 746     name  = ucsdet_getName(match, &status);
 747     if (strcmp(name, "IBM420_rtl") != 0) {
 748         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
 749     }
 750
 751     ucsdet_setText(csd, bytes_r, brLength, &status);
 752     match = ucsdet_detect(csd, &status);
 753
 754     if (match == NULL) {
 755         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
 756         goto bail;
 757     }
 758
 759     name  = ucsdet_getName(match, &status);
 760     if (strcmp(name, "IBM420_ltr") != 0) {
 761         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
 762     }
 763
 764 bail:
 765     freeBytes(bytes);
 766     freeBytes(bytes_r);
 767     ucsdet_close(csd);
 768 #endif
 769 }
 770
 771
 772 void CharsetDetectionTest::Ticket6394Test() {
 773 #if !UCONFIG_NO_CONVERSION
 774     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
 775                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
 776                              "encodings more than once.  The hop through UnicodeString is for platforms "
 777                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
 778     char latin1Text[sizeof(charText)];
 779     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
 780
 781     UErrorCode status = U_ZERO_ERROR;
 782     UCharsetDetector *csd = ucsdet_open(&status);
 783     ucsdet_setText(csd, latin1Text, -1, &status);
 784     if (U_FAILURE(status)) {
 785         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 786         return;
 787     }
 788
 789     int32_t matchCount = 0;
 790     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
 791     if (U_FAILURE(status)) {
 792         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 793         return;
 794     }
 795
 796     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
 797     int32_t i;
 798     for (i=0; i<matchCount; i++) {
 799         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
 800         if (U_FAILURE(status)) {
 801             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
 802             status = U_ZERO_ERROR;
 803         }
 804         if (setOfCharsetNames.contains(charSetName)) {
 805             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
 806             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
 807         }
 808         setOfCharsetNames.add(charSetName);
 809     }
 810     ucsdet_close(csd);
 811 #endif
 812 }
 813
 814
 815 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
 816 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
 817 //               Charset Recognizer objects, and could be overwritten.
 818 void CharsetDetectionTest::Ticket6954Test() {
 819 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
 820     UErrorCode status = U_ZERO_ERROR;
 821     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 822     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
 823                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 824     UnicodeString sWindows  = ssWindows.unescape();
 825     int32_t lISO = 0, lWindows = 0;
 826     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 827     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 828
 829     // First do a plain vanilla detect of 1252 text
 830
 831     UCharsetDetector *csd1 = ucsdet_open(&status);
 832     ucsdet_setText(csd1, bWindows, lWindows, &status);
 833     const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
 834     const char *name1 = ucsdet_getName(match1, &status);
 835     TEST_ASSERT_SUCCESS(status);
 836     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 837
 838     // Next, using a completely separate detector, detect some 8859-1 text
 839
 840     UCharsetDetector *csd2 = ucsdet_open(&status);
 841     ucsdet_setText(csd2, bISO, lISO, &status);
 842     const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
 843     const char *name2 = ucsdet_getName(match2, &status);
 844     TEST_ASSERT_SUCCESS(status);
 845     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
 846
 847     // Recheck the 1252 results from the first detector, which should not have been
 848     //  altered by the use of a different detector.
 849
 850     name1 = ucsdet_getName(match1, &status);
 851     TEST_ASSERT_SUCCESS(status);
 852     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 853
 854     ucsdet_close(csd1);
 855     ucsdet_close(csd2);
 856     freeBytes(bISO);
 857     freeBytes(bWindows);
 858 #endif
 859 }