icuSources/test/intltest/csdetest.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  **********************************************************************
   5  *   Copyright (C) 2005-2016, International Business Machines
   6  *   Corporation and others.  All Rights Reserved.
   7  **********************************************************************
   8  */
   9
  10
  11 #include "unicode/utypes.h"
  12 #include "unicode/ucsdet.h"
  13 #include "unicode/ucnv.h"
  14 #include "unicode/unistr.h"
  15 #include "unicode/putil.h"
  16 #include "unicode/uniset.h"
  17
  18 #include "intltest.h"
  19 #include "csdetest.h"
  20
  21 #include "xmlparser.h"
  22
  23 #include <memory>
  24 #include <stdlib.h>
  25 #include <string.h>
  26
  27 #ifdef DEBUG_DETECT
  28 #include <stdio.h>
  29 #endif
  30
  31
  32 #define CH_SPACE 0x0020
  33 #define CH_SLASH 0x002F
  34
  35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
  36     if (!(x)) { \
  37         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
  38     } \
  39 } UPRV_BLOCK_MACRO_END
  40
  41 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
  42     if (U_FAILURE(errcode)) { \
  43         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
  44         return; \
  45     } \
  46 } UPRV_BLOCK_MACRO_END
  47
  48
  49 //---------------------------------------------------------------------------
  50 //
  51 //  Test class boilerplate
  52 //
  53 //---------------------------------------------------------------------------
  54 CharsetDetectionTest::CharsetDetectionTest()
  55 {
  56 }
  57
  58
  59 CharsetDetectionTest::~CharsetDetectionTest()
  60 {
  61 }
  62
  63
  64
  65 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  66 {
  67     if (exec) logln("TestSuite CharsetDetectionTest: ");
  68     switch (index) {
  69        case 0: name = "ConstructionTest";
  70             if (exec) ConstructionTest();
  71             break;
  72
  73        case 1: name = "UTF8Test";
  74             if (exec) UTF8Test();
  75             break;
  76
  77        case 2: name = "UTF16Test";
  78             if (exec) UTF16Test();
  79             break;
  80
  81        case 3: name = "C1BytesTest";
  82             if (exec) C1BytesTest();
  83             break;
  84
  85        case 4: name = "InputFilterTest";
  86             if (exec) InputFilterTest();
  87             break;
  88
  89        case 5: name = "DetectionTest";
  90             if (exec) DetectionTest();
  91             break;
  92 #if !UCONFIG_NO_LEGACY_CONVERSION
  93        case 6: name = "IBM424Test";
  94             if (exec) IBM424Test();
  95             break;
  96
  97        case 7: name = "IBM420Test";
  98             if (exec) IBM420Test();
  99             break;
 100 #else
 101        case 6:
 102        case 7: name = "skip"; break;
 103 #endif
 104        case 8: name = "Ticket6394Test";
 105             if (exec) Ticket6394Test();
 106             break;
 107
 108        case 9: name = "Ticket6954Test";
 109             if (exec) Ticket6954Test();
 110             break;
 111
 112         default: name = "";
 113             break; //needed to end loop
 114     }
 115 }
 116
 117 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
 118 {
 119     int32_t offset = -1;
 120
 121     splits = 1;
 122     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
 123         splits += 1;
 124     }
 125
 126     UnicodeString *result = new UnicodeString[splits];
 127
 128     int32_t start = 0;
 129     int32_t split = 0;
 130     int32_t end;
 131
 132     while((end = src.indexOf(ch, start)) >= 0) {
 133         src.extractBetween(start, end, result[split++]);
 134         start = end + 1;
 135     }
 136
 137     src.extractBetween(start, src.length(), result[split]);
 138
 139     return result;
 140 }
 141
 142 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
 143 {
 144     int32_t sLength = source.length();
 145     char *bytes = NULL;
 146
 147     length = source.extract(0, sLength, NULL, codepage);
 148
 149     if (length > 0) {
 150         bytes = new char[length + 1];
 151         source.extract(0, sLength, bytes, codepage);
 152     }
 153
 154     return bytes;
 155 }
 156
 157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
 158 {
 159     int32_t splits = 0;
 160     int32_t testLength = testString.length();
 161     std::unique_ptr<UnicodeString []> eSplit(split(encoding, CH_SLASH, splits));
 162     UErrorCode status = U_ZERO_ERROR;
 163     int32_t cpLength = eSplit[0].length();
 164     char codepage[64];
 165
 166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
 167     codepage[cpLength] = '\0';
 168
 169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
 170
 171     int32_t byteLength = 0;
 172     std::unique_ptr<char []> bytes(extractBytes(testString, codepage, byteLength));
 173
 174     if (! bytes) {
 175 #if !UCONFIG_NO_LEGACY_CONVERSION
 176         dataerrln("Can't open a " + encoding + " converter for " + id);
 177 #endif
 178         return;
 179     }
 180
 181     ucsdet_setText(csd.getAlias(), bytes.get(), byteLength, &status);
 182
 183     int32_t matchCount = 0;
 184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
 185
 186
 187     UnicodeString name(ucsdet_getName(matches[0], &status));
 188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
 189     UChar *decoded = NULL;
 190     int32_t dLength = 0;
 191
 192     if (matchCount == 0) {
 193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
 194         return;
 195     }
 196
 197     if (name.compare(eSplit[0]) != 0) {
 198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
 199
 200 #ifdef DEBUG_DETECT
 201         for (int32_t m = 0; m < matchCount; m += 1) {
 202             const char *name = ucsdet_getName(matches[m], &status);
 203             const char *lang = ucsdet_getLanguage(matches[m], &status);
 204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
 205
 206             printf("%s (%s) %d\n", name, lang, confidence);
 207         }
 208 #endif
 209         return;
 210     }
 211
 212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
 213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
 214         return;
 215     }
 216
 217     decoded = new UChar[testLength];
 218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
 219
 220     if (testString.compare(decoded, dLength) != 0) {
 221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
 222
 223 #ifdef DEBUG_DETECT
 224         for(int32_t i = 0; i < testLength; i += 1) {
 225             if(testString[i] != decoded[i]) {
 226                 printf("Strings differ at byte %d\n", i);
 227                 break;
 228             }
 229         }
 230 #endif
 231
 232     }
 233
 234     delete[] decoded;
 235 }
 236
 237 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
 238     UErrorCode status = U_ZERO_ERROR;
 239     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 240
 241     if (U_FAILURE(status)) {
 242         errln("ERROR: getPath() failed - %s", u_errorName(status));
 243         return NULL;
 244     }
 245
 246     strcpy(buffer, testDataDirectory);
 247     strcat(buffer, filename);
 248     return buffer;
 249 }
 250
 251 void CharsetDetectionTest::ConstructionTest()
 252 {
 253     IcuTestErrorCode status(*this, "ConstructionTest");
 254     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
 255     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
 256     int32_t count = uenum_count(e.getAlias(), status);
 257
 258 #ifdef DEBUG_DETECT
 259     printf("There are %d recognizers.\n", count);
 260 #endif
 261
 262     for(int32_t i = 0; i < count; i += 1) {
 263         int32_t length;
 264         const char *name = uenum_next(e.getAlias(), &length, status);
 265
 266         if(name == NULL || length <= 0) {
 267             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
 268         }
 269
 270 #ifdef DEBUG_DETECT
 271         printf("%s\n", name);
 272 #endif
 273     }
 274
 275     const char* defDisabled[] = {
 276         "IBM420_rtl", "IBM420_ltr",
 277         "IBM424_rtl", "IBM424_ltr",
 278         0
 279     };
 280
 281     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
 282     const char *activeName = NULL;
 283
 284     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
 285         // the charset must be included in all list
 286         UBool found = FALSE;
 287
 288         const char *name = NULL;
 289         uenum_reset(e.getAlias(), status);
 290         while ((name = uenum_next(e.getAlias(), NULL, status))) {
 291             if (strcmp(activeName, name) == 0) {
 292                 found = TRUE;
 293                 break;
 294             }
 295         }
 296
 297         if (!found) {
 298             errln(UnicodeString(activeName) + " is not included in the all charset list.");
 299         }
 300
 301         // some charsets are disabled by default
 302         found = FALSE;
 303         for (int32_t i = 0; defDisabled[i] != 0; i++) {
 304             if (strcmp(activeName, defDisabled[i]) == 0) {
 305                 found = TRUE;
 306                 break;
 307             }
 308         }
 309         if (found) {
 310             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
 311         }
 312     }
 313 }
 314
 315 void CharsetDetectionTest::UTF8Test()
 316 {
 317     UErrorCode status = U_ZERO_ERROR;
 318     UnicodeString ss = "This is a string with some non-ascii characters that will "
 319                        "be converted to UTF-8, then shoved through the detection process.  "
 320                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
 321                        "Sure would be nice if our source could contain Unicode directly!";
 322     UnicodeString s = ss.unescape();
 323     int32_t byteLength = 0, sLength = s.length();
 324     char *bytes = extractBytes(s, "UTF-8", byteLength);
 325     UCharsetDetector *csd = ucsdet_open(&status);
 326     const UCharsetMatch *match;
 327     UChar *detected = new UChar[sLength];
 328
 329     ucsdet_setText(csd, bytes, byteLength, &status);
 330     match = ucsdet_detect(csd, &status);
 331
 332     if (match == NULL) {
 333         errln("Detection failure for UTF-8: got no matches.");
 334         goto bail;
 335     }
 336
 337     ucsdet_getUChars(match, detected, sLength, &status);
 338
 339     if (s.compare(detected, sLength) != 0) {
 340         errln("Round-trip test failed!");
 341     }
 342
 343     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 344
 345 bail:
 346     delete[] detected;
 347     delete[] bytes;
 348     ucsdet_close(csd);
 349 }
 350
 351 void CharsetDetectionTest::UTF16Test()
 352 {
 353     UErrorCode status = U_ZERO_ERROR;
 354     /* Notice the BOM on the start of this string */
 355     UChar chars[] = {
 356         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 357         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 358         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 359         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 360         0x064a, 0x062a, 0x0000};
 361     UnicodeString s(chars);
 362     int32_t beLength = 0, leLength = 0;
 363     std::unique_ptr<char []>beBytes(extractBytes(s, "UTF-16BE", beLength));
 364     std::unique_ptr<char []>leBytes(extractBytes(s, "UTF-16LE", leLength));
 365     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
 366     const UCharsetMatch *match;
 367     const char *name;
 368     int32_t conf;
 369
 370     ucsdet_setText(csd.getAlias(), beBytes.get(), beLength, &status);
 371     match = ucsdet_detect(csd.getAlias(), &status);
 372
 373     if (match == NULL) {
 374         errln("Encoding detection failure for UTF-16BE: got no matches.");
 375     } else {
 376
 377         name  = ucsdet_getName(match, &status);
 378         conf  = ucsdet_getConfidence(match, &status);
 379
 380         if (strcmp(name, "UTF-16BE") != 0) {
 381             errln("Encoding detection failure for UTF-16BE: got %s", name);
 382         } else if (conf != 100) {
 383             errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
 384         }
 385     }
 386
 387     ucsdet_setText(csd.getAlias(), leBytes.get(), leLength, &status);
 388     match = ucsdet_detect(csd.getAlias(), &status);
 389
 390     if (match == NULL) {
 391         errln("Encoding detection failure for UTF-16LE: got no matches.");
 392         return;
 393     }
 394
 395     name  = ucsdet_getName(match, &status);
 396     conf = ucsdet_getConfidence(match, &status);
 397
 398     if (strcmp(name, "UTF-16LE") != 0) {
 399         errln("Enconding detection failure for UTF-16LE: got %s", name);
 400         return;
 401     }
 402
 403     if (conf != 100) {
 404         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
 405     }
 406 }
 407
 408 void CharsetDetectionTest::InputFilterTest()
 409 {
 410     UErrorCode status = U_ZERO_ERROR;
 411     UnicodeString s(u"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>");
 412     int32_t byteLength = 0;
 413     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
 414     UCharsetDetector *csd = ucsdet_open(&status);
 415     const UCharsetMatch *match;
 416     const char *lang, *name;
 417
 418     ucsdet_enableInputFilter(csd, TRUE);
 419
 420     if (!ucsdet_isInputFilterEnabled(csd)) {
 421         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
 422     }
 423
 424
 425     ucsdet_setText(csd, bytes, byteLength, &status);
 426     match = ucsdet_detect(csd, &status);
 427
 428     if (match == NULL) {
 429         errln("Turning on the input filter resulted in no matches.");
 430         goto turn_off;
 431     }
 432
 433     name = ucsdet_getName(match, &status);
 434
 435     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 436         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
 437     } else {
 438         lang = ucsdet_getLanguage(match, &status);
 439
 440         if (lang == NULL || strcmp(lang, "fr") != 0) {
 441             errln("Input filter did not strip markup!");
 442         }
 443     }
 444
 445 turn_off:
 446     ucsdet_enableInputFilter(csd, FALSE);
 447     ucsdet_setText(csd, bytes, byteLength, &status);
 448     match = ucsdet_detect(csd, &status);
 449
 450     if (match == NULL) {
 451         errln("Turning off the input filter resulted in no matches.");
 452         goto bail;
 453     }
 454
 455     name = ucsdet_getName(match, &status);
 456
 457     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 458         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
 459     } else {
 460         lang = ucsdet_getLanguage(match, &status);
 461
 462         if (lang == NULL || strcmp(lang, "en") != 0) {
 463             errln("Unfiltered input did not detect as English!");
 464         }
 465     }
 466
 467 bail:
 468     delete[] bytes;
 469     ucsdet_close(csd);
 470 }
 471
 472 void CharsetDetectionTest::C1BytesTest()
 473 {
 474 #if !UCONFIG_NO_LEGACY_CONVERSION
 475     UErrorCode status = U_ZERO_ERROR;
 476     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 477     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 478     UnicodeString sWindows  = ssWindows.unescape();
 479     int32_t lISO = 0, lWindows = 0;
 480     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 481     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 482     UCharsetDetector *csd = ucsdet_open(&status);
 483     const UCharsetMatch *match;
 484     const char *name;
 485
 486     ucsdet_setText(csd, bWindows, lWindows, &status);
 487     match = ucsdet_detect(csd, &status);
 488
 489     if (match == NULL) {
 490         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
 491         goto bail;
 492     }
 493
 494     name  = ucsdet_getName(match, &status);
 495
 496     if (strcmp(name, "windows-1252") != 0) {
 497         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
 498     }
 499
 500     ucsdet_setText(csd, bISO, lISO, &status);
 501     match = ucsdet_detect(csd, &status);
 502
 503     if (match == NULL) {
 504         errln("English text without C1 bytes got no matches.");
 505         goto bail;
 506     }
 507
 508     name  = ucsdet_getName(match, &status);
 509
 510     if (strcmp(name, "ISO-8859-1") != 0) {
 511         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
 512     }
 513
 514 bail:
 515     delete[] bWindows;
 516     delete[] bISO;
 517
 518     ucsdet_close(csd);
 519 #endif
 520 }
 521
 522 void CharsetDetectionTest::DetectionTest()
 523 {
 524 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 525     UErrorCode status = U_ZERO_ERROR;
 526     char path[2048];
 527     const char *testFilePath = getPath(path, "csdetest.xml");
 528
 529     if (testFilePath == NULL) {
 530         return; /* Couldn't get path: error message already output. */
 531     }
 532
 533     UXMLParser  *parser = UXMLParser::createParser(status);
 534     if (U_FAILURE(status)) {
 535         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
 536         return;
 537     }
 538
 539     UXMLElement *root   = parser->parseFile(testFilePath, status);
 540     if (!assertSuccess( "parseFile",status)) return;
 541
 542     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
 543     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
 544     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
 545
 546     const UXMLElement *testCase;
 547     int32_t tc = 0;
 548
 549     while((testCase = root->nextChildElement(tc)) != NULL) {
 550         if (testCase->getTagName().compare(test_case) == 0) {
 551             const UnicodeString *id = testCase->getAttribute(id_attr);
 552             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
 553             const UnicodeString  text = testCase->getText(TRUE);
 554             int32_t encodingCount;
 555             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
 556
 557             for(int32_t e = 0; e < encodingCount; e += 1) {
 558                 checkEncoding(text, encodingList[e], *id);
 559             }
 560
 561             delete[] encodingList;
 562         }
 563     }
 564
 565     delete root;
 566     delete parser;
 567 #endif
 568 }
 569
 570 void CharsetDetectionTest::IBM424Test()
 571 {
 572 #if !UCONFIG_ONLY_HTML_CONVERSION
 573     UErrorCode status = U_ZERO_ERROR;
 574
 575     static const UChar chars[] = {
 576             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
 577             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
 578             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
 579             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
 580             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
 581             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
 582             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
 583             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
 584             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
 585             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
 586             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
 587             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
 588             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
 589             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
 590             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
 591             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
 592             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
 593     };
 594
 595     static const UChar chars_reverse[] = {
 596             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
 597             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
 598             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
 599             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
 600             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
 601             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
 602             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
 603             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
 604             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
 605             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
 606             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
 607             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
 608             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
 609             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
 610             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
 611             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
 612             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
 613             0x0000
 614     };
 615
 616     int32_t bLength = 0, brLength = 0;
 617
 618     UnicodeString s1(chars);
 619     UnicodeString s2(chars_reverse);
 620
 621     char *bytes = extractBytes(s1, "IBM424", bLength);
 622     char *bytes_r = extractBytes(s2, "IBM424", brLength);
 623
 624     UCharsetDetector *csd = ucsdet_open(&status);
 625         ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
 626         ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
 627         ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
 628         ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
 629     if (U_FAILURE(status)) {
 630         errln("Error opening charset detector. - %s", u_errorName(status));
 631     }
 632     const UCharsetMatch *match;
 633     const char *name;
 634
 635     ucsdet_setText(csd, bytes, bLength, &status);
 636     match = ucsdet_detect(csd, &status);
 637
 638     if (match == NULL) {
 639         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
 640         goto bail;
 641     }
 642
 643     name  = ucsdet_getName(match, &status);
 644     if (strcmp(name, "IBM424_rtl") != 0) {
 645         errln("Encoding detection failure for IBM424_rtl: got %s", name);
 646     }
 647
 648     ucsdet_setText(csd, bytes_r, brLength, &status);
 649     match = ucsdet_detect(csd, &status);
 650
 651     if (match == NULL) {
 652         errln("Encoding detection failure for IBM424_ltr: got no matches.");
 653         goto bail;
 654     }
 655
 656     name  = ucsdet_getName(match, &status);
 657     if (strcmp(name, "IBM424_ltr") != 0) {
 658         errln("Encoding detection failure for IBM424_ltr: got %s", name);
 659     }
 660
 661 bail:
 662     delete[] bytes;
 663     delete[] bytes_r;
 664     ucsdet_close(csd);
 665 #endif
 666 }
 667
 668 void CharsetDetectionTest::IBM420Test()
 669 {
 670 #if !UCONFIG_ONLY_HTML_CONVERSION
 671     UErrorCode status = U_ZERO_ERROR;
 672
 673     static const UChar chars[] = {
 674         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
 675         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
 676         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
 677         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
 678         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
 679         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
 680         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
 681         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
 682         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
 683         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
 684         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
 685         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
 686         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
 687         0x0000
 688     };
 689     static const UChar chars_reverse[] = {
 690         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
 691         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
 692         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
 693         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
 694         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
 695         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
 696         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
 697         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
 698         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
 699         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
 700         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
 701         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
 702         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
 703         0x0000,
 704     };
 705
 706     int32_t bLength = 0, brLength = 0;
 707
 708     UnicodeString s1(chars);
 709     UnicodeString s2(chars_reverse);
 710
 711     char *bytes = extractBytes(s1, "IBM420", bLength);
 712     char *bytes_r = extractBytes(s2, "IBM420", brLength);
 713
 714     UCharsetDetector *csd = ucsdet_open(&status);
 715     if (U_FAILURE(status)) {
 716         errln("Error opening charset detector. - %s", u_errorName(status));
 717     }
 718         ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
 719         ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
 720         ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
 721         ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
 722     const UCharsetMatch *match;
 723     const char *name;
 724
 725     ucsdet_setText(csd, bytes, bLength, &status);
 726     match = ucsdet_detect(csd, &status);
 727
 728     if (match == NULL) {
 729         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
 730         goto bail;
 731     }
 732
 733     name  = ucsdet_getName(match, &status);
 734     if (strcmp(name, "IBM420_rtl") != 0) {
 735         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
 736     }
 737
 738     ucsdet_setText(csd, bytes_r, brLength, &status);
 739     match = ucsdet_detect(csd, &status);
 740
 741     if (match == NULL) {
 742         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
 743         goto bail;
 744     }
 745
 746     name  = ucsdet_getName(match, &status);
 747     if (strcmp(name, "IBM420_ltr") != 0) {
 748         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
 749     }
 750
 751 bail:
 752     delete[] bytes;
 753     delete[] bytes_r;
 754     ucsdet_close(csd);
 755 #endif
 756 }
 757
 758
 759 void CharsetDetectionTest::Ticket6394Test() {
 760 #if !UCONFIG_NO_CONVERSION
 761     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
 762                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
 763                              "encodings more than once.  The hop through UnicodeString is for platforms "
 764                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
 765     char latin1Text[sizeof(charText)];
 766     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
 767
 768     UErrorCode status = U_ZERO_ERROR;
 769     UCharsetDetector *csd = ucsdet_open(&status);
 770     ucsdet_setText(csd, latin1Text, -1, &status);
 771     if (U_FAILURE(status)) {
 772         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 773         return;
 774     }
 775
 776     int32_t matchCount = 0;
 777     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
 778     if (U_FAILURE(status)) {
 779         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 780         return;
 781     }
 782
 783     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
 784     int32_t i;
 785     for (i=0; i<matchCount; i++) {
 786         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
 787         if (U_FAILURE(status)) {
 788             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
 789             status = U_ZERO_ERROR;
 790         }
 791         if (setOfCharsetNames.contains(charSetName)) {
 792             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
 793             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
 794         }
 795         setOfCharsetNames.add(charSetName);
 796     }
 797     ucsdet_close(csd);
 798 #endif
 799 }
 800
 801
 802 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
 803 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
 804 //               Charset Recognizer objects, and could be overwritten.
 805 void CharsetDetectionTest::Ticket6954Test() {
 806 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
 807     UErrorCode status = U_ZERO_ERROR;
 808     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 809     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
 810                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 811     UnicodeString sWindows  = ssWindows.unescape();
 812     int32_t lISO = 0, lWindows = 0;
 813     std::unique_ptr<char[]> bISO(extractBytes(sISO, "ISO-8859-1", lISO));
 814     std::unique_ptr<char[]> bWindows(extractBytes(sWindows, "windows-1252", lWindows));
 815
 816     // First do a plain vanilla detect of 1252 text
 817
 818     LocalUCharsetDetectorPointer csd1(ucsdet_open(&status));
 819     ucsdet_setText(csd1.getAlias(), bWindows.get(), lWindows, &status);
 820     const UCharsetMatch *match1 = ucsdet_detect(csd1.getAlias(), &status);
 821     const char *name1 = ucsdet_getName(match1, &status);
 822     TEST_ASSERT_SUCCESS(status);
 823     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 824
 825     // Next, using a completely separate detector, detect some 8859-1 text
 826
 827     LocalUCharsetDetectorPointer csd2(ucsdet_open(&status));
 828     ucsdet_setText(csd2.getAlias(), bISO.get(), lISO, &status);
 829     const UCharsetMatch *match2 = ucsdet_detect(csd2.getAlias(), &status);
 830     const char *name2 = ucsdet_getName(match2, &status);
 831     TEST_ASSERT_SUCCESS(status);
 832     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
 833
 834     // Recheck the 1252 results from the first detector, which should not have been
 835     //  altered by the use of a different detector.
 836
 837     name1 = ucsdet_getName(match1, &status);
 838     TEST_ASSERT_SUCCESS(status);
 839     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 840 #endif
 841 }