icuSources/test/intltest/csdetest.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  **********************************************************************
   5  *   Copyright (C) 2005-2016, International Business Machines
   6  *   Corporation and others.  All Rights Reserved.
   7  **********************************************************************
   8  */
   9
  10
  11 #include "unicode/utypes.h"
  12 #include "unicode/ucsdet.h"
  13 #include "unicode/ucnv.h"
  14 #include "unicode/unistr.h"
  15 #include "unicode/putil.h"
  16 #include "unicode/uniset.h"
  17
  18 #include "intltest.h"
  19 #include "csdetest.h"
  20
  21 #include "xmlparser.h"
  22
  23 #include <stdlib.h>
  24 #include <string.h>
  25
  26 #ifdef DEBUG_DETECT
  27 #include <stdio.h>
  28 #endif
  29
  30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
  31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
  32
  33 #define CH_SPACE 0x0020
  34 #define CH_SLASH 0x002F
  35
  36 #define TEST_ASSERT(x) {if (!(x)) { \
  37     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  38
  39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  40     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
  41     return;}}
  42
  43
  44 //---------------------------------------------------------------------------
  45 //
  46 //  Test class boilerplate
  47 //
  48 //---------------------------------------------------------------------------
  49 CharsetDetectionTest::CharsetDetectionTest()
  50 {
  51 }
  52
  53
  54 CharsetDetectionTest::~CharsetDetectionTest()
  55 {
  56 }
  57
  58
  59
  60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  61 {
  62     if (exec) logln("TestSuite CharsetDetectionTest: ");
  63     switch (index) {
  64        case 0: name = "ConstructionTest";
  65             if (exec) ConstructionTest();
  66             break;
  67
  68        case 1: name = "UTF8Test";
  69             if (exec) UTF8Test();
  70             break;
  71
  72        case 2: name = "UTF16Test";
  73             if (exec) UTF16Test();
  74             break;
  75
  76        case 3: name = "C1BytesTest";
  77             if (exec) C1BytesTest();
  78             break;
  79
  80        case 4: name = "InputFilterTest";
  81             if (exec) InputFilterTest();
  82             break;
  83
  84        case 5: name = "DetectionTest";
  85             if (exec) DetectionTest();
  86             break;
  87 #if !UCONFIG_NO_LEGACY_CONVERSION
  88        case 6: name = "IBM424Test";
  89             if (exec) IBM424Test();
  90             break;
  91
  92        case 7: name = "IBM420Test";
  93             if (exec) IBM420Test();
  94             break;
  95 #else
  96        case 6:
  97        case 7: name = "skip"; break;
  98 #endif
  99        case 8: name = "Ticket6394Test";
 100             if (exec) Ticket6394Test();
 101             break;
 102
 103        case 9: name = "Ticket6954Test";
 104             if (exec) Ticket6954Test();
 105             break;
 106
 107         default: name = "";
 108             break; //needed to end loop
 109     }
 110 }
 111
 112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
 113 {
 114     int32_t offset = -1;
 115
 116     splits = 1;
 117     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
 118         splits += 1;
 119     }
 120
 121     UnicodeString *result = new UnicodeString[splits];
 122
 123     int32_t start = 0;
 124     int32_t split = 0;
 125     int32_t end;
 126
 127     while((end = src.indexOf(ch, start)) >= 0) {
 128         src.extractBetween(start, end, result[split++]);
 129         start = end + 1;
 130     }
 131
 132     src.extractBetween(start, src.length(), result[split]);
 133
 134     return result;
 135 }
 136
 137 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
 138 {
 139     int32_t sLength = source.length();
 140     char *bytes = NULL;
 141
 142     length = source.extract(0, sLength, NULL, codepage);
 143
 144     if (length > 0) {
 145         bytes = NEW_ARRAY(char, length + 1);
 146         source.extract(0, sLength, bytes, codepage);
 147     }
 148
 149     return bytes;
 150 }
 151
 152 static void freeBytes(char *bytes)
 153 {
 154     DELETE_ARRAY(bytes);
 155 }
 156
 157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
 158 {
 159     int32_t splits = 0;
 160     int32_t testLength = testString.length();
 161     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
 162     UErrorCode status = U_ZERO_ERROR;
 163     int32_t cpLength = eSplit[0].length();
 164     char codepage[64];
 165
 166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
 167     codepage[cpLength] = '\0';
 168
 169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
 170
 171     int32_t byteLength = 0;
 172     char *bytes = extractBytes(testString, codepage, byteLength);
 173
 174     if (bytes == NULL) {
 175 #if !UCONFIG_NO_LEGACY_CONVERSION
 176         dataerrln("Can't open a " + encoding + " converter for " + id);
 177 #endif
 178         return;
 179     }
 180
 181     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
 182
 183     int32_t matchCount = 0;
 184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
 185
 186
 187     UnicodeString name(ucsdet_getName(matches[0], &status));
 188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
 189     UChar *decoded = NULL;
 190     int32_t dLength = 0;
 191
 192     if (matchCount == 0) {
 193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
 194         goto bail;
 195     }
 196
 197     if (name.compare(eSplit[0]) != 0) {
 198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
 199
 200 #ifdef DEBUG_DETECT
 201         for (int32_t m = 0; m < matchCount; m += 1) {
 202             const char *name = ucsdet_getName(matches[m], &status);
 203             const char *lang = ucsdet_getLanguage(matches[m], &status);
 204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
 205
 206             printf("%s (%s) %d\n", name, lang, confidence);
 207         }
 208 #endif
 209         goto bail;
 210     }
 211
 212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
 213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
 214         goto bail;
 215     }
 216
 217     decoded = NEW_ARRAY(UChar, testLength);
 218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
 219
 220     if (testString.compare(decoded, dLength) != 0) {
 221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
 222
 223 #ifdef DEBUG_DETECT
 224         for(int32_t i = 0; i < testLength; i += 1) {
 225             if(testString[i] != decoded[i]) {
 226                 printf("Strings differ at byte %d\n", i);
 227                 break;
 228             }
 229         }
 230 #endif
 231
 232     }
 233
 234     DELETE_ARRAY(decoded);
 235
 236 bail:
 237     freeBytes(bytes);
 238     delete[] eSplit;
 239 }
 240
 241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
 242     UErrorCode status = U_ZERO_ERROR;
 243     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 244
 245     if (U_FAILURE(status)) {
 246         errln("ERROR: getPath() failed - %s", u_errorName(status));
 247         return NULL;
 248     }
 249
 250     strcpy(buffer, testDataDirectory);
 251     strcat(buffer, filename);
 252     return buffer;
 253 }
 254
 255 void CharsetDetectionTest::ConstructionTest()
 256 {
 257     IcuTestErrorCode status(*this, "ConstructionTest");
 258     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
 259     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
 260     int32_t count = uenum_count(e.getAlias(), status);
 261
 262 #ifdef DEBUG_DETECT
 263     printf("There are %d recognizers.\n", count);
 264 #endif
 265
 266     for(int32_t i = 0; i < count; i += 1) {
 267         int32_t length;
 268         const char *name = uenum_next(e.getAlias(), &length, status);
 269
 270         if(name == NULL || length <= 0) {
 271             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
 272         }
 273
 274 #ifdef DEBUG_DETECT
 275         printf("%s\n", name);
 276 #endif
 277     }
 278
 279     const char* defDisabled[] = {
 280         "IBM420_rtl", "IBM420_ltr",
 281         "IBM424_rtl", "IBM424_ltr",
 282         0
 283     };
 284
 285     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
 286     const char *activeName = NULL;
 287
 288     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
 289         // the charset must be included in all list
 290         UBool found = FALSE;
 291
 292         const char *name = NULL;
 293         uenum_reset(e.getAlias(), status);
 294         while ((name = uenum_next(e.getAlias(), NULL, status))) {
 295             if (strcmp(activeName, name) == 0) {
 296                 found = TRUE;
 297                 break;
 298             }
 299         }
 300
 301         if (!found) {
 302             errln(UnicodeString(activeName) + " is not included in the all charset list.");
 303         }
 304
 305         // some charsets are disabled by default
 306         found = FALSE;
 307         for (int32_t i = 0; defDisabled[i] != 0; i++) {
 308             if (strcmp(activeName, defDisabled[i]) == 0) {
 309                 found = TRUE;
 310                 break;
 311             }
 312         }
 313         if (found) {
 314             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
 315         }
 316     }
 317 }
 318
 319 void CharsetDetectionTest::UTF8Test()
 320 {
 321     UErrorCode status = U_ZERO_ERROR;
 322     UnicodeString ss = "This is a string with some non-ascii characters that will "
 323                        "be converted to UTF-8, then shoved through the detection process.  "
 324                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
 325                        "Sure would be nice if our source could contain Unicode directly!";
 326     UnicodeString s = ss.unescape();
 327     int32_t byteLength = 0, sLength = s.length();
 328     char *bytes = extractBytes(s, "UTF-8", byteLength);
 329     UCharsetDetector *csd = ucsdet_open(&status);
 330     const UCharsetMatch *match;
 331     UChar *detected = NEW_ARRAY(UChar, sLength);
 332
 333     ucsdet_setText(csd, bytes, byteLength, &status);
 334     match = ucsdet_detect(csd, &status);
 335
 336     if (match == NULL) {
 337         errln("Detection failure for UTF-8: got no matches.");
 338         goto bail;
 339     }
 340
 341     ucsdet_getUChars(match, detected, sLength, &status);
 342
 343     if (s.compare(detected, sLength) != 0) {
 344         errln("Round-trip test failed!");
 345     }
 346
 347     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 348
 349 bail:
 350     DELETE_ARRAY(detected);
 351     freeBytes(bytes);
 352     ucsdet_close(csd);
 353 }
 354
 355 void CharsetDetectionTest::UTF16Test()
 356 {
 357     UErrorCode status = U_ZERO_ERROR;
 358     /* Notice the BOM on the start of this string */
 359     UChar chars[] = {
 360         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 361         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 362         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 363         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 364         0x064a, 0x062a, 0x0000};
 365     UnicodeString s(chars);
 366     int32_t beLength = 0, leLength = 0;
 367     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
 368     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
 369     UCharsetDetector *csd = ucsdet_open(&status);
 370     const UCharsetMatch *match;
 371     const char *name;
 372     int32_t conf;
 373
 374     ucsdet_setText(csd, beBytes, beLength, &status);
 375     match = ucsdet_detect(csd, &status);
 376
 377     if (match == NULL) {
 378         errln("Encoding detection failure for UTF-16BE: got no matches.");
 379         goto try_le;
 380     }
 381
 382     name  = ucsdet_getName(match, &status);
 383     conf  = ucsdet_getConfidence(match, &status);
 384
 385     if (strcmp(name, "UTF-16BE") != 0) {
 386         errln("Encoding detection failure for UTF-16BE: got %s", name);
 387         goto try_le; // no point in looking at confidence if we got the wrong character set.
 388     }
 389
 390     if (conf != 100) {
 391         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
 392     }
 393
 394 try_le:
 395     ucsdet_setText(csd, leBytes, leLength, &status);
 396     match = ucsdet_detect(csd, &status);
 397
 398     if (match == NULL) {
 399         errln("Encoding detection failure for UTF-16LE: got no matches.");
 400         goto bail;
 401     }
 402
 403     name  = ucsdet_getName(match, &status);
 404     conf = ucsdet_getConfidence(match, &status);
 405
 406
 407     if (strcmp(name, "UTF-16LE") != 0) {
 408         errln("Enconding detection failure for UTF-16LE: got %s", name);
 409         goto bail; // no point in looking at confidence if we got the wrong character set.
 410     }
 411
 412     if (conf != 100) {
 413         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
 414     }
 415
 416 bail:
 417     freeBytes(leBytes);
 418     freeBytes(beBytes);
 419     ucsdet_close(csd);
 420 }
 421
 422 void CharsetDetectionTest::InputFilterTest()
 423 {
 424     UErrorCode status = U_ZERO_ERROR;
 425     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
 426     UnicodeString s  = ss.unescape();
 427     int32_t byteLength = 0;
 428     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
 429     UCharsetDetector *csd = ucsdet_open(&status);
 430     const UCharsetMatch *match;
 431     const char *lang, *name;
 432
 433     ucsdet_enableInputFilter(csd, TRUE);
 434
 435     if (!ucsdet_isInputFilterEnabled(csd)) {
 436         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
 437     }
 438
 439
 440     ucsdet_setText(csd, bytes, byteLength, &status);
 441     match = ucsdet_detect(csd, &status);
 442
 443     if (match == NULL) {
 444         errln("Turning on the input filter resulted in no matches.");
 445         goto turn_off;
 446     }
 447
 448     name = ucsdet_getName(match, &status);
 449
 450     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 451         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
 452     } else {
 453         lang = ucsdet_getLanguage(match, &status);
 454
 455         if (lang == NULL || strcmp(lang, "fr") != 0) {
 456             errln("Input filter did not strip markup!");
 457         }
 458     }
 459
 460 turn_off:
 461     ucsdet_enableInputFilter(csd, FALSE);
 462     ucsdet_setText(csd, bytes, byteLength, &status);
 463     match = ucsdet_detect(csd, &status);
 464
 465     if (match == NULL) {
 466         errln("Turning off the input filter resulted in no matches.");
 467         goto bail;
 468     }
 469
 470     name = ucsdet_getName(match, &status);
 471
 472     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 473         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
 474     } else {
 475         lang = ucsdet_getLanguage(match, &status);
 476
 477         if (lang == NULL || strcmp(lang, "en") != 0) {
 478             errln("Unfiltered input did not detect as English!");
 479         }
 480     }
 481
 482 bail:
 483     freeBytes(bytes);
 484     ucsdet_close(csd);
 485 }
 486
 487 void CharsetDetectionTest::C1BytesTest()
 488 {
 489 #if !UCONFIG_NO_LEGACY_CONVERSION
 490     UErrorCode status = U_ZERO_ERROR;
 491     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 492     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 493     UnicodeString sWindows  = ssWindows.unescape();
 494     int32_t lISO = 0, lWindows = 0;
 495     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 496     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 497     UCharsetDetector *csd = ucsdet_open(&status);
 498     const UCharsetMatch *match;
 499     const char *name;
 500
 501     ucsdet_setText(csd, bWindows, lWindows, &status);
 502     match = ucsdet_detect(csd, &status);
 503
 504     if (match == NULL) {
 505         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
 506         goto bail;
 507     }
 508
 509     name  = ucsdet_getName(match, &status);
 510
 511     if (strcmp(name, "windows-1252") != 0) {
 512         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
 513     }
 514
 515     ucsdet_setText(csd, bISO, lISO, &status);
 516     match = ucsdet_detect(csd, &status);
 517
 518     if (match == NULL) {
 519         errln("English text without C1 bytes got no matches.");
 520         goto bail;
 521     }
 522
 523     name  = ucsdet_getName(match, &status);
 524
 525     if (strcmp(name, "ISO-8859-1") != 0) {
 526         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
 527     }
 528
 529 bail:
 530     freeBytes(bWindows);
 531     freeBytes(bISO);
 532
 533     ucsdet_close(csd);
 534 #endif
 535 }
 536
 537 void CharsetDetectionTest::DetectionTest()
 538 {
 539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 540     UErrorCode status = U_ZERO_ERROR;
 541     char path[2048];
 542     const char *testFilePath = getPath(path, "csdetest.xml");
 543
 544     if (testFilePath == NULL) {
 545         return; /* Couldn't get path: error message already output. */
 546     }
 547
 548     UXMLParser  *parser = UXMLParser::createParser(status);
 549     if (U_FAILURE(status)) {
 550         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
 551         return;
 552     }
 553
 554     UXMLElement *root   = parser->parseFile(testFilePath, status);
 555     if (!assertSuccess( "parseFile",status)) return;
 556
 557     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
 558     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
 559     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
 560
 561     const UXMLElement *testCase;
 562     int32_t tc = 0;
 563
 564     while((testCase = root->nextChildElement(tc)) != NULL) {
 565         if (testCase->getTagName().compare(test_case) == 0) {
 566             const UnicodeString *id = testCase->getAttribute(id_attr);
 567             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
 568             const UnicodeString  text = testCase->getText(TRUE);
 569             int32_t encodingCount;
 570             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
 571
 572             for(int32_t e = 0; e < encodingCount; e += 1) {
 573                 checkEncoding(text, encodingList[e], *id);
 574             }
 575
 576             delete[] encodingList;
 577         }
 578     }
 579
 580     delete root;
 581     delete parser;
 582 #endif
 583 }
 584
 585 void CharsetDetectionTest::IBM424Test()
 586 {
 587 #if !UCONFIG_ONLY_HTML_CONVERSION
 588     UErrorCode status = U_ZERO_ERROR;
 589
 590     static const UChar chars[] = {
 591             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
 592             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
 593             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
 594             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
 595             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
 596             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
 597             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
 598             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
 599             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
 600             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
 601             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
 602             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
 603             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
 604             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
 605             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
 606             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
 607             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
 608     };
 609
 610     static const UChar chars_reverse[] = {
 611             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
 612             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
 613             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
 614             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
 615             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
 616             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
 617             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
 618             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
 619             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
 620             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
 621             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
 622             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
 623             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
 624             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
 625             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
 626             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
 627             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
 628             0x0000
 629     };
 630
 631     int32_t bLength = 0, brLength = 0;
 632
 633     UnicodeString s1(chars);
 634     UnicodeString s2(chars_reverse);
 635
 636     char *bytes = extractBytes(s1, "IBM424", bLength);
 637     char *bytes_r = extractBytes(s2, "IBM424", brLength);
 638
 639     UCharsetDetector *csd = ucsdet_open(&status);
 640         ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
 641         ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
 642         ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
 643         ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
 644     if (U_FAILURE(status)) {
 645         errln("Error opening charset detector. - %s", u_errorName(status));
 646     }
 647     const UCharsetMatch *match;
 648     const char *name;
 649
 650     ucsdet_setText(csd, bytes, bLength, &status);
 651     match = ucsdet_detect(csd, &status);
 652
 653     if (match == NULL) {
 654         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
 655         goto bail;
 656     }
 657
 658     name  = ucsdet_getName(match, &status);
 659     if (strcmp(name, "IBM424_rtl") != 0) {
 660         errln("Encoding detection failure for IBM424_rtl: got %s", name);
 661     }
 662
 663     ucsdet_setText(csd, bytes_r, brLength, &status);
 664     match = ucsdet_detect(csd, &status);
 665
 666     if (match == NULL) {
 667         errln("Encoding detection failure for IBM424_ltr: got no matches.");
 668         goto bail;
 669     }
 670
 671     name  = ucsdet_getName(match, &status);
 672     if (strcmp(name, "IBM424_ltr") != 0) {
 673         errln("Encoding detection failure for IBM424_ltr: got %s", name);
 674     }
 675
 676 bail:
 677     freeBytes(bytes);
 678     freeBytes(bytes_r);
 679     ucsdet_close(csd);
 680 #endif
 681 }
 682
 683 void CharsetDetectionTest::IBM420Test()
 684 {
 685 #if !UCONFIG_ONLY_HTML_CONVERSION
 686     UErrorCode status = U_ZERO_ERROR;
 687
 688     static const UChar chars[] = {
 689         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
 690         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
 691         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
 692         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
 693         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
 694         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
 695         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
 696         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
 697         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
 698         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
 699         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
 700         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
 701         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
 702         0x0000
 703     };
 704     static const UChar chars_reverse[] = {
 705         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
 706         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
 707         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
 708         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
 709         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
 710         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
 711         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
 712         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
 713         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
 714         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
 715         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
 716         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
 717         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
 718         0x0000,
 719     };
 720
 721     int32_t bLength = 0, brLength = 0;
 722
 723     UnicodeString s1(chars);
 724     UnicodeString s2(chars_reverse);
 725
 726     char *bytes = extractBytes(s1, "IBM420", bLength);
 727     char *bytes_r = extractBytes(s2, "IBM420", brLength);
 728
 729     UCharsetDetector *csd = ucsdet_open(&status);
 730     if (U_FAILURE(status)) {
 731         errln("Error opening charset detector. - %s", u_errorName(status));
 732     }
 733         ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
 734         ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
 735         ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
 736         ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
 737     const UCharsetMatch *match;
 738     const char *name;
 739
 740     ucsdet_setText(csd, bytes, bLength, &status);
 741     match = ucsdet_detect(csd, &status);
 742
 743     if (match == NULL) {
 744         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
 745         goto bail;
 746     }
 747
 748     name  = ucsdet_getName(match, &status);
 749     if (strcmp(name, "IBM420_rtl") != 0) {
 750         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
 751     }
 752
 753     ucsdet_setText(csd, bytes_r, brLength, &status);
 754     match = ucsdet_detect(csd, &status);
 755
 756     if (match == NULL) {
 757         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
 758         goto bail;
 759     }
 760
 761     name  = ucsdet_getName(match, &status);
 762     if (strcmp(name, "IBM420_ltr") != 0) {
 763         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
 764     }
 765
 766 bail:
 767     freeBytes(bytes);
 768     freeBytes(bytes_r);
 769     ucsdet_close(csd);
 770 #endif
 771 }
 772
 773
 774 void CharsetDetectionTest::Ticket6394Test() {
 775 #if !UCONFIG_NO_CONVERSION
 776     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
 777                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
 778                              "encodings more than once.  The hop through UnicodeString is for platforms "
 779                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
 780     char latin1Text[sizeof(charText)];
 781     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
 782
 783     UErrorCode status = U_ZERO_ERROR;
 784     UCharsetDetector *csd = ucsdet_open(&status);
 785     ucsdet_setText(csd, latin1Text, -1, &status);
 786     if (U_FAILURE(status)) {
 787         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 788         return;
 789     }
 790
 791     int32_t matchCount = 0;
 792     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
 793     if (U_FAILURE(status)) {
 794         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 795         return;
 796     }
 797
 798     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
 799     int32_t i;
 800     for (i=0; i<matchCount; i++) {
 801         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
 802         if (U_FAILURE(status)) {
 803             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
 804             status = U_ZERO_ERROR;
 805         }
 806         if (setOfCharsetNames.contains(charSetName)) {
 807             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
 808             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
 809         }
 810         setOfCharsetNames.add(charSetName);
 811     }
 812     ucsdet_close(csd);
 813 #endif
 814 }
 815
 816
 817 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
 818 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
 819 //               Charset Recognizer objects, and could be overwritten.
 820 void CharsetDetectionTest::Ticket6954Test() {
 821 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
 822     UErrorCode status = U_ZERO_ERROR;
 823     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 824     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
 825                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 826     UnicodeString sWindows  = ssWindows.unescape();
 827     int32_t lISO = 0, lWindows = 0;
 828     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 829     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 830
 831     // First do a plain vanilla detect of 1252 text
 832
 833     UCharsetDetector *csd1 = ucsdet_open(&status);
 834     ucsdet_setText(csd1, bWindows, lWindows, &status);
 835     const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
 836     const char *name1 = ucsdet_getName(match1, &status);
 837     TEST_ASSERT_SUCCESS(status);
 838     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 839
 840     // Next, using a completely separate detector, detect some 8859-1 text
 841
 842     UCharsetDetector *csd2 = ucsdet_open(&status);
 843     ucsdet_setText(csd2, bISO, lISO, &status);
 844     const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
 845     const char *name2 = ucsdet_getName(match2, &status);
 846     TEST_ASSERT_SUCCESS(status);
 847     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
 848
 849     // Recheck the 1252 results from the first detector, which should not have been
 850     //  altered by the use of a different detector.
 851
 852     name1 = ucsdet_getName(match1, &status);
 853     TEST_ASSERT_SUCCESS(status);
 854     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 855
 856     ucsdet_close(csd1);
 857     ucsdet_close(csd2);
 858     freeBytes(bISO);
 859     freeBytes(bWindows);
 860 #endif
 861 }