icuSources/test/intltest/csdetest.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2012, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8
   9 #include "unicode/utypes.h"
  10 #include "unicode/ucsdet.h"
  11 #include "unicode/ucnv.h"
  12 #include "unicode/unistr.h"
  13 #include "unicode/putil.h"
  14 #include "unicode/uniset.h"
  15
  16 #include "intltest.h"
  17 #include "csdetest.h"
  18
  19 #include "xmlparser.h"
  20
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #ifdef DEBUG_DETECT
  25 #include <stdio.h>
  26 #endif
  27
  28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  29
  30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
  31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
  32
  33 #define CH_SPACE 0x0020
  34 #define CH_SLASH 0x002F
  35
  36 #define TEST_ASSERT(x) {if (!(x)) { \
  37     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  38
  39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  40     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
  41     return;}}
  42
  43
  44 //---------------------------------------------------------------------------
  45 //
  46 //  Test class boilerplate
  47 //
  48 //---------------------------------------------------------------------------
  49 CharsetDetectionTest::CharsetDetectionTest()
  50 {
  51 }
  52
  53
  54 CharsetDetectionTest::~CharsetDetectionTest()
  55 {
  56 }
  57
  58
  59
  60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  61 {
  62     if (exec) logln("TestSuite CharsetDetectionTest: ");
  63     switch (index) {
  64        case 0: name = "ConstructionTest";
  65             if (exec) ConstructionTest();
  66             break;
  67
  68        case 1: name = "UTF8Test";
  69             if (exec) UTF8Test();
  70             break;
  71
  72        case 2: name = "UTF16Test";
  73             if (exec) UTF16Test();
  74             break;
  75
  76        case 3: name = "C1BytesTest";
  77             if (exec) C1BytesTest();
  78             break;
  79
  80        case 4: name = "InputFilterTest";
  81             if (exec) InputFilterTest();
  82             break;
  83
  84        case 5: name = "DetectionTest";
  85             if (exec) DetectionTest();
  86             break;
  87 #if !UCONFIG_NO_LEGACY_CONVERSION
  88        case 6: name = "IBM424Test";
  89             if (exec) IBM424Test();
  90             break;
  91
  92        case 7: name = "IBM420Test";
  93             if (exec) IBM420Test();
  94             break;
  95 #else
  96        case 6:
  97        case 7: name = "skip"; break;
  98 #endif
  99        case 8: name = "Ticket6394Test";
 100             if (exec) Ticket6394Test();
 101             break;
 102
 103        case 9: name = "Ticket6954Test";
 104             if (exec) Ticket6954Test();
 105             break;
 106
 107         default: name = "";
 108             break; //needed to end loop
 109     }
 110 }
 111
 112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
 113 {
 114     int32_t offset = -1;
 115
 116     splits = 1;
 117     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
 118         splits += 1;
 119     }
 120
 121     UnicodeString *result = new UnicodeString[splits];
 122
 123     int32_t start = 0;
 124     int32_t split = 0;
 125     int32_t end;
 126
 127     while((end = src.indexOf(ch, start)) >= 0) {
 128         src.extractBetween(start, end, result[split++]);
 129         start = end + 1;
 130     }
 131
 132     src.extractBetween(start, src.length(), result[split]);
 133
 134     return result;
 135 }
 136
 137 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
 138 {
 139     int32_t sLength = source.length();
 140     char *bytes = NULL;
 141
 142     length = source.extract(0, sLength, NULL, codepage);
 143
 144     if (length > 0) {
 145         bytes = NEW_ARRAY(char, length + 1);
 146         source.extract(0, sLength, bytes, codepage);
 147     }
 148
 149     return bytes;
 150 }
 151
 152 static void freeBytes(char *bytes)
 153 {
 154     DELETE_ARRAY(bytes);
 155 }
 156
 157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
 158 {
 159     int32_t splits = 0;
 160     int32_t testLength = testString.length();
 161     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
 162     UErrorCode status = U_ZERO_ERROR;
 163     int32_t cpLength = eSplit[0].length();
 164     char codepage[64];
 165
 166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
 167     codepage[cpLength] = '\0';
 168
 169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
 170
 171     int32_t byteLength = 0;
 172     char *bytes = extractBytes(testString, codepage, byteLength);
 173
 174     if (bytes == NULL) {
 175 #if !UCONFIG_NO_LEGACY_CONVERSION
 176         dataerrln("Can't open a " + encoding + " converter for " + id);
 177 #endif
 178         return;
 179     }
 180
 181     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
 182
 183     int32_t matchCount = 0;
 184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
 185
 186
 187     UnicodeString name(ucsdet_getName(matches[0], &status));
 188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
 189     UChar *decoded = NULL;
 190     int32_t dLength = 0;
 191
 192     if (matchCount == 0) {
 193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
 194         goto bail;
 195     }
 196
 197     if (name.compare(eSplit[0]) != 0) {
 198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
 199
 200 #ifdef DEBUG_DETECT
 201         for (int32_t m = 0; m < matchCount; m += 1) {
 202             const char *name = ucsdet_getName(matches[m], &status);
 203             const char *lang = ucsdet_getLanguage(matches[m], &status);
 204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
 205
 206             printf("%s (%s) %d\n", name, lang, confidence);
 207         }
 208 #endif
 209         goto bail;
 210     }
 211
 212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
 213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
 214         goto bail;
 215     }
 216
 217     decoded = NEW_ARRAY(UChar, testLength);
 218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
 219
 220     if (testString.compare(decoded, dLength) != 0) {
 221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
 222
 223 #ifdef DEBUG_DETECT
 224         for(int32_t i = 0; i < testLength; i += 1) {
 225             if(testString[i] != decoded[i]) {
 226                 printf("Strings differ at byte %d\n", i);
 227                 break;
 228             }
 229         }
 230 #endif
 231
 232     }
 233
 234     DELETE_ARRAY(decoded);
 235
 236 bail:
 237     freeBytes(bytes);
 238     delete[] eSplit;
 239 }
 240
 241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
 242     UErrorCode status = U_ZERO_ERROR;
 243     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 244
 245     if (U_FAILURE(status)) {
 246         errln("ERROR: getPath() failed - %s", u_errorName(status));
 247         return NULL;
 248     }
 249
 250     strcpy(buffer, testDataDirectory);
 251     strcat(buffer, filename);
 252     return buffer;
 253 }
 254
 255 void CharsetDetectionTest::ConstructionTest()
 256 {
 257     IcuTestErrorCode status(*this, "ConstructionTest");
 258     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
 259     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
 260     int32_t count = uenum_count(e.getAlias(), status);
 261
 262 #ifdef DEBUG_DETECT
 263     printf("There are %d recognizers.\n", count);
 264 #endif
 265
 266     for(int32_t i = 0; i < count; i += 1) {
 267         int32_t length;
 268         const char *name = uenum_next(e.getAlias(), &length, status);
 269
 270         if(name == NULL || length <= 0) {
 271             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
 272         }
 273
 274 #ifdef DEBUG_DETECT
 275         printf("%s\n", name);
 276 #endif
 277     }
 278 }
 279
 280 void CharsetDetectionTest::UTF8Test()
 281 {
 282     UErrorCode status = U_ZERO_ERROR;
 283     UnicodeString ss = "This is a string with some non-ascii characters that will "
 284                        "be converted to UTF-8, then shoved through the detection process.  "
 285                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
 286                        "Sure would be nice if our source could contain Unicode directly!";
 287     UnicodeString s = ss.unescape();
 288     int32_t byteLength = 0, sLength = s.length();
 289     char *bytes = extractBytes(s, "UTF-8", byteLength);
 290     UCharsetDetector *csd = ucsdet_open(&status);
 291     const UCharsetMatch *match;
 292     UChar *detected = NEW_ARRAY(UChar, sLength);
 293
 294     ucsdet_setText(csd, bytes, byteLength, &status);
 295     match = ucsdet_detect(csd, &status);
 296
 297     if (match == NULL) {
 298         errln("Detection failure for UTF-8: got no matches.");
 299         goto bail;
 300     }
 301
 302     ucsdet_getUChars(match, detected, sLength, &status);
 303
 304     if (s.compare(detected, sLength) != 0) {
 305         errln("Round-trip test failed!");
 306     }
 307
 308     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 309
 310 bail:
 311     DELETE_ARRAY(detected);
 312     freeBytes(bytes);
 313     ucsdet_close(csd);
 314 }
 315
 316 void CharsetDetectionTest::UTF16Test()
 317 {
 318     UErrorCode status = U_ZERO_ERROR;
 319     /* Notice the BOM on the start of this string */
 320     UChar chars[] = {
 321         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 322         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 323         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 324         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 325         0x064a, 0x062a, 0x0000};
 326     UnicodeString s(chars);
 327     int32_t beLength = 0, leLength = 0;
 328     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
 329     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
 330     UCharsetDetector *csd = ucsdet_open(&status);
 331     const UCharsetMatch *match;
 332     const char *name;
 333     int32_t conf;
 334
 335     ucsdet_setText(csd, beBytes, beLength, &status);
 336     match = ucsdet_detect(csd, &status);
 337
 338     if (match == NULL) {
 339         errln("Encoding detection failure for UTF-16BE: got no matches.");
 340         goto try_le;
 341     }
 342
 343     name  = ucsdet_getName(match, &status);
 344     conf  = ucsdet_getConfidence(match, &status);
 345
 346     if (strcmp(name, "UTF-16BE") != 0) {
 347         errln("Encoding detection failure for UTF-16BE: got %s", name);
 348         goto try_le; // no point in looking at confidence if we got the wrong character set.
 349     }
 350
 351     if (conf != 100) {
 352         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
 353     }
 354
 355 try_le:
 356     ucsdet_setText(csd, leBytes, leLength, &status);
 357     match = ucsdet_detect(csd, &status);
 358
 359     if (match == NULL) {
 360         errln("Encoding detection failure for UTF-16LE: got no matches.");
 361         goto bail;
 362     }
 363
 364     name  = ucsdet_getName(match, &status);
 365     conf = ucsdet_getConfidence(match, &status);
 366
 367
 368     if (strcmp(name, "UTF-16LE") != 0) {
 369         errln("Enconding detection failure for UTF-16LE: got %s", name);
 370         goto bail; // no point in looking at confidence if we got the wrong character set.
 371     }
 372
 373     if (conf != 100) {
 374         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
 375     }
 376
 377 bail:
 378     freeBytes(leBytes);
 379     freeBytes(beBytes);
 380     ucsdet_close(csd);
 381 }
 382
 383 void CharsetDetectionTest::InputFilterTest()
 384 {
 385     UErrorCode status = U_ZERO_ERROR;
 386     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
 387     UnicodeString s  = ss.unescape();
 388     int32_t byteLength = 0;
 389     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
 390     UCharsetDetector *csd = ucsdet_open(&status);
 391     const UCharsetMatch *match;
 392     const char *lang, *name;
 393
 394     ucsdet_enableInputFilter(csd, TRUE);
 395
 396     if (!ucsdet_isInputFilterEnabled(csd)) {
 397         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
 398     }
 399
 400
 401     ucsdet_setText(csd, bytes, byteLength, &status);
 402     match = ucsdet_detect(csd, &status);
 403
 404     if (match == NULL) {
 405         errln("Turning on the input filter resulted in no matches.");
 406         goto turn_off;
 407     }
 408
 409     name = ucsdet_getName(match, &status);
 410
 411     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 412         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
 413     } else {
 414         lang = ucsdet_getLanguage(match, &status);
 415
 416         if (lang == NULL || strcmp(lang, "fr") != 0) {
 417             errln("Input filter did not strip markup!");
 418         }
 419     }
 420
 421 turn_off:
 422     ucsdet_enableInputFilter(csd, FALSE);
 423     ucsdet_setText(csd, bytes, byteLength, &status);
 424     match = ucsdet_detect(csd, &status);
 425
 426     if (match == NULL) {
 427         errln("Turning off the input filter resulted in no matches.");
 428         goto bail;
 429     }
 430
 431     name = ucsdet_getName(match, &status);
 432
 433     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 434         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
 435     } else {
 436         lang = ucsdet_getLanguage(match, &status);
 437
 438         if (lang == NULL || strcmp(lang, "en") != 0) {
 439             errln("Unfiltered input did not detect as English!");
 440         }
 441     }
 442
 443 bail:
 444     freeBytes(bytes);
 445     ucsdet_close(csd);
 446 }
 447
 448 void CharsetDetectionTest::C1BytesTest()
 449 {
 450 #if !UCONFIG_NO_LEGACY_CONVERSION
 451     UErrorCode status = U_ZERO_ERROR;
 452     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 453     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 454     UnicodeString sWindows  = ssWindows.unescape();
 455     int32_t lISO = 0, lWindows = 0;
 456     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 457     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 458     UCharsetDetector *csd = ucsdet_open(&status);
 459     const UCharsetMatch *match;
 460     const char *name;
 461
 462     ucsdet_setText(csd, bWindows, lWindows, &status);
 463     match = ucsdet_detect(csd, &status);
 464
 465     if (match == NULL) {
 466         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
 467         goto bail;
 468     }
 469
 470     name  = ucsdet_getName(match, &status);
 471
 472     if (strcmp(name, "windows-1252") != 0) {
 473         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
 474     }
 475
 476     ucsdet_setText(csd, bISO, lISO, &status);
 477     match = ucsdet_detect(csd, &status);
 478
 479     if (match == NULL) {
 480         errln("English text without C1 bytes got no matches.");
 481         goto bail;
 482     }
 483
 484     name  = ucsdet_getName(match, &status);
 485
 486     if (strcmp(name, "ISO-8859-1") != 0) {
 487         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
 488     }
 489
 490 bail:
 491     freeBytes(bWindows);
 492     freeBytes(bISO);
 493
 494     ucsdet_close(csd);
 495 #endif
 496 }
 497
 498 void CharsetDetectionTest::DetectionTest()
 499 {
 500 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 501     UErrorCode status = U_ZERO_ERROR;
 502     char path[2048];
 503     const char *testFilePath = getPath(path, "csdetest.xml");
 504
 505     if (testFilePath == NULL) {
 506         return; /* Couldn't get path: error message already output. */
 507     }
 508
 509     UXMLParser  *parser = UXMLParser::createParser(status);
 510     if (U_FAILURE(status)) {
 511         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
 512         return;
 513     }
 514
 515     UXMLElement *root   = parser->parseFile(testFilePath, status);
 516     if (!assertSuccess( "parseFile",status)) return;
 517
 518     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
 519     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
 520     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
 521
 522     const UXMLElement *testCase;
 523     int32_t tc = 0;
 524
 525     while((testCase = root->nextChildElement(tc)) != NULL) {
 526         if (testCase->getTagName().compare(test_case) == 0) {
 527             const UnicodeString *id = testCase->getAttribute(id_attr);
 528             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
 529             const UnicodeString  text = testCase->getText(TRUE);
 530             int32_t encodingCount;
 531             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
 532
 533             for(int32_t e = 0; e < encodingCount; e += 1) {
 534                 checkEncoding(text, encodingList[e], *id);
 535             }
 536
 537             delete[] encodingList;
 538         }
 539     }
 540
 541     delete root;
 542     delete parser;
 543 #endif
 544 }
 545
 546 void CharsetDetectionTest::IBM424Test()
 547 {
 548     UErrorCode status = U_ZERO_ERROR;
 549
 550     static const UChar chars[] = {
 551             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
 552             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
 553             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
 554             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
 555             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
 556             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
 557             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
 558             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
 559             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
 560             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
 561             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
 562             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
 563             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
 564             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
 565             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
 566             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
 567             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
 568     };
 569
 570     static const UChar chars_reverse[] = {
 571             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
 572             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
 573             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
 574             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
 575             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
 576             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
 577             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
 578             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
 579             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
 580             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
 581             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
 582             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
 583             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
 584             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
 585             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
 586             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
 587             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
 588             0x0000
 589     };
 590
 591     int32_t bLength = 0, brLength = 0;
 592
 593     UnicodeString s1(chars);
 594     UnicodeString s2(chars_reverse);
 595
 596     char *bytes = extractBytes(s1, "IBM424", bLength);
 597     char *bytes_r = extractBytes(s2, "IBM424", brLength);
 598
 599     UCharsetDetector *csd = ucsdet_open(&status);
 600     if (U_FAILURE(status)) {
 601         errln("Error opening charset detector. - %s", u_errorName(status));
 602     }
 603     const UCharsetMatch *match;
 604     const char *name;
 605
 606     ucsdet_setText(csd, bytes, bLength, &status);
 607     match = ucsdet_detect(csd, &status);
 608
 609     if (match == NULL) {
 610         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
 611         goto bail;
 612     }
 613
 614     name  = ucsdet_getName(match, &status);
 615     if (strcmp(name, "IBM424_rtl") != 0) {
 616         errln("Encoding detection failure for IBM424_rtl: got %s", name);
 617     }
 618
 619     ucsdet_setText(csd, bytes_r, brLength, &status);
 620     match = ucsdet_detect(csd, &status);
 621
 622     if (match == NULL) {
 623         errln("Encoding detection failure for IBM424_ltr: got no matches.");
 624         goto bail;
 625     }
 626
 627     name  = ucsdet_getName(match, &status);
 628     if (strcmp(name, "IBM424_ltr") != 0) {
 629         errln("Encoding detection failure for IBM424_ltr: got %s", name);
 630     }
 631
 632 bail:
 633     freeBytes(bytes);
 634     freeBytes(bytes_r);
 635     ucsdet_close(csd);
 636 }
 637
 638 void CharsetDetectionTest::IBM420Test()
 639 {
 640     UErrorCode status = U_ZERO_ERROR;
 641
 642     static const UChar chars[] = {
 643         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
 644         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
 645         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
 646         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
 647         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
 648         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
 649         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
 650         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
 651         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
 652         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
 653         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
 654         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
 655         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
 656         0x0000
 657     };
 658     static const UChar chars_reverse[] = {
 659         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
 660         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
 661         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
 662         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
 663         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
 664         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
 665         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
 666         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
 667         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
 668         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
 669         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
 670         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
 671         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
 672         0x0000,
 673     };
 674
 675     int32_t bLength = 0, brLength = 0;
 676
 677     UnicodeString s1(chars);
 678     UnicodeString s2(chars_reverse);
 679
 680     char *bytes = extractBytes(s1, "IBM420", bLength);
 681     char *bytes_r = extractBytes(s2, "IBM420", brLength);
 682
 683     UCharsetDetector *csd = ucsdet_open(&status);
 684     if (U_FAILURE(status)) {
 685         errln("Error opening charset detector. - %s", u_errorName(status));
 686     }
 687     const UCharsetMatch *match;
 688     const char *name;
 689
 690     ucsdet_setText(csd, bytes, bLength, &status);
 691     match = ucsdet_detect(csd, &status);
 692
 693     if (match == NULL) {
 694         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
 695         goto bail;
 696     }
 697
 698     name  = ucsdet_getName(match, &status);
 699     if (strcmp(name, "IBM420_rtl") != 0) {
 700         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
 701     }
 702
 703     ucsdet_setText(csd, bytes_r, brLength, &status);
 704     match = ucsdet_detect(csd, &status);
 705
 706     if (match == NULL) {
 707         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
 708         goto bail;
 709     }
 710
 711     name  = ucsdet_getName(match, &status);
 712     if (strcmp(name, "IBM420_ltr") != 0) {
 713         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
 714     }
 715
 716 bail:
 717     freeBytes(bytes);
 718     freeBytes(bytes_r);
 719     ucsdet_close(csd);
 720 }
 721
 722
 723 void CharsetDetectionTest::Ticket6394Test() {
 724 #if !UCONFIG_NO_CONVERSION
 725     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
 726                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
 727                              "encodings more than once.  The hop through UnicodeString is for platforms "
 728                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
 729     char latin1Text[sizeof(charText)];
 730     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
 731
 732     UErrorCode status = U_ZERO_ERROR;
 733     UCharsetDetector *csd = ucsdet_open(&status);
 734     ucsdet_setText(csd, latin1Text, -1, &status);
 735     if (U_FAILURE(status)) {
 736         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 737         return;
 738     }
 739
 740     int32_t matchCount = 0;
 741     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
 742     if (U_FAILURE(status)) {
 743         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
 744         return;
 745     }
 746
 747     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
 748     int32_t i;
 749     for (i=0; i<matchCount; i++) {
 750         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
 751         if (U_FAILURE(status)) {
 752             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
 753             status = U_ZERO_ERROR;
 754         }
 755         if (setOfCharsetNames.contains(charSetName)) {
 756             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
 757             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
 758         }
 759         setOfCharsetNames.add(charSetName);
 760     }
 761     ucsdet_close(csd);
 762 #endif
 763 }
 764
 765
 766 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
 767 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
 768 //               Charset Recognizer objects, and could be overwritten.
 769 void CharsetDetectionTest::Ticket6954Test() {
 770 #if !UCONFIG_NO_CONVERSION
 771     UErrorCode status = U_ZERO_ERROR;
 772     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 773     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
 774                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 775     UnicodeString sWindows  = ssWindows.unescape();
 776     int32_t lISO = 0, lWindows = 0;
 777     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 778     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 779
 780     // First do a plain vanilla detect of 1252 text
 781
 782     UCharsetDetector *csd1 = ucsdet_open(&status);
 783     ucsdet_setText(csd1, bWindows, lWindows, &status);
 784     const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
 785     const char *name1 = ucsdet_getName(match1, &status);
 786     TEST_ASSERT_SUCCESS(status);
 787     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 788
 789     // Next, using a completely separate detector, detect some 8859-1 text
 790
 791     UCharsetDetector *csd2 = ucsdet_open(&status);
 792     ucsdet_setText(csd2, bISO, lISO, &status);
 793     const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
 794     const char *name2 = ucsdet_getName(match2, &status);
 795     TEST_ASSERT_SUCCESS(status);
 796     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
 797
 798     // Recheck the 1252 results from the first detector, which should not have been
 799     //  altered by the use of a different detector.
 800
 801     name1 = ucsdet_getName(match1, &status);
 802     TEST_ASSERT_SUCCESS(status);
 803     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
 804
 805     ucsdet_close(csd1);
 806     ucsdet_close(csd2);
 807     freeBytes(bISO);
 808     freeBytes(bWindows);
 809 #endif
 810 }