icuSources/test/cintltst/ucsdetst.c

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  ****************************************************************************
   5  * Copyright (c) 2005-2016, International Business Machines Corporation and *
   6  * others. All Rights Reserved.                                             *
   7  ****************************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #include "unicode/ucsdet.h"
  13 #include "unicode/ucnv.h"
  14 #include "unicode/ustring.h"
  15
  16 #include "cintltst.h"
  17 #include "cmemory.h"
  18
  19 #include <stdlib.h>
  20 #include <string.h>
  21
  22 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
  23 #define DELETE_ARRAY(array) free(array)
  24
  25 static void TestConstruction(void);
  26 static void TestUTF8(void);
  27 static void TestUTF16(void);
  28 static void TestC1Bytes(void);
  29 static void TestInputFilter(void);
  30 static void TestChaining(void);
  31 static void TestBufferOverflow(void);
  32 static void TestIBM424(void);
  33 static void TestIBM420(void);
  34 #if U_PLATFORM_IS_DARWIN_BASED
  35 static void TestMailFilterCSS(void);
  36 #endif
  37
  38 void addUCsdetTest(TestNode** root);
  39
  40 void addUCsdetTest(TestNode** root)
  41 {
  42     addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
  43     addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
  44     addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
  45     addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
  46     addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
  47     addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
  48     addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
  49 #if !UCONFIG_NO_LEGACY_CONVERSION
  50     addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
  51     addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
  52 #endif
  53 #if U_PLATFORM_IS_DARWIN_BASED
  54     addTest(root, &TestMailFilterCSS, "ucsdetst/TestMailFilterCSS");
  55 #endif
  56 }
  57
  58 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
  59 {
  60     UErrorCode status;
  61     char buffer[1024];
  62     char *dest, *destLimit = buffer + sizeof(buffer);
  63     const UChar *srcLimit = src + length;
  64     int32_t result = 0;
  65
  66     do {
  67         dest = buffer;
  68         status = U_ZERO_ERROR;
  69         ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
  70         result += (int32_t) (dest - buffer);
  71     } while (status == U_BUFFER_OVERFLOW_ERROR);
  72
  73     return result;
  74 }
  75
  76 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
  77 {
  78     UErrorCode status = U_ZERO_ERROR;
  79     UConverter *cnv = ucnv_open(codepage, &status);
  80     int32_t byteCount = preflight(src, length, cnv);
  81     const UChar *srcLimit = src + length;
  82     char *bytes = NEW_ARRAY(char, byteCount + 1);
  83     char *dest = bytes, *destLimit = bytes + byteCount + 1;
  84
  85     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
  86     ucnv_close(cnv);
  87
  88     *byteLength = byteCount;
  89     return bytes;
  90 }
  91
  92 static void freeBytes(char *bytes)
  93 {
  94     DELETE_ARRAY(bytes);
  95 }
  96
  97 static void TestConstruction(void)
  98 {
  99     UErrorCode status = U_ZERO_ERROR;
 100     UCharsetDetector *csd = ucsdet_open(&status);
 101     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
 102     const char *name;
 103     int32_t count = uenum_count(e, &status);
 104     int32_t i, length;
 105
 106     for(i = 0; i < count; i += 1) {
 107         name = uenum_next(e, &length, &status);
 108
 109         if(name == NULL || length <= 0) {
 110             log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
 111         }
 112     }
 113     /* one past the list of all names must return NULL */
 114     name = uenum_next(e, &length, &status);
 115     if(name != NULL || length != 0 || U_FAILURE(status)) {
 116         log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
 117     }
 118
 119     uenum_close(e);
 120     ucsdet_close(csd);
 121 }
 122
 123 static void TestUTF8(void)
 124 {
 125     UErrorCode status = U_ZERO_ERROR;
 126     static const char ss[] = "This is a string with some non-ascii characters that will "
 127                "be converted to UTF-8, then shoved through the detection process.  "
 128                "\\u0391\\u0392\\u0393\\u0394\\u0395"
 129                "Sure would be nice if our source could contain Unicode directly!";
 130     int32_t byteLength = 0, sLength = 0, dLength = 0;
 131     UChar s[sizeof(ss)];
 132     char *bytes;
 133     UCharsetDetector *csd = ucsdet_open(&status);
 134     const UCharsetMatch *match;
 135     UChar detected[sizeof(ss)];
 136
 137     sLength = u_unescape(ss, s, sizeof(ss));
 138     bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
 139
 140     ucsdet_setText(csd, bytes, byteLength, &status);
 141     if (U_FAILURE(status)) {
 142         log_err("status is %s\n", u_errorName(status));
 143         goto bail;
 144     }
 145
 146     match = ucsdet_detect(csd, &status);
 147
 148     if (match == NULL) {
 149         log_err("Detection failure for UTF-8: got no matches.\n");
 150         goto bail;
 151     }
 152
 153     dLength = ucsdet_getUChars(match, detected, sLength, &status);
 154
 155     if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
 156         log_err("Round-trip test failed!\n");
 157     }
 158
 159     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 160
 161 bail:
 162     freeBytes(bytes);
 163     ucsdet_close(csd);
 164 }
 165
 166 static void TestUTF16(void)
 167 {
 168     UErrorCode status = U_ZERO_ERROR;
 169     /* Notice the BOM on the start of this string */
 170     static const UChar chars[] = {
 171         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 172         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 173         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 174         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 175         0x064a, 0x062a, 0x0000};
 176     int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
 177     char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
 178     char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
 179     UCharsetDetector *csd = ucsdet_open(&status);
 180     const UCharsetMatch *match;
 181     const char *name;
 182     int32_t conf;
 183
 184     ucsdet_setText(csd, beBytes, beLength, &status);
 185     match = ucsdet_detect(csd, &status);
 186
 187     if (match == NULL) {
 188         log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
 189         goto try_le;
 190     }
 191
 192     name  = ucsdet_getName(match, &status);
 193     conf  = ucsdet_getConfidence(match, &status);
 194
 195     if (strcmp(name, "UTF-16BE") != 0) {
 196         log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
 197     }
 198
 199     if (conf != 100) {
 200         log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
 201     }
 202
 203 try_le:
 204     ucsdet_setText(csd, leBytes, leLength, &status);
 205     match = ucsdet_detect(csd, &status);
 206
 207     if (match == NULL) {
 208         log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
 209         goto bail;
 210     }
 211
 212     name  = ucsdet_getName(match, &status);
 213     conf = ucsdet_getConfidence(match, &status);
 214
 215
 216     if (strcmp(name, "UTF-16LE") != 0) {
 217         log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
 218     }
 219
 220     if (conf != 100) {
 221         log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
 222     }
 223
 224 bail:
 225     freeBytes(leBytes);
 226     freeBytes(beBytes);
 227     ucsdet_close(csd);
 228 }
 229
 230 static void TestC1Bytes(void)
 231 {
 232 #if !UCONFIG_NO_LEGACY_CONVERSION
 233     UErrorCode status = U_ZERO_ERROR;
 234     static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 235     static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
 236     int32_t sISOLength = 0, sWindowsLength = 0;
 237     UChar sISO[sizeof(ssISO)];
 238     UChar sWindows[sizeof(ssWindows)];
 239     int32_t lISO = 0, lWindows = 0;
 240     char *bISO;
 241     char *bWindows;
 242     UCharsetDetector *csd = ucsdet_open(&status);
 243     const UCharsetMatch *match;
 244     const char *name;
 245
 246     sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
 247     sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
 248     bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
 249     bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
 250
 251     ucsdet_setText(csd, bWindows, lWindows, &status);
 252     match = ucsdet_detect(csd, &status);
 253
 254     if (match == NULL) {
 255         log_err("English test with C1 bytes got no matches.\n");
 256         goto bail;
 257     }
 258
 259     name  = ucsdet_getName(match, &status);
 260
 261     if (strcmp(name, "windows-1252") != 0) {
 262         log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
 263     }
 264
 265     ucsdet_setText(csd, bISO, lISO, &status);
 266     match = ucsdet_detect(csd, &status);
 267
 268     if (match == NULL) {
 269         log_err("English text without C1 bytes got no matches.\n");
 270         goto bail;
 271     }
 272
 273     name  = ucsdet_getName(match, &status);
 274
 275     if (strcmp(name, "ISO-8859-1") != 0) {
 276         log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
 277     }
 278
 279 bail:
 280     freeBytes(bWindows);
 281     freeBytes(bISO);
 282
 283     ucsdet_close(csd);
 284 #endif
 285 }
 286
 287 static void TestInputFilter(void)
 288 {
 289     UErrorCode status = U_ZERO_ERROR;
 290     static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
 291     int32_t sLength = 0;
 292     UChar s[sizeof(ss)];
 293     int32_t byteLength = 0;
 294     char *bytes;
 295     UCharsetDetector *csd = ucsdet_open(&status);
 296     const UCharsetMatch *match;
 297     const char *lang, *name;
 298
 299     sLength = u_unescape(ss, s, sizeof(ss));
 300     bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
 301
 302     ucsdet_enableInputFilter(csd, TRUE);
 303
 304     if (!ucsdet_isInputFilterEnabled(csd)) {
 305         log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
 306     }
 307
 308
 309     ucsdet_setText(csd, bytes, byteLength, &status);
 310     match = ucsdet_detect(csd, &status);
 311
 312     if (match == NULL) {
 313         log_err("Turning on the input filter resulted in no matches.\n");
 314         goto turn_off;
 315     }
 316
 317     name = ucsdet_getName(match, &status);
 318
 319     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 320         log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
 321     } else {
 322         lang = ucsdet_getLanguage(match, &status);
 323
 324         if (lang == NULL || strcmp(lang, "fr") != 0) {
 325             log_err("Input filter did not strip markup!\n");
 326         }
 327     }
 328
 329 turn_off:
 330     ucsdet_enableInputFilter(csd, FALSE);
 331     ucsdet_setText(csd, bytes, byteLength, &status);
 332     match = ucsdet_detect(csd, &status);
 333
 334     if (match == NULL) {
 335         log_err("Turning off the input filter resulted in no matches.\n");
 336         goto bail;
 337     }
 338
 339     name = ucsdet_getName(match, &status);
 340
 341     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 342         log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
 343     } else {
 344         lang = ucsdet_getLanguage(match, &status);
 345
 346         if (lang == NULL || strcmp(lang, "en") != 0) {
 347             log_err("Unfiltered input did not detect as English!\n");
 348         }
 349     }
 350
 351 bail:
 352     freeBytes(bytes);
 353     ucsdet_close(csd);
 354 }
 355
 356 static void TestChaining(void) {
 357     UErrorCode status = U_USELESS_COLLATOR_ERROR;
 358
 359     ucsdet_open(&status);
 360     ucsdet_setText(NULL, NULL, 0, &status);
 361     ucsdet_getName(NULL, &status);
 362     ucsdet_getConfidence(NULL, &status);
 363     ucsdet_getLanguage(NULL, &status);
 364     ucsdet_detect(NULL, &status);
 365     ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
 366     ucsdet_detectAll(NULL, NULL, &status);
 367     ucsdet_getUChars(NULL, NULL, 0, &status);
 368     ucsdet_getUChars(NULL, NULL, 0, &status);
 369     ucsdet_close(NULL);
 370
 371     /* All of this code should have done nothing. */
 372     if (status != U_USELESS_COLLATOR_ERROR) {
 373         log_err("Status got changed to %s\n", u_errorName(status));
 374     }
 375 }
 376
 377 static void TestBufferOverflow(void) {
 378     UErrorCode status = U_ZERO_ERROR;
 379     static const char *testStrings[] = {
 380         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
 381         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
 382         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
 383         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
 384         "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
 385         "\xa1", /* Could be a single byte shift-jis at the end */
 386         "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
 387         "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
 388     };
 389     static const char *testResults[] = {
 390         "windows-1252",
 391         "windows-1252",
 392         "windows-1252",
 393         "windows-1252",
 394         "ISO-2022-JP",
 395         NULL,
 396         NULL,
 397         "ISO-8859-1"
 398     };
 399     int32_t idx = 0;
 400     UCharsetDetector *csd = ucsdet_open(&status);
 401     const UCharsetMatch *match;
 402
 403     ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
 404
 405     if (U_FAILURE(status)) {
 406         log_err("Couldn't open detector. %s\n", u_errorName(status));
 407         goto bail;
 408     }
 409
 410     for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
 411         ucsdet_setText(csd, testStrings[idx], -1, &status);
 412         match = ucsdet_detect(csd, &status);
 413
 414         if (match == NULL) {
 415             if (testResults[idx] != NULL) {
 416                 log_err("Unexpectedly got no results at index %d.\n", idx);
 417             }
 418             else {
 419                 log_verbose("Got no result as expected at index %d.\n", idx);
 420             }
 421             continue;
 422         }
 423
 424         if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
 425             log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
 426                 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
 427             goto bail;
 428         }
 429     }
 430
 431 bail:
 432     ucsdet_close(csd);
 433 }
 434
 435 static void TestIBM424(void)
 436 {
 437     UErrorCode status = U_ZERO_ERROR;
 438
 439     static const UChar chars[] = {
 440             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
 441             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
 442             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
 443             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
 444             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
 445             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
 446             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
 447             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
 448             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
 449             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
 450             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
 451             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
 452             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
 453             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
 454             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
 455             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
 456             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
 457     };
 458
 459     static const UChar chars_reverse[] = {
 460             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
 461             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
 462             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
 463             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
 464             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
 465             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
 466             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
 467             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
 468             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
 469             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
 470             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
 471             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
 472             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
 473             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
 474             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
 475             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
 476             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
 477             0x0000
 478     };
 479
 480     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
 481
 482     char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
 483     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
 484
 485     UCharsetDetector *csd = ucsdet_open(&status);
 486     const UCharsetMatch *match;
 487     const char *name;
 488
 489     ucsdet_setText(csd, bytes, bLength, &status);
 490     match = ucsdet_detect(csd, &status);
 491
 492     if (match == NULL) {
 493         log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
 494         goto bail;
 495     }
 496
 497     name  = ucsdet_getName(match, &status);
 498     if (strcmp(name, "IBM424_rtl") != 0) {
 499         log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
 500     }
 501
 502     ucsdet_setText(csd, bytes_r, brLength, &status);
 503     match = ucsdet_detect(csd, &status);
 504
 505     if (match == NULL) {
 506         log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
 507         goto bail;
 508     }
 509
 510     name  = ucsdet_getName(match, &status);
 511     if (strcmp(name, "IBM424_ltr") != 0) {
 512         log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
 513     }
 514
 515 bail:
 516     freeBytes(bytes);
 517     freeBytes(bytes_r);
 518     ucsdet_close(csd);
 519 }
 520
 521 static void TestIBM420(void)
 522 {
 523     UErrorCode status = U_ZERO_ERROR;
 524
 525     static const UChar chars[] = {
 526         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
 527         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
 528         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
 529         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
 530         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
 531         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
 532         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
 533         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
 534         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
 535         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
 536         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
 537         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
 538         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
 539         0x0000
 540     };
 541     static const UChar chars_reverse[] = {
 542         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
 543         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
 544         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
 545         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
 546         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
 547         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
 548         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
 549         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
 550         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
 551         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
 552         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
 553         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
 554         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
 555         0x0000,
 556     };
 557
 558     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
 559
 560     char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
 561     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
 562
 563     UCharsetDetector *csd = ucsdet_open(&status);
 564     const UCharsetMatch *match;
 565     const char *name;
 566
 567     ucsdet_setText(csd, bytes, bLength, &status);
 568     match = ucsdet_detect(csd, &status);
 569
 570     if (match == NULL) {
 571         log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
 572         goto bail;
 573     }
 574
 575     name  = ucsdet_getName(match, &status);
 576     if (strcmp(name, "IBM420_rtl") != 0) {
 577         log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
 578     }
 579
 580     ucsdet_setText(csd, bytes_r, brLength, &status);
 581     match = ucsdet_detect(csd, &status);
 582
 583     if (match == NULL) {
 584         log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
 585         goto bail;
 586     }
 587
 588     name  = ucsdet_getName(match, &status);
 589     if (strcmp(name, "IBM420_ltr") != 0) {
 590         log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
 591     }
 592
 593 bail:
 594     freeBytes(bytes);
 595     freeBytes(bytes_r);
 596     ucsdet_close(csd);
 597 }
 598
 599 #if U_PLATFORM_IS_DARWIN_BASED
 600 #include <stdio.h>
 601 // read data from file into a malloc'ed buf, which must be freed by caller.
 602 // returns NULL if error. Copied from cbiapts.c
 603 static void* dataBufFromFile(const char* path, long* dataBufSizeP) {
 604     FILE * dataFile;
 605     void * dataBuf;
 606     long dataBufSize, dataFileRead = 0;
 607
 608     if (dataBufSizeP) {
 609         *dataBufSizeP = 0;
 610     }
 611     dataFile = fopen(path, "r");
 612     if (dataFile == NULL) {
 613         log_data_err("FAIL: for %s, fopen fails\n", path);
 614         return NULL;
 615     }
 616     fseek(dataFile, 0, SEEK_END);
 617     dataBufSize = ftell(dataFile);
 618     rewind(dataFile);
 619
 620     dataBuf = uprv_malloc(dataBufSize);
 621     if (dataBuf != NULL) {
 622         dataFileRead = fread(dataBuf, 1, dataBufSize, dataFile);
 623     }
 624     fclose(dataFile);
 625     if (dataBuf == NULL) {
 626         log_data_err("FAIL: for %s, uprv_malloc fails for dataBuf[%ld]\n", path, dataBufSize);
 627         return NULL;
 628     }
 629     if (dataFileRead < dataBufSize) {
 630         log_data_err("FAIL: for %s, fread fails, read %ld of %ld\n", path, dataFileRead, dataBufSize);
 631         uprv_free(dataBuf);
 632         return NULL;
 633     }
 634     if (dataBufSizeP) {
 635         *dataBufSizeP = dataBufSize;
 636     }
 637     return dataBuf;
 638 }
 639
 640 typedef struct {
 641     const char* sampleTextPath; // relative to cintltst directory
 642     const char* encodingName;   // expected
 643 } SampleTextAndEncoding;
 644
 645 static const SampleTextAndEncoding mailSampleTests[] = {
 646     { "../testdata/encodingSamples/mailExample_Latin1_2.txt", "iso-8859-1" },
 647     { "../testdata/encodingSamples/mailExample_Latin1_3.txt", "iso-8859-1" },
 648     { "../testdata/encodingSamples/mailExample_Latin1_4.txt", "iso-8859-1" },
 649     { "../testdata/encodingSamples/mailExample_Latin1_6.txt", "iso-8859-1" },
 650     { "../testdata/encodingSamples/mailExample_Latin1_7.txt", "iso-8859-1" },
 651     { "../testdata/encodingSamples/mailExample_Latin1_8.txt", "iso-8859-1" },
 652     { "../testdata/encodingSamples/mailExample_Latin1_9.txt", "iso-8859-1" },
 653     { "../testdata/encodingSamples/mailExample_Latin1Esc_2.txt", "iso-8859-1" },
 654     { "../testdata/encodingSamples/mailExample_Latin1Esc_3.txt", "iso-8859-1" },
 655     { "../testdata/encodingSamples/mailExample_Latin1Esc_4.txt", "iso-8859-1" },
 656     { "../testdata/encodingSamples/mailExample_Latin1Esc_6.txt", "iso-8859-1" },
 657     { "../testdata/encodingSamples/mailExample_Latin1Esc_7.txt", "iso-8859-1" },
 658     { "../testdata/encodingSamples/mailExample_Latin1Esc_8.txt", "iso-8859-1" },
 659     { "../testdata/encodingSamples/mailExample_Latin1Esc_9.txt", "iso-8859-1" },
 660     { NULL, NULL }
 661 };
 662
 663 static void TestMailFilterCSS(void) {
 664     UErrorCode status = U_ZERO_ERROR;
 665     UCharsetDetector *detector = ucsdet_open(&status);
 666     if (U_FAILURE(status)) {
 667         log_data_err("ucsdet_open fails. %s\n", u_errorName(status));
 668     } else {
 669         const SampleTextAndEncoding* testPtr;
 670         for (testPtr = mailSampleTests; testPtr->sampleTextPath != NULL; testPtr++) {
 671             long sampleTextLen;
 672             char * sampleText = (char *)dataBufFromFile(testPtr->sampleTextPath, &sampleTextLen);
 673             if (sampleText != NULL) { // dataBufFromFile reports the errors that would produce NULL
 674                 status = U_ZERO_ERROR;
 675                 ucsdet_setText(detector, sampleText, sampleTextLen, &status);
 676                 if (U_FAILURE(status)) {
 677                     log_data_err("ucsdet_setText fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
 678                 } else {
 679                     const UCharsetMatch *highestMatch = NULL;
 680                     ucsdet_enableInputFilter(detector, TRUE);
 681                     highestMatch = ucsdet_detect(detector, &status);
 682                     if (U_FAILURE(status) || highestMatch==NULL) {
 683                         log_err("ucsdet_detect fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
 684                     } else {
 685                         const char *icuName = ucsdet_getName(highestMatch, &status);
 686                         int32_t confidence = ucsdet_getConfidence(highestMatch, &status);
 687                         if (U_FAILURE(status) || icuName==NULL) {
 688                             log_err("ucsdet_getName and/or ucsdet_getConfidence fails for text file %s: %s\n", testPtr->sampleTextPath, u_errorName(status));
 689                         } else {
 690                             log_info("For text file %s: expect %s; get %s with confidence %d, text length %ld\n",
 691                                     testPtr->sampleTextPath, testPtr->encodingName, icuName, confidence, sampleTextLen);
 692                         }
 693                     }
 694                 }
 695                 uprv_free(sampleText);
 696             }
 697         }
 698         ucsdet_close(detector);
 699     }
 700 }
 701 #endif /* U_PLATFORM_IS_DARWIN_BASED */