icuSources/test/cintltst/ucsdetst.c

   1 /*
   2  ****************************************************************************
   3  * Copyright (c) 2005-2016, International Business Machines Corporation and *
   4  * others. All Rights Reserved.                                             *
   5  ****************************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #include "unicode/ucsdet.h"
  11 #include "unicode/ucnv.h"
  12 #include "unicode/ustring.h"
  13
  14 #include "cintltst.h"
  15 #include "cmemory.h"
  16
  17 #include <stdlib.h>
  18 #include <string.h>
  19
  20 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
  21 #define DELETE_ARRAY(array) free(array)
  22
  23 static void TestConstruction(void);
  24 static void TestUTF8(void);
  25 static void TestUTF16(void);
  26 static void TestC1Bytes(void);
  27 static void TestInputFilter(void);
  28 static void TestChaining(void);
  29 static void TestBufferOverflow(void);
  30 static void TestIBM424(void);
  31 static void TestIBM420(void);
  32
  33 void addUCsdetTest(TestNode** root);
  34
  35 void addUCsdetTest(TestNode** root)
  36 {
  37     addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
  38     addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
  39     addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
  40     addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
  41     addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
  42     addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
  43     addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
  44 #if !UCONFIG_NO_LEGACY_CONVERSION
  45     addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
  46     addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
  47 #endif
  48 }
  49
  50 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
  51 {
  52     UErrorCode status;
  53     char buffer[1024];
  54     char *dest, *destLimit = buffer + sizeof(buffer);
  55     const UChar *srcLimit = src + length;
  56     int32_t result = 0;
  57
  58     do {
  59         dest = buffer;
  60         status = U_ZERO_ERROR;
  61         ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
  62         result += (int32_t) (dest - buffer);
  63     } while (status == U_BUFFER_OVERFLOW_ERROR);
  64
  65     return result;
  66 }
  67
  68 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
  69 {
  70     UErrorCode status = U_ZERO_ERROR;
  71     UConverter *cnv = ucnv_open(codepage, &status);
  72     int32_t byteCount = preflight(src, length, cnv);
  73     const UChar *srcLimit = src + length;
  74     char *bytes = NEW_ARRAY(char, byteCount + 1);
  75     char *dest = bytes, *destLimit = bytes + byteCount + 1;
  76
  77     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
  78     ucnv_close(cnv);
  79
  80     *byteLength = byteCount;
  81     return bytes;
  82 }
  83
  84 static void freeBytes(char *bytes)
  85 {
  86     DELETE_ARRAY(bytes);
  87 }
  88
  89 static void TestConstruction(void)
  90 {
  91     UErrorCode status = U_ZERO_ERROR;
  92     UCharsetDetector *csd = ucsdet_open(&status);
  93     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
  94     const char *name;
  95     int32_t count = uenum_count(e, &status);
  96     int32_t i, length;
  97
  98     for(i = 0; i < count; i += 1) {
  99         name = uenum_next(e, &length, &status);
 100
 101         if(name == NULL || length <= 0) {
 102             log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
 103         }
 104     }
 105     /* one past the list of all names must return NULL */
 106     name = uenum_next(e, &length, &status);
 107     if(name != NULL || length != 0 || U_FAILURE(status)) {
 108         log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
 109     }
 110
 111     uenum_close(e);
 112     ucsdet_close(csd);
 113 }
 114
 115 static void TestUTF8(void)
 116 {
 117     UErrorCode status = U_ZERO_ERROR;
 118     static const char ss[] = "This is a string with some non-ascii characters that will "
 119                "be converted to UTF-8, then shoved through the detection process.  "
 120                "\\u0391\\u0392\\u0393\\u0394\\u0395"
 121                "Sure would be nice if our source could contain Unicode directly!";
 122     int32_t byteLength = 0, sLength = 0, dLength = 0;
 123     UChar s[sizeof(ss)];
 124     char *bytes;
 125     UCharsetDetector *csd = ucsdet_open(&status);
 126     const UCharsetMatch *match;
 127     UChar detected[sizeof(ss)];
 128
 129     sLength = u_unescape(ss, s, sizeof(ss));
 130     bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
 131
 132     ucsdet_setText(csd, bytes, byteLength, &status);
 133     if (U_FAILURE(status)) {
 134         log_err("status is %s\n", u_errorName(status));
 135         goto bail;
 136     }
 137
 138     match = ucsdet_detect(csd, &status);
 139
 140     if (match == NULL) {
 141         log_err("Detection failure for UTF-8: got no matches.\n");
 142         goto bail;
 143     }
 144
 145     dLength = ucsdet_getUChars(match, detected, sLength, &status);
 146
 147     if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
 148         log_err("Round-trip test failed!\n");
 149     }
 150
 151     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 152
 153 bail:
 154     freeBytes(bytes);
 155     ucsdet_close(csd);
 156 }
 157
 158 static void TestUTF16(void)
 159 {
 160     UErrorCode status = U_ZERO_ERROR;
 161     /* Notice the BOM on the start of this string */
 162     static const UChar chars[] = {
 163         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 164         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 165         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 166         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 167         0x064a, 0x062a, 0x0000};
 168     int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
 169     char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
 170     char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
 171     UCharsetDetector *csd = ucsdet_open(&status);
 172     const UCharsetMatch *match;
 173     const char *name;
 174     int32_t conf;
 175
 176     ucsdet_setText(csd, beBytes, beLength, &status);
 177     match = ucsdet_detect(csd, &status);
 178
 179     if (match == NULL) {
 180         log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
 181         goto try_le;
 182     }
 183
 184     name  = ucsdet_getName(match, &status);
 185     conf  = ucsdet_getConfidence(match, &status);
 186
 187     if (strcmp(name, "UTF-16BE") != 0) {
 188         log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
 189     }
 190
 191     if (conf != 100) {
 192         log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
 193     }
 194
 195 try_le:
 196     ucsdet_setText(csd, leBytes, leLength, &status);
 197     match = ucsdet_detect(csd, &status);
 198
 199     if (match == NULL) {
 200         log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
 201         goto bail;
 202     }
 203
 204     name  = ucsdet_getName(match, &status);
 205     conf = ucsdet_getConfidence(match, &status);
 206
 207
 208     if (strcmp(name, "UTF-16LE") != 0) {
 209         log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
 210     }
 211
 212     if (conf != 100) {
 213         log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
 214     }
 215
 216 bail:
 217     freeBytes(leBytes);
 218     freeBytes(beBytes);
 219     ucsdet_close(csd);
 220 }
 221
 222 static void TestC1Bytes(void)
 223 {
 224 #if !UCONFIG_NO_LEGACY_CONVERSION
 225     UErrorCode status = U_ZERO_ERROR;
 226     static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 227     static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
 228     int32_t sISOLength = 0, sWindowsLength = 0;
 229     UChar sISO[sizeof(ssISO)];
 230     UChar sWindows[sizeof(ssWindows)];
 231     int32_t lISO = 0, lWindows = 0;
 232     char *bISO;
 233     char *bWindows;
 234     UCharsetDetector *csd = ucsdet_open(&status);
 235     const UCharsetMatch *match;
 236     const char *name;
 237
 238     sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
 239     sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
 240     bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
 241     bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
 242
 243     ucsdet_setText(csd, bWindows, lWindows, &status);
 244     match = ucsdet_detect(csd, &status);
 245
 246     if (match == NULL) {
 247         log_err("English test with C1 bytes got no matches.\n");
 248         goto bail;
 249     }
 250
 251     name  = ucsdet_getName(match, &status);
 252
 253     if (strcmp(name, "windows-1252") != 0) {
 254         log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
 255     }
 256
 257     ucsdet_setText(csd, bISO, lISO, &status);
 258     match = ucsdet_detect(csd, &status);
 259
 260     if (match == NULL) {
 261         log_err("English text without C1 bytes got no matches.\n");
 262         goto bail;
 263     }
 264
 265     name  = ucsdet_getName(match, &status);
 266
 267     if (strcmp(name, "ISO-8859-1") != 0) {
 268         log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
 269     }
 270
 271 bail:
 272     freeBytes(bWindows);
 273     freeBytes(bISO);
 274
 275     ucsdet_close(csd);
 276 #endif
 277 }
 278
 279 static void TestInputFilter(void)
 280 {
 281     UErrorCode status = U_ZERO_ERROR;
 282     static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
 283     int32_t sLength = 0;
 284     UChar s[sizeof(ss)];
 285     int32_t byteLength = 0;
 286     char *bytes;
 287     UCharsetDetector *csd = ucsdet_open(&status);
 288     const UCharsetMatch *match;
 289     const char *lang, *name;
 290
 291     sLength = u_unescape(ss, s, sizeof(ss));
 292     bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
 293
 294     ucsdet_enableInputFilter(csd, TRUE);
 295
 296     if (!ucsdet_isInputFilterEnabled(csd)) {
 297         log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
 298     }
 299
 300
 301     ucsdet_setText(csd, bytes, byteLength, &status);
 302     match = ucsdet_detect(csd, &status);
 303
 304     if (match == NULL) {
 305         log_err("Turning on the input filter resulted in no matches.\n");
 306         goto turn_off;
 307     }
 308
 309     name = ucsdet_getName(match, &status);
 310
 311     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 312         log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
 313     } else {
 314         lang = ucsdet_getLanguage(match, &status);
 315
 316         if (lang == NULL || strcmp(lang, "fr") != 0) {
 317             log_err("Input filter did not strip markup!\n");
 318         }
 319     }
 320
 321 turn_off:
 322     ucsdet_enableInputFilter(csd, FALSE);
 323     ucsdet_setText(csd, bytes, byteLength, &status);
 324     match = ucsdet_detect(csd, &status);
 325
 326     if (match == NULL) {
 327         log_err("Turning off the input filter resulted in no matches.\n");
 328         goto bail;
 329     }
 330
 331     name = ucsdet_getName(match, &status);
 332
 333     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 334         log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
 335     } else {
 336         lang = ucsdet_getLanguage(match, &status);
 337
 338         if (lang == NULL || strcmp(lang, "en") != 0) {
 339             log_err("Unfiltered input did not detect as English!\n");
 340         }
 341     }
 342
 343 bail:
 344     freeBytes(bytes);
 345     ucsdet_close(csd);
 346 }
 347
 348 static void TestChaining(void) {
 349     UErrorCode status = U_USELESS_COLLATOR_ERROR;
 350
 351     ucsdet_open(&status);
 352     ucsdet_setText(NULL, NULL, 0, &status);
 353     ucsdet_getName(NULL, &status);
 354     ucsdet_getConfidence(NULL, &status);
 355     ucsdet_getLanguage(NULL, &status);
 356     ucsdet_detect(NULL, &status);
 357     ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
 358     ucsdet_detectAll(NULL, NULL, &status);
 359     ucsdet_getUChars(NULL, NULL, 0, &status);
 360     ucsdet_getUChars(NULL, NULL, 0, &status);
 361     ucsdet_close(NULL);
 362
 363     /* All of this code should have done nothing. */
 364     if (status != U_USELESS_COLLATOR_ERROR) {
 365         log_err("Status got changed to %s\n", u_errorName(status));
 366     }
 367 }
 368
 369 static void TestBufferOverflow(void) {
 370     UErrorCode status = U_ZERO_ERROR;
 371     static const char *testStrings[] = {
 372         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
 373         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
 374         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
 375         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
 376         "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
 377         "\xa1", /* Could be a single byte shift-jis at the end */
 378         "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
 379         "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
 380     };
 381     static const char *testResults[] = {
 382         "windows-1252",
 383         "windows-1252",
 384         "windows-1252",
 385         "windows-1252",
 386         "ISO-2022-JP",
 387         NULL,
 388         NULL,
 389         "ISO-8859-1"
 390     };
 391     int32_t idx = 0;
 392     UCharsetDetector *csd = ucsdet_open(&status);
 393     const UCharsetMatch *match;
 394
 395     ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
 396
 397     if (U_FAILURE(status)) {
 398         log_err("Couldn't open detector. %s\n", u_errorName(status));
 399         goto bail;
 400     }
 401
 402     for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
 403         ucsdet_setText(csd, testStrings[idx], -1, &status);
 404         match = ucsdet_detect(csd, &status);
 405
 406         if (match == NULL) {
 407             if (testResults[idx] != NULL) {
 408                 log_err("Unexpectedly got no results at index %d.\n", idx);
 409             }
 410             else {
 411                 log_verbose("Got no result as expected at index %d.\n", idx);
 412             }
 413             continue;
 414         }
 415
 416         if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
 417             log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
 418                 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
 419             goto bail;
 420         }
 421     }
 422
 423 bail:
 424     ucsdet_close(csd);
 425 }
 426
 427 static void TestIBM424(void)
 428 {
 429     UErrorCode status = U_ZERO_ERROR;
 430
 431     static const UChar chars[] = {
 432             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
 433             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
 434             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
 435             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
 436             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
 437             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
 438             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
 439             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
 440             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
 441             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
 442             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
 443             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
 444             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
 445             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
 446             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
 447             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
 448             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
 449     };
 450
 451     static const UChar chars_reverse[] = {
 452             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
 453             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
 454             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
 455             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
 456             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
 457             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
 458             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
 459             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
 460             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
 461             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
 462             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
 463             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
 464             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
 465             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
 466             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
 467             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
 468             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
 469             0x0000
 470     };
 471
 472     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
 473
 474     char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
 475     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
 476
 477     UCharsetDetector *csd = ucsdet_open(&status);
 478     const UCharsetMatch *match;
 479     const char *name;
 480
 481     ucsdet_setText(csd, bytes, bLength, &status);
 482     match = ucsdet_detect(csd, &status);
 483
 484     if (match == NULL) {
 485         log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
 486         goto bail;
 487     }
 488
 489     name  = ucsdet_getName(match, &status);
 490     if (strcmp(name, "IBM424_rtl") != 0) {
 491         log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
 492     }
 493
 494     ucsdet_setText(csd, bytes_r, brLength, &status);
 495     match = ucsdet_detect(csd, &status);
 496
 497     if (match == NULL) {
 498         log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
 499         goto bail;
 500     }
 501
 502     name  = ucsdet_getName(match, &status);
 503     if (strcmp(name, "IBM424_ltr") != 0) {
 504         log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
 505     }
 506
 507 bail:
 508     freeBytes(bytes);
 509     freeBytes(bytes_r);
 510     ucsdet_close(csd);
 511 }
 512
 513 static void TestIBM420(void)
 514 {
 515     UErrorCode status = U_ZERO_ERROR;
 516
 517     static const UChar chars[] = {
 518         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
 519         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
 520         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
 521         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
 522         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
 523         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
 524         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
 525         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
 526         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
 527         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
 528         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
 529         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
 530         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
 531         0x0000
 532     };
 533     static const UChar chars_reverse[] = {
 534         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
 535         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
 536         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
 537         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
 538         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
 539         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
 540         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
 541         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
 542         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
 543         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
 544         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
 545         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
 546         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
 547         0x0000,
 548     };
 549
 550     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
 551
 552     char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
 553     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
 554
 555     UCharsetDetector *csd = ucsdet_open(&status);
 556     const UCharsetMatch *match;
 557     const char *name;
 558
 559     ucsdet_setText(csd, bytes, bLength, &status);
 560     match = ucsdet_detect(csd, &status);
 561
 562     if (match == NULL) {
 563         log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
 564         goto bail;
 565     }
 566
 567     name  = ucsdet_getName(match, &status);
 568     if (strcmp(name, "IBM420_rtl") != 0) {
 569         log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
 570     }
 571
 572     ucsdet_setText(csd, bytes_r, brLength, &status);
 573     match = ucsdet_detect(csd, &status);
 574
 575     if (match == NULL) {
 576         log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
 577         goto bail;
 578     }
 579
 580     name  = ucsdet_getName(match, &status);
 581     if (strcmp(name, "IBM420_ltr") != 0) {
 582         log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
 583     }
 584
 585 bail:
 586     freeBytes(bytes);
 587     freeBytes(bytes_r);
 588     ucsdet_close(csd);
 589 }