icuSources/test/intltest/csdetest.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2008, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8
   9 #include "unicode/utypes.h"
  10 #include "unicode/ucsdet.h"
  11 #include "unicode/ucnv.h"
  12 #include "unicode/unistr.h"
  13 #include "unicode/putil.h"
  14
  15 #include "intltest.h"
  16 #include "csdetest.h"
  17
  18 #include "xmlparser.h"
  19
  20 #include <stdlib.h>
  21 #include <string.h>
  22
  23 #ifdef DEBUG_DETECT
  24 #include <stdio.h>
  25 #endif
  26
  27 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  28
  29 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
  30 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
  31
  32 #define CH_SPACE 0x0020
  33 #define CH_SLASH 0x002F
  34
  35 //---------------------------------------------------------------------------
  36 //
  37 //  Test class boilerplate
  38 //
  39 //---------------------------------------------------------------------------
  40 CharsetDetectionTest::CharsetDetectionTest()
  41 {
  42 }
  43
  44
  45 CharsetDetectionTest::~CharsetDetectionTest()
  46 {
  47 }
  48
  49
  50
  51 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  52 {
  53     if (exec) logln("TestSuite CharsetDetectionTest: ");
  54     switch (index) {
  55        case 0: name = "ConstructionTest";
  56             if (exec) ConstructionTest();
  57             break;
  58
  59        case 1: name = "UTF8Test";
  60             if (exec) UTF8Test();
  61             break;
  62
  63        case 2: name = "UTF16Test";
  64             if (exec) UTF16Test();
  65             break;
  66
  67        case 3: name = "C1BytesTest";
  68             if (exec) C1BytesTest();
  69             break;
  70
  71        case 4: name = "InputFilterTest";
  72             if (exec) InputFilterTest();
  73             break;
  74
  75        case 5: name = "DetectionTest";
  76             if (exec) DetectionTest();
  77             break;
  78
  79         default: name = "";
  80             break; //needed to end loop
  81     }
  82 }
  83
  84 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
  85 {
  86     int32_t offset = -1;
  87
  88     splits = 1;
  89     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
  90         splits += 1;
  91     }
  92
  93     UnicodeString *result = new UnicodeString[splits];
  94
  95     int32_t start = 0;
  96     int32_t split = 0;
  97     int32_t end;
  98
  99     while((end = src.indexOf(ch, start)) >= 0) {
 100         src.extractBetween(start, end, result[split++]);
 101         start = end + 1;
 102     }
 103
 104     src.extractBetween(start, src.length(), result[split]);
 105
 106     return result;
 107 }
 108
 109 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
 110 {
 111     int32_t sLength = source.length();
 112     char *bytes = NULL;
 113
 114     length = source.extract(0, sLength, NULL, codepage);
 115
 116     if (length > 0) {
 117         bytes = NEW_ARRAY(char, length + 1);
 118         source.extract(0, sLength, bytes, codepage);
 119     }
 120
 121     return bytes;
 122 }
 123
 124 static void freeBytes(char *bytes)
 125 {
 126     DELETE_ARRAY(bytes);
 127 }
 128
 129 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
 130 {
 131     int32_t splits = 0;
 132     int32_t testLength = testString.length();
 133     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
 134     UErrorCode status = U_ZERO_ERROR;
 135     int32_t cpLength = eSplit[0].length();
 136     char codepage[64];
 137
 138     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
 139     codepage[cpLength] = '\0';
 140
 141     UCharsetDetector *csd = ucsdet_open(&status);
 142
 143     int32_t byteLength = 0;
 144     char *bytes = extractBytes(testString, codepage, byteLength);
 145
 146     if (bytes == NULL) {
 147 #if !UCONFIG_NO_LEGACY_CONVERSION
 148         errln("Can't open a " + encoding + " converter for " + id);
 149 #endif
 150         return;
 151     }
 152
 153     ucsdet_setText(csd, bytes, byteLength, &status);
 154
 155     int32_t matchCount = 0;
 156     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
 157
 158
 159     UnicodeString name(ucsdet_getName(matches[0], &status));
 160     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
 161     UChar *decoded = NULL;
 162     int32_t dLength = 0;
 163
 164     if (matchCount == 0) {
 165         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
 166         goto bail;
 167     }
 168
 169     if (name.compare(eSplit[0]) != 0) {
 170         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
 171
 172 #ifdef DEBUG_DETECT
 173         for (int32_t m = 0; m < matchCount; m += 1) {
 174             const char *name = ucsdet_getName(matches[m], &status);
 175             const char *lang = ucsdet_getLanguage(matches[m], &status);
 176             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
 177
 178             printf("%s (%s) %d\n", name, lang, confidence);
 179         }
 180 #endif
 181         goto bail;
 182     }
 183
 184     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
 185         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
 186         goto bail;
 187     }
 188
 189     decoded = NEW_ARRAY(UChar, testLength);
 190     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
 191
 192     if (testString.compare(decoded, dLength) != 0) {
 193         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
 194
 195 #ifdef DEBUG_DETECT
 196         for(int32_t i = 0; i < testLength; i += 1) {
 197             if(testString[i] != decoded[i]) {
 198                 printf("Strings differ at byte %d\n", i);
 199                 break;
 200             }
 201         }
 202 #endif
 203
 204     }
 205
 206     DELETE_ARRAY(decoded);
 207
 208 bail:
 209     freeBytes(bytes);
 210     ucsdet_close(csd);
 211     delete[] eSplit;
 212 }
 213
 214 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
 215     UErrorCode status = U_ZERO_ERROR;
 216     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 217
 218     if (U_FAILURE(status)) {
 219         errln("ERROR: getPath() failed - %s", u_errorName(status));
 220         return NULL;
 221     }
 222
 223     strcpy(buffer, testDataDirectory);
 224     strcat(buffer, filename);
 225     return buffer;
 226 }
 227
 228 void CharsetDetectionTest::ConstructionTest()
 229 {
 230     UErrorCode status = U_ZERO_ERROR;
 231     UCharsetDetector *csd = ucsdet_open(&status);
 232     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
 233     int32_t count = uenum_count(e, &status);
 234
 235 #ifdef DEBUG_DETECT
 236     printf("There are %d recognizers.\n", count);
 237 #endif
 238
 239     for(int32_t i = 0; i < count; i += 1) {
 240         int32_t length;
 241         const char *name = uenum_next(e, &length, &status);
 242
 243         if(name == NULL || length <= 0) {
 244             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
 245         }
 246
 247 #ifdef DEBUG_DETECT
 248         printf("%s\n", name);
 249 #endif
 250     }
 251
 252     uenum_close(e);
 253     ucsdet_close(csd);
 254 }
 255
 256 void CharsetDetectionTest::UTF8Test()
 257 {
 258     UErrorCode status = U_ZERO_ERROR;
 259     UnicodeString ss = "This is a string with some non-ascii characters that will "
 260                        "be converted to UTF-8, then shoved through the detection process.  "
 261                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
 262                        "Sure would be nice if our source could contain Unicode directly!";
 263     UnicodeString s = ss.unescape();
 264     int32_t byteLength = 0, sLength = s.length();
 265     char *bytes = extractBytes(s, "UTF-8", byteLength);
 266     UCharsetDetector *csd = ucsdet_open(&status);
 267     const UCharsetMatch *match;
 268     UChar *detected = NEW_ARRAY(UChar, sLength);
 269
 270     ucsdet_setText(csd, bytes, byteLength, &status);
 271     match = ucsdet_detect(csd, &status);
 272
 273     if (match == NULL) {
 274         errln("Detection failure for UTF-8: got no matches.");
 275         goto bail;
 276     }
 277
 278     ucsdet_getUChars(match, detected, sLength, &status);
 279
 280     if (s.compare(detected, sLength) != 0) {
 281         errln("Round-trip test failed!");
 282     }
 283
 284     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
 285
 286 bail:
 287     DELETE_ARRAY(detected);
 288     freeBytes(bytes);
 289     ucsdet_close(csd);
 290 }
 291
 292 void CharsetDetectionTest::UTF16Test()
 293 {
 294     UErrorCode status = U_ZERO_ERROR;
 295     /* Notice the BOM on the start of this string */
 296     UChar chars[] = {
 297         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
 298         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
 299         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
 300         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
 301         0x064a, 0x062a, 0x0000};
 302     UnicodeString s(chars);
 303     int32_t beLength = 0, leLength = 0;
 304     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
 305     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
 306     UCharsetDetector *csd = ucsdet_open(&status);
 307     const UCharsetMatch *match;
 308     const char *name;
 309     int32_t conf;
 310
 311     ucsdet_setText(csd, beBytes, beLength, &status);
 312     match = ucsdet_detect(csd, &status);
 313
 314     if (match == NULL) {
 315         errln("Encoding detection failure for UTF-16BE: got no matches.");
 316         goto try_le;
 317     }
 318
 319     name  = ucsdet_getName(match, &status);
 320     conf  = ucsdet_getConfidence(match, &status);
 321
 322     if (strcmp(name, "UTF-16BE") != 0) {
 323         errln("Encoding detection failure for UTF-16BE: got %s", name);
 324         goto try_le; // no point in looking at confidence if we got the wrong character set.
 325     }
 326
 327     if (conf != 100) {
 328         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
 329     }
 330
 331 try_le:
 332     ucsdet_setText(csd, leBytes, leLength, &status);
 333     match = ucsdet_detect(csd, &status);
 334
 335     if (match == NULL) {
 336         errln("Encoding detection failure for UTF-16LE: got no matches.");
 337         goto bail;
 338     }
 339
 340     name  = ucsdet_getName(match, &status);
 341     conf = ucsdet_getConfidence(match, &status);
 342
 343
 344     if (strcmp(name, "UTF-16LE") != 0) {
 345         errln("Enconding detection failure for UTF-16LE: got %s", name);
 346         goto bail; // no point in looking at confidence if we got the wrong character set.
 347     }
 348
 349     if (conf != 100) {
 350         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
 351     }
 352
 353 bail:
 354     freeBytes(leBytes);
 355     freeBytes(beBytes);
 356     ucsdet_close(csd);
 357 }
 358
 359 void CharsetDetectionTest::InputFilterTest()
 360 {
 361     UErrorCode status = U_ZERO_ERROR;
 362     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
 363     UnicodeString s  = ss.unescape();
 364     int32_t byteLength = 0;
 365     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
 366     UCharsetDetector *csd = ucsdet_open(&status);
 367     const UCharsetMatch *match;
 368     const char *lang, *name;
 369
 370     ucsdet_enableInputFilter(csd, TRUE);
 371
 372     if (!ucsdet_isInputFilterEnabled(csd)) {
 373         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
 374     }
 375
 376
 377     ucsdet_setText(csd, bytes, byteLength, &status);
 378     match = ucsdet_detect(csd, &status);
 379
 380     if (match == NULL) {
 381         errln("Turning on the input filter resulted in no matches.");
 382         goto turn_off;
 383     }
 384
 385     name = ucsdet_getName(match, &status);
 386
 387     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 388         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
 389     } else {
 390         lang = ucsdet_getLanguage(match, &status);
 391
 392         if (lang == NULL || strcmp(lang, "fr") != 0) {
 393             errln("Input filter did not strip markup!");
 394         }
 395     }
 396
 397 turn_off:
 398     ucsdet_enableInputFilter(csd, FALSE);
 399     ucsdet_setText(csd, bytes, byteLength, &status);
 400     match = ucsdet_detect(csd, &status);
 401
 402     if (match == NULL) {
 403         errln("Turning off the input filter resulted in no matches.");
 404         goto bail;
 405     }
 406
 407     name = ucsdet_getName(match, &status);
 408
 409     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
 410         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
 411     } else {
 412         lang = ucsdet_getLanguage(match, &status);
 413
 414         if (lang == NULL || strcmp(lang, "en") != 0) {
 415             errln("Unfiltered input did not detect as English!");
 416         }
 417     }
 418
 419 bail:
 420     freeBytes(bytes);
 421     ucsdet_close(csd);
 422 }
 423
 424 void CharsetDetectionTest::C1BytesTest()
 425 {
 426 #if !UCONFIG_NO_LEGACY_CONVERSION
 427     UErrorCode status = U_ZERO_ERROR;
 428     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 429     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
 430     UnicodeString sWindows  = ssWindows.unescape();
 431     int32_t lISO = 0, lWindows = 0;
 432     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
 433     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
 434     UCharsetDetector *csd = ucsdet_open(&status);
 435     const UCharsetMatch *match;
 436     const char *name;
 437
 438     ucsdet_setText(csd, bWindows, lWindows, &status);
 439     match = ucsdet_detect(csd, &status);
 440
 441     if (match == NULL) {
 442         errln("English test with C1 bytes got no matches.");
 443         goto bail;
 444     }
 445
 446     name  = ucsdet_getName(match, &status);
 447
 448     if (strcmp(name, "windows-1252") != 0) {
 449         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
 450     }
 451
 452     ucsdet_setText(csd, bISO, lISO, &status);
 453     match = ucsdet_detect(csd, &status);
 454
 455     if (match == NULL) {
 456         errln("English text without C1 bytes got no matches.");
 457         goto bail;
 458     }
 459
 460     name  = ucsdet_getName(match, &status);
 461
 462     if (strcmp(name, "ISO-8859-1") != 0) {
 463         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
 464     }
 465
 466 bail:
 467     freeBytes(bWindows);
 468     freeBytes(bISO);
 469
 470     ucsdet_close(csd);
 471 #endif
 472 }
 473
 474 void CharsetDetectionTest::DetectionTest()
 475 {
 476 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 477     UErrorCode status = U_ZERO_ERROR;
 478     char path[2048];
 479     const char *testFilePath = getPath(path, "csdetest.xml");
 480
 481     if (testFilePath == NULL) {
 482         return; /* Couldn't get path: error message already output. */
 483     }
 484
 485     UXMLParser  *parser = UXMLParser::createParser(status);
 486     if (!assertSuccess("UXMLParser::createParser",status)) return;
 487     UXMLElement *root   = parser->parseFile(testFilePath, status);
 488     if (!assertSuccess( "parseFile",status)) return;
 489
 490     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
 491     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
 492     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
 493
 494     const UXMLElement *testCase;
 495     int32_t tc = 0;
 496
 497     while((testCase = root->nextChildElement(tc)) != NULL) {
 498         if (testCase->getTagName().compare(test_case) == 0) {
 499             const UnicodeString *id = testCase->getAttribute(id_attr);
 500             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
 501             const UnicodeString  text = testCase->getText(TRUE);
 502             int32_t encodingCount;
 503             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
 504
 505             for(int32_t e = 0; e < encodingCount; e += 1) {
 506                 checkEncoding(text, encodingList[e], *id);
 507             }
 508
 509             delete[] encodingList;
 510         }
 511     }
 512
 513     delete root;
 514     delete parser;
 515 #endif
 516 }
 517
 518