icuSources/i18n/csrmbcs.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2006, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "csrmbcs.h"
  13
  14 #include <math.h>
  15
  16 U_NAMESPACE_BEGIN
  17
  18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  19
  20 #define min(x,y) (((x)<(y))?(x):(y))
  21
  22 const int32_t commonChars_sjis [] = {
  23 // TODO:  This set of data comes from the character frequency-
  24 //        of-occurence analysis tool.  The data needs to be moved
  25 //        into a resource and loaded from there.
  26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
  27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
  28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
  29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
  30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
  31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
  32
  33 const int32_t commonChars_euc_jp[] = {
  34 // TODO:  This set of data comes from the character frequency-
  35 //        of-occurence analysis tool.  The data needs to be moved
  36 //        into a resource and loaded from there.
  37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
  38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
  39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
  40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
  41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
  42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
  43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
  44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
  45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
  46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
  47
  48 const int32_t commonChars_euc_kr[] = {
  49 // TODO:  This set of data comes from the character frequency-
  50 //        of-occurence analysis tool.  The data needs to be moved
  51 //        into a resource and loaded from there.
  52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
  53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
  54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
  55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
  56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
  57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
  58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
  59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
  60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
  61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
  62
  63 const int32_t commonChars_big5[] = {
  64 // TODO:  This set of data comes from the character frequency-
  65 //        of-occurence analysis tool.  The data needs to be moved
  66 //        into a resource and loaded from there.
  67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
  68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
  69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
  70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
  71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
  72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
  73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
  74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
  75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
  76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
  77
  78 const int32_t commonChars_gb_18030[] = {
  79 // TODO:  This set of data comes from the character frequency-
  80 //        of-occurence analysis tool.  The data needs to be moved
  81 //        into a resource and loaded from there.
  82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
  83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
  84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
  85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
  86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
  87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
  88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
  89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
  90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
  91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
  92
  93 static int32_t binarySearch(const int32_t *array, int32_t len, int32_t value)
  94 {
  95     int32_t start = 0, end = len-1;
  96     int32_t mid = (start+end)/2;
  97
  98     while(start <= end) {
  99         if(array[mid] == value) {
 100             return mid;
 101         }
 102
 103         if(array[mid] < value){
 104             start = mid+1;
 105         } else {
 106             end = mid-1;
 107         }
 108
 109         mid = (start+end)/2;
 110     }
 111
 112     return -1;
 113 }
 114
 115 IteratedChar::IteratedChar():charValue(0), index(0), nextIndex(0), error(FALSE), done(FALSE)
 116 {
 117     // nothing else to do.
 118 }
 119
 120 void IteratedChar::reset()
 121 {
 122     charValue = 0;
 123     index     = -1;
 124     nextIndex = 0;
 125     error     = FALSE;
 126     done      = FALSE;
 127 }
 128
 129 int32_t IteratedChar::nextByte(InputText *det)
 130 {
 131     if (nextIndex >= det->fRawLength) {
 132         done = TRUE;
 133
 134         return -1;
 135     }
 136
 137     return det->fRawInput[nextIndex++];
 138 }
 139
 140 CharsetRecog_mbcs::~CharsetRecog_mbcs()
 141 {
 142     // nothing to do.
 143 }
 144
 145 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[], int32_t commonCharsLen) {
 146     int   singleByteCharCount = 0;
 147     int   doubleByteCharCount = 0;
 148     int   commonCharCount     = 0;
 149     int   badCharCount        = 0;
 150     int   totalCharCount      = 0;
 151     int   confidence          = 0;
 152     IteratedChar *iter        = new IteratedChar();
 153
 154     // {
 155     for (iter->reset(); nextChar(iter, det);) {
 156         totalCharCount += 1;
 157
 158         if (iter->error) {
 159             badCharCount += 1;
 160         } else {
 161             if (iter->charValue <= 0xFF) {
 162                 singleByteCharCount += 1;
 163             } else {
 164                 doubleByteCharCount += 1;
 165
 166                 if (commonChars != 0) {
 167                     if (binarySearch(commonChars, commonCharsLen, iter->charValue) >= 0){
 168                         commonCharCount += 1;
 169                     }
 170                 }
 171             }
 172         }
 173
 174
 175         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
 176             // Bail out early if the byte data is not matching the encoding scheme.
 177             // break detectBlock;
 178             delete iter;
 179             return confidence;
 180         }
 181     }
 182
 183     delete iter;
 184
 185     if (doubleByteCharCount <= 10 && badCharCount == 0) {
 186         // Not many multi-byte chars.
 187         //   ASCII or ISO file?  It's probably not our encoding,
 188         //   but is not incompatible with our encoding, so don't give it a zero.
 189         confidence = 10;
 190
 191         return confidence;
 192     }
 193
 194     //
 195     //  No match if there are too many characters that don't fit the encoding scheme.
 196     //    (should we have zero tolerance for these?)
 197     //
 198     if (doubleByteCharCount < 20*badCharCount) {
 199         confidence = 0;
 200
 201         return confidence;
 202     }
 203
 204     if (commonChars == 0) {
 205         // We have no statistics on frequently occuring characters.
 206         //  Assess confidence purely on having a reasonable number of
 207         //  multi-byte characters (the more the better)
 208         confidence = 30 + doubleByteCharCount - 20*badCharCount;
 209
 210         if (confidence > 100) {
 211             confidence = 100;
 212         }
 213     } else {
 214         //
 215         // Frequency of occurence statistics exist.
 216         //
 217
 218         double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
 219         double scaleFactor = 90.0 / maxVal;
 220         confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
 221
 222         confidence = min(confidence, 100);
 223     }
 224
 225     if (confidence < 0) {
 226         confidence = 0;
 227     }
 228
 229     return confidence;
 230 }
 231
 232 CharsetRecog_sjis::~CharsetRecog_sjis()
 233 {
 234     // nothing to do
 235 }
 236
 237 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
 238     it->index = it->nextIndex;
 239     it->error = FALSE;
 240
 241     int32_t firstByte = it->charValue = it->nextByte(det);
 242
 243     if (firstByte < 0) {
 244         return FALSE;
 245     }
 246
 247     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
 248         return TRUE;
 249     }
 250
 251     int32_t secondByte = it->nextByte(det);
 252
 253     if (secondByte < 0)  {
 254         return FALSE;
 255     }
 256     it->charValue = (firstByte << 8) | secondByte;
 257     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
 258         // Illegal second byte value.
 259         it->error = TRUE;
 260     }
 261
 262     return TRUE;
 263 }
 264
 265 int32_t CharsetRecog_sjis::match(InputText* det)
 266 {
 267     return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
 268 }
 269
 270 const char *CharsetRecog_sjis::getName() const
 271 {
 272     return "Shift_JIS";
 273 }
 274
 275 const char *CharsetRecog_sjis::getLanguage() const
 276 {
 277     return "ja";
 278 }
 279
 280 CharsetRecog_euc::~CharsetRecog_euc()
 281 {
 282     // nothing to do
 283 }
 284
 285 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
 286     int32_t firstByte  = 0;
 287     int32_t secondByte = 0;
 288     int32_t thirdByte  = 0;
 289     // int32_t fourthByte = 0;
 290
 291     it->index = it->nextIndex;
 292     it->error = FALSE;
 293     firstByte = it->charValue = it->nextByte(det);
 294
 295     if (firstByte < 0) {
 296         // Ran off the end of the input data
 297         it->done = TRUE;
 298
 299         return (! it->done);
 300     }
 301
 302     if (firstByte <= 0x8D) {
 303         // single byte char
 304         return (! it->done);
 305     }
 306
 307     secondByte = it->nextByte(det);
 308     it->charValue = (it->charValue << 8) | secondByte;
 309
 310     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
 311         // Two byte Char
 312         if (secondByte < 0xA1) {
 313             it->error = TRUE;
 314         }
 315
 316         return (! it->done);
 317     }
 318
 319     if (firstByte == 0x8E) {
 320         // Code Set 2.
 321         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 322         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 323         // We don't know which we've got.
 324         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 325         //   bytes will look like a well formed 2 byte char.
 326         if (secondByte < 0xA1) {
 327             it->error = TRUE;
 328         }
 329
 330         return (! it->done);
 331     }
 332
 333     if (firstByte == 0x8F) {
 334         // Code set 3.
 335         // Three byte total char size, two bytes of actual char value.
 336         thirdByte    = it->nextByte(det);
 337         it->charValue = (it->charValue << 8) | thirdByte;
 338
 339         if (thirdByte < 0xa1) {
 340             it->error = TRUE;
 341         }
 342     }
 343
 344     return (! it->done);
 345
 346 }
 347
 348 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
 349 {
 350     // nothing to do
 351 }
 352
 353 const char *CharsetRecog_euc_jp::getName() const
 354 {
 355     return "EUC-JP";
 356 }
 357
 358 const char *CharsetRecog_euc_jp::getLanguage() const
 359 {
 360     return "ja";
 361 }
 362
 363 int32_t CharsetRecog_euc_jp::match(InputText *det)
 364 {
 365     return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
 366 }
 367
 368 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
 369 {
 370     // nothing to do
 371 }
 372
 373 const char *CharsetRecog_euc_kr::getName() const
 374 {
 375     return "EUC-KR";
 376 }
 377
 378 const char *CharsetRecog_euc_kr::getLanguage() const
 379 {
 380     return "ko";
 381 }
 382
 383 int32_t CharsetRecog_euc_kr::match(InputText *det)
 384 {
 385     return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
 386 }
 387
 388 CharsetRecog_big5::~CharsetRecog_big5()
 389 {
 390     // nothing to do
 391 }
 392
 393 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
 394 {
 395     int32_t firstByte;
 396
 397     it->index = it->nextIndex;
 398     it->error = FALSE;
 399     firstByte = it->charValue = it->nextByte(det);
 400
 401     if (firstByte < 0) {
 402         return FALSE;
 403     }
 404
 405     if (firstByte <= 0x7F || firstByte == 0xFF) {
 406         // single byte character.
 407         return TRUE;
 408     }
 409
 410     int32_t secondByte = it->nextByte(det);
 411
 412     if (secondByte < 0)  {
 413         return FALSE;
 414     }
 415
 416     it->charValue = (it->charValue << 8) | secondByte;
 417
 418     if (secondByte < 0x40 ||
 419         secondByte == 0x7F ||
 420         secondByte == 0xFF) {
 421             it->error = TRUE;
 422     }
 423
 424     return TRUE;
 425 }
 426
 427 const char *CharsetRecog_big5::getName() const
 428 {
 429     return "Big5";
 430 }
 431
 432 const char *CharsetRecog_big5::getLanguage() const
 433 {
 434     return "zh";
 435 }
 436
 437 int32_t CharsetRecog_big5::match(InputText *det)
 438 {
 439     return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
 440 }
 441
 442 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
 443 {
 444     // nothing to do
 445 }
 446
 447 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
 448     int32_t firstByte  = 0;
 449     int32_t secondByte = 0;
 450     int32_t thirdByte  = 0;
 451     int32_t fourthByte = 0;
 452
 453     it->index = it->nextIndex;
 454     it->error = FALSE;
 455     firstByte = it->charValue = it->nextByte(det);
 456
 457     if (firstByte < 0) {
 458         // Ran off the end of the input data
 459         it->done = TRUE;
 460
 461         return (! it->done);
 462     }
 463
 464     if (firstByte <= 0x80) {
 465         // single byte char
 466         return (! it->done);
 467     }
 468
 469     secondByte = it->nextByte(det);
 470     it->charValue = (it->charValue << 8) | secondByte;
 471
 472     if (firstByte >= 0x81 && firstByte <= 0xFE) {
 473         // Two byte Char
 474         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
 475             return (! it->done);
 476         }
 477
 478         // Four byte char
 479         if (secondByte >= 0x30 && secondByte <= 0x39) {
 480             thirdByte = it->nextByte(det);
 481
 482             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 483                 fourthByte = it->nextByte(det);
 484
 485                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 486                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
 487
 488                     return (! it->done);
 489                 }
 490             }
 491         }
 492
 493         it->error = TRUE;
 494
 495         return (! it->done);
 496     }
 497
 498     return (! it->done);
 499 }
 500
 501 const char *CharsetRecog_gb_18030::getName() const
 502 {
 503     return "GB18030";
 504 }
 505
 506 const char *CharsetRecog_gb_18030::getLanguage() const
 507 {
 508     return "zh";
 509 }
 510
 511 int32_t CharsetRecog_gb_18030::match(InputText *det)
 512 {
 513     return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
 514 }
 515
 516 U_NAMESPACE_END
 517 #endif