icuSources/i18n/csrmbcs.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2008, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "csrmbcs.h"
  13
  14 #include <math.h>
  15
  16 U_NAMESPACE_BEGIN
  17
  18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  19
  20 #define min(x,y) (((x)<(y))?(x):(y))
  21
  22 static const uint16_t commonChars_sjis [] = {
  23 // TODO:  This set of data comes from the character frequency-
  24 //        of-occurence analysis tool.  The data needs to be moved
  25 //        into a resource and loaded from there.
  26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
  27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
  28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
  29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
  30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
  31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
  32
  33 static const uint16_t commonChars_euc_jp[] = {
  34 // TODO:  This set of data comes from the character frequency-
  35 //        of-occurence analysis tool.  The data needs to be moved
  36 //        into a resource and loaded from there.
  37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
  38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
  39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
  40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
  41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
  42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
  43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
  44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
  45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
  46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
  47
  48 static const uint16_t commonChars_euc_kr[] = {
  49 // TODO:  This set of data comes from the character frequency-
  50 //        of-occurence analysis tool.  The data needs to be moved
  51 //        into a resource and loaded from there.
  52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
  53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
  54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
  55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
  56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
  57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
  58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
  59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
  60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
  61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
  62
  63 static const uint16_t commonChars_big5[] = {
  64 // TODO:  This set of data comes from the character frequency-
  65 //        of-occurence analysis tool.  The data needs to be moved
  66 //        into a resource and loaded from there.
  67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
  68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
  69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
  70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
  71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
  72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
  73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
  74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
  75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
  76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
  77
  78 static const uint16_t commonChars_gb_18030[] = {
  79 // TODO:  This set of data comes from the character frequency-
  80 //        of-occurence analysis tool.  The data needs to be moved
  81 //        into a resource and loaded from there.
  82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
  83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
  84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
  85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
  86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
  87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
  88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
  89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
  90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
  91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
  92
  93 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
  94 {
  95     int32_t start = 0, end = len-1;
  96     int32_t mid = (start+end)/2;
  97
  98     while(start <= end) {
  99         if(array[mid] == value) {
 100             return mid;
 101         }
 102
 103         if(array[mid] < value){
 104             start = mid+1;
 105         } else {
 106             end = mid-1;
 107         }
 108
 109         mid = (start+end)/2;
 110     }
 111
 112     return -1;
 113 }
 114
 115 IteratedChar::IteratedChar() :
 116 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
 117 {
 118     // nothing else to do.
 119 }
 120
 121 /*void IteratedChar::reset()
 122 {
 123     charValue = 0;
 124     index     = -1;
 125     nextIndex = 0;
 126     error     = FALSE;
 127     done      = FALSE;
 128 }*/
 129
 130 int32_t IteratedChar::nextByte(InputText *det)
 131 {
 132     if (nextIndex >= det->fRawLength) {
 133         done = TRUE;
 134
 135         return -1;
 136     }
 137
 138     return det->fRawInput[nextIndex++];
 139 }
 140
 141 CharsetRecog_mbcs::~CharsetRecog_mbcs()
 142 {
 143     // nothing to do.
 144 }
 145
 146 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
 147     int32_t singleByteCharCount = 0;
 148     int32_t doubleByteCharCount = 0;
 149     int32_t commonCharCount     = 0;
 150     int32_t badCharCount        = 0;
 151     int32_t totalCharCount      = 0;
 152     int32_t confidence          = 0;
 153     IteratedChar iter;
 154
 155     while (nextChar(&iter, det)) {
 156         totalCharCount++;
 157
 158         if (iter.error) {
 159             badCharCount++;
 160         } else {
 161             if (iter.charValue <= 0xFF) {
 162                 singleByteCharCount++;
 163             } else {
 164                 doubleByteCharCount++;
 165
 166                 if (commonChars != 0) {
 167                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
 168                         commonCharCount += 1;
 169                     }
 170                 }
 171             }
 172         }
 173
 174
 175         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
 176             // Bail out early if the byte data is not matching the encoding scheme.
 177             // break detectBlock;
 178             return confidence;
 179         }
 180     }
 181
 182     if (doubleByteCharCount <= 10 && badCharCount == 0) {
 183         // Not many multi-byte chars.
 184         if (doubleByteCharCount == 0 && totalCharCount < 10) {
 185             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
 186             // We don't have enough data to have any confidence.
 187             // Statistical analysis of single byte non-ASCII charcters would probably help here.
 188             confidence = 0;
 189         }
 190         else {
 191             //   ASCII or ISO file?  It's probably not our encoding,
 192             //   but is not incompatible with our encoding, so don't give it a zero.
 193             confidence = 10;
 194         }
 195
 196         return confidence;
 197     }
 198
 199     //
 200     //  No match if there are too many characters that don't fit the encoding scheme.
 201     //    (should we have zero tolerance for these?)
 202     //
 203     if (doubleByteCharCount < 20*badCharCount) {
 204         confidence = 0;
 205
 206         return confidence;
 207     }
 208
 209     if (commonChars == 0) {
 210         // We have no statistics on frequently occuring characters.
 211         //  Assess confidence purely on having a reasonable number of
 212         //  multi-byte characters (the more the better)
 213         confidence = 30 + doubleByteCharCount - 20*badCharCount;
 214
 215         if (confidence > 100) {
 216             confidence = 100;
 217         }
 218     } else {
 219         //
 220         // Frequency of occurence statistics exist.
 221         //
 222
 223         double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
 224         double scaleFactor = 90.0 / maxVal;
 225         confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
 226
 227         confidence = min(confidence, 100);
 228     }
 229
 230     if (confidence < 0) {
 231         confidence = 0;
 232     }
 233
 234     return confidence;
 235 }
 236
 237 CharsetRecog_sjis::~CharsetRecog_sjis()
 238 {
 239     // nothing to do
 240 }
 241
 242 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
 243     it->index = it->nextIndex;
 244     it->error = FALSE;
 245
 246     int32_t firstByte = it->charValue = it->nextByte(det);
 247
 248     if (firstByte < 0) {
 249         return FALSE;
 250     }
 251
 252     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
 253         return TRUE;
 254     }
 255
 256     int32_t secondByte = it->nextByte(det);
 257     if (secondByte >= 0) {
 258         it->charValue = (firstByte << 8) | secondByte;
 259     }
 260     // else we'll handle the error later.
 261
 262     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
 263         // Illegal second byte value.
 264         it->error = TRUE;
 265     }
 266
 267     return TRUE;
 268 }
 269
 270 int32_t CharsetRecog_sjis::match(InputText* det)
 271 {
 272     return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
 273 }
 274
 275 const char *CharsetRecog_sjis::getName() const
 276 {
 277     return "Shift_JIS";
 278 }
 279
 280 const char *CharsetRecog_sjis::getLanguage() const
 281 {
 282     return "ja";
 283 }
 284
 285 CharsetRecog_euc::~CharsetRecog_euc()
 286 {
 287     // nothing to do
 288 }
 289
 290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
 291     int32_t firstByte  = 0;
 292     int32_t secondByte = 0;
 293     int32_t thirdByte  = 0;
 294
 295     it->index = it->nextIndex;
 296     it->error = FALSE;
 297     firstByte = it->charValue = it->nextByte(det);
 298
 299     if (firstByte < 0) {
 300         // Ran off the end of the input data
 301         return FALSE;
 302     }
 303
 304     if (firstByte <= 0x8D) {
 305         // single byte char
 306         return TRUE;
 307     }
 308
 309     secondByte = it->nextByte(det);
 310     if (secondByte >= 0) {
 311         it->charValue = (it->charValue << 8) | secondByte;
 312     }
 313     // else we'll handle the error later.
 314
 315     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
 316         // Two byte Char
 317         if (secondByte < 0xA1) {
 318             it->error = TRUE;
 319         }
 320
 321         return TRUE;
 322     }
 323
 324     if (firstByte == 0x8E) {
 325         // Code Set 2.
 326         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 327         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 328         // We don't know which we've got.
 329         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 330         //   bytes will look like a well formed 2 byte char.
 331         if (secondByte < 0xA1) {
 332             it->error = TRUE;
 333         }
 334
 335         return TRUE;
 336     }
 337
 338     if (firstByte == 0x8F) {
 339         // Code set 3.
 340         // Three byte total char size, two bytes of actual char value.
 341         thirdByte    = it->nextByte(det);
 342         it->charValue = (it->charValue << 8) | thirdByte;
 343
 344         if (thirdByte < 0xa1) {
 345             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
 346             it->error = TRUE;
 347         }
 348     }
 349
 350     return TRUE;
 351
 352 }
 353
 354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
 355 {
 356     // nothing to do
 357 }
 358
 359 const char *CharsetRecog_euc_jp::getName() const
 360 {
 361     return "EUC-JP";
 362 }
 363
 364 const char *CharsetRecog_euc_jp::getLanguage() const
 365 {
 366     return "ja";
 367 }
 368
 369 int32_t CharsetRecog_euc_jp::match(InputText *det)
 370 {
 371     return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
 372 }
 373
 374 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
 375 {
 376     // nothing to do
 377 }
 378
 379 const char *CharsetRecog_euc_kr::getName() const
 380 {
 381     return "EUC-KR";
 382 }
 383
 384 const char *CharsetRecog_euc_kr::getLanguage() const
 385 {
 386     return "ko";
 387 }
 388
 389 int32_t CharsetRecog_euc_kr::match(InputText *det)
 390 {
 391     return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
 392 }
 393
 394 CharsetRecog_big5::~CharsetRecog_big5()
 395 {
 396     // nothing to do
 397 }
 398
 399 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
 400 {
 401     int32_t firstByte;
 402
 403     it->index = it->nextIndex;
 404     it->error = FALSE;
 405     firstByte = it->charValue = it->nextByte(det);
 406
 407     if (firstByte < 0) {
 408         return FALSE;
 409     }
 410
 411     if (firstByte <= 0x7F || firstByte == 0xFF) {
 412         // single byte character.
 413         return TRUE;
 414     }
 415
 416     int32_t secondByte = it->nextByte(det);
 417     if (secondByte >= 0)  {
 418         it->charValue = (it->charValue << 8) | secondByte;
 419     }
 420     // else we'll handle the error later.
 421
 422     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
 423         it->error = TRUE;
 424     }
 425
 426     return TRUE;
 427 }
 428
 429 const char *CharsetRecog_big5::getName() const
 430 {
 431     return "Big5";
 432 }
 433
 434 const char *CharsetRecog_big5::getLanguage() const
 435 {
 436     return "zh";
 437 }
 438
 439 int32_t CharsetRecog_big5::match(InputText *det)
 440 {
 441     return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
 442 }
 443
 444 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
 445 {
 446     // nothing to do
 447 }
 448
 449 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
 450     int32_t firstByte  = 0;
 451     int32_t secondByte = 0;
 452     int32_t thirdByte  = 0;
 453     int32_t fourthByte = 0;
 454
 455     it->index = it->nextIndex;
 456     it->error = FALSE;
 457     firstByte = it->charValue = it->nextByte(det);
 458
 459     if (firstByte < 0) {
 460         // Ran off the end of the input data
 461         return FALSE;
 462     }
 463
 464     if (firstByte <= 0x80) {
 465         // single byte char
 466         return TRUE;
 467     }
 468
 469     secondByte = it->nextByte(det);
 470     if (secondByte >= 0) {
 471         it->charValue = (it->charValue << 8) | secondByte;
 472     }
 473     // else we'll handle the error later.
 474
 475     if (firstByte >= 0x81 && firstByte <= 0xFE) {
 476         // Two byte Char
 477         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
 478             return TRUE;
 479         }
 480
 481         // Four byte char
 482         if (secondByte >= 0x30 && secondByte <= 0x39) {
 483             thirdByte = it->nextByte(det);
 484
 485             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 486                 fourthByte = it->nextByte(det);
 487
 488                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 489                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
 490
 491                     return TRUE;
 492                 }
 493             }
 494         }
 495
 496         // Something wasn't valid, or we ran out of data (-1).
 497         it->error = TRUE;
 498     }
 499
 500     return TRUE;
 501 }
 502
 503 const char *CharsetRecog_gb_18030::getName() const
 504 {
 505     return "GB18030";
 506 }
 507
 508 const char *CharsetRecog_gb_18030::getLanguage() const
 509 {
 510     return "zh";
 511 }
 512
 513 int32_t CharsetRecog_gb_18030::match(InputText *det)
 514 {
 515     return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
 516 }
 517
 518 U_NAMESPACE_END
 519 #endif