icuSources/i18n/csrmbcs.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2012, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "csmatch.h"
  13 #include "csrmbcs.h"
  14
  15 #include <math.h>
  16
  17 U_NAMESPACE_BEGIN
  18
  19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  20
  21 #define min(x,y) (((x)<(y))?(x):(y))
  22
  23 static const uint16_t commonChars_sjis [] = {
  24 // TODO:  This set of data comes from the character frequency-
  25 //        of-occurence analysis tool.  The data needs to be moved
  26 //        into a resource and loaded from there.
  27 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
  28 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
  29 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
  30 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
  31 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
  32 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
  33
  34 static const uint16_t commonChars_euc_jp[] = {
  35 // TODO:  This set of data comes from the character frequency-
  36 //        of-occurence analysis tool.  The data needs to be moved
  37 //        into a resource and loaded from there.
  38 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
  39 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
  40 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
  41 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
  42 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
  43 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
  44 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
  45 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
  46 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
  47 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
  48
  49 static const uint16_t commonChars_euc_kr[] = {
  50 // TODO:  This set of data comes from the character frequency-
  51 //        of-occurence analysis tool.  The data needs to be moved
  52 //        into a resource and loaded from there.
  53 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
  54 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
  55 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
  56 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
  57 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
  58 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
  59 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
  60 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
  61 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
  62 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
  63
  64 static const uint16_t commonChars_big5[] = {
  65 // TODO:  This set of data comes from the character frequency-
  66 //        of-occurence analysis tool.  The data needs to be moved
  67 //        into a resource and loaded from there.
  68 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
  69 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
  70 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
  71 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
  72 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
  73 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
  74 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
  75 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
  76 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
  77 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
  78
  79 static const uint16_t commonChars_gb_18030[] = {
  80 // TODO:  This set of data comes from the character frequency-
  81 //        of-occurence analysis tool.  The data needs to be moved
  82 //        into a resource and loaded from there.
  83 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
  84 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
  85 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
  86 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
  87 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
  88 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
  89 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
  90 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
  91 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
  92 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
  93
  94 #if U_PLATFORM_IS_DARWIN_BASED
  95 static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
  96     {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
  97     {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
  98     {0}
  99 };
 100 static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
 101     {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
 102     {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
 103     {0}
 104 };
 105 static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
 106     {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
 107     {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
 108     {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
 109     {0}
 110 };
 111 static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
 112     {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
 113     {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
 114     {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
 115     {0}
 116 };
 117 static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
 118     {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
 119     {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
 120     {0}
 121 };
 122 #endif
 123
 124 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
 125 {
 126     int32_t start = 0, end = len-1;
 127     int32_t mid = (start+end)/2;
 128
 129     while(start <= end) {
 130         if(array[mid] == value) {
 131             return mid;
 132         }
 133
 134         if(array[mid] < value){
 135             start = mid+1;
 136         } else {
 137             end = mid-1;
 138         }
 139
 140         mid = (start+end)/2;
 141     }
 142
 143     return -1;
 144 }
 145
 146 #if U_PLATFORM_IS_DARWIN_BASED
 147 // If testPrefix is a prefix of base, return its length, else return 0
 148 static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
 149     const uint8_t *testPrefixStart = testPrefix;
 150     while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
 151         testPrefix++;
 152         base++;
 153     }
 154     return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
 155 }
 156 #endif
 157
 158 IteratedChar::IteratedChar() :
 159 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
 160 {
 161     // nothing else to do.
 162 }
 163
 164 /*void IteratedChar::reset()
 165 {
 166     charValue = 0;
 167     index     = -1;
 168     nextIndex = 0;
 169     error     = FALSE;
 170     done      = FALSE;
 171 }*/
 172
 173 int32_t IteratedChar::nextByte(InputText *det)
 174 {
 175     if (nextIndex >= det->fRawLength) {
 176         done = TRUE;
 177
 178         return -1;
 179     }
 180
 181     return det->fRawInput[nextIndex++];
 182 }
 183
 184 CharsetRecog_mbcs::~CharsetRecog_mbcs()
 185 {
 186     // nothing to do.
 187 }
 188
 189 #if U_PLATFORM_IS_DARWIN_BASED
 190 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
 191 #else
 192 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
 193 #endif
 194     int32_t singleByteCharCount = 0;
 195     int32_t doubleByteCharCount = 0;
 196     int32_t commonCharCount     = 0;
 197     int32_t badCharCount        = 0;
 198     int32_t totalCharCount      = 0;
 199     int32_t confidence          = 0;
 200 #if U_PLATFORM_IS_DARWIN_BASED
 201     int32_t confidenceFromKeys  = 0;
 202 #endif
 203     IteratedChar iter;
 204
 205     while (nextChar(&iter, det)) {
 206         totalCharCount++;
 207
 208         if (iter.error) {
 209             badCharCount++;
 210         } else {
 211             if (iter.charValue <= 0xFF) {
 212                 singleByteCharCount++;
 213             } else {
 214                 doubleByteCharCount++;
 215
 216                 if (commonChars != 0) {
 217                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
 218                         commonCharCount += 1;
 219                     }
 220                 }
 221 #if U_PLATFORM_IS_DARWIN_BASED
 222                 if (doubleByteCharCount <= 20) {
 223                     int32_t keyIndex;
 224                     for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
 225                         int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
 226                         confidenceFromKeys += prefixLen*5;
 227                     }
 228                 }
 229 #endif
 230             }
 231         }
 232
 233
 234         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
 235             // Bail out early if the byte data is not matching the encoding scheme.
 236             // break detectBlock;
 237             return confidence;
 238         }
 239     }
 240
 241     if (doubleByteCharCount <= 10 && badCharCount == 0) {
 242         // Not many multi-byte chars.
 243         if (doubleByteCharCount == 0 && totalCharCount < 10) {
 244             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
 245             // We don't have enough data to have any confidence.
 246             // Statistical analysis of single byte non-ASCII charcters would probably help here.
 247             confidence = 0;
 248         }
 249         else {
 250             //   ASCII or ISO file?  It's probably not our encoding,
 251             //   but is not incompatible with our encoding, so don't give it a zero.
 252 #if U_PLATFORM_IS_DARWIN_BASED
 253             if (confidenceFromKeys > 90) {
 254                 confidenceFromKeys = 90;
 255             } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
 256                 confidenceFromKeys += 20;
 257             }
 258             confidence = 10 + confidenceFromKeys;
 259 #else
 260             confidence = 10;
 261 #endif
 262         }
 263
 264         return confidence;
 265     }
 266
 267     //
 268     //  No match if there are too many characters that don't fit the encoding scheme.
 269     //    (should we have zero tolerance for these?)
 270     //
 271     if (doubleByteCharCount < 20*badCharCount) {
 272         confidence = 0;
 273
 274         return confidence;
 275     }
 276
 277     if (commonChars == 0) {
 278         // We have no statistics on frequently occuring characters.
 279         //  Assess confidence purely on having a reasonable number of
 280         //  multi-byte characters (the more the better)
 281         confidence = 30 + doubleByteCharCount - 20*badCharCount;
 282 #if U_PLATFORM_IS_DARWIN_BASED
 283         confidence += confidenceFromKeys;
 284 #endif
 285
 286         if (confidence > 100) {
 287             confidence = 100;
 288         }
 289     } else {
 290         //
 291         // Frequency of occurence statistics exist.
 292         //
 293
 294         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
 295         double scaleFactor = 90.0 / maxVal;
 296         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
 297 #if U_PLATFORM_IS_DARWIN_BASED
 298         confidence += confidenceFromKeys;
 299 #endif
 300
 301         confidence = min(confidence, 100);
 302     }
 303
 304     if (confidence < 0) {
 305         confidence = 0;
 306     }
 307
 308     return confidence;
 309 }
 310
 311 CharsetRecog_sjis::~CharsetRecog_sjis()
 312 {
 313     // nothing to do
 314 }
 315
 316 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
 317     it->index = it->nextIndex;
 318     it->error = FALSE;
 319
 320     int32_t firstByte = it->charValue = it->nextByte(det);
 321
 322     if (firstByte < 0) {
 323         return FALSE;
 324     }
 325
 326     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
 327         return TRUE;
 328     }
 329
 330     int32_t secondByte = it->nextByte(det);
 331     if (secondByte >= 0) {
 332         it->charValue = (firstByte << 8) | secondByte;
 333     }
 334     // else we'll handle the error later.
 335
 336     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
 337         // Illegal second byte value.
 338         it->error = TRUE;
 339     }
 340
 341     return TRUE;
 342 }
 343
 344 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
 345 #if U_PLATFORM_IS_DARWIN_BASED
 346     int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis);
 347 #else
 348     int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
 349 #endif
 350     results->set(det, this, confidence);
 351     return (confidence > 0);
 352 }
 353
 354 const char *CharsetRecog_sjis::getName() const
 355 {
 356     return "Shift_JIS";
 357 }
 358
 359 const char *CharsetRecog_sjis::getLanguage() const
 360 {
 361     return "ja";
 362 }
 363
 364 CharsetRecog_euc::~CharsetRecog_euc()
 365 {
 366     // nothing to do
 367 }
 368
 369 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
 370     int32_t firstByte  = 0;
 371     int32_t secondByte = 0;
 372     int32_t thirdByte  = 0;
 373
 374     it->index = it->nextIndex;
 375     it->error = FALSE;
 376     firstByte = it->charValue = it->nextByte(det);
 377
 378     if (firstByte < 0) {
 379         // Ran off the end of the input data
 380         return FALSE;
 381     }
 382
 383     if (firstByte <= 0x8D) {
 384         // single byte char
 385         return TRUE;
 386     }
 387
 388     secondByte = it->nextByte(det);
 389     if (secondByte >= 0) {
 390         it->charValue = (it->charValue << 8) | secondByte;
 391     }
 392     // else we'll handle the error later.
 393
 394     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
 395         // Two byte Char
 396         if (secondByte < 0xA1) {
 397             it->error = TRUE;
 398         }
 399
 400         return TRUE;
 401     }
 402
 403     if (firstByte == 0x8E) {
 404         // Code Set 2.
 405         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 406         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 407         // We don't know which we've got.
 408         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 409         //   bytes will look like a well formed 2 byte char.
 410         if (secondByte < 0xA1) {
 411             it->error = TRUE;
 412         }
 413
 414         return TRUE;
 415     }
 416
 417     if (firstByte == 0x8F) {
 418         // Code set 3.
 419         // Three byte total char size, two bytes of actual char value.
 420         thirdByte    = it->nextByte(det);
 421         it->charValue = (it->charValue << 8) | thirdByte;
 422
 423         if (thirdByte < 0xa1) {
 424             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
 425             it->error = TRUE;
 426         }
 427     }
 428
 429     return TRUE;
 430
 431 }
 432
 433 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
 434 {
 435     // nothing to do
 436 }
 437
 438 const char *CharsetRecog_euc_jp::getName() const
 439 {
 440     return "EUC-JP";
 441 }
 442
 443 const char *CharsetRecog_euc_jp::getLanguage() const
 444 {
 445     return "ja";
 446 }
 447
 448 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
 449 {
 450 #if U_PLATFORM_IS_DARWIN_BASED
 451     int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp);
 452 #else
 453     int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
 454 #endif
 455     results->set(det, this, confidence);
 456     return (confidence > 0);
 457 }
 458
 459 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
 460 {
 461     // nothing to do
 462 }
 463
 464 const char *CharsetRecog_euc_kr::getName() const
 465 {
 466     return "EUC-KR";
 467 }
 468
 469 const char *CharsetRecog_euc_kr::getLanguage() const
 470 {
 471     return "ko";
 472 }
 473
 474 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
 475 {
 476 #if U_PLATFORM_IS_DARWIN_BASED
 477     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr);
 478 #else
 479     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
 480 #endif
 481     results->set(det, this, confidence);
 482     return (confidence > 0);
 483 }
 484
 485 CharsetRecog_big5::~CharsetRecog_big5()
 486 {
 487     // nothing to do
 488 }
 489
 490 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
 491 {
 492     int32_t firstByte;
 493
 494     it->index = it->nextIndex;
 495     it->error = FALSE;
 496     firstByte = it->charValue = it->nextByte(det);
 497
 498     if (firstByte < 0) {
 499         return FALSE;
 500     }
 501
 502     if (firstByte <= 0x7F || firstByte == 0xFF) {
 503         // single byte character.
 504         return TRUE;
 505     }
 506
 507     int32_t secondByte = it->nextByte(det);
 508     if (secondByte >= 0)  {
 509         it->charValue = (it->charValue << 8) | secondByte;
 510     }
 511     // else we'll handle the error later.
 512
 513     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
 514         it->error = TRUE;
 515     }
 516
 517     return TRUE;
 518 }
 519
 520 const char *CharsetRecog_big5::getName() const
 521 {
 522     return "Big5";
 523 }
 524
 525 const char *CharsetRecog_big5::getLanguage() const
 526 {
 527     return "zh";
 528 }
 529
 530 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
 531 {
 532 #if U_PLATFORM_IS_DARWIN_BASED
 533     int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5);
 534 #else
 535     int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
 536 #endif
 537     results->set(det, this, confidence);
 538     return (confidence > 0);
 539 }
 540
 541 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
 542 {
 543     // nothing to do
 544 }
 545
 546 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
 547     int32_t firstByte  = 0;
 548     int32_t secondByte = 0;
 549     int32_t thirdByte  = 0;
 550     int32_t fourthByte = 0;
 551
 552     it->index = it->nextIndex;
 553     it->error = FALSE;
 554     firstByte = it->charValue = it->nextByte(det);
 555
 556     if (firstByte < 0) {
 557         // Ran off the end of the input data
 558         return FALSE;
 559     }
 560
 561     if (firstByte <= 0x80) {
 562         // single byte char
 563         return TRUE;
 564     }
 565
 566     secondByte = it->nextByte(det);
 567     if (secondByte >= 0) {
 568         it->charValue = (it->charValue << 8) | secondByte;
 569     }
 570     // else we'll handle the error later.
 571
 572     if (firstByte >= 0x81 && firstByte <= 0xFE) {
 573         // Two byte Char
 574         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
 575             return TRUE;
 576         }
 577
 578         // Four byte char
 579         if (secondByte >= 0x30 && secondByte <= 0x39) {
 580             thirdByte = it->nextByte(det);
 581
 582             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 583                 fourthByte = it->nextByte(det);
 584
 585                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 586                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
 587
 588                     return TRUE;
 589                 }
 590             }
 591         }
 592
 593         // Something wasn't valid, or we ran out of data (-1).
 594         it->error = TRUE;
 595     }
 596
 597     return TRUE;
 598 }
 599
 600 const char *CharsetRecog_gb_18030::getName() const
 601 {
 602     return "GB18030";
 603 }
 604
 605 const char *CharsetRecog_gb_18030::getLanguage() const
 606 {
 607     return "zh";
 608 }
 609
 610 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
 611 {
 612 #if U_PLATFORM_IS_DARWIN_BASED
 613     int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030);
 614 #else
 615     int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
 616 #endif
 617     results->set(det, this, confidence);
 618     return (confidence > 0);
 619 }
 620
 621 U_NAMESPACE_END
 622 #endif