icuSources/i18n/csrmbcs.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2016, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "cmemory.h"
  13 #include "csmatch.h"
  14 #include "csrmbcs.h"
  15
  16 #include <math.h>
  17
  18 U_NAMESPACE_BEGIN
  19
  20 #define min(x,y) (((x)<(y))?(x):(y))
  21
  22 static const uint16_t commonChars_sjis [] = {
  23 // TODO:  This set of data comes from the character frequency-
  24 //        of-occurence analysis tool.  The data needs to be moved
  25 //        into a resource and loaded from there.
  26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
  27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
  28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
  29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
  30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
  31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
  32
  33 static const uint16_t commonChars_euc_jp[] = {
  34 // TODO:  This set of data comes from the character frequency-
  35 //        of-occurence analysis tool.  The data needs to be moved
  36 //        into a resource and loaded from there.
  37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
  38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
  39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
  40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
  41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
  42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
  43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
  44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
  45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
  46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
  47
  48 static const uint16_t commonChars_euc_kr[] = {
  49 // TODO:  This set of data comes from the character frequency-
  50 //        of-occurence analysis tool.  The data needs to be moved
  51 //        into a resource and loaded from there.
  52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
  53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
  54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
  55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
  56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
  57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
  58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
  59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
  60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
  61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
  62
  63 static const uint16_t commonChars_big5[] = {
  64 // TODO:  This set of data comes from the character frequency-
  65 //        of-occurence analysis tool.  The data needs to be moved
  66 //        into a resource and loaded from there.
  67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
  68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
  69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
  70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
  71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
  72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
  73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
  74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
  75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
  76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
  77
  78 static const uint16_t commonChars_gb_18030[] = {
  79 // TODO:  This set of data comes from the character frequency-
  80 //        of-occurence analysis tool.  The data needs to be moved
  81 //        into a resource and loaded from there.
  82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
  83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
  84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
  85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
  86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
  87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
  88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
  89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
  90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
  91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
  92
  93 #if U_PLATFORM_IS_DARWIN_BASED
  94 static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
  95     {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
  96     {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
  97     {0}
  98 };
  99 static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
 100     {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
 101     {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
 102     {0}
 103 };
 104 static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
 105     {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
 106     {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
 107     {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
 108     {0}
 109 };
 110 static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
 111     {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
 112     {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
 113     {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
 114     {0}
 115 };
 116 static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
 117     {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
 118     {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
 119     {0}
 120 };
 121 #endif
 122
 123 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
 124 {
 125     int32_t start = 0, end = len-1;
 126     int32_t mid = (start+end)/2;
 127
 128     while(start <= end) {
 129         if(array[mid] == value) {
 130             return mid;
 131         }
 132
 133         if(array[mid] < value){
 134             start = mid+1;
 135         } else {
 136             end = mid-1;
 137         }
 138
 139         mid = (start+end)/2;
 140     }
 141
 142     return -1;
 143 }
 144
 145 #if U_PLATFORM_IS_DARWIN_BASED
 146 // If testPrefix is a prefix of base, return its length, else return 0
 147 static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
 148     const uint8_t *testPrefixStart = testPrefix;
 149     while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
 150         testPrefix++;
 151         base++;
 152     }
 153     return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
 154 }
 155 #endif
 156
 157 IteratedChar::IteratedChar() :
 158 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
 159 {
 160     // nothing else to do.
 161 }
 162
 163 /*void IteratedChar::reset()
 164 {
 165     charValue = 0;
 166     index     = -1;
 167     nextIndex = 0;
 168     error     = FALSE;
 169     done      = FALSE;
 170 }*/
 171
 172 int32_t IteratedChar::nextByte(InputText *det)
 173 {
 174     if (nextIndex >= det->fRawLength) {
 175         done = TRUE;
 176
 177         return -1;
 178     }
 179
 180     return det->fRawInput[nextIndex++];
 181 }
 182
 183 CharsetRecog_mbcs::~CharsetRecog_mbcs()
 184 {
 185     // nothing to do.
 186 }
 187
 188 #if U_PLATFORM_IS_DARWIN_BASED
 189 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
 190 #else
 191 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
 192 #endif
 193     int32_t singleByteCharCount = 0;
 194     int32_t doubleByteCharCount = 0;
 195     int32_t commonCharCount     = 0;
 196     int32_t badCharCount        = 0;
 197     int32_t totalCharCount      = 0;
 198     int32_t confidence          = 0;
 199 #if U_PLATFORM_IS_DARWIN_BASED
 200     int32_t confidenceFromKeys  = 0;
 201 #endif
 202     IteratedChar iter;
 203
 204     while (nextChar(&iter, det)) {
 205         totalCharCount++;
 206
 207         if (iter.error) {
 208             badCharCount++;
 209         } else {
 210             if (iter.charValue <= 0xFF) {
 211                 singleByteCharCount++;
 212             } else {
 213                 doubleByteCharCount++;
 214
 215                 if (commonChars != 0) {
 216                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
 217                         commonCharCount += 1;
 218                     }
 219                 }
 220 #if U_PLATFORM_IS_DARWIN_BASED
 221                 if (doubleByteCharCount <= 20) {
 222                     int32_t keyIndex;
 223                     for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
 224                         int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
 225                         confidenceFromKeys += prefixLen*5;
 226                     }
 227                 }
 228 #endif
 229             }
 230         }
 231
 232
 233         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
 234             // Bail out early if the byte data is not matching the encoding scheme.
 235             // break detectBlock;
 236             return confidence;
 237         }
 238     }
 239
 240     if (doubleByteCharCount <= 10 && badCharCount == 0) {
 241         // Not many multi-byte chars.
 242         if (doubleByteCharCount == 0 && totalCharCount < 10) {
 243             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
 244             // We don't have enough data to have any confidence.
 245             // Statistical analysis of single byte non-ASCII charcters would probably help here.
 246             confidence = 0;
 247         }
 248         else {
 249             //   ASCII or ISO file?  It's probably not our encoding,
 250             //   but is not incompatible with our encoding, so don't give it a zero.
 251 #if U_PLATFORM_IS_DARWIN_BASED
 252             if (confidenceFromKeys > 90) {
 253                 confidenceFromKeys = 90;
 254             } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
 255                 confidenceFromKeys += 20;
 256             }
 257             confidence = 10 + confidenceFromKeys;
 258 #else
 259             confidence = 10;
 260 #endif
 261         }
 262
 263         return confidence;
 264     }
 265
 266     //
 267     //  No match if there are too many characters that don't fit the encoding scheme.
 268     //    (should we have zero tolerance for these?)
 269     //
 270     if (doubleByteCharCount < 20*badCharCount) {
 271         confidence = 0;
 272
 273         return confidence;
 274     }
 275
 276     if (commonChars == 0) {
 277         // We have no statistics on frequently occuring characters.
 278         //  Assess confidence purely on having a reasonable number of
 279         //  multi-byte characters (the more the better)
 280         confidence = 30 + doubleByteCharCount - 20*badCharCount;
 281 #if U_PLATFORM_IS_DARWIN_BASED
 282         confidence += confidenceFromKeys;
 283 #endif
 284
 285         if (confidence > 100) {
 286             confidence = 100;
 287         }
 288     } else {
 289         //
 290         // Frequency of occurence statistics exist.
 291         //
 292
 293         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
 294         double scaleFactor = 90.0 / maxVal;
 295         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
 296 #if U_PLATFORM_IS_DARWIN_BASED
 297         confidence += confidenceFromKeys;
 298 #endif
 299
 300         confidence = min(confidence, 100);
 301     }
 302
 303     if (confidence < 0) {
 304         confidence = 0;
 305     }
 306
 307     return confidence;
 308 }
 309
 310 CharsetRecog_sjis::~CharsetRecog_sjis()
 311 {
 312     // nothing to do
 313 }
 314
 315 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
 316     it->index = it->nextIndex;
 317     it->error = FALSE;
 318
 319     int32_t firstByte = it->charValue = it->nextByte(det);
 320
 321     if (firstByte < 0) {
 322         return FALSE;
 323     }
 324
 325     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
 326         return TRUE;
 327     }
 328
 329     int32_t secondByte = it->nextByte(det);
 330     if (secondByte >= 0) {
 331         it->charValue = (firstByte << 8) | secondByte;
 332     }
 333     // else we'll handle the error later.
 334
 335     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
 336         // Illegal second byte value.
 337         it->error = TRUE;
 338     }
 339
 340     return TRUE;
 341 }
 342
 343 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
 344 #if U_PLATFORM_IS_DARWIN_BASED
 345     int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis), keyStrings_sjis);
 346 #else
 347     int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
 348 #endif
 349     results->set(det, this, confidence);
 350     return (confidence > 0);
 351 }
 352
 353 const char *CharsetRecog_sjis::getName() const
 354 {
 355     return "Shift_JIS";
 356 }
 357
 358 const char *CharsetRecog_sjis::getLanguage() const
 359 {
 360     return "ja";
 361 }
 362
 363 CharsetRecog_euc::~CharsetRecog_euc()
 364 {
 365     // nothing to do
 366 }
 367
 368 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
 369     int32_t firstByte  = 0;
 370     int32_t secondByte = 0;
 371     int32_t thirdByte  = 0;
 372
 373     it->index = it->nextIndex;
 374     it->error = FALSE;
 375     firstByte = it->charValue = it->nextByte(det);
 376
 377     if (firstByte < 0) {
 378         // Ran off the end of the input data
 379         return FALSE;
 380     }
 381
 382     if (firstByte <= 0x8D) {
 383         // single byte char
 384         return TRUE;
 385     }
 386
 387     secondByte = it->nextByte(det);
 388     if (secondByte >= 0) {
 389         it->charValue = (it->charValue << 8) | secondByte;
 390     }
 391     // else we'll handle the error later.
 392
 393     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
 394         // Two byte Char
 395         if (secondByte < 0xA1) {
 396             it->error = TRUE;
 397         }
 398
 399         return TRUE;
 400     }
 401
 402     if (firstByte == 0x8E) {
 403         // Code Set 2.
 404         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 405         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 406         // We don't know which we've got.
 407         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 408         //   bytes will look like a well formed 2 byte char.
 409         if (secondByte < 0xA1) {
 410             it->error = TRUE;
 411         }
 412
 413         return TRUE;
 414     }
 415
 416     if (firstByte == 0x8F) {
 417         // Code set 3.
 418         // Three byte total char size, two bytes of actual char value.
 419         thirdByte    = it->nextByte(det);
 420         it->charValue = (it->charValue << 8) | thirdByte;
 421
 422         if (thirdByte < 0xa1) {
 423             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
 424             it->error = TRUE;
 425         }
 426     }
 427
 428     return TRUE;
 429
 430 }
 431
 432 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
 433 {
 434     // nothing to do
 435 }
 436
 437 const char *CharsetRecog_euc_jp::getName() const
 438 {
 439     return "EUC-JP";
 440 }
 441
 442 const char *CharsetRecog_euc_jp::getLanguage() const
 443 {
 444     return "ja";
 445 }
 446
 447 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
 448 {
 449 #if U_PLATFORM_IS_DARWIN_BASED
 450     int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp), keyStrings_euc_jp);
 451 #else
 452     int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
 453 #endif
 454     results->set(det, this, confidence);
 455     return (confidence > 0);
 456 }
 457
 458 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
 459 {
 460     // nothing to do
 461 }
 462
 463 const char *CharsetRecog_euc_kr::getName() const
 464 {
 465     return "EUC-KR";
 466 }
 467
 468 const char *CharsetRecog_euc_kr::getLanguage() const
 469 {
 470     return "ko";
 471 }
 472
 473 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
 474 {
 475 #if U_PLATFORM_IS_DARWIN_BASED
 476     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr), keyStrings_euc_kr);
 477 #else
 478     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
 479 #endif
 480     results->set(det, this, confidence);
 481     return (confidence > 0);
 482 }
 483
 484 CharsetRecog_big5::~CharsetRecog_big5()
 485 {
 486     // nothing to do
 487 }
 488
 489 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
 490 {
 491     int32_t firstByte;
 492
 493     it->index = it->nextIndex;
 494     it->error = FALSE;
 495     firstByte = it->charValue = it->nextByte(det);
 496
 497     if (firstByte < 0) {
 498         return FALSE;
 499     }
 500
 501     if (firstByte <= 0x7F || firstByte == 0xFF) {
 502         // single byte character.
 503         return TRUE;
 504     }
 505
 506     int32_t secondByte = it->nextByte(det);
 507     if (secondByte >= 0)  {
 508         it->charValue = (it->charValue << 8) | secondByte;
 509     }
 510     // else we'll handle the error later.
 511
 512     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
 513         it->error = TRUE;
 514     }
 515
 516     return TRUE;
 517 }
 518
 519 const char *CharsetRecog_big5::getName() const
 520 {
 521     return "Big5";
 522 }
 523
 524 const char *CharsetRecog_big5::getLanguage() const
 525 {
 526     return "zh";
 527 }
 528
 529 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
 530 {
 531 #if U_PLATFORM_IS_DARWIN_BASED
 532     int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5), keyStrings_big5);
 533 #else
 534     int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
 535 #endif
 536     results->set(det, this, confidence);
 537     return (confidence > 0);
 538 }
 539
 540 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
 541 {
 542     // nothing to do
 543 }
 544
 545 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
 546     int32_t firstByte  = 0;
 547     int32_t secondByte = 0;
 548     int32_t thirdByte  = 0;
 549     int32_t fourthByte = 0;
 550
 551     it->index = it->nextIndex;
 552     it->error = FALSE;
 553     firstByte = it->charValue = it->nextByte(det);
 554
 555     if (firstByte < 0) {
 556         // Ran off the end of the input data
 557         return FALSE;
 558     }
 559
 560     if (firstByte <= 0x80) {
 561         // single byte char
 562         return TRUE;
 563     }
 564
 565     secondByte = it->nextByte(det);
 566     if (secondByte >= 0) {
 567         it->charValue = (it->charValue << 8) | secondByte;
 568     }
 569     // else we'll handle the error later.
 570
 571     if (firstByte >= 0x81 && firstByte <= 0xFE) {
 572         // Two byte Char
 573         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
 574             return TRUE;
 575         }
 576
 577         // Four byte char
 578         if (secondByte >= 0x30 && secondByte <= 0x39) {
 579             thirdByte = it->nextByte(det);
 580
 581             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 582                 fourthByte = it->nextByte(det);
 583
 584                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 585                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
 586
 587                     return TRUE;
 588                 }
 589             }
 590         }
 591
 592         // Something wasn't valid, or we ran out of data (-1).
 593         it->error = TRUE;
 594     }
 595
 596     return TRUE;
 597 }
 598
 599 const char *CharsetRecog_gb_18030::getName() const
 600 {
 601     return "GB18030";
 602 }
 603
 604 const char *CharsetRecog_gb_18030::getLanguage() const
 605 {
 606     return "zh";
 607 }
 608
 609 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
 610 {
 611 #if U_PLATFORM_IS_DARWIN_BASED
 612     int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030), keyStrings_gb_18030);
 613 #else
 614     int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
 615 #endif
 616     results->set(det, this, confidence);
 617     return (confidence > 0);
 618 }
 619
 620 U_NAMESPACE_END
 621 #endif