icuSources/i18n/csrmbcs.cpp

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2012, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_CONVERSION
  11
  12 #include "csrmbcs.h"
  13
  14 #include <math.h>
  15
  16 U_NAMESPACE_BEGIN
  17
  18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  19
  20 #define min(x,y) (((x)<(y))?(x):(y))
  21
  22 static const uint16_t commonChars_sjis [] = {
  23 // TODO:  This set of data comes from the character frequency-
  24 //        of-occurence analysis tool.  The data needs to be moved
  25 //        into a resource and loaded from there.
  26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
  27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
  28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
  29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
  30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
  31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
  32
  33 static const uint16_t commonChars_euc_jp[] = {
  34 // TODO:  This set of data comes from the character frequency-
  35 //        of-occurence analysis tool.  The data needs to be moved
  36 //        into a resource and loaded from there.
  37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
  38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
  39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
  40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
  41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
  42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
  43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
  44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
  45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
  46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
  47
  48 static const uint16_t commonChars_euc_kr[] = {
  49 // TODO:  This set of data comes from the character frequency-
  50 //        of-occurence analysis tool.  The data needs to be moved
  51 //        into a resource and loaded from there.
  52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
  53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
  54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
  55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
  56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
  57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
  58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
  59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
  60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
  61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
  62
  63 static const uint16_t commonChars_big5[] = {
  64 // TODO:  This set of data comes from the character frequency-
  65 //        of-occurence analysis tool.  The data needs to be moved
  66 //        into a resource and loaded from there.
  67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
  68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
  69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
  70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
  71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
  72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
  73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
  74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
  75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
  76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
  77
  78 static const uint16_t commonChars_gb_18030[] = {
  79 // TODO:  This set of data comes from the character frequency-
  80 //        of-occurence analysis tool.  The data needs to be moved
  81 //        into a resource and loaded from there.
  82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
  83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
  84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
  85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
  86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
  87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
  88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
  89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
  90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
  91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
  92
  93 #if U_PLATFORM_IS_DARWIN_BASED
  94 static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
  95     {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
  96     {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
  97     {0}
  98 };
  99 static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
 100     {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
 101     {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
 102     {0}
 103 };
 104 static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
 105     {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
 106     {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
 107     {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
 108     {0}
 109 };
 110 static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
 111     {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
 112     {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
 113     {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
 114     {0}
 115 };
 116 static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
 117     {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
 118     {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
 119     {0}
 120 };
 121 #endif
 122
 123 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
 124 {
 125     int32_t start = 0, end = len-1;
 126     int32_t mid = (start+end)/2;
 127
 128     while(start <= end) {
 129         if(array[mid] == value) {
 130             return mid;
 131         }
 132
 133         if(array[mid] < value){
 134             start = mid+1;
 135         } else {
 136             end = mid-1;
 137         }
 138
 139         mid = (start+end)/2;
 140     }
 141
 142     return -1;
 143 }
 144
 145 #if U_PLATFORM_IS_DARWIN_BASED
 146 // If testPrefix is a prefix of base, return its length, else return 0
 147 static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
 148     const uint8_t *testPrefixStart = testPrefix;
 149     while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
 150         testPrefix++;
 151         base++;
 152     }
 153     return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
 154 }
 155 #endif
 156
 157 IteratedChar::IteratedChar() :
 158 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
 159 {
 160     // nothing else to do.
 161 }
 162
 163 /*void IteratedChar::reset()
 164 {
 165     charValue = 0;
 166     index     = -1;
 167     nextIndex = 0;
 168     error     = FALSE;
 169     done      = FALSE;
 170 }*/
 171
 172 int32_t IteratedChar::nextByte(InputText *det)
 173 {
 174     if (nextIndex >= det->fRawLength) {
 175         done = TRUE;
 176
 177         return -1;
 178     }
 179
 180     return det->fRawInput[nextIndex++];
 181 }
 182
 183 CharsetRecog_mbcs::~CharsetRecog_mbcs()
 184 {
 185     // nothing to do.
 186 }
 187
 188 #if U_PLATFORM_IS_DARWIN_BASED
 189 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) {
 190 #else
 191 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
 192 #endif
 193     int32_t singleByteCharCount = 0;
 194     int32_t doubleByteCharCount = 0;
 195     int32_t commonCharCount     = 0;
 196     int32_t badCharCount        = 0;
 197     int32_t totalCharCount      = 0;
 198     int32_t confidence          = 0;
 199 #if U_PLATFORM_IS_DARWIN_BASED
 200     int32_t confidenceFromKeys  = 0;
 201 #endif
 202     IteratedChar iter;
 203
 204     while (nextChar(&iter, det)) {
 205         totalCharCount++;
 206
 207         if (iter.error) {
 208             badCharCount++;
 209         } else {
 210             if (iter.charValue <= 0xFF) {
 211                 singleByteCharCount++;
 212             } else {
 213                 doubleByteCharCount++;
 214
 215                 if (commonChars != 0) {
 216                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
 217                         commonCharCount += 1;
 218                     }
 219                 }
 220 #if U_PLATFORM_IS_DARWIN_BASED
 221                 if (doubleByteCharCount <= 20) {
 222                     int32_t keyIndex;
 223                     for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
 224                         int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
 225                         confidenceFromKeys += prefixLen*5;
 226                     }
 227                 }
 228 #endif
 229             }
 230         }
 231
 232
 233         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
 234             // Bail out early if the byte data is not matching the encoding scheme.
 235             // break detectBlock;
 236             return confidence;
 237         }
 238     }
 239
 240     if (doubleByteCharCount <= 10 && badCharCount == 0) {
 241         // Not many multi-byte chars.
 242         if (doubleByteCharCount == 0 && totalCharCount < 10) {
 243             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
 244             // We don't have enough data to have any confidence.
 245             // Statistical analysis of single byte non-ASCII charcters would probably help here.
 246             confidence = 0;
 247         }
 248         else {
 249             //   ASCII or ISO file?  It's probably not our encoding,
 250             //   but is not incompatible with our encoding, so don't give it a zero.
 251 #if U_PLATFORM_IS_DARWIN_BASED
 252             if (confidenceFromKeys > 90) {
 253                 confidenceFromKeys = 90;
 254             } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
 255                 confidenceFromKeys += 20;
 256             }
 257             confidence = 10 + confidenceFromKeys;
 258 #else
 259             confidence = 10;
 260 #endif
 261         }
 262
 263         return confidence;
 264     }
 265
 266     //
 267     //  No match if there are too many characters that don't fit the encoding scheme.
 268     //    (should we have zero tolerance for these?)
 269     //
 270     if (doubleByteCharCount < 20*badCharCount) {
 271         confidence = 0;
 272
 273         return confidence;
 274     }
 275
 276     if (commonChars == 0) {
 277         // We have no statistics on frequently occuring characters.
 278         //  Assess confidence purely on having a reasonable number of
 279         //  multi-byte characters (the more the better)
 280         confidence = 30 + doubleByteCharCount - 20*badCharCount;
 281 #if U_PLATFORM_IS_DARWIN_BASED
 282         confidence += confidenceFromKeys;
 283 #endif
 284
 285         if (confidence > 100) {
 286             confidence = 100;
 287         }
 288     } else {
 289         //
 290         // Frequency of occurence statistics exist.
 291         //
 292
 293         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
 294         double scaleFactor = 90.0 / maxVal;
 295         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
 296 #if U_PLATFORM_IS_DARWIN_BASED
 297         confidence += confidenceFromKeys;
 298 #endif
 299
 300         confidence = min(confidence, 100);
 301     }
 302
 303     if (confidence < 0) {
 304         confidence = 0;
 305     }
 306
 307     return confidence;
 308 }
 309
 310 CharsetRecog_sjis::~CharsetRecog_sjis()
 311 {
 312     // nothing to do
 313 }
 314
 315 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
 316     it->index = it->nextIndex;
 317     it->error = FALSE;
 318
 319     int32_t firstByte = it->charValue = it->nextByte(det);
 320
 321     if (firstByte < 0) {
 322         return FALSE;
 323     }
 324
 325     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
 326         return TRUE;
 327     }
 328
 329     int32_t secondByte = it->nextByte(det);
 330     if (secondByte >= 0) {
 331         it->charValue = (firstByte << 8) | secondByte;
 332     }
 333     // else we'll handle the error later.
 334
 335     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
 336         // Illegal second byte value.
 337         it->error = TRUE;
 338     }
 339
 340     return TRUE;
 341 }
 342
 343 int32_t CharsetRecog_sjis::match(InputText* det)
 344 {
 345 #if U_PLATFORM_IS_DARWIN_BASED
 346     return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis);
 347 #else
 348     return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
 349 #endif
 350 }
 351
 352 const char *CharsetRecog_sjis::getName() const
 353 {
 354     return "Shift_JIS";
 355 }
 356
 357 const char *CharsetRecog_sjis::getLanguage() const
 358 {
 359     return "ja";
 360 }
 361
 362 CharsetRecog_euc::~CharsetRecog_euc()
 363 {
 364     // nothing to do
 365 }
 366
 367 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
 368     int32_t firstByte  = 0;
 369     int32_t secondByte = 0;
 370     int32_t thirdByte  = 0;
 371
 372     it->index = it->nextIndex;
 373     it->error = FALSE;
 374     firstByte = it->charValue = it->nextByte(det);
 375
 376     if (firstByte < 0) {
 377         // Ran off the end of the input data
 378         return FALSE;
 379     }
 380
 381     if (firstByte <= 0x8D) {
 382         // single byte char
 383         return TRUE;
 384     }
 385
 386     secondByte = it->nextByte(det);
 387     if (secondByte >= 0) {
 388         it->charValue = (it->charValue << 8) | secondByte;
 389     }
 390     // else we'll handle the error later.
 391
 392     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
 393         // Two byte Char
 394         if (secondByte < 0xA1) {
 395             it->error = TRUE;
 396         }
 397
 398         return TRUE;
 399     }
 400
 401     if (firstByte == 0x8E) {
 402         // Code Set 2.
 403         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 404         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 405         // We don't know which we've got.
 406         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 407         //   bytes will look like a well formed 2 byte char.
 408         if (secondByte < 0xA1) {
 409             it->error = TRUE;
 410         }
 411
 412         return TRUE;
 413     }
 414
 415     if (firstByte == 0x8F) {
 416         // Code set 3.
 417         // Three byte total char size, two bytes of actual char value.
 418         thirdByte    = it->nextByte(det);
 419         it->charValue = (it->charValue << 8) | thirdByte;
 420
 421         if (thirdByte < 0xa1) {
 422             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
 423             it->error = TRUE;
 424         }
 425     }
 426
 427     return TRUE;
 428
 429 }
 430
 431 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
 432 {
 433     // nothing to do
 434 }
 435
 436 const char *CharsetRecog_euc_jp::getName() const
 437 {
 438     return "EUC-JP";
 439 }
 440
 441 const char *CharsetRecog_euc_jp::getLanguage() const
 442 {
 443     return "ja";
 444 }
 445
 446 int32_t CharsetRecog_euc_jp::match(InputText *det)
 447 {
 448 #if U_PLATFORM_IS_DARWIN_BASED
 449     return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp);
 450 #else
 451     return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
 452 #endif
 453 }
 454
 455 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
 456 {
 457     // nothing to do
 458 }
 459
 460 const char *CharsetRecog_euc_kr::getName() const
 461 {
 462     return "EUC-KR";
 463 }
 464
 465 const char *CharsetRecog_euc_kr::getLanguage() const
 466 {
 467     return "ko";
 468 }
 469
 470 int32_t CharsetRecog_euc_kr::match(InputText *det)
 471 {
 472 #if U_PLATFORM_IS_DARWIN_BASED
 473     return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr);
 474 #else
 475     return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
 476 #endif
 477 }
 478
 479 CharsetRecog_big5::~CharsetRecog_big5()
 480 {
 481     // nothing to do
 482 }
 483
 484 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
 485 {
 486     int32_t firstByte;
 487
 488     it->index = it->nextIndex;
 489     it->error = FALSE;
 490     firstByte = it->charValue = it->nextByte(det);
 491
 492     if (firstByte < 0) {
 493         return FALSE;
 494     }
 495
 496     if (firstByte <= 0x7F || firstByte == 0xFF) {
 497         // single byte character.
 498         return TRUE;
 499     }
 500
 501     int32_t secondByte = it->nextByte(det);
 502     if (secondByte >= 0)  {
 503         it->charValue = (it->charValue << 8) | secondByte;
 504     }
 505     // else we'll handle the error later.
 506
 507     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
 508         it->error = TRUE;
 509     }
 510
 511     return TRUE;
 512 }
 513
 514 const char *CharsetRecog_big5::getName() const
 515 {
 516     return "Big5";
 517 }
 518
 519 const char *CharsetRecog_big5::getLanguage() const
 520 {
 521     return "zh";
 522 }
 523
 524 int32_t CharsetRecog_big5::match(InputText *det)
 525 {
 526 #if U_PLATFORM_IS_DARWIN_BASED
 527     return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5);
 528 #else
 529     return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
 530 #endif
 531 }
 532
 533 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
 534 {
 535     // nothing to do
 536 }
 537
 538 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
 539     int32_t firstByte  = 0;
 540     int32_t secondByte = 0;
 541     int32_t thirdByte  = 0;
 542     int32_t fourthByte = 0;
 543
 544     it->index = it->nextIndex;
 545     it->error = FALSE;
 546     firstByte = it->charValue = it->nextByte(det);
 547
 548     if (firstByte < 0) {
 549         // Ran off the end of the input data
 550         return FALSE;
 551     }
 552
 553     if (firstByte <= 0x80) {
 554         // single byte char
 555         return TRUE;
 556     }
 557
 558     secondByte = it->nextByte(det);
 559     if (secondByte >= 0) {
 560         it->charValue = (it->charValue << 8) | secondByte;
 561     }
 562     // else we'll handle the error later.
 563
 564     if (firstByte >= 0x81 && firstByte <= 0xFE) {
 565         // Two byte Char
 566         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
 567             return TRUE;
 568         }
 569
 570         // Four byte char
 571         if (secondByte >= 0x30 && secondByte <= 0x39) {
 572             thirdByte = it->nextByte(det);
 573
 574             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 575                 fourthByte = it->nextByte(det);
 576
 577                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 578                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
 579
 580                     return TRUE;
 581                 }
 582             }
 583         }
 584
 585         // Something wasn't valid, or we ran out of data (-1).
 586         it->error = TRUE;
 587     }
 588
 589     return TRUE;
 590 }
 591
 592 const char *CharsetRecog_gb_18030::getName() const
 593 {
 594     return "GB18030";
 595 }
 596
 597 const char *CharsetRecog_gb_18030::getLanguage() const
 598 {
 599     return "zh";
 600 }
 601
 602 int32_t CharsetRecog_gb_18030::match(InputText *det)
 603 {
 604 #if U_PLATFORM_IS_DARWIN_BASED
 605     return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030);
 606 #else
 607     return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
 608 #endif
 609 }
 610
 611 U_NAMESPACE_END
 612 #endif