]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "csrmbcs.h"
13
14 #include <math.h>
15
16 U_NAMESPACE_BEGIN
17
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20 #define min(x,y) (((x)<(y))?(x):(y))
21
22 static const uint16_t commonChars_sjis [] = {
23 // TODO: This set of data comes from the character frequency-
24 // of-occurence analysis tool. The data needs to be moved
25 // into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
32
33 static const uint16_t commonChars_euc_jp[] = {
34 // TODO: This set of data comes from the character frequency-
35 // of-occurence analysis tool. The data needs to be moved
36 // into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
47
48 static const uint16_t commonChars_euc_kr[] = {
49 // TODO: This set of data comes from the character frequency-
50 // of-occurence analysis tool. The data needs to be moved
51 // into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
62
63 static const uint16_t commonChars_big5[] = {
64 // TODO: This set of data comes from the character frequency-
65 // of-occurence analysis tool. The data needs to be moved
66 // into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
77
78 static const uint16_t commonChars_gb_18030[] = {
79 // TODO: This set of data comes from the character frequency-
80 // of-occurence analysis tool. The data needs to be moved
81 // into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
92
93 #if U_PLATFORM_IS_DARWIN_BASED
94 static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
95 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
96 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
97 {0}
98 };
99 static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
100 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
101 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
102 {0}
103 };
104 static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
105 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
106 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
107 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
108 {0}
109 };
110 static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
111 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
112 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
113 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
114 {0}
115 };
116 static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
117 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
118 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
119 {0}
120 };
121 #endif
122
123 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
124 {
125 int32_t start = 0, end = len-1;
126 int32_t mid = (start+end)/2;
127
128 while(start <= end) {
129 if(array[mid] == value) {
130 return mid;
131 }
132
133 if(array[mid] < value){
134 start = mid+1;
135 } else {
136 end = mid-1;
137 }
138
139 mid = (start+end)/2;
140 }
141
142 return -1;
143 }
144
145 #if U_PLATFORM_IS_DARWIN_BASED
146 // If testPrefix is a prefix of base, return its length, else return 0
147 static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
148 const uint8_t *testPrefixStart = testPrefix;
149 while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
150 testPrefix++;
151 base++;
152 }
153 return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
154 }
155 #endif
156
157 IteratedChar::IteratedChar() :
158 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
159 {
160 // nothing else to do.
161 }
162
163 /*void IteratedChar::reset()
164 {
165 charValue = 0;
166 index = -1;
167 nextIndex = 0;
168 error = FALSE;
169 done = FALSE;
170 }*/
171
172 int32_t IteratedChar::nextByte(InputText *det)
173 {
174 if (nextIndex >= det->fRawLength) {
175 done = TRUE;
176
177 return -1;
178 }
179
180 return det->fRawInput[nextIndex++];
181 }
182
183 CharsetRecog_mbcs::~CharsetRecog_mbcs()
184 {
185 // nothing to do.
186 }
187
188 #if U_PLATFORM_IS_DARWIN_BASED
189 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) {
190 #else
191 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
192 #endif
193 int32_t singleByteCharCount = 0;
194 int32_t doubleByteCharCount = 0;
195 int32_t commonCharCount = 0;
196 int32_t badCharCount = 0;
197 int32_t totalCharCount = 0;
198 int32_t confidence = 0;
199 #if U_PLATFORM_IS_DARWIN_BASED
200 int32_t confidenceFromKeys = 0;
201 #endif
202 IteratedChar iter;
203
204 while (nextChar(&iter, det)) {
205 totalCharCount++;
206
207 if (iter.error) {
208 badCharCount++;
209 } else {
210 if (iter.charValue <= 0xFF) {
211 singleByteCharCount++;
212 } else {
213 doubleByteCharCount++;
214
215 if (commonChars != 0) {
216 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
217 commonCharCount += 1;
218 }
219 }
220 #if U_PLATFORM_IS_DARWIN_BASED
221 if (doubleByteCharCount <= 20) {
222 int32_t keyIndex;
223 for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
224 int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
225 confidenceFromKeys += prefixLen*5;
226 }
227 }
228 #endif
229 }
230 }
231
232
233 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
234 // Bail out early if the byte data is not matching the encoding scheme.
235 // break detectBlock;
236 return confidence;
237 }
238 }
239
240 if (doubleByteCharCount <= 10 && badCharCount == 0) {
241 // Not many multi-byte chars.
242 if (doubleByteCharCount == 0 && totalCharCount < 10) {
243 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
244 // We don't have enough data to have any confidence.
245 // Statistical analysis of single byte non-ASCII charcters would probably help here.
246 confidence = 0;
247 }
248 else {
249 // ASCII or ISO file? It's probably not our encoding,
250 // but is not incompatible with our encoding, so don't give it a zero.
251 #if U_PLATFORM_IS_DARWIN_BASED
252 if (confidenceFromKeys > 90) {
253 confidenceFromKeys = 90;
254 } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
255 confidenceFromKeys += 20;
256 }
257 confidence = 10 + confidenceFromKeys;
258 #else
259 confidence = 10;
260 #endif
261 }
262
263 return confidence;
264 }
265
266 //
267 // No match if there are too many characters that don't fit the encoding scheme.
268 // (should we have zero tolerance for these?)
269 //
270 if (doubleByteCharCount < 20*badCharCount) {
271 confidence = 0;
272
273 return confidence;
274 }
275
276 if (commonChars == 0) {
277 // We have no statistics on frequently occuring characters.
278 // Assess confidence purely on having a reasonable number of
279 // multi-byte characters (the more the better)
280 confidence = 30 + doubleByteCharCount - 20*badCharCount;
281 #if U_PLATFORM_IS_DARWIN_BASED
282 confidence += confidenceFromKeys;
283 #endif
284
285 if (confidence > 100) {
286 confidence = 100;
287 }
288 } else {
289 //
290 // Frequency of occurence statistics exist.
291 //
292
293 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
294 double scaleFactor = 90.0 / maxVal;
295 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
296 #if U_PLATFORM_IS_DARWIN_BASED
297 confidence += confidenceFromKeys;
298 #endif
299
300 confidence = min(confidence, 100);
301 }
302
303 if (confidence < 0) {
304 confidence = 0;
305 }
306
307 return confidence;
308 }
309
310 CharsetRecog_sjis::~CharsetRecog_sjis()
311 {
312 // nothing to do
313 }
314
315 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
316 it->index = it->nextIndex;
317 it->error = FALSE;
318
319 int32_t firstByte = it->charValue = it->nextByte(det);
320
321 if (firstByte < 0) {
322 return FALSE;
323 }
324
325 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
326 return TRUE;
327 }
328
329 int32_t secondByte = it->nextByte(det);
330 if (secondByte >= 0) {
331 it->charValue = (firstByte << 8) | secondByte;
332 }
333 // else we'll handle the error later.
334
335 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
336 // Illegal second byte value.
337 it->error = TRUE;
338 }
339
340 return TRUE;
341 }
342
343 int32_t CharsetRecog_sjis::match(InputText* det)
344 {
345 #if U_PLATFORM_IS_DARWIN_BASED
346 return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis);
347 #else
348 return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
349 #endif
350 }
351
352 const char *CharsetRecog_sjis::getName() const
353 {
354 return "Shift_JIS";
355 }
356
357 const char *CharsetRecog_sjis::getLanguage() const
358 {
359 return "ja";
360 }
361
362 CharsetRecog_euc::~CharsetRecog_euc()
363 {
364 // nothing to do
365 }
366
367 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
368 int32_t firstByte = 0;
369 int32_t secondByte = 0;
370 int32_t thirdByte = 0;
371
372 it->index = it->nextIndex;
373 it->error = FALSE;
374 firstByte = it->charValue = it->nextByte(det);
375
376 if (firstByte < 0) {
377 // Ran off the end of the input data
378 return FALSE;
379 }
380
381 if (firstByte <= 0x8D) {
382 // single byte char
383 return TRUE;
384 }
385
386 secondByte = it->nextByte(det);
387 if (secondByte >= 0) {
388 it->charValue = (it->charValue << 8) | secondByte;
389 }
390 // else we'll handle the error later.
391
392 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
393 // Two byte Char
394 if (secondByte < 0xA1) {
395 it->error = TRUE;
396 }
397
398 return TRUE;
399 }
400
401 if (firstByte == 0x8E) {
402 // Code Set 2.
403 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
404 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
405 // We don't know which we've got.
406 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
407 // bytes will look like a well formed 2 byte char.
408 if (secondByte < 0xA1) {
409 it->error = TRUE;
410 }
411
412 return TRUE;
413 }
414
415 if (firstByte == 0x8F) {
416 // Code set 3.
417 // Three byte total char size, two bytes of actual char value.
418 thirdByte = it->nextByte(det);
419 it->charValue = (it->charValue << 8) | thirdByte;
420
421 if (thirdByte < 0xa1) {
422 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
423 it->error = TRUE;
424 }
425 }
426
427 return TRUE;
428
429 }
430
431 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
432 {
433 // nothing to do
434 }
435
436 const char *CharsetRecog_euc_jp::getName() const
437 {
438 return "EUC-JP";
439 }
440
441 const char *CharsetRecog_euc_jp::getLanguage() const
442 {
443 return "ja";
444 }
445
446 int32_t CharsetRecog_euc_jp::match(InputText *det)
447 {
448 #if U_PLATFORM_IS_DARWIN_BASED
449 return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp);
450 #else
451 return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
452 #endif
453 }
454
455 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
456 {
457 // nothing to do
458 }
459
460 const char *CharsetRecog_euc_kr::getName() const
461 {
462 return "EUC-KR";
463 }
464
465 const char *CharsetRecog_euc_kr::getLanguage() const
466 {
467 return "ko";
468 }
469
470 int32_t CharsetRecog_euc_kr::match(InputText *det)
471 {
472 #if U_PLATFORM_IS_DARWIN_BASED
473 return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr);
474 #else
475 return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
476 #endif
477 }
478
479 CharsetRecog_big5::~CharsetRecog_big5()
480 {
481 // nothing to do
482 }
483
484 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
485 {
486 int32_t firstByte;
487
488 it->index = it->nextIndex;
489 it->error = FALSE;
490 firstByte = it->charValue = it->nextByte(det);
491
492 if (firstByte < 0) {
493 return FALSE;
494 }
495
496 if (firstByte <= 0x7F || firstByte == 0xFF) {
497 // single byte character.
498 return TRUE;
499 }
500
501 int32_t secondByte = it->nextByte(det);
502 if (secondByte >= 0) {
503 it->charValue = (it->charValue << 8) | secondByte;
504 }
505 // else we'll handle the error later.
506
507 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
508 it->error = TRUE;
509 }
510
511 return TRUE;
512 }
513
514 const char *CharsetRecog_big5::getName() const
515 {
516 return "Big5";
517 }
518
519 const char *CharsetRecog_big5::getLanguage() const
520 {
521 return "zh";
522 }
523
524 int32_t CharsetRecog_big5::match(InputText *det)
525 {
526 #if U_PLATFORM_IS_DARWIN_BASED
527 return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5);
528 #else
529 return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
530 #endif
531 }
532
533 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
534 {
535 // nothing to do
536 }
537
538 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
539 int32_t firstByte = 0;
540 int32_t secondByte = 0;
541 int32_t thirdByte = 0;
542 int32_t fourthByte = 0;
543
544 it->index = it->nextIndex;
545 it->error = FALSE;
546 firstByte = it->charValue = it->nextByte(det);
547
548 if (firstByte < 0) {
549 // Ran off the end of the input data
550 return FALSE;
551 }
552
553 if (firstByte <= 0x80) {
554 // single byte char
555 return TRUE;
556 }
557
558 secondByte = it->nextByte(det);
559 if (secondByte >= 0) {
560 it->charValue = (it->charValue << 8) | secondByte;
561 }
562 // else we'll handle the error later.
563
564 if (firstByte >= 0x81 && firstByte <= 0xFE) {
565 // Two byte Char
566 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
567 return TRUE;
568 }
569
570 // Four byte char
571 if (secondByte >= 0x30 && secondByte <= 0x39) {
572 thirdByte = it->nextByte(det);
573
574 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
575 fourthByte = it->nextByte(det);
576
577 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
578 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
579
580 return TRUE;
581 }
582 }
583 }
584
585 // Something wasn't valid, or we ran out of data (-1).
586 it->error = TRUE;
587 }
588
589 return TRUE;
590 }
591
592 const char *CharsetRecog_gb_18030::getName() const
593 {
594 return "GB18030";
595 }
596
597 const char *CharsetRecog_gb_18030::getLanguage() const
598 {
599 return "zh";
600 }
601
602 int32_t CharsetRecog_gb_18030::match(InputText *det)
603 {
604 #if U_PLATFORM_IS_DARWIN_BASED
605 return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030);
606 #else
607 return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
608 #endif
609 }
610
611 U_NAMESPACE_END
612 #endif