]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "csmatch.h"
13 #include "csrmbcs.h"
14
15 #include <math.h>
16
17 U_NAMESPACE_BEGIN
18
19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20
21 #define min(x,y) (((x)<(y))?(x):(y))
22
23 static const uint16_t commonChars_sjis [] = {
24 // TODO: This set of data comes from the character frequency-
25 // of-occurence analysis tool. The data needs to be moved
26 // into a resource and loaded from there.
27 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
28 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
29 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
30 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
31 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
32 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33
34 static const uint16_t commonChars_euc_jp[] = {
35 // TODO: This set of data comes from the character frequency-
36 // of-occurence analysis tool. The data needs to be moved
37 // into a resource and loaded from there.
38 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
39 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
40 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
41 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
42 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
43 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
44 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
45 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
46 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
47 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48
49 static const uint16_t commonChars_euc_kr[] = {
50 // TODO: This set of data comes from the character frequency-
51 // of-occurence analysis tool. The data needs to be moved
52 // into a resource and loaded from there.
53 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
54 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
55 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
56 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
57 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
58 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
59 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
60 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
61 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
62 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63
64 static const uint16_t commonChars_big5[] = {
65 // TODO: This set of data comes from the character frequency-
66 // of-occurence analysis tool. The data needs to be moved
67 // into a resource and loaded from there.
68 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
69 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
70 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
71 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
72 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
73 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
74 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
75 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
76 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
77 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78
79 static const uint16_t commonChars_gb_18030[] = {
80 // TODO: This set of data comes from the character frequency-
81 // of-occurence analysis tool. The data needs to be moved
82 // into a resource and loaded from there.
83 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
84 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
85 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
86 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
87 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
88 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
89 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
90 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
91 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
92 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
93
94 #if U_PLATFORM_IS_DARWIN_BASED
95 static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
96 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
97 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
98 {0}
99 };
100 static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
101 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
102 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
103 {0}
104 };
105 static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
106 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
107 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
108 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
109 {0}
110 };
111 static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
112 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
113 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
114 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
115 {0}
116 };
117 static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
118 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
119 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
120 {0}
121 };
122 #endif
123
124 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
125 {
126 int32_t start = 0, end = len-1;
127 int32_t mid = (start+end)/2;
128
129 while(start <= end) {
130 if(array[mid] == value) {
131 return mid;
132 }
133
134 if(array[mid] < value){
135 start = mid+1;
136 } else {
137 end = mid-1;
138 }
139
140 mid = (start+end)/2;
141 }
142
143 return -1;
144 }
145
146 #if U_PLATFORM_IS_DARWIN_BASED
147 // If testPrefix is a prefix of base, return its length, else return 0
148 static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
149 const uint8_t *testPrefixStart = testPrefix;
150 while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
151 testPrefix++;
152 base++;
153 }
154 return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
155 }
156 #endif
157
158 IteratedChar::IteratedChar() :
159 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
160 {
161 // nothing else to do.
162 }
163
164 /*void IteratedChar::reset()
165 {
166 charValue = 0;
167 index = -1;
168 nextIndex = 0;
169 error = FALSE;
170 done = FALSE;
171 }*/
172
173 int32_t IteratedChar::nextByte(InputText *det)
174 {
175 if (nextIndex >= det->fRawLength) {
176 done = TRUE;
177
178 return -1;
179 }
180
181 return det->fRawInput[nextIndex++];
182 }
183
184 CharsetRecog_mbcs::~CharsetRecog_mbcs()
185 {
186 // nothing to do.
187 }
188
189 #if U_PLATFORM_IS_DARWIN_BASED
190 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
191 #else
192 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
193 #endif
194 int32_t singleByteCharCount = 0;
195 int32_t doubleByteCharCount = 0;
196 int32_t commonCharCount = 0;
197 int32_t badCharCount = 0;
198 int32_t totalCharCount = 0;
199 int32_t confidence = 0;
200 #if U_PLATFORM_IS_DARWIN_BASED
201 int32_t confidenceFromKeys = 0;
202 #endif
203 IteratedChar iter;
204
205 while (nextChar(&iter, det)) {
206 totalCharCount++;
207
208 if (iter.error) {
209 badCharCount++;
210 } else {
211 if (iter.charValue <= 0xFF) {
212 singleByteCharCount++;
213 } else {
214 doubleByteCharCount++;
215
216 if (commonChars != 0) {
217 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
218 commonCharCount += 1;
219 }
220 }
221 #if U_PLATFORM_IS_DARWIN_BASED
222 if (doubleByteCharCount <= 20) {
223 int32_t keyIndex;
224 for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
225 int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
226 confidenceFromKeys += prefixLen*5;
227 }
228 }
229 #endif
230 }
231 }
232
233
234 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
235 // Bail out early if the byte data is not matching the encoding scheme.
236 // break detectBlock;
237 return confidence;
238 }
239 }
240
241 if (doubleByteCharCount <= 10 && badCharCount == 0) {
242 // Not many multi-byte chars.
243 if (doubleByteCharCount == 0 && totalCharCount < 10) {
244 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
245 // We don't have enough data to have any confidence.
246 // Statistical analysis of single byte non-ASCII charcters would probably help here.
247 confidence = 0;
248 }
249 else {
250 // ASCII or ISO file? It's probably not our encoding,
251 // but is not incompatible with our encoding, so don't give it a zero.
252 #if U_PLATFORM_IS_DARWIN_BASED
253 if (confidenceFromKeys > 90) {
254 confidenceFromKeys = 90;
255 } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
256 confidenceFromKeys += 20;
257 }
258 confidence = 10 + confidenceFromKeys;
259 #else
260 confidence = 10;
261 #endif
262 }
263
264 return confidence;
265 }
266
267 //
268 // No match if there are too many characters that don't fit the encoding scheme.
269 // (should we have zero tolerance for these?)
270 //
271 if (doubleByteCharCount < 20*badCharCount) {
272 confidence = 0;
273
274 return confidence;
275 }
276
277 if (commonChars == 0) {
278 // We have no statistics on frequently occuring characters.
279 // Assess confidence purely on having a reasonable number of
280 // multi-byte characters (the more the better)
281 confidence = 30 + doubleByteCharCount - 20*badCharCount;
282 #if U_PLATFORM_IS_DARWIN_BASED
283 confidence += confidenceFromKeys;
284 #endif
285
286 if (confidence > 100) {
287 confidence = 100;
288 }
289 } else {
290 //
291 // Frequency of occurence statistics exist.
292 //
293
294 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
295 double scaleFactor = 90.0 / maxVal;
296 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
297 #if U_PLATFORM_IS_DARWIN_BASED
298 confidence += confidenceFromKeys;
299 #endif
300
301 confidence = min(confidence, 100);
302 }
303
304 if (confidence < 0) {
305 confidence = 0;
306 }
307
308 return confidence;
309 }
310
311 CharsetRecog_sjis::~CharsetRecog_sjis()
312 {
313 // nothing to do
314 }
315
316 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
317 it->index = it->nextIndex;
318 it->error = FALSE;
319
320 int32_t firstByte = it->charValue = it->nextByte(det);
321
322 if (firstByte < 0) {
323 return FALSE;
324 }
325
326 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
327 return TRUE;
328 }
329
330 int32_t secondByte = it->nextByte(det);
331 if (secondByte >= 0) {
332 it->charValue = (firstByte << 8) | secondByte;
333 }
334 // else we'll handle the error later.
335
336 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
337 // Illegal second byte value.
338 it->error = TRUE;
339 }
340
341 return TRUE;
342 }
343
344 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
345 #if U_PLATFORM_IS_DARWIN_BASED
346 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis);
347 #else
348 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
349 #endif
350 results->set(det, this, confidence);
351 return (confidence > 0);
352 }
353
354 const char *CharsetRecog_sjis::getName() const
355 {
356 return "Shift_JIS";
357 }
358
359 const char *CharsetRecog_sjis::getLanguage() const
360 {
361 return "ja";
362 }
363
364 CharsetRecog_euc::~CharsetRecog_euc()
365 {
366 // nothing to do
367 }
368
369 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
370 int32_t firstByte = 0;
371 int32_t secondByte = 0;
372 int32_t thirdByte = 0;
373
374 it->index = it->nextIndex;
375 it->error = FALSE;
376 firstByte = it->charValue = it->nextByte(det);
377
378 if (firstByte < 0) {
379 // Ran off the end of the input data
380 return FALSE;
381 }
382
383 if (firstByte <= 0x8D) {
384 // single byte char
385 return TRUE;
386 }
387
388 secondByte = it->nextByte(det);
389 if (secondByte >= 0) {
390 it->charValue = (it->charValue << 8) | secondByte;
391 }
392 // else we'll handle the error later.
393
394 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
395 // Two byte Char
396 if (secondByte < 0xA1) {
397 it->error = TRUE;
398 }
399
400 return TRUE;
401 }
402
403 if (firstByte == 0x8E) {
404 // Code Set 2.
405 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
406 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
407 // We don't know which we've got.
408 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
409 // bytes will look like a well formed 2 byte char.
410 if (secondByte < 0xA1) {
411 it->error = TRUE;
412 }
413
414 return TRUE;
415 }
416
417 if (firstByte == 0x8F) {
418 // Code set 3.
419 // Three byte total char size, two bytes of actual char value.
420 thirdByte = it->nextByte(det);
421 it->charValue = (it->charValue << 8) | thirdByte;
422
423 if (thirdByte < 0xa1) {
424 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
425 it->error = TRUE;
426 }
427 }
428
429 return TRUE;
430
431 }
432
433 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
434 {
435 // nothing to do
436 }
437
438 const char *CharsetRecog_euc_jp::getName() const
439 {
440 return "EUC-JP";
441 }
442
443 const char *CharsetRecog_euc_jp::getLanguage() const
444 {
445 return "ja";
446 }
447
448 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
449 {
450 #if U_PLATFORM_IS_DARWIN_BASED
451 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp);
452 #else
453 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
454 #endif
455 results->set(det, this, confidence);
456 return (confidence > 0);
457 }
458
459 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
460 {
461 // nothing to do
462 }
463
464 const char *CharsetRecog_euc_kr::getName() const
465 {
466 return "EUC-KR";
467 }
468
469 const char *CharsetRecog_euc_kr::getLanguage() const
470 {
471 return "ko";
472 }
473
474 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
475 {
476 #if U_PLATFORM_IS_DARWIN_BASED
477 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr);
478 #else
479 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
480 #endif
481 results->set(det, this, confidence);
482 return (confidence > 0);
483 }
484
485 CharsetRecog_big5::~CharsetRecog_big5()
486 {
487 // nothing to do
488 }
489
490 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
491 {
492 int32_t firstByte;
493
494 it->index = it->nextIndex;
495 it->error = FALSE;
496 firstByte = it->charValue = it->nextByte(det);
497
498 if (firstByte < 0) {
499 return FALSE;
500 }
501
502 if (firstByte <= 0x7F || firstByte == 0xFF) {
503 // single byte character.
504 return TRUE;
505 }
506
507 int32_t secondByte = it->nextByte(det);
508 if (secondByte >= 0) {
509 it->charValue = (it->charValue << 8) | secondByte;
510 }
511 // else we'll handle the error later.
512
513 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
514 it->error = TRUE;
515 }
516
517 return TRUE;
518 }
519
520 const char *CharsetRecog_big5::getName() const
521 {
522 return "Big5";
523 }
524
525 const char *CharsetRecog_big5::getLanguage() const
526 {
527 return "zh";
528 }
529
530 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
531 {
532 #if U_PLATFORM_IS_DARWIN_BASED
533 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5);
534 #else
535 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
536 #endif
537 results->set(det, this, confidence);
538 return (confidence > 0);
539 }
540
541 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
542 {
543 // nothing to do
544 }
545
546 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
547 int32_t firstByte = 0;
548 int32_t secondByte = 0;
549 int32_t thirdByte = 0;
550 int32_t fourthByte = 0;
551
552 it->index = it->nextIndex;
553 it->error = FALSE;
554 firstByte = it->charValue = it->nextByte(det);
555
556 if (firstByte < 0) {
557 // Ran off the end of the input data
558 return FALSE;
559 }
560
561 if (firstByte <= 0x80) {
562 // single byte char
563 return TRUE;
564 }
565
566 secondByte = it->nextByte(det);
567 if (secondByte >= 0) {
568 it->charValue = (it->charValue << 8) | secondByte;
569 }
570 // else we'll handle the error later.
571
572 if (firstByte >= 0x81 && firstByte <= 0xFE) {
573 // Two byte Char
574 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
575 return TRUE;
576 }
577
578 // Four byte char
579 if (secondByte >= 0x30 && secondByte <= 0x39) {
580 thirdByte = it->nextByte(det);
581
582 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
583 fourthByte = it->nextByte(det);
584
585 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
586 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
587
588 return TRUE;
589 }
590 }
591 }
592
593 // Something wasn't valid, or we ran out of data (-1).
594 it->error = TRUE;
595 }
596
597 return TRUE;
598 }
599
600 const char *CharsetRecog_gb_18030::getName() const
601 {
602 return "GB18030";
603 }
604
605 const char *CharsetRecog_gb_18030::getLanguage() const
606 {
607 return "zh";
608 }
609
610 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
611 {
612 #if U_PLATFORM_IS_DARWIN_BASED
613 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030);
614 #else
615 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
616 #endif
617 results->set(det, this, confidence);
618 return (confidence > 0);
619 }
620
621 U_NAMESPACE_END
622 #endif