]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
ICU-57149.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "cmemory.h"
13 #include "csmatch.h"
14 #include "csrmbcs.h"
15
16 #include <math.h>
17
18 U_NAMESPACE_BEGIN
19
20 #define min(x,y) (((x)<(y))?(x):(y))
21
22 static const uint16_t commonChars_sjis [] = {
23 // TODO: This set of data comes from the character frequency-
24 // of-occurence analysis tool. The data needs to be moved
25 // into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
32
33 static const uint16_t commonChars_euc_jp[] = {
34 // TODO: This set of data comes from the character frequency-
35 // of-occurence analysis tool. The data needs to be moved
36 // into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
47
48 static const uint16_t commonChars_euc_kr[] = {
49 // TODO: This set of data comes from the character frequency-
50 // of-occurence analysis tool. The data needs to be moved
51 // into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
62
63 static const uint16_t commonChars_big5[] = {
64 // TODO: This set of data comes from the character frequency-
65 // of-occurence analysis tool. The data needs to be moved
66 // into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
77
78 static const uint16_t commonChars_gb_18030[] = {
79 // TODO: This set of data comes from the character frequency-
80 // of-occurence analysis tool. The data needs to be moved
81 // into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
92
93 #if U_PLATFORM_IS_DARWIN_BASED
94 static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
95 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
96 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
97 {0}
98 };
99 static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
100 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
101 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
102 {0}
103 };
104 static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
105 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
106 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
107 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
108 {0}
109 };
110 static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
111 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
112 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
113 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
114 {0}
115 };
116 static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
117 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
118 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
119 {0}
120 };
121 #endif
122
123 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
124 {
125 int32_t start = 0, end = len-1;
126 int32_t mid = (start+end)/2;
127
128 while(start <= end) {
129 if(array[mid] == value) {
130 return mid;
131 }
132
133 if(array[mid] < value){
134 start = mid+1;
135 } else {
136 end = mid-1;
137 }
138
139 mid = (start+end)/2;
140 }
141
142 return -1;
143 }
144
145 #if U_PLATFORM_IS_DARWIN_BASED
146 // If testPrefix is a prefix of base, return its length, else return 0
147 static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
148 const uint8_t *testPrefixStart = testPrefix;
149 while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
150 testPrefix++;
151 base++;
152 }
153 return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
154 }
155 #endif
156
157 IteratedChar::IteratedChar() :
158 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
159 {
160 // nothing else to do.
161 }
162
163 /*void IteratedChar::reset()
164 {
165 charValue = 0;
166 index = -1;
167 nextIndex = 0;
168 error = FALSE;
169 done = FALSE;
170 }*/
171
172 int32_t IteratedChar::nextByte(InputText *det)
173 {
174 if (nextIndex >= det->fRawLength) {
175 done = TRUE;
176
177 return -1;
178 }
179
180 return det->fRawInput[nextIndex++];
181 }
182
183 CharsetRecog_mbcs::~CharsetRecog_mbcs()
184 {
185 // nothing to do.
186 }
187
188 #if U_PLATFORM_IS_DARWIN_BASED
189 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
190 #else
191 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
192 #endif
193 int32_t singleByteCharCount = 0;
194 int32_t doubleByteCharCount = 0;
195 int32_t commonCharCount = 0;
196 int32_t badCharCount = 0;
197 int32_t totalCharCount = 0;
198 int32_t confidence = 0;
199 #if U_PLATFORM_IS_DARWIN_BASED
200 int32_t confidenceFromKeys = 0;
201 #endif
202 IteratedChar iter;
203
204 while (nextChar(&iter, det)) {
205 totalCharCount++;
206
207 if (iter.error) {
208 badCharCount++;
209 } else {
210 if (iter.charValue <= 0xFF) {
211 singleByteCharCount++;
212 } else {
213 doubleByteCharCount++;
214
215 if (commonChars != 0) {
216 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
217 commonCharCount += 1;
218 }
219 }
220 #if U_PLATFORM_IS_DARWIN_BASED
221 if (doubleByteCharCount <= 20) {
222 int32_t keyIndex;
223 for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
224 int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
225 confidenceFromKeys += prefixLen*5;
226 }
227 }
228 #endif
229 }
230 }
231
232
233 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
234 // Bail out early if the byte data is not matching the encoding scheme.
235 // break detectBlock;
236 return confidence;
237 }
238 }
239
240 if (doubleByteCharCount <= 10 && badCharCount == 0) {
241 // Not many multi-byte chars.
242 if (doubleByteCharCount == 0 && totalCharCount < 10) {
243 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
244 // We don't have enough data to have any confidence.
245 // Statistical analysis of single byte non-ASCII charcters would probably help here.
246 confidence = 0;
247 }
248 else {
249 // ASCII or ISO file? It's probably not our encoding,
250 // but is not incompatible with our encoding, so don't give it a zero.
251 #if U_PLATFORM_IS_DARWIN_BASED
252 if (confidenceFromKeys > 90) {
253 confidenceFromKeys = 90;
254 } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
255 confidenceFromKeys += 20;
256 }
257 confidence = 10 + confidenceFromKeys;
258 #else
259 confidence = 10;
260 #endif
261 }
262
263 return confidence;
264 }
265
266 //
267 // No match if there are too many characters that don't fit the encoding scheme.
268 // (should we have zero tolerance for these?)
269 //
270 if (doubleByteCharCount < 20*badCharCount) {
271 confidence = 0;
272
273 return confidence;
274 }
275
276 if (commonChars == 0) {
277 // We have no statistics on frequently occuring characters.
278 // Assess confidence purely on having a reasonable number of
279 // multi-byte characters (the more the better)
280 confidence = 30 + doubleByteCharCount - 20*badCharCount;
281 #if U_PLATFORM_IS_DARWIN_BASED
282 confidence += confidenceFromKeys;
283 #endif
284
285 if (confidence > 100) {
286 confidence = 100;
287 }
288 } else {
289 //
290 // Frequency of occurence statistics exist.
291 //
292
293 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
294 double scaleFactor = 90.0 / maxVal;
295 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
296 #if U_PLATFORM_IS_DARWIN_BASED
297 confidence += confidenceFromKeys;
298 #endif
299
300 confidence = min(confidence, 100);
301 }
302
303 if (confidence < 0) {
304 confidence = 0;
305 }
306
307 return confidence;
308 }
309
310 CharsetRecog_sjis::~CharsetRecog_sjis()
311 {
312 // nothing to do
313 }
314
315 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
316 it->index = it->nextIndex;
317 it->error = FALSE;
318
319 int32_t firstByte = it->charValue = it->nextByte(det);
320
321 if (firstByte < 0) {
322 return FALSE;
323 }
324
325 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
326 return TRUE;
327 }
328
329 int32_t secondByte = it->nextByte(det);
330 if (secondByte >= 0) {
331 it->charValue = (firstByte << 8) | secondByte;
332 }
333 // else we'll handle the error later.
334
335 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
336 // Illegal second byte value.
337 it->error = TRUE;
338 }
339
340 return TRUE;
341 }
342
343 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
344 #if U_PLATFORM_IS_DARWIN_BASED
345 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis), keyStrings_sjis);
346 #else
347 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
348 #endif
349 results->set(det, this, confidence);
350 return (confidence > 0);
351 }
352
353 const char *CharsetRecog_sjis::getName() const
354 {
355 return "Shift_JIS";
356 }
357
358 const char *CharsetRecog_sjis::getLanguage() const
359 {
360 return "ja";
361 }
362
363 CharsetRecog_euc::~CharsetRecog_euc()
364 {
365 // nothing to do
366 }
367
368 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
369 int32_t firstByte = 0;
370 int32_t secondByte = 0;
371 int32_t thirdByte = 0;
372
373 it->index = it->nextIndex;
374 it->error = FALSE;
375 firstByte = it->charValue = it->nextByte(det);
376
377 if (firstByte < 0) {
378 // Ran off the end of the input data
379 return FALSE;
380 }
381
382 if (firstByte <= 0x8D) {
383 // single byte char
384 return TRUE;
385 }
386
387 secondByte = it->nextByte(det);
388 if (secondByte >= 0) {
389 it->charValue = (it->charValue << 8) | secondByte;
390 }
391 // else we'll handle the error later.
392
393 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
394 // Two byte Char
395 if (secondByte < 0xA1) {
396 it->error = TRUE;
397 }
398
399 return TRUE;
400 }
401
402 if (firstByte == 0x8E) {
403 // Code Set 2.
404 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
405 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
406 // We don't know which we've got.
407 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
408 // bytes will look like a well formed 2 byte char.
409 if (secondByte < 0xA1) {
410 it->error = TRUE;
411 }
412
413 return TRUE;
414 }
415
416 if (firstByte == 0x8F) {
417 // Code set 3.
418 // Three byte total char size, two bytes of actual char value.
419 thirdByte = it->nextByte(det);
420 it->charValue = (it->charValue << 8) | thirdByte;
421
422 if (thirdByte < 0xa1) {
423 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
424 it->error = TRUE;
425 }
426 }
427
428 return TRUE;
429
430 }
431
432 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
433 {
434 // nothing to do
435 }
436
437 const char *CharsetRecog_euc_jp::getName() const
438 {
439 return "EUC-JP";
440 }
441
442 const char *CharsetRecog_euc_jp::getLanguage() const
443 {
444 return "ja";
445 }
446
447 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
448 {
449 #if U_PLATFORM_IS_DARWIN_BASED
450 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp), keyStrings_euc_jp);
451 #else
452 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
453 #endif
454 results->set(det, this, confidence);
455 return (confidence > 0);
456 }
457
458 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
459 {
460 // nothing to do
461 }
462
463 const char *CharsetRecog_euc_kr::getName() const
464 {
465 return "EUC-KR";
466 }
467
468 const char *CharsetRecog_euc_kr::getLanguage() const
469 {
470 return "ko";
471 }
472
473 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
474 {
475 #if U_PLATFORM_IS_DARWIN_BASED
476 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr), keyStrings_euc_kr);
477 #else
478 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
479 #endif
480 results->set(det, this, confidence);
481 return (confidence > 0);
482 }
483
484 CharsetRecog_big5::~CharsetRecog_big5()
485 {
486 // nothing to do
487 }
488
489 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
490 {
491 int32_t firstByte;
492
493 it->index = it->nextIndex;
494 it->error = FALSE;
495 firstByte = it->charValue = it->nextByte(det);
496
497 if (firstByte < 0) {
498 return FALSE;
499 }
500
501 if (firstByte <= 0x7F || firstByte == 0xFF) {
502 // single byte character.
503 return TRUE;
504 }
505
506 int32_t secondByte = it->nextByte(det);
507 if (secondByte >= 0) {
508 it->charValue = (it->charValue << 8) | secondByte;
509 }
510 // else we'll handle the error later.
511
512 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
513 it->error = TRUE;
514 }
515
516 return TRUE;
517 }
518
519 const char *CharsetRecog_big5::getName() const
520 {
521 return "Big5";
522 }
523
524 const char *CharsetRecog_big5::getLanguage() const
525 {
526 return "zh";
527 }
528
529 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
530 {
531 #if U_PLATFORM_IS_DARWIN_BASED
532 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5), keyStrings_big5);
533 #else
534 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
535 #endif
536 results->set(det, this, confidence);
537 return (confidence > 0);
538 }
539
540 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
541 {
542 // nothing to do
543 }
544
545 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
546 int32_t firstByte = 0;
547 int32_t secondByte = 0;
548 int32_t thirdByte = 0;
549 int32_t fourthByte = 0;
550
551 it->index = it->nextIndex;
552 it->error = FALSE;
553 firstByte = it->charValue = it->nextByte(det);
554
555 if (firstByte < 0) {
556 // Ran off the end of the input data
557 return FALSE;
558 }
559
560 if (firstByte <= 0x80) {
561 // single byte char
562 return TRUE;
563 }
564
565 secondByte = it->nextByte(det);
566 if (secondByte >= 0) {
567 it->charValue = (it->charValue << 8) | secondByte;
568 }
569 // else we'll handle the error later.
570
571 if (firstByte >= 0x81 && firstByte <= 0xFE) {
572 // Two byte Char
573 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
574 return TRUE;
575 }
576
577 // Four byte char
578 if (secondByte >= 0x30 && secondByte <= 0x39) {
579 thirdByte = it->nextByte(det);
580
581 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
582 fourthByte = it->nextByte(det);
583
584 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
585 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
586
587 return TRUE;
588 }
589 }
590 }
591
592 // Something wasn't valid, or we ran out of data (-1).
593 it->error = TRUE;
594 }
595
596 return TRUE;
597 }
598
599 const char *CharsetRecog_gb_18030::getName() const
600 {
601 return "GB18030";
602 }
603
604 const char *CharsetRecog_gb_18030::getLanguage() const
605 {
606 return "zh";
607 }
608
609 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
610 {
611 #if U_PLATFORM_IS_DARWIN_BASED
612 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030), keyStrings_gb_18030);
613 #else
614 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
615 #endif
616 results->set(det, this, confidence);
617 return (confidence > 0);
618 }
619
620 U_NAMESPACE_END
621 #endif