]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_CONVERSION
13
14 #include "cmemory.h"
15 #include "csmatch.h"
16 #include "csrmbcs.h"
17
18 #include <math.h>
19
20 U_NAMESPACE_BEGIN
21
22 #define min(x,y) (((x)<(y))?(x):(y))
23
24 static const uint16_t commonChars_sjis [] = {
25 // TODO: This set of data comes from the character frequency-
26 // of-occurence analysis tool. The data needs to be moved
27 // into a resource and loaded from there.
28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35 static const uint16_t commonChars_euc_jp[] = {
36 // TODO: This set of data comes from the character frequency-
37 // of-occurence analysis tool. The data needs to be moved
38 // into a resource and loaded from there.
39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50 static const uint16_t commonChars_euc_kr[] = {
51 // TODO: This set of data comes from the character frequency-
52 // of-occurence analysis tool. The data needs to be moved
53 // into a resource and loaded from there.
54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65 static const uint16_t commonChars_big5[] = {
66 // TODO: This set of data comes from the character frequency-
67 // of-occurence analysis tool. The data needs to be moved
68 // into a resource and loaded from there.
69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80 static const uint16_t commonChars_gb_18030[] = {
81 // TODO: This set of data comes from the character frequency-
82 // of-occurence analysis tool. The data needs to be moved
83 // into a resource and loaded from there.
84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
95 #if U_PLATFORM_IS_DARWIN_BASED
96 static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
97 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
98 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
99 {0}
100 };
101 static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
102 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
103 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
104 {0}
105 };
106 static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
107 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
108 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
109 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
110 {0}
111 };
112 static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
113 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
114 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
115 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
116 {0}
117 };
118 static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
119 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
120 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
121 {0}
122 };
123 #endif
124
125 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
126 {
127 int32_t start = 0, end = len-1;
128 int32_t mid = (start+end)/2;
129
130 while(start <= end) {
131 if(array[mid] == value) {
132 return mid;
133 }
134
135 if(array[mid] < value){
136 start = mid+1;
137 } else {
138 end = mid-1;
139 }
140
141 mid = (start+end)/2;
142 }
143
144 return -1;
145 }
146
147 #if U_PLATFORM_IS_DARWIN_BASED
148 // If testPrefix is a prefix of base, return its length, else return 0
149 static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
150 const uint8_t *testPrefixStart = testPrefix;
151 while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
152 testPrefix++;
153 base++;
154 }
155 return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
156 }
157 #endif
158
159 IteratedChar::IteratedChar() :
160 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
161 {
162 // nothing else to do.
163 }
164
165 /*void IteratedChar::reset()
166 {
167 charValue = 0;
168 index = -1;
169 nextIndex = 0;
170 error = FALSE;
171 done = FALSE;
172 }*/
173
174 int32_t IteratedChar::nextByte(InputText *det)
175 {
176 if (nextIndex >= det->fRawLength) {
177 done = TRUE;
178
179 return -1;
180 }
181
182 return det->fRawInput[nextIndex++];
183 }
184
185 CharsetRecog_mbcs::~CharsetRecog_mbcs()
186 {
187 // nothing to do.
188 }
189
190 #if U_PLATFORM_IS_DARWIN_BASED
191 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
192 #else
193 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
194 #endif
195 int32_t singleByteCharCount = 0;
196 int32_t doubleByteCharCount = 0;
197 int32_t commonCharCount = 0;
198 int32_t badCharCount = 0;
199 int32_t totalCharCount = 0;
200 int32_t confidence = 0;
201 #if U_PLATFORM_IS_DARWIN_BASED
202 int32_t confidenceFromKeys = 0;
203 #endif
204 IteratedChar iter;
205
206 while (nextChar(&iter, det)) {
207 totalCharCount++;
208
209 if (iter.error) {
210 badCharCount++;
211 } else {
212 if (iter.charValue <= 0xFF) {
213 singleByteCharCount++;
214 } else {
215 doubleByteCharCount++;
216
217 if (commonChars != 0) {
218 if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
219 commonCharCount += 1;
220 }
221 }
222 #if U_PLATFORM_IS_DARWIN_BASED
223 if (doubleByteCharCount <= 20) {
224 int32_t keyIndex;
225 for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
226 int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
227 confidenceFromKeys += prefixLen*5;
228 }
229 }
230 #endif
231 }
232 }
233
234
235 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
236 // Bail out early if the byte data is not matching the encoding scheme.
237 // break detectBlock;
238 return confidence;
239 }
240 }
241
242 if (doubleByteCharCount <= 10 && badCharCount == 0) {
243 // Not many multi-byte chars.
244 if (doubleByteCharCount == 0 && totalCharCount < 10) {
245 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
246 // We don't have enough data to have any confidence.
247 // Statistical analysis of single byte non-ASCII charcters would probably help here.
248 confidence = 0;
249 }
250 else {
251 // ASCII or ISO file? It's probably not our encoding,
252 // but is not incompatible with our encoding, so don't give it a zero.
253 #if U_PLATFORM_IS_DARWIN_BASED
254 if (confidenceFromKeys > 90) {
255 confidenceFromKeys = 90;
256 } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
257 confidenceFromKeys += 20;
258 }
259 confidence = 10 + confidenceFromKeys;
260 #else
261 confidence = 10;
262 #endif
263 }
264
265 return confidence;
266 }
267
268 //
269 // No match if there are too many characters that don't fit the encoding scheme.
270 // (should we have zero tolerance for these?)
271 //
272 if (doubleByteCharCount < 20*badCharCount) {
273 confidence = 0;
274
275 return confidence;
276 }
277
278 if (commonChars == 0) {
279 // We have no statistics on frequently occuring characters.
280 // Assess confidence purely on having a reasonable number of
281 // multi-byte characters (the more the better)
282 confidence = 30 + doubleByteCharCount - 20*badCharCount;
283 #if U_PLATFORM_IS_DARWIN_BASED
284 confidence += confidenceFromKeys;
285 #endif
286
287 if (confidence > 100) {
288 confidence = 100;
289 }
290 } else {
291 //
292 // Frequency of occurence statistics exist.
293 //
294
295 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
296 double scaleFactor = 90.0 / maxVal;
297 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
298 #if U_PLATFORM_IS_DARWIN_BASED
299 confidence += confidenceFromKeys;
300 #endif
301
302 confidence = min(confidence, 100);
303 }
304
305 if (confidence < 0) {
306 confidence = 0;
307 }
308
309 return confidence;
310 }
311
312 CharsetRecog_sjis::~CharsetRecog_sjis()
313 {
314 // nothing to do
315 }
316
317 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
318 it->index = it->nextIndex;
319 it->error = FALSE;
320
321 int32_t firstByte = it->charValue = it->nextByte(det);
322
323 if (firstByte < 0) {
324 return FALSE;
325 }
326
327 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
328 return TRUE;
329 }
330
331 int32_t secondByte = it->nextByte(det);
332 if (secondByte >= 0) {
333 it->charValue = (firstByte << 8) | secondByte;
334 }
335 // else we'll handle the error later.
336
337 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
338 // Illegal second byte value.
339 it->error = TRUE;
340 }
341
342 return TRUE;
343 }
344
345 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
346 #if U_PLATFORM_IS_DARWIN_BASED
347 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis), keyStrings_sjis);
348 #else
349 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
350 #endif
351 results->set(det, this, confidence);
352 return (confidence > 0);
353 }
354
355 const char *CharsetRecog_sjis::getName() const
356 {
357 return "Shift_JIS";
358 }
359
360 const char *CharsetRecog_sjis::getLanguage() const
361 {
362 return "ja";
363 }
364
365 CharsetRecog_euc::~CharsetRecog_euc()
366 {
367 // nothing to do
368 }
369
370 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
371 int32_t firstByte = 0;
372 int32_t secondByte = 0;
373 int32_t thirdByte = 0;
374
375 it->index = it->nextIndex;
376 it->error = FALSE;
377 firstByte = it->charValue = it->nextByte(det);
378
379 if (firstByte < 0) {
380 // Ran off the end of the input data
381 return FALSE;
382 }
383
384 if (firstByte <= 0x8D) {
385 // single byte char
386 return TRUE;
387 }
388
389 secondByte = it->nextByte(det);
390 if (secondByte >= 0) {
391 it->charValue = (it->charValue << 8) | secondByte;
392 }
393 // else we'll handle the error later.
394
395 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
396 // Two byte Char
397 if (secondByte < 0xA1) {
398 it->error = TRUE;
399 }
400
401 return TRUE;
402 }
403
404 if (firstByte == 0x8E) {
405 // Code Set 2.
406 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
407 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
408 // We don't know which we've got.
409 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
410 // bytes will look like a well formed 2 byte char.
411 if (secondByte < 0xA1) {
412 it->error = TRUE;
413 }
414
415 return TRUE;
416 }
417
418 if (firstByte == 0x8F) {
419 // Code set 3.
420 // Three byte total char size, two bytes of actual char value.
421 thirdByte = it->nextByte(det);
422 it->charValue = (it->charValue << 8) | thirdByte;
423
424 if (thirdByte < 0xa1) {
425 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
426 it->error = TRUE;
427 }
428 }
429
430 return TRUE;
431
432 }
433
434 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
435 {
436 // nothing to do
437 }
438
439 const char *CharsetRecog_euc_jp::getName() const
440 {
441 return "EUC-JP";
442 }
443
444 const char *CharsetRecog_euc_jp::getLanguage() const
445 {
446 return "ja";
447 }
448
449 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
450 {
451 #if U_PLATFORM_IS_DARWIN_BASED
452 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp), keyStrings_euc_jp);
453 #else
454 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
455 #endif
456 results->set(det, this, confidence);
457 return (confidence > 0);
458 }
459
460 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
461 {
462 // nothing to do
463 }
464
465 const char *CharsetRecog_euc_kr::getName() const
466 {
467 return "EUC-KR";
468 }
469
470 const char *CharsetRecog_euc_kr::getLanguage() const
471 {
472 return "ko";
473 }
474
475 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
476 {
477 #if U_PLATFORM_IS_DARWIN_BASED
478 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr), keyStrings_euc_kr);
479 #else
480 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
481 #endif
482 results->set(det, this, confidence);
483 return (confidence > 0);
484 }
485
486 CharsetRecog_big5::~CharsetRecog_big5()
487 {
488 // nothing to do
489 }
490
491 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
492 {
493 int32_t firstByte;
494
495 it->index = it->nextIndex;
496 it->error = FALSE;
497 firstByte = it->charValue = it->nextByte(det);
498
499 if (firstByte < 0) {
500 return FALSE;
501 }
502
503 if (firstByte <= 0x7F || firstByte == 0xFF) {
504 // single byte character.
505 return TRUE;
506 }
507
508 int32_t secondByte = it->nextByte(det);
509 if (secondByte >= 0) {
510 it->charValue = (it->charValue << 8) | secondByte;
511 }
512 // else we'll handle the error later.
513
514 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
515 it->error = TRUE;
516 }
517
518 return TRUE;
519 }
520
521 const char *CharsetRecog_big5::getName() const
522 {
523 return "Big5";
524 }
525
526 const char *CharsetRecog_big5::getLanguage() const
527 {
528 return "zh";
529 }
530
531 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
532 {
533 #if U_PLATFORM_IS_DARWIN_BASED
534 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5), keyStrings_big5);
535 #else
536 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
537 #endif
538 results->set(det, this, confidence);
539 return (confidence > 0);
540 }
541
542 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
543 {
544 // nothing to do
545 }
546
547 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
548 int32_t firstByte = 0;
549 int32_t secondByte = 0;
550 int32_t thirdByte = 0;
551 int32_t fourthByte = 0;
552
553 it->index = it->nextIndex;
554 it->error = FALSE;
555 firstByte = it->charValue = it->nextByte(det);
556
557 if (firstByte < 0) {
558 // Ran off the end of the input data
559 return FALSE;
560 }
561
562 if (firstByte <= 0x80) {
563 // single byte char
564 return TRUE;
565 }
566
567 secondByte = it->nextByte(det);
568 if (secondByte >= 0) {
569 it->charValue = (it->charValue << 8) | secondByte;
570 }
571 // else we'll handle the error later.
572
573 if (firstByte >= 0x81 && firstByte <= 0xFE) {
574 // Two byte Char
575 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
576 return TRUE;
577 }
578
579 // Four byte char
580 if (secondByte >= 0x30 && secondByte <= 0x39) {
581 thirdByte = it->nextByte(det);
582
583 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
584 fourthByte = it->nextByte(det);
585
586 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
587 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
588
589 return TRUE;
590 }
591 }
592 }
593
594 // Something wasn't valid, or we ran out of data (-1).
595 it->error = TRUE;
596 }
597
598 return TRUE;
599 }
600
601 const char *CharsetRecog_gb_18030::getName() const
602 {
603 return "GB18030";
604 }
605
606 const char *CharsetRecog_gb_18030::getLanguage() const
607 {
608 return "zh";
609 }
610
611 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
612 {
613 #if U_PLATFORM_IS_DARWIN_BASED
614 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030), keyStrings_gb_18030);
615 #else
616 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
617 #endif
618 results->set(det, this, confidence);
619 return (confidence > 0);
620 }
621
622 U_NAMESPACE_END
623 #endif