]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csrmbcs.cpp
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
4388f060 3 * Copyright (C) 2005-2012, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
51004dcb 12#include "csmatch.h"
73c04bcf
A
13#include "csrmbcs.h"
14
15#include <math.h>
16
17U_NAMESPACE_BEGIN
18
19#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20
21#define min(x,y) (((x)<(y))?(x):(y))
22
46f4442e 23static const uint16_t commonChars_sjis [] = {
73c04bcf
A
24// TODO: This set of data comes from the character frequency-
25// of-occurence analysis tool. The data needs to be moved
26// into a resource and loaded from there.
270x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
280x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
290x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
300x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
310x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
320x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33
46f4442e 34static const uint16_t commonChars_euc_jp[] = {
73c04bcf
A
35// TODO: This set of data comes from the character frequency-
36// of-occurence analysis tool. The data needs to be moved
37// into a resource and loaded from there.
380xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
390xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
400xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
410xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
420xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
430xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
440xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
450xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
460xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
470xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48
46f4442e 49static const uint16_t commonChars_euc_kr[] = {
73c04bcf
A
50// TODO: This set of data comes from the character frequency-
51// of-occurence analysis tool. The data needs to be moved
52// into a resource and loaded from there.
530xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
540xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
550xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
560xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
570xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
580xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
590xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
600xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
610xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
620xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63
46f4442e 64static const uint16_t commonChars_big5[] = {
73c04bcf
A
65// TODO: This set of data comes from the character frequency-
66// of-occurence analysis tool. The data needs to be moved
67// into a resource and loaded from there.
680xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
690xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
700xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
710xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
720xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
730xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
740xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
750xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
760xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
770xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78
46f4442e 79static const uint16_t commonChars_gb_18030[] = {
73c04bcf
A
80// TODO: This set of data comes from the character frequency-
81// of-occurence analysis tool. The data needs to be moved
82// into a resource and loaded from there.
830xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
840xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
850xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
860xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
870xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
880xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
890xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
900xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
910xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
920xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
93
4388f060
A
94#if U_PLATFORM_IS_DARWIN_BASED
95static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
96 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
97 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
98 {0}
99};
100static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
101 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
102 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
103 {0}
104};
105static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
106 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
107 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
108 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
109 {0}
110};
111static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
112 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
113 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
114 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
115 {0}
116};
117static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
118 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
119 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
120 {0}
121};
122#endif
123
46f4442e 124static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
73c04bcf
A
125{
126 int32_t start = 0, end = len-1;
127 int32_t mid = (start+end)/2;
128
129 while(start <= end) {
130 if(array[mid] == value) {
131 return mid;
132 }
133
134 if(array[mid] < value){
135 start = mid+1;
136 } else {
137 end = mid-1;
138 }
139
140 mid = (start+end)/2;
141 }
142
143 return -1;
144}
145
4388f060
A
146#if U_PLATFORM_IS_DARWIN_BASED
147// If testPrefix is a prefix of base, return its length, else return 0
148static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
149 const uint8_t *testPrefixStart = testPrefix;
150 while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
151 testPrefix++;
152 base++;
153 }
154 return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
155}
156#endif
157
46f4442e
A
158IteratedChar::IteratedChar() :
159charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
73c04bcf
A
160{
161 // nothing else to do.
162}
163
46f4442e 164/*void IteratedChar::reset()
73c04bcf
A
165{
166 charValue = 0;
167 index = -1;
168 nextIndex = 0;
169 error = FALSE;
170 done = FALSE;
46f4442e 171}*/
73c04bcf
A
172
173int32_t IteratedChar::nextByte(InputText *det)
174{
175 if (nextIndex >= det->fRawLength) {
176 done = TRUE;
177
178 return -1;
179 }
180
181 return det->fRawInput[nextIndex++];
182}
183
184CharsetRecog_mbcs::~CharsetRecog_mbcs()
185{
186 // nothing to do.
187}
188
4388f060 189#if U_PLATFORM_IS_DARWIN_BASED
51004dcb 190int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
4388f060 191#else
51004dcb 192int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
4388f060 193#endif
46f4442e
A
194 int32_t singleByteCharCount = 0;
195 int32_t doubleByteCharCount = 0;
196 int32_t commonCharCount = 0;
197 int32_t badCharCount = 0;
198 int32_t totalCharCount = 0;
199 int32_t confidence = 0;
4388f060
A
200#if U_PLATFORM_IS_DARWIN_BASED
201 int32_t confidenceFromKeys = 0;
202#endif
46f4442e
A
203 IteratedChar iter;
204
205 while (nextChar(&iter, det)) {
206 totalCharCount++;
207
208 if (iter.error) {
209 badCharCount++;
73c04bcf 210 } else {
46f4442e
A
211 if (iter.charValue <= 0xFF) {
212 singleByteCharCount++;
73c04bcf 213 } else {
46f4442e 214 doubleByteCharCount++;
73c04bcf
A
215
216 if (commonChars != 0) {
46f4442e 217 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
73c04bcf
A
218 commonCharCount += 1;
219 }
220 }
4388f060
A
221#if U_PLATFORM_IS_DARWIN_BASED
222 if (doubleByteCharCount <= 20) {
223 int32_t keyIndex;
224 for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
225 int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
226 confidenceFromKeys += prefixLen*5;
227 }
228 }
229#endif
73c04bcf
A
230 }
231 }
232
233
234 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
235 // Bail out early if the byte data is not matching the encoding scheme.
236 // break detectBlock;
73c04bcf
A
237 return confidence;
238 }
239 }
240
73c04bcf
A
241 if (doubleByteCharCount <= 10 && badCharCount == 0) {
242 // Not many multi-byte chars.
46f4442e
A
243 if (doubleByteCharCount == 0 && totalCharCount < 10) {
244 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
245 // We don't have enough data to have any confidence.
246 // Statistical analysis of single byte non-ASCII charcters would probably help here.
247 confidence = 0;
248 }
249 else {
250 // ASCII or ISO file? It's probably not our encoding,
251 // but is not incompatible with our encoding, so don't give it a zero.
4388f060
A
252#if U_PLATFORM_IS_DARWIN_BASED
253 if (confidenceFromKeys > 90) {
254 confidenceFromKeys = 90;
255 } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
256 confidenceFromKeys += 20;
257 }
258 confidence = 10 + confidenceFromKeys;
259#else
46f4442e 260 confidence = 10;
4388f060 261#endif
46f4442e 262 }
73c04bcf
A
263
264 return confidence;
265 }
266
267 //
268 // No match if there are too many characters that don't fit the encoding scheme.
269 // (should we have zero tolerance for these?)
270 //
271 if (doubleByteCharCount < 20*badCharCount) {
272 confidence = 0;
273
274 return confidence;
275 }
276
277 if (commonChars == 0) {
278 // We have no statistics on frequently occuring characters.
279 // Assess confidence purely on having a reasonable number of
280 // multi-byte characters (the more the better)
281 confidence = 30 + doubleByteCharCount - 20*badCharCount;
4388f060
A
282#if U_PLATFORM_IS_DARWIN_BASED
283 confidence += confidenceFromKeys;
284#endif
73c04bcf
A
285
286 if (confidence > 100) {
287 confidence = 100;
288 }
289 } else {
290 //
291 // Frequency of occurence statistics exist.
292 //
293
4388f060 294 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
73c04bcf 295 double scaleFactor = 90.0 / maxVal;
4388f060
A
296 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
297#if U_PLATFORM_IS_DARWIN_BASED
298 confidence += confidenceFromKeys;
299#endif
73c04bcf
A
300
301 confidence = min(confidence, 100);
302 }
303
304 if (confidence < 0) {
305 confidence = 0;
306 }
307
308 return confidence;
309}
310
311CharsetRecog_sjis::~CharsetRecog_sjis()
312{
313 // nothing to do
314}
315
51004dcb 316UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
73c04bcf
A
317 it->index = it->nextIndex;
318 it->error = FALSE;
319
320 int32_t firstByte = it->charValue = it->nextByte(det);
321
322 if (firstByte < 0) {
323 return FALSE;
324 }
325
326 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
327 return TRUE;
328 }
329
330 int32_t secondByte = it->nextByte(det);
46f4442e
A
331 if (secondByte >= 0) {
332 it->charValue = (firstByte << 8) | secondByte;
73c04bcf 333 }
46f4442e
A
334 // else we'll handle the error later.
335
73c04bcf
A
336 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
337 // Illegal second byte value.
338 it->error = TRUE;
339 }
340
341 return TRUE;
342}
343
51004dcb 344UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
4388f060 345#if U_PLATFORM_IS_DARWIN_BASED
51004dcb 346 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis);
4388f060 347#else
51004dcb 348 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
4388f060 349#endif
51004dcb
A
350 results->set(det, this, confidence);
351 return (confidence > 0);
73c04bcf
A
352}
353
354const char *CharsetRecog_sjis::getName() const
355{
356 return "Shift_JIS";
357}
358
359const char *CharsetRecog_sjis::getLanguage() const
360{
361 return "ja";
362}
363
364CharsetRecog_euc::~CharsetRecog_euc()
365{
366 // nothing to do
367}
368
51004dcb 369UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
73c04bcf
A
370 int32_t firstByte = 0;
371 int32_t secondByte = 0;
372 int32_t thirdByte = 0;
73c04bcf
A
373
374 it->index = it->nextIndex;
375 it->error = FALSE;
376 firstByte = it->charValue = it->nextByte(det);
377
378 if (firstByte < 0) {
379 // Ran off the end of the input data
46f4442e 380 return FALSE;
73c04bcf
A
381 }
382
383 if (firstByte <= 0x8D) {
384 // single byte char
46f4442e 385 return TRUE;
73c04bcf
A
386 }
387
388 secondByte = it->nextByte(det);
46f4442e
A
389 if (secondByte >= 0) {
390 it->charValue = (it->charValue << 8) | secondByte;
391 }
392 // else we'll handle the error later.
73c04bcf
A
393
394 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
395 // Two byte Char
396 if (secondByte < 0xA1) {
397 it->error = TRUE;
398 }
399
46f4442e 400 return TRUE;
73c04bcf
A
401 }
402
403 if (firstByte == 0x8E) {
404 // Code Set 2.
405 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
406 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
407 // We don't know which we've got.
408 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
409 // bytes will look like a well formed 2 byte char.
410 if (secondByte < 0xA1) {
411 it->error = TRUE;
412 }
413
46f4442e 414 return TRUE;
73c04bcf
A
415 }
416
417 if (firstByte == 0x8F) {
418 // Code set 3.
419 // Three byte total char size, two bytes of actual char value.
420 thirdByte = it->nextByte(det);
421 it->charValue = (it->charValue << 8) | thirdByte;
422
423 if (thirdByte < 0xa1) {
46f4442e 424 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
73c04bcf
A
425 it->error = TRUE;
426 }
427 }
428
46f4442e 429 return TRUE;
73c04bcf
A
430
431}
432
433CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
434{
435 // nothing to do
436}
437
438const char *CharsetRecog_euc_jp::getName() const
439{
440 return "EUC-JP";
441}
442
443const char *CharsetRecog_euc_jp::getLanguage() const
444{
445 return "ja";
446}
447
51004dcb 448UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
73c04bcf 449{
4388f060 450#if U_PLATFORM_IS_DARWIN_BASED
51004dcb 451 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp);
4388f060 452#else
51004dcb 453 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
4388f060 454#endif
51004dcb
A
455 results->set(det, this, confidence);
456 return (confidence > 0);
73c04bcf
A
457}
458
459CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
460{
461 // nothing to do
462}
463
464const char *CharsetRecog_euc_kr::getName() const
465{
466 return "EUC-KR";
467}
468
469const char *CharsetRecog_euc_kr::getLanguage() const
470{
471 return "ko";
472}
473
51004dcb 474UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
73c04bcf 475{
4388f060 476#if U_PLATFORM_IS_DARWIN_BASED
51004dcb 477 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr);
4388f060 478#else
51004dcb 479 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
4388f060 480#endif
51004dcb
A
481 results->set(det, this, confidence);
482 return (confidence > 0);
73c04bcf
A
483}
484
485CharsetRecog_big5::~CharsetRecog_big5()
486{
487 // nothing to do
488}
489
51004dcb 490UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
73c04bcf
A
491{
492 int32_t firstByte;
493
494 it->index = it->nextIndex;
495 it->error = FALSE;
496 firstByte = it->charValue = it->nextByte(det);
497
498 if (firstByte < 0) {
499 return FALSE;
500 }
501
502 if (firstByte <= 0x7F || firstByte == 0xFF) {
503 // single byte character.
504 return TRUE;
505 }
506
507 int32_t secondByte = it->nextByte(det);
46f4442e
A
508 if (secondByte >= 0) {
509 it->charValue = (it->charValue << 8) | secondByte;
73c04bcf 510 }
46f4442e 511 // else we'll handle the error later.
73c04bcf 512
46f4442e
A
513 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
514 it->error = TRUE;
73c04bcf
A
515 }
516
517 return TRUE;
518}
519
520const char *CharsetRecog_big5::getName() const
521{
522 return "Big5";
523}
524
525const char *CharsetRecog_big5::getLanguage() const
526{
527 return "zh";
528}
529
51004dcb 530UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
73c04bcf 531{
4388f060 532#if U_PLATFORM_IS_DARWIN_BASED
51004dcb 533 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5);
4388f060 534#else
51004dcb 535 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
4388f060 536#endif
51004dcb
A
537 results->set(det, this, confidence);
538 return (confidence > 0);
73c04bcf
A
539}
540
541CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
542{
543 // nothing to do
544}
545
51004dcb 546UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
73c04bcf
A
547 int32_t firstByte = 0;
548 int32_t secondByte = 0;
549 int32_t thirdByte = 0;
550 int32_t fourthByte = 0;
551
552 it->index = it->nextIndex;
553 it->error = FALSE;
554 firstByte = it->charValue = it->nextByte(det);
555
556 if (firstByte < 0) {
557 // Ran off the end of the input data
46f4442e 558 return FALSE;
73c04bcf
A
559 }
560
561 if (firstByte <= 0x80) {
562 // single byte char
46f4442e 563 return TRUE;
73c04bcf
A
564 }
565
566 secondByte = it->nextByte(det);
46f4442e
A
567 if (secondByte >= 0) {
568 it->charValue = (it->charValue << 8) | secondByte;
569 }
570 // else we'll handle the error later.
73c04bcf
A
571
572 if (firstByte >= 0x81 && firstByte <= 0xFE) {
573 // Two byte Char
574 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
46f4442e 575 return TRUE;
73c04bcf
A
576 }
577
578 // Four byte char
579 if (secondByte >= 0x30 && secondByte <= 0x39) {
580 thirdByte = it->nextByte(det);
581
582 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
583 fourthByte = it->nextByte(det);
584
585 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
586 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
587
46f4442e 588 return TRUE;
73c04bcf
A
589 }
590 }
591 }
592
46f4442e 593 // Something wasn't valid, or we ran out of data (-1).
73c04bcf 594 it->error = TRUE;
73c04bcf
A
595 }
596
46f4442e 597 return TRUE;
73c04bcf
A
598}
599
600const char *CharsetRecog_gb_18030::getName() const
601{
602 return "GB18030";
603}
604
605const char *CharsetRecog_gb_18030::getLanguage() const
606{
607 return "zh";
608}
609
51004dcb 610UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
73c04bcf 611{
4388f060 612#if U_PLATFORM_IS_DARWIN_BASED
51004dcb 613 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030);
4388f060 614#else
51004dcb 615 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
4388f060 616#endif
51004dcb
A
617 results->set(det, this, confidence);
618 return (confidence > 0);
73c04bcf
A
619}
620
621U_NAMESPACE_END
622#endif