]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csrmbcs.cpp
ICU-461.12.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
46f4442e 3 * Copyright (C) 2005-2008, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "csrmbcs.h"
13
14#include <math.h>
15
16U_NAMESPACE_BEGIN
17
18#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20#define min(x,y) (((x)<(y))?(x):(y))
21
46f4442e 22static const uint16_t commonChars_sjis [] = {
73c04bcf
A
23// TODO: This set of data comes from the character frequency-
24// of-occurence analysis tool. The data needs to be moved
25// into a resource and loaded from there.
260x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
270x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
280x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
290x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
300x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
310x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
32
46f4442e 33static const uint16_t commonChars_euc_jp[] = {
73c04bcf
A
34// TODO: This set of data comes from the character frequency-
35// of-occurence analysis tool. The data needs to be moved
36// into a resource and loaded from there.
370xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
380xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
390xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
400xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
410xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
420xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
430xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
440xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
450xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
460xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
47
46f4442e 48static const uint16_t commonChars_euc_kr[] = {
73c04bcf
A
49// TODO: This set of data comes from the character frequency-
50// of-occurence analysis tool. The data needs to be moved
51// into a resource and loaded from there.
520xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
530xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
540xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
550xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
560xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
570xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
580xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
590xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
600xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
610xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
62
46f4442e 63static const uint16_t commonChars_big5[] = {
73c04bcf
A
64// TODO: This set of data comes from the character frequency-
65// of-occurence analysis tool. The data needs to be moved
66// into a resource and loaded from there.
670xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
680xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
690xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
700xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
710xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
720xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
730xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
740xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
750xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
760xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
77
46f4442e 78static const uint16_t commonChars_gb_18030[] = {
73c04bcf
A
79// TODO: This set of data comes from the character frequency-
80// of-occurence analysis tool. The data needs to be moved
81// into a resource and loaded from there.
820xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
830xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
840xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
850xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
860xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
870xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
880xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
890xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
900xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
910xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
92
46f4442e 93static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
73c04bcf
A
94{
95 int32_t start = 0, end = len-1;
96 int32_t mid = (start+end)/2;
97
98 while(start <= end) {
99 if(array[mid] == value) {
100 return mid;
101 }
102
103 if(array[mid] < value){
104 start = mid+1;
105 } else {
106 end = mid-1;
107 }
108
109 mid = (start+end)/2;
110 }
111
112 return -1;
113}
114
46f4442e
A
115IteratedChar::IteratedChar() :
116charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
73c04bcf
A
117{
118 // nothing else to do.
119}
120
46f4442e 121/*void IteratedChar::reset()
73c04bcf
A
122{
123 charValue = 0;
124 index = -1;
125 nextIndex = 0;
126 error = FALSE;
127 done = FALSE;
46f4442e 128}*/
73c04bcf
A
129
130int32_t IteratedChar::nextByte(InputText *det)
131{
132 if (nextIndex >= det->fRawLength) {
133 done = TRUE;
134
135 return -1;
136 }
137
138 return det->fRawInput[nextIndex++];
139}
140
141CharsetRecog_mbcs::~CharsetRecog_mbcs()
142{
143 // nothing to do.
144}
145
46f4442e
A
146int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
147 int32_t singleByteCharCount = 0;
148 int32_t doubleByteCharCount = 0;
149 int32_t commonCharCount = 0;
150 int32_t badCharCount = 0;
151 int32_t totalCharCount = 0;
152 int32_t confidence = 0;
153 IteratedChar iter;
154
155 while (nextChar(&iter, det)) {
156 totalCharCount++;
157
158 if (iter.error) {
159 badCharCount++;
73c04bcf 160 } else {
46f4442e
A
161 if (iter.charValue <= 0xFF) {
162 singleByteCharCount++;
73c04bcf 163 } else {
46f4442e 164 doubleByteCharCount++;
73c04bcf
A
165
166 if (commonChars != 0) {
46f4442e 167 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
73c04bcf
A
168 commonCharCount += 1;
169 }
170 }
171 }
172 }
173
174
175 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
176 // Bail out early if the byte data is not matching the encoding scheme.
177 // break detectBlock;
73c04bcf
A
178 return confidence;
179 }
180 }
181
73c04bcf
A
182 if (doubleByteCharCount <= 10 && badCharCount == 0) {
183 // Not many multi-byte chars.
46f4442e
A
184 if (doubleByteCharCount == 0 && totalCharCount < 10) {
185 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
186 // We don't have enough data to have any confidence.
187 // Statistical analysis of single byte non-ASCII charcters would probably help here.
188 confidence = 0;
189 }
190 else {
191 // ASCII or ISO file? It's probably not our encoding,
192 // but is not incompatible with our encoding, so don't give it a zero.
193 confidence = 10;
194 }
73c04bcf
A
195
196 return confidence;
197 }
198
199 //
200 // No match if there are too many characters that don't fit the encoding scheme.
201 // (should we have zero tolerance for these?)
202 //
203 if (doubleByteCharCount < 20*badCharCount) {
204 confidence = 0;
205
206 return confidence;
207 }
208
209 if (commonChars == 0) {
210 // We have no statistics on frequently occuring characters.
211 // Assess confidence purely on having a reasonable number of
212 // multi-byte characters (the more the better)
213 confidence = 30 + doubleByteCharCount - 20*badCharCount;
214
215 if (confidence > 100) {
216 confidence = 100;
217 }
218 } else {
219 //
220 // Frequency of occurence statistics exist.
221 //
222
223 double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
224 double scaleFactor = 90.0 / maxVal;
225 confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
226
227 confidence = min(confidence, 100);
228 }
229
230 if (confidence < 0) {
231 confidence = 0;
232 }
233
234 return confidence;
235}
236
237CharsetRecog_sjis::~CharsetRecog_sjis()
238{
239 // nothing to do
240}
241
242UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
243 it->index = it->nextIndex;
244 it->error = FALSE;
245
246 int32_t firstByte = it->charValue = it->nextByte(det);
247
248 if (firstByte < 0) {
249 return FALSE;
250 }
251
252 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
253 return TRUE;
254 }
255
256 int32_t secondByte = it->nextByte(det);
46f4442e
A
257 if (secondByte >= 0) {
258 it->charValue = (firstByte << 8) | secondByte;
73c04bcf 259 }
46f4442e
A
260 // else we'll handle the error later.
261
73c04bcf
A
262 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
263 // Illegal second byte value.
264 it->error = TRUE;
265 }
266
267 return TRUE;
268}
269
270int32_t CharsetRecog_sjis::match(InputText* det)
271{
272 return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
273}
274
275const char *CharsetRecog_sjis::getName() const
276{
277 return "Shift_JIS";
278}
279
280const char *CharsetRecog_sjis::getLanguage() const
281{
282 return "ja";
283}
284
285CharsetRecog_euc::~CharsetRecog_euc()
286{
287 // nothing to do
288}
289
290UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
291 int32_t firstByte = 0;
292 int32_t secondByte = 0;
293 int32_t thirdByte = 0;
73c04bcf
A
294
295 it->index = it->nextIndex;
296 it->error = FALSE;
297 firstByte = it->charValue = it->nextByte(det);
298
299 if (firstByte < 0) {
300 // Ran off the end of the input data
46f4442e 301 return FALSE;
73c04bcf
A
302 }
303
304 if (firstByte <= 0x8D) {
305 // single byte char
46f4442e 306 return TRUE;
73c04bcf
A
307 }
308
309 secondByte = it->nextByte(det);
46f4442e
A
310 if (secondByte >= 0) {
311 it->charValue = (it->charValue << 8) | secondByte;
312 }
313 // else we'll handle the error later.
73c04bcf
A
314
315 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316 // Two byte Char
317 if (secondByte < 0xA1) {
318 it->error = TRUE;
319 }
320
46f4442e 321 return TRUE;
73c04bcf
A
322 }
323
324 if (firstByte == 0x8E) {
325 // Code Set 2.
326 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328 // We don't know which we've got.
329 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
330 // bytes will look like a well formed 2 byte char.
331 if (secondByte < 0xA1) {
332 it->error = TRUE;
333 }
334
46f4442e 335 return TRUE;
73c04bcf
A
336 }
337
338 if (firstByte == 0x8F) {
339 // Code set 3.
340 // Three byte total char size, two bytes of actual char value.
341 thirdByte = it->nextByte(det);
342 it->charValue = (it->charValue << 8) | thirdByte;
343
344 if (thirdByte < 0xa1) {
46f4442e 345 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
73c04bcf
A
346 it->error = TRUE;
347 }
348 }
349
46f4442e 350 return TRUE;
73c04bcf
A
351
352}
353
354CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355{
356 // nothing to do
357}
358
359const char *CharsetRecog_euc_jp::getName() const
360{
361 return "EUC-JP";
362}
363
364const char *CharsetRecog_euc_jp::getLanguage() const
365{
366 return "ja";
367}
368
369int32_t CharsetRecog_euc_jp::match(InputText *det)
370{
371 return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
372}
373
374CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
375{
376 // nothing to do
377}
378
379const char *CharsetRecog_euc_kr::getName() const
380{
381 return "EUC-KR";
382}
383
384const char *CharsetRecog_euc_kr::getLanguage() const
385{
386 return "ko";
387}
388
389int32_t CharsetRecog_euc_kr::match(InputText *det)
390{
391 return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
392}
393
394CharsetRecog_big5::~CharsetRecog_big5()
395{
396 // nothing to do
397}
398
399UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
400{
401 int32_t firstByte;
402
403 it->index = it->nextIndex;
404 it->error = FALSE;
405 firstByte = it->charValue = it->nextByte(det);
406
407 if (firstByte < 0) {
408 return FALSE;
409 }
410
411 if (firstByte <= 0x7F || firstByte == 0xFF) {
412 // single byte character.
413 return TRUE;
414 }
415
416 int32_t secondByte = it->nextByte(det);
46f4442e
A
417 if (secondByte >= 0) {
418 it->charValue = (it->charValue << 8) | secondByte;
73c04bcf 419 }
46f4442e 420 // else we'll handle the error later.
73c04bcf 421
46f4442e
A
422 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
423 it->error = TRUE;
73c04bcf
A
424 }
425
426 return TRUE;
427}
428
429const char *CharsetRecog_big5::getName() const
430{
431 return "Big5";
432}
433
434const char *CharsetRecog_big5::getLanguage() const
435{
436 return "zh";
437}
438
439int32_t CharsetRecog_big5::match(InputText *det)
440{
441 return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
442}
443
444CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
445{
446 // nothing to do
447}
448
449UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
450 int32_t firstByte = 0;
451 int32_t secondByte = 0;
452 int32_t thirdByte = 0;
453 int32_t fourthByte = 0;
454
455 it->index = it->nextIndex;
456 it->error = FALSE;
457 firstByte = it->charValue = it->nextByte(det);
458
459 if (firstByte < 0) {
460 // Ran off the end of the input data
46f4442e 461 return FALSE;
73c04bcf
A
462 }
463
464 if (firstByte <= 0x80) {
465 // single byte char
46f4442e 466 return TRUE;
73c04bcf
A
467 }
468
469 secondByte = it->nextByte(det);
46f4442e
A
470 if (secondByte >= 0) {
471 it->charValue = (it->charValue << 8) | secondByte;
472 }
473 // else we'll handle the error later.
73c04bcf
A
474
475 if (firstByte >= 0x81 && firstByte <= 0xFE) {
476 // Two byte Char
477 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
46f4442e 478 return TRUE;
73c04bcf
A
479 }
480
481 // Four byte char
482 if (secondByte >= 0x30 && secondByte <= 0x39) {
483 thirdByte = it->nextByte(det);
484
485 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
486 fourthByte = it->nextByte(det);
487
488 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
489 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
490
46f4442e 491 return TRUE;
73c04bcf
A
492 }
493 }
494 }
495
46f4442e 496 // Something wasn't valid, or we ran out of data (-1).
73c04bcf 497 it->error = TRUE;
73c04bcf
A
498 }
499
46f4442e 500 return TRUE;
73c04bcf
A
501}
502
503const char *CharsetRecog_gb_18030::getName() const
504{
505 return "GB18030";
506}
507
508const char *CharsetRecog_gb_18030::getLanguage() const
509{
510 return "zh";
511}
512
513int32_t CharsetRecog_gb_18030::match(InputText *det)
514{
515 return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
516}
517
518U_NAMESPACE_END
519#endif