]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
2 **********************************************************************
3 * Copyright (C) 2005-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20 #define min(x,y) (((x)<(y))?(x):(y))
22 static const uint16_t commonChars_sjis
[] = {
23 // TODO: This set of data comes from the character frequency-
24 // of-occurence analysis tool. The data needs to be moved
25 // into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33 static const uint16_t commonChars_euc_jp
[] = {
34 // TODO: This set of data comes from the character frequency-
35 // of-occurence analysis tool. The data needs to be moved
36 // into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48 static const uint16_t commonChars_euc_kr
[] = {
49 // TODO: This set of data comes from the character frequency-
50 // of-occurence analysis tool. The data needs to be moved
51 // into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63 static const uint16_t commonChars_big5
[] = {
64 // TODO: This set of data comes from the character frequency-
65 // of-occurence analysis tool. The data needs to be moved
66 // into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78 static const uint16_t commonChars_gb_18030
[] = {
79 // TODO: This set of data comes from the character frequency-
80 // of-occurence analysis tool. The data needs to be moved
81 // into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
93 static int32_t binarySearch(const uint16_t *array
, int32_t len
, uint16_t value
)
95 int32_t start
= 0, end
= len
-1;
96 int32_t mid
= (start
+end
)/2;
99 if(array
[mid
] == value
) {
103 if(array
[mid
] < value
){
115 IteratedChar::IteratedChar() :
116 charValue(0), index(-1), nextIndex(0), error(FALSE
), done(FALSE
)
118 // nothing else to do.
121 /*void IteratedChar::reset()
130 int32_t IteratedChar::nextByte(InputText
*det
)
132 if (nextIndex
>= det
->fRawLength
) {
138 return det
->fRawInput
[nextIndex
++];
141 CharsetRecog_mbcs::~CharsetRecog_mbcs()
146 int32_t CharsetRecog_mbcs::match_mbcs(InputText
*det
, const uint16_t commonChars
[], int32_t commonCharsLen
) {
147 int32_t singleByteCharCount
= 0;
148 int32_t doubleByteCharCount
= 0;
149 int32_t commonCharCount
= 0;
150 int32_t badCharCount
= 0;
151 int32_t totalCharCount
= 0;
152 int32_t confidence
= 0;
155 while (nextChar(&iter
, det
)) {
161 if (iter
.charValue
<= 0xFF) {
162 singleByteCharCount
++;
164 doubleByteCharCount
++;
166 if (commonChars
!= 0) {
167 if (binarySearch(commonChars
, commonCharsLen
, iter
.charValue
) >= 0){
168 commonCharCount
+= 1;
175 if (badCharCount
>= 2 && badCharCount
*5 >= doubleByteCharCount
) {
176 // Bail out early if the byte data is not matching the encoding scheme.
177 // break detectBlock;
182 if (doubleByteCharCount
<= 10 && badCharCount
== 0) {
183 // Not many multi-byte chars.
184 if (doubleByteCharCount
== 0 && totalCharCount
< 10) {
185 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
186 // We don't have enough data to have any confidence.
187 // Statistical analysis of single byte non-ASCII charcters would probably help here.
191 // ASCII or ISO file? It's probably not our encoding,
192 // but is not incompatible with our encoding, so don't give it a zero.
200 // No match if there are too many characters that don't fit the encoding scheme.
201 // (should we have zero tolerance for these?)
203 if (doubleByteCharCount
< 20*badCharCount
) {
209 if (commonChars
== 0) {
210 // We have no statistics on frequently occuring characters.
211 // Assess confidence purely on having a reasonable number of
212 // multi-byte characters (the more the better)
213 confidence
= 30 + doubleByteCharCount
- 20*badCharCount
;
215 if (confidence
> 100) {
220 // Frequency of occurence statistics exist.
223 double maxVal
= log10((double)doubleByteCharCount
/ 4); /*(float)?*/
224 double scaleFactor
= 90.0 / maxVal
;
225 confidence
= (int32_t)(log10((double)commonCharCount
+1) * scaleFactor
+ 10.0);
227 confidence
= min(confidence
, 100);
230 if (confidence
< 0) {
237 CharsetRecog_sjis::~CharsetRecog_sjis()
242 UBool
CharsetRecog_sjis::nextChar(IteratedChar
* it
, InputText
* det
) {
243 it
->index
= it
->nextIndex
;
246 int32_t firstByte
= it
->charValue
= it
->nextByte(det
);
252 if (firstByte
<= 0x7F || (firstByte
> 0xA0 && firstByte
<= 0xDF)) {
256 int32_t secondByte
= it
->nextByte(det
);
257 if (secondByte
>= 0) {
258 it
->charValue
= (firstByte
<< 8) | secondByte
;
260 // else we'll handle the error later.
262 if (! ((secondByte
>= 0x40 && secondByte
<= 0x7F) || (secondByte
>= 0x80 && secondByte
<= 0xFE))) {
263 // Illegal second byte value.
270 int32_t CharsetRecog_sjis::match(InputText
* det
)
272 return match_mbcs(det
, commonChars_sjis
, ARRAY_SIZE(commonChars_sjis
));
275 const char *CharsetRecog_sjis::getName() const
280 const char *CharsetRecog_sjis::getLanguage() const
285 CharsetRecog_euc::~CharsetRecog_euc()
290 UBool
CharsetRecog_euc::nextChar(IteratedChar
* it
, InputText
* det
) {
291 int32_t firstByte
= 0;
292 int32_t secondByte
= 0;
293 int32_t thirdByte
= 0;
295 it
->index
= it
->nextIndex
;
297 firstByte
= it
->charValue
= it
->nextByte(det
);
300 // Ran off the end of the input data
304 if (firstByte
<= 0x8D) {
309 secondByte
= it
->nextByte(det
);
310 if (secondByte
>= 0) {
311 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
313 // else we'll handle the error later.
315 if (firstByte
>= 0xA1 && firstByte
<= 0xFE) {
317 if (secondByte
< 0xA1) {
324 if (firstByte
== 0x8E) {
326 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328 // We don't know which we've got.
329 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
330 // bytes will look like a well formed 2 byte char.
331 if (secondByte
< 0xA1) {
338 if (firstByte
== 0x8F) {
340 // Three byte total char size, two bytes of actual char value.
341 thirdByte
= it
->nextByte(det
);
342 it
->charValue
= (it
->charValue
<< 8) | thirdByte
;
344 if (thirdByte
< 0xa1) {
345 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
359 const char *CharsetRecog_euc_jp::getName() const
364 const char *CharsetRecog_euc_jp::getLanguage() const
369 int32_t CharsetRecog_euc_jp::match(InputText
*det
)
371 return match_mbcs(det
, commonChars_euc_jp
, ARRAY_SIZE(commonChars_euc_jp
));
374 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
379 const char *CharsetRecog_euc_kr::getName() const
384 const char *CharsetRecog_euc_kr::getLanguage() const
389 int32_t CharsetRecog_euc_kr::match(InputText
*det
)
391 return match_mbcs(det
, commonChars_euc_kr
, ARRAY_SIZE(commonChars_euc_kr
));
394 CharsetRecog_big5::~CharsetRecog_big5()
399 UBool
CharsetRecog_big5::nextChar(IteratedChar
* it
, InputText
* det
)
403 it
->index
= it
->nextIndex
;
405 firstByte
= it
->charValue
= it
->nextByte(det
);
411 if (firstByte
<= 0x7F || firstByte
== 0xFF) {
412 // single byte character.
416 int32_t secondByte
= it
->nextByte(det
);
417 if (secondByte
>= 0) {
418 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
420 // else we'll handle the error later.
422 if (secondByte
< 0x40 || secondByte
== 0x7F || secondByte
== 0xFF) {
429 const char *CharsetRecog_big5::getName() const
434 const char *CharsetRecog_big5::getLanguage() const
439 int32_t CharsetRecog_big5::match(InputText
*det
)
441 return match_mbcs(det
, commonChars_big5
, ARRAY_SIZE(commonChars_big5
));
444 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
449 UBool
CharsetRecog_gb_18030::nextChar(IteratedChar
* it
, InputText
* det
) {
450 int32_t firstByte
= 0;
451 int32_t secondByte
= 0;
452 int32_t thirdByte
= 0;
453 int32_t fourthByte
= 0;
455 it
->index
= it
->nextIndex
;
457 firstByte
= it
->charValue
= it
->nextByte(det
);
460 // Ran off the end of the input data
464 if (firstByte
<= 0x80) {
469 secondByte
= it
->nextByte(det
);
470 if (secondByte
>= 0) {
471 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
473 // else we'll handle the error later.
475 if (firstByte
>= 0x81 && firstByte
<= 0xFE) {
477 if ((secondByte
>= 0x40 && secondByte
<= 0x7E) || (secondByte
>=80 && secondByte
<= 0xFE)) {
482 if (secondByte
>= 0x30 && secondByte
<= 0x39) {
483 thirdByte
= it
->nextByte(det
);
485 if (thirdByte
>= 0x81 && thirdByte
<= 0xFE) {
486 fourthByte
= it
->nextByte(det
);
488 if (fourthByte
>= 0x30 && fourthByte
<= 0x39) {
489 it
->charValue
= (it
->charValue
<< 16) | (thirdByte
<< 8) | fourthByte
;
496 // Something wasn't valid, or we ran out of data (-1).
503 const char *CharsetRecog_gb_18030::getName() const
508 const char *CharsetRecog_gb_18030::getLanguage() const
513 int32_t CharsetRecog_gb_18030::match(InputText
*det
)
515 return match_mbcs(det
, commonChars_gb_18030
, ARRAY_SIZE(commonChars_gb_18030
));