]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20 #define min(x,y) (((x)<(y))?(x):(y))
22 static const uint16_t commonChars_sjis
[] = {
23 // TODO: This set of data comes from the character frequency-
24 // of-occurence analysis tool. The data needs to be moved
25 // into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33 static const uint16_t commonChars_euc_jp
[] = {
34 // TODO: This set of data comes from the character frequency-
35 // of-occurence analysis tool. The data needs to be moved
36 // into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48 static const uint16_t commonChars_euc_kr
[] = {
49 // TODO: This set of data comes from the character frequency-
50 // of-occurence analysis tool. The data needs to be moved
51 // into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63 static const uint16_t commonChars_big5
[] = {
64 // TODO: This set of data comes from the character frequency-
65 // of-occurence analysis tool. The data needs to be moved
66 // into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78 static const uint16_t commonChars_gb_18030
[] = {
79 // TODO: This set of data comes from the character frequency-
80 // of-occurence analysis tool. The data needs to be moved
81 // into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
93 #if U_PLATFORM_IS_DARWIN_BASED
94 static const uint8_t keyStrings_sjis
[][MAX_KEY_STRING_WITH_NULL
] = {
95 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
96 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
99 static const uint8_t keyStrings_euc_jp
[][MAX_KEY_STRING_WITH_NULL
] = {
100 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
101 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
104 static const uint8_t keyStrings_euc_kr
[][MAX_KEY_STRING_WITH_NULL
] = {
105 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
106 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
107 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
110 static const uint8_t keyStrings_big5
[][MAX_KEY_STRING_WITH_NULL
] = {
111 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
112 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
113 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
116 static const uint8_t keyStrings_gb_18030
[][MAX_KEY_STRING_WITH_NULL
] = {
117 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
118 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
123 static int32_t binarySearch(const uint16_t *array
, int32_t len
, uint16_t value
)
125 int32_t start
= 0, end
= len
-1;
126 int32_t mid
= (start
+end
)/2;
128 while(start
<= end
) {
129 if(array
[mid
] == value
) {
133 if(array
[mid
] < value
){
145 #if U_PLATFORM_IS_DARWIN_BASED
146 // If testPrefix is a prefix of base, return its length, else return 0
147 static int32_t isPrefix(const uint8_t *testPrefix
, const uint8_t *base
, const uint8_t *baseLimit
) {
148 const uint8_t *testPrefixStart
= testPrefix
;
149 while (*testPrefix
!= 0 && base
< baseLimit
&& *testPrefix
== *base
) {
153 return (*testPrefix
== 0)? (int32_t)(testPrefix
-testPrefixStart
): 0;
157 IteratedChar::IteratedChar() :
158 charValue(0), index(-1), nextIndex(0), error(FALSE
), done(FALSE
)
160 // nothing else to do.
163 /*void IteratedChar::reset()
172 int32_t IteratedChar::nextByte(InputText
*det
)
174 if (nextIndex
>= det
->fRawLength
) {
180 return det
->fRawInput
[nextIndex
++];
183 CharsetRecog_mbcs::~CharsetRecog_mbcs()
188 #if U_PLATFORM_IS_DARWIN_BASED
189 int32_t CharsetRecog_mbcs::match_mbcs(InputText
*det
, const uint16_t commonChars
[], int32_t commonCharsLen
, const uint8_t (*keyStrings
)[MAX_KEY_STRING_WITH_NULL
] ) {
191 int32_t CharsetRecog_mbcs::match_mbcs(InputText
*det
, const uint16_t commonChars
[], int32_t commonCharsLen
) {
193 int32_t singleByteCharCount
= 0;
194 int32_t doubleByteCharCount
= 0;
195 int32_t commonCharCount
= 0;
196 int32_t badCharCount
= 0;
197 int32_t totalCharCount
= 0;
198 int32_t confidence
= 0;
199 #if U_PLATFORM_IS_DARWIN_BASED
200 int32_t confidenceFromKeys
= 0;
204 while (nextChar(&iter
, det
)) {
210 if (iter
.charValue
<= 0xFF) {
211 singleByteCharCount
++;
213 doubleByteCharCount
++;
215 if (commonChars
!= 0) {
216 if (binarySearch(commonChars
, commonCharsLen
, iter
.charValue
) >= 0){
217 commonCharCount
+= 1;
220 #if U_PLATFORM_IS_DARWIN_BASED
221 if (doubleByteCharCount
<= 20) {
223 for ( keyIndex
= 0; keyStrings
[keyIndex
][0] != 0; keyIndex
++ ) {
224 int32_t prefixLen
= isPrefix(keyStrings
[keyIndex
], &det
->fRawInput
[iter
.index
], &det
->fRawInput
[det
->fRawLength
]);
225 confidenceFromKeys
+= prefixLen
*5;
233 if (badCharCount
>= 2 && badCharCount
*5 >= doubleByteCharCount
) {
234 // Bail out early if the byte data is not matching the encoding scheme.
235 // break detectBlock;
240 if (doubleByteCharCount
<= 10 && badCharCount
== 0) {
241 // Not many multi-byte chars.
242 if (doubleByteCharCount
== 0 && totalCharCount
< 10) {
243 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
244 // We don't have enough data to have any confidence.
245 // Statistical analysis of single byte non-ASCII charcters would probably help here.
249 // ASCII or ISO file? It's probably not our encoding,
250 // but is not incompatible with our encoding, so don't give it a zero.
251 #if U_PLATFORM_IS_DARWIN_BASED
252 if (confidenceFromKeys
> 90) {
253 confidenceFromKeys
= 90;
254 } else if (confidenceFromKeys
> 0 && confidenceFromKeys
< 70) {
255 confidenceFromKeys
+= 20;
257 confidence
= 10 + confidenceFromKeys
;
267 // No match if there are too many characters that don't fit the encoding scheme.
268 // (should we have zero tolerance for these?)
270 if (doubleByteCharCount
< 20*badCharCount
) {
276 if (commonChars
== 0) {
277 // We have no statistics on frequently occuring characters.
278 // Assess confidence purely on having a reasonable number of
279 // multi-byte characters (the more the better)
280 confidence
= 30 + doubleByteCharCount
- 20*badCharCount
;
281 #if U_PLATFORM_IS_DARWIN_BASED
282 confidence
+= confidenceFromKeys
;
285 if (confidence
> 100) {
290 // Frequency of occurence statistics exist.
293 double maxVal
= log((double)doubleByteCharCount
/ 4); /*(float)?*/
294 double scaleFactor
= 90.0 / maxVal
;
295 confidence
= (int32_t)(log((double)commonCharCount
+1) * scaleFactor
+ 10.0);
296 #if U_PLATFORM_IS_DARWIN_BASED
297 confidence
+= confidenceFromKeys
;
300 confidence
= min(confidence
, 100);
303 if (confidence
< 0) {
310 CharsetRecog_sjis::~CharsetRecog_sjis()
315 UBool
CharsetRecog_sjis::nextChar(IteratedChar
* it
, InputText
* det
) {
316 it
->index
= it
->nextIndex
;
319 int32_t firstByte
= it
->charValue
= it
->nextByte(det
);
325 if (firstByte
<= 0x7F || (firstByte
> 0xA0 && firstByte
<= 0xDF)) {
329 int32_t secondByte
= it
->nextByte(det
);
330 if (secondByte
>= 0) {
331 it
->charValue
= (firstByte
<< 8) | secondByte
;
333 // else we'll handle the error later.
335 if (! ((secondByte
>= 0x40 && secondByte
<= 0x7F) || (secondByte
>= 0x80 && secondByte
<= 0xFE))) {
336 // Illegal second byte value.
343 int32_t CharsetRecog_sjis::match(InputText
* det
)
345 #if U_PLATFORM_IS_DARWIN_BASED
346 return match_mbcs(det
, commonChars_sjis
, ARRAY_SIZE(commonChars_sjis
), keyStrings_sjis
);
348 return match_mbcs(det
, commonChars_sjis
, ARRAY_SIZE(commonChars_sjis
));
352 const char *CharsetRecog_sjis::getName() const
357 const char *CharsetRecog_sjis::getLanguage() const
362 CharsetRecog_euc::~CharsetRecog_euc()
367 UBool
CharsetRecog_euc::nextChar(IteratedChar
* it
, InputText
* det
) {
368 int32_t firstByte
= 0;
369 int32_t secondByte
= 0;
370 int32_t thirdByte
= 0;
372 it
->index
= it
->nextIndex
;
374 firstByte
= it
->charValue
= it
->nextByte(det
);
377 // Ran off the end of the input data
381 if (firstByte
<= 0x8D) {
386 secondByte
= it
->nextByte(det
);
387 if (secondByte
>= 0) {
388 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
390 // else we'll handle the error later.
392 if (firstByte
>= 0xA1 && firstByte
<= 0xFE) {
394 if (secondByte
< 0xA1) {
401 if (firstByte
== 0x8E) {
403 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
404 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
405 // We don't know which we've got.
406 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
407 // bytes will look like a well formed 2 byte char.
408 if (secondByte
< 0xA1) {
415 if (firstByte
== 0x8F) {
417 // Three byte total char size, two bytes of actual char value.
418 thirdByte
= it
->nextByte(det
);
419 it
->charValue
= (it
->charValue
<< 8) | thirdByte
;
421 if (thirdByte
< 0xa1) {
422 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
431 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
436 const char *CharsetRecog_euc_jp::getName() const
441 const char *CharsetRecog_euc_jp::getLanguage() const
446 int32_t CharsetRecog_euc_jp::match(InputText
*det
)
448 #if U_PLATFORM_IS_DARWIN_BASED
449 return match_mbcs(det
, commonChars_euc_jp
, ARRAY_SIZE(commonChars_euc_jp
), keyStrings_euc_jp
);
451 return match_mbcs(det
, commonChars_euc_jp
, ARRAY_SIZE(commonChars_euc_jp
));
455 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
460 const char *CharsetRecog_euc_kr::getName() const
465 const char *CharsetRecog_euc_kr::getLanguage() const
470 int32_t CharsetRecog_euc_kr::match(InputText
*det
)
472 #if U_PLATFORM_IS_DARWIN_BASED
473 return match_mbcs(det
, commonChars_euc_kr
, ARRAY_SIZE(commonChars_euc_kr
), keyStrings_euc_kr
);
475 return match_mbcs(det
, commonChars_euc_kr
, ARRAY_SIZE(commonChars_euc_kr
));
479 CharsetRecog_big5::~CharsetRecog_big5()
484 UBool
CharsetRecog_big5::nextChar(IteratedChar
* it
, InputText
* det
)
488 it
->index
= it
->nextIndex
;
490 firstByte
= it
->charValue
= it
->nextByte(det
);
496 if (firstByte
<= 0x7F || firstByte
== 0xFF) {
497 // single byte character.
501 int32_t secondByte
= it
->nextByte(det
);
502 if (secondByte
>= 0) {
503 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
505 // else we'll handle the error later.
507 if (secondByte
< 0x40 || secondByte
== 0x7F || secondByte
== 0xFF) {
514 const char *CharsetRecog_big5::getName() const
519 const char *CharsetRecog_big5::getLanguage() const
524 int32_t CharsetRecog_big5::match(InputText
*det
)
526 #if U_PLATFORM_IS_DARWIN_BASED
527 return match_mbcs(det
, commonChars_big5
, ARRAY_SIZE(commonChars_big5
), keyStrings_big5
);
529 return match_mbcs(det
, commonChars_big5
, ARRAY_SIZE(commonChars_big5
));
533 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
538 UBool
CharsetRecog_gb_18030::nextChar(IteratedChar
* it
, InputText
* det
) {
539 int32_t firstByte
= 0;
540 int32_t secondByte
= 0;
541 int32_t thirdByte
= 0;
542 int32_t fourthByte
= 0;
544 it
->index
= it
->nextIndex
;
546 firstByte
= it
->charValue
= it
->nextByte(det
);
549 // Ran off the end of the input data
553 if (firstByte
<= 0x80) {
558 secondByte
= it
->nextByte(det
);
559 if (secondByte
>= 0) {
560 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
562 // else we'll handle the error later.
564 if (firstByte
>= 0x81 && firstByte
<= 0xFE) {
566 if ((secondByte
>= 0x40 && secondByte
<= 0x7E) || (secondByte
>=80 && secondByte
<= 0xFE)) {
571 if (secondByte
>= 0x30 && secondByte
<= 0x39) {
572 thirdByte
= it
->nextByte(det
);
574 if (thirdByte
>= 0x81 && thirdByte
<= 0xFE) {
575 fourthByte
= it
->nextByte(det
);
577 if (fourthByte
>= 0x30 && fourthByte
<= 0x39) {
578 it
->charValue
= (it
->charValue
<< 16) | (thirdByte
<< 8) | fourthByte
;
585 // Something wasn't valid, or we ran out of data (-1).
592 const char *CharsetRecog_gb_18030::getName() const
597 const char *CharsetRecog_gb_18030::getLanguage() const
602 int32_t CharsetRecog_gb_18030::match(InputText
*det
)
604 #if U_PLATFORM_IS_DARWIN_BASED
605 return match_mbcs(det
, commonChars_gb_18030
, ARRAY_SIZE(commonChars_gb_18030
), keyStrings_gb_18030
);
607 return match_mbcs(det
, commonChars_gb_18030
, ARRAY_SIZE(commonChars_gb_18030
));