git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
2 **********************************************************************
3 * Copyright (C) 2005-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
20 #define min(x,y) (((x)<(y))?(x):(y))
22 static const uint16_t commonChars_sjis
[] = {
23 // TODO: This set of data comes from the character frequency-
24 // of-occurence analysis tool. The data needs to be moved
25 // into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33 static const uint16_t commonChars_euc_jp
[] = {
34 // TODO: This set of data comes from the character frequency-
35 // of-occurence analysis tool. The data needs to be moved
36 // into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48 static const uint16_t commonChars_euc_kr
[] = {
49 // TODO: This set of data comes from the character frequency-
50 // of-occurence analysis tool. The data needs to be moved
51 // into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63 static const uint16_t commonChars_big5
[] = {
64 // TODO: This set of data comes from the character frequency-
65 // of-occurence analysis tool. The data needs to be moved
66 // into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78 static const uint16_t commonChars_gb_18030
[] = {
79 // TODO: This set of data comes from the character frequency-
80 // of-occurence analysis tool. The data needs to be moved
81 // into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94 static const uint8_t keyStrings_sjis
] = {
95 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
96 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
99 static const uint8_t keyStrings_euc_jp
] = {
100 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
101 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
104 static const uint8_t keyStrings_euc_kr
] = {
105 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
106 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
107 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
110 static const uint8_t keyStrings_big5
] = {
111 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
112 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
113 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
116 static const uint8_t keyStrings_gb_18030
] = {
117 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
118 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
123 static int32_t binarySearch(const uint16_t *array
, int32_t len
, uint16_t value
125 int32_t start
= 0, end
= len
126 int32_t mid
= (start
128 while(start
<= end
) {
129 if(array
] == value
) {
133 if(array
] < value
146 // If testPrefix is a prefix of base, return its length, else return 0
147 static int32_t isPrefix(const uint8_t *testPrefix
, const uint8_t *base
, const uint8_t *baseLimit
) {
148 const uint8_t *testPrefixStart
= testPrefix
149 while (*testPrefix
!= 0 && base
< baseLimit
&& *testPrefix
== *base
) {
153 return (*testPrefix
== 0)? (int32_t)(testPrefix
): 0;
157 IteratedChar::IteratedChar() :
158 charValue(0), index(-1), nextIndex(0), error(FALSE
), done(FALSE
160 // nothing else to do.
163 /*void IteratedChar::reset()
172 int32_t IteratedChar::nextByte(InputText
174 if (nextIndex
>= det
) {
180 return det
183 CharsetRecog_mbcs::~CharsetRecog_mbcs()
189 int32_t CharsetRecog_mbcs::match_mbcs(InputText
, const uint16_t commonChars
[], int32_t commonCharsLen
, const uint8_t (*keyStrings
] ) const {
191 int32_t CharsetRecog_mbcs::match_mbcs(InputText
, const uint16_t commonChars
[], int32_t commonCharsLen
) const {
193 int32_t singleByteCharCount
= 0;
194 int32_t doubleByteCharCount
= 0;
195 int32_t commonCharCount
= 0;
196 int32_t badCharCount
= 0;
197 int32_t totalCharCount
= 0;
198 int32_t confidence
= 0;
200 int32_t confidenceFromKeys
= 0;
204 while (nextChar(&iter
, det
)) {
210 if (iter
<= 0xFF) {
211 singleByteCharCount
213 doubleByteCharCount
215 if (commonChars
!= 0) {
216 if (binarySearch(commonChars
, commonCharsLen
, iter
) >= 0){
217 commonCharCount
+= 1;
221 if (doubleByteCharCount
<= 20) {
223 for ( keyIndex
= 0; keyStrings
][0] != 0; keyIndex
++ ) {
224 int32_t prefixLen
= isPrefix(keyStrings
], &det
], &det
225 confidenceFromKeys
+= prefixLen
233 if (badCharCount
>= 2 && badCharCount
*5 >= doubleByteCharCount
) {
234 // Bail out early if the byte data is not matching the encoding scheme.
235 // break detectBlock;
240 if (doubleByteCharCount
<= 10 && badCharCount
== 0) {
241 // Not many multi-byte chars.
242 if (doubleByteCharCount
== 0 && totalCharCount
< 10) {
243 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
244 // We don't have enough data to have any confidence.
245 // Statistical analysis of single byte non-ASCII charcters would probably help here.
249 // ASCII or ISO file? It's probably not our encoding,
250 // but is not incompatible with our encoding, so don't give it a zero.
252 if (confidenceFromKeys
> 90) {
253 confidenceFromKeys
= 90;
254 } else if (confidenceFromKeys
> 0 && confidenceFromKeys
< 70) {
255 confidenceFromKeys
+= 20;
257 confidence
= 10 + confidenceFromKeys
267 // No match if there are too many characters that don't fit the encoding scheme.
268 // (should we have zero tolerance for these?)
270 if (doubleByteCharCount
< 20*badCharCount
) {
276 if (commonChars
== 0) {
277 // We have no statistics on frequently occuring characters.
278 // Assess confidence purely on having a reasonable number of
279 // multi-byte characters (the more the better)
280 confidence
= 30 + doubleByteCharCount
- 20*badCharCount
282 confidence
+= confidenceFromKeys
285 if (confidence
> 100) {
290 // Frequency of occurence statistics exist.
293 double maxVal
= log((double)doubleByteCharCount
/ 4); /*(float)?*/
294 double scaleFactor
= 90.0 / maxVal
295 confidence
= (int32_t)(log((double)commonCharCount
+1) * scaleFactor
+ 10.0);
297 confidence
+= confidenceFromKeys
300 confidence
= min(confidence
, 100);
303 if (confidence
< 0) {
310 CharsetRecog_sjis::~CharsetRecog_sjis()
315 UBool
* it
, InputText
* det
) const {
316 it
= it
319 int32_t firstByte
= it
= it
325 if (firstByte
<= 0x7F || (firstByte
> 0xA0 && firstByte
<= 0xDF)) {
329 int32_t secondByte
= it
330 if (secondByte
>= 0) {
331 it
= (firstByte
<< 8) | secondByte
333 // else we'll handle the error later.
335 if (! ((secondByte
>= 0x40 && secondByte
<= 0x7F) || (secondByte
>= 0x80 && secondByte
<= 0xFE))) {
336 // Illegal second byte value.
343 UBool
* det
, CharsetMatch
) const {
345 int32_t confidence
= match_mbcs(det
, commonChars_sjis
, UPRV_LENGTHOF(commonChars_sjis
), keyStrings_sjis
347 int32_t confidence
= match_mbcs(det
, commonChars_sjis
, UPRV_LENGTHOF(commonChars_sjis
349 results
, this, confidence
350 return (confidence
> 0);
353 const char *CharsetRecog_sjis::getName() const
358 const char *CharsetRecog_sjis::getLanguage() const
363 CharsetRecog_euc::~CharsetRecog_euc()
368 UBool
* it
, InputText
* det
) const {
369 int32_t firstByte
= 0;
370 int32_t secondByte
= 0;
371 int32_t thirdByte
= 0;
373 it
= it
375 firstByte
= it
= it
378 // Ran off the end of the input data
382 if (firstByte
<= 0x8D) {
387 secondByte
= it
388 if (secondByte
>= 0) {
389 it
= (it
<< 8) | secondByte
391 // else we'll handle the error later.
393 if (firstByte
>= 0xA1 && firstByte
<= 0xFE) {
395 if (secondByte
< 0xA1) {
402 if (firstByte
== 0x8E) {
404 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
405 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
406 // We don't know which we've got.
407 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
408 // bytes will look like a well formed 2 byte char.
409 if (secondByte
< 0xA1) {
416 if (firstByte
== 0x8F) {
418 // Three byte total char size, two bytes of actual char value.
419 thirdByte
= it
420 it
= (it
<< 8) | thirdByte
422 if (thirdByte
< 0xa1) {
423 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
432 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
437 const char *CharsetRecog_euc_jp::getName() const
442 const char *CharsetRecog_euc_jp::getLanguage() const
447 UBool
, CharsetMatch
) const
450 int32_t confidence
= match_mbcs(det
, commonChars_euc_jp
, UPRV_LENGTHOF(commonChars_euc_jp
), keyStrings_euc_jp
452 int32_t confidence
= match_mbcs(det
, commonChars_euc_jp
, UPRV_LENGTHOF(commonChars_euc_jp
454 results
, this, confidence
455 return (confidence
> 0);
458 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
463 const char *CharsetRecog_euc_kr::getName() const
468 const char *CharsetRecog_euc_kr::getLanguage() const
473 UBool
, CharsetMatch
) const
476 int32_t confidence
= match_mbcs(det
, commonChars_euc_kr
, UPRV_LENGTHOF(commonChars_euc_kr
), keyStrings_euc_kr
478 int32_t confidence
= match_mbcs(det
, commonChars_euc_kr
, UPRV_LENGTHOF(commonChars_euc_kr
480 results
, this, confidence
481 return (confidence
> 0);
484 CharsetRecog_big5::~CharsetRecog_big5()
489 UBool
* it
, InputText
* det
) const
493 it
= it
495 firstByte
= it
= it
501 if (firstByte
<= 0x7F || firstByte
== 0xFF) {
502 // single byte character.
506 int32_t secondByte
= it
507 if (secondByte
>= 0) {
508 it
= (it
<< 8) | secondByte
510 // else we'll handle the error later.
512 if (secondByte
< 0x40 || secondByte
== 0x7F || secondByte
== 0xFF) {
519 const char *CharsetRecog_big5::getName() const
524 const char *CharsetRecog_big5::getLanguage() const
529 UBool
, CharsetMatch
) const
532 int32_t confidence
= match_mbcs(det
, commonChars_big5
, UPRV_LENGTHOF(commonChars_big5
), keyStrings_big5
534 int32_t confidence
= match_mbcs(det
, commonChars_big5
, UPRV_LENGTHOF(commonChars_big5
536 results
, this, confidence
537 return (confidence
> 0);
540 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
545 UBool
* it
, InputText
* det
) const {
546 int32_t firstByte
= 0;
547 int32_t secondByte
= 0;
548 int32_t thirdByte
= 0;
549 int32_t fourthByte
= 0;
551 it
= it
553 firstByte
= it
= it
556 // Ran off the end of the input data
560 if (firstByte
<= 0x80) {
565 secondByte
= it
566 if (secondByte
>= 0) {
567 it
= (it
<< 8) | secondByte
569 // else we'll handle the error later.
571 if (firstByte
>= 0x81 && firstByte
<= 0xFE) {
573 if ((secondByte
>= 0x40 && secondByte
<= 0x7E) || (secondByte
>=80 && secondByte
<= 0xFE)) {
578 if (secondByte
>= 0x30 && secondByte
<= 0x39) {
579 thirdByte
= it
581 if (thirdByte
>= 0x81 && thirdByte
<= 0xFE) {
582 fourthByte
= it
584 if (fourthByte
>= 0x30 && fourthByte
<= 0x39) {
585 it
= (it
<< 16) | (thirdByte
<< 8) | fourthByte
592 // Something wasn't valid, or we ran out of data (-1).
599 const char *CharsetRecog_gb_18030::getName() const
604 const char *CharsetRecog_gb_18030::getLanguage() const
609 UBool
, CharsetMatch
) const
612 int32_t confidence
= match_mbcs(det
, commonChars_gb_18030
, UPRV_LENGTHOF(commonChars_gb_18030
), keyStrings_gb_18030
614 int32_t confidence
= match_mbcs(det
, commonChars_gb_18030
, UPRV_LENGTHOF(commonChars_gb_18030
616 results
, this, confidence
617 return (confidence
> 0);