]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
22 #define min(x,y) (((x)<(y))?(x):(y))
24 static const uint16_t commonChars_sjis
[] = {
25 // TODO: This set of data comes from the character frequency-
26 // of-occurence analysis tool. The data needs to be moved
27 // into a resource and loaded from there.
28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
35 static const uint16_t commonChars_euc_jp
[] = {
36 // TODO: This set of data comes from the character frequency-
37 // of-occurence analysis tool. The data needs to be moved
38 // into a resource and loaded from there.
39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
50 static const uint16_t commonChars_euc_kr
[] = {
51 // TODO: This set of data comes from the character frequency-
52 // of-occurence analysis tool. The data needs to be moved
53 // into a resource and loaded from there.
54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
65 static const uint16_t commonChars_big5
[] = {
66 // TODO: This set of data comes from the character frequency-
67 // of-occurence analysis tool. The data needs to be moved
68 // into a resource and loaded from there.
69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
80 static const uint16_t commonChars_gb_18030
[] = {
81 // TODO: This set of data comes from the character frequency-
82 // of-occurence analysis tool. The data needs to be moved
83 // into a resource and loaded from there.
84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
95 #if U_PLATFORM_IS_DARWIN_BASED
96 static const uint8_t keyStrings_sjis
[][MAX_KEY_STRING_WITH_NULL
] = {
97 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
98 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
101 static const uint8_t keyStrings_euc_jp
[][MAX_KEY_STRING_WITH_NULL
] = {
102 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
103 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
106 static const uint8_t keyStrings_euc_kr
[][MAX_KEY_STRING_WITH_NULL
] = {
107 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
108 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
109 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
112 static const uint8_t keyStrings_big5
[][MAX_KEY_STRING_WITH_NULL
] = {
113 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
114 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
115 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
118 static const uint8_t keyStrings_gb_18030
[][MAX_KEY_STRING_WITH_NULL
] = {
119 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
120 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
125 static int32_t binarySearch(const uint16_t *array
, int32_t len
, uint16_t value
)
127 int32_t start
= 0, end
= len
-1;
128 int32_t mid
= (start
+end
)/2;
130 while(start
<= end
) {
131 if(array
[mid
] == value
) {
135 if(array
[mid
] < value
){
147 #if U_PLATFORM_IS_DARWIN_BASED
148 // If testPrefix is a prefix of base, return its length, else return 0
149 static int32_t isPrefix(const uint8_t *testPrefix
, const uint8_t *base
, const uint8_t *baseLimit
) {
150 const uint8_t *testPrefixStart
= testPrefix
;
151 while (*testPrefix
!= 0 && base
< baseLimit
&& *testPrefix
== *base
) {
155 return (*testPrefix
== 0)? (int32_t)(testPrefix
-testPrefixStart
): 0;
159 IteratedChar::IteratedChar() :
160 charValue(0), index(-1), nextIndex(0), error(FALSE
), done(FALSE
)
162 // nothing else to do.
165 /*void IteratedChar::reset()
174 int32_t IteratedChar::nextByte(InputText
*det
)
176 if (nextIndex
>= det
->fRawLength
) {
182 return det
->fRawInput
[nextIndex
++];
185 CharsetRecog_mbcs::~CharsetRecog_mbcs()
190 #if U_PLATFORM_IS_DARWIN_BASED
191 int32_t CharsetRecog_mbcs::match_mbcs(InputText
*det
, const uint16_t commonChars
[], int32_t commonCharsLen
, const uint8_t (*keyStrings
)[MAX_KEY_STRING_WITH_NULL
] ) const {
193 int32_t CharsetRecog_mbcs::match_mbcs(InputText
*det
, const uint16_t commonChars
[], int32_t commonCharsLen
) const {
195 int32_t singleByteCharCount
= 0;
196 int32_t doubleByteCharCount
= 0;
197 int32_t commonCharCount
= 0;
198 int32_t badCharCount
= 0;
199 int32_t totalCharCount
= 0;
200 int32_t confidence
= 0;
201 #if U_PLATFORM_IS_DARWIN_BASED
202 int32_t confidenceFromKeys
= 0;
206 while (nextChar(&iter
, det
)) {
212 if (iter
.charValue
<= 0xFF) {
213 singleByteCharCount
++;
215 doubleByteCharCount
++;
217 if (commonChars
!= 0) {
218 if (binarySearch(commonChars
, commonCharsLen
, iter
.charValue
) >= 0){
219 commonCharCount
+= 1;
222 #if U_PLATFORM_IS_DARWIN_BASED
223 if (doubleByteCharCount
<= 20) {
225 for ( keyIndex
= 0; keyStrings
[keyIndex
][0] != 0; keyIndex
++ ) {
226 int32_t prefixLen
= isPrefix(keyStrings
[keyIndex
], &det
->fRawInput
[iter
.index
], &det
->fRawInput
[det
->fRawLength
]);
227 confidenceFromKeys
+= prefixLen
*5;
235 if (badCharCount
>= 2 && badCharCount
*5 >= doubleByteCharCount
) {
236 // Bail out early if the byte data is not matching the encoding scheme.
237 // break detectBlock;
242 if (doubleByteCharCount
<= 10 && badCharCount
== 0) {
243 // Not many multi-byte chars.
244 if (doubleByteCharCount
== 0 && totalCharCount
< 10) {
245 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
246 // We don't have enough data to have any confidence.
247 // Statistical analysis of single byte non-ASCII charcters would probably help here.
251 // ASCII or ISO file? It's probably not our encoding,
252 // but is not incompatible with our encoding, so don't give it a zero.
253 #if U_PLATFORM_IS_DARWIN_BASED
254 if (confidenceFromKeys
> 90) {
255 confidenceFromKeys
= 90;
256 } else if (confidenceFromKeys
> 0 && confidenceFromKeys
< 70) {
257 confidenceFromKeys
+= 20;
259 confidence
= 10 + confidenceFromKeys
;
269 // No match if there are too many characters that don't fit the encoding scheme.
270 // (should we have zero tolerance for these?)
272 if (doubleByteCharCount
< 20*badCharCount
) {
278 if (commonChars
== 0) {
279 // We have no statistics on frequently occuring characters.
280 // Assess confidence purely on having a reasonable number of
281 // multi-byte characters (the more the better)
282 confidence
= 30 + doubleByteCharCount
- 20*badCharCount
;
283 #if U_PLATFORM_IS_DARWIN_BASED
284 confidence
+= confidenceFromKeys
;
287 if (confidence
> 100) {
292 // Frequency of occurence statistics exist.
295 double maxVal
= log((double)doubleByteCharCount
/ 4); /*(float)?*/
296 double scaleFactor
= 90.0 / maxVal
;
297 confidence
= (int32_t)(log((double)commonCharCount
+1) * scaleFactor
+ 10.0);
298 #if U_PLATFORM_IS_DARWIN_BASED
299 confidence
+= confidenceFromKeys
;
302 confidence
= min(confidence
, 100);
305 if (confidence
< 0) {
312 CharsetRecog_sjis::~CharsetRecog_sjis()
317 UBool
CharsetRecog_sjis::nextChar(IteratedChar
* it
, InputText
* det
) const {
318 it
->index
= it
->nextIndex
;
321 int32_t firstByte
= it
->charValue
= it
->nextByte(det
);
327 if (firstByte
<= 0x7F || (firstByte
> 0xA0 && firstByte
<= 0xDF)) {
331 int32_t secondByte
= it
->nextByte(det
);
332 if (secondByte
>= 0) {
333 it
->charValue
= (firstByte
<< 8) | secondByte
;
335 // else we'll handle the error later.
337 if (! ((secondByte
>= 0x40 && secondByte
<= 0x7F) || (secondByte
>= 0x80 && secondByte
<= 0xFE))) {
338 // Illegal second byte value.
345 UBool
CharsetRecog_sjis::match(InputText
* det
, CharsetMatch
*results
) const {
346 #if U_PLATFORM_IS_DARWIN_BASED
347 int32_t confidence
= match_mbcs(det
, commonChars_sjis
, UPRV_LENGTHOF(commonChars_sjis
), keyStrings_sjis
);
349 int32_t confidence
= match_mbcs(det
, commonChars_sjis
, UPRV_LENGTHOF(commonChars_sjis
));
351 results
->set(det
, this, confidence
);
352 return (confidence
> 0);
355 const char *CharsetRecog_sjis::getName() const
360 const char *CharsetRecog_sjis::getLanguage() const
365 CharsetRecog_euc::~CharsetRecog_euc()
370 UBool
CharsetRecog_euc::nextChar(IteratedChar
* it
, InputText
* det
) const {
371 int32_t firstByte
= 0;
372 int32_t secondByte
= 0;
373 int32_t thirdByte
= 0;
375 it
->index
= it
->nextIndex
;
377 firstByte
= it
->charValue
= it
->nextByte(det
);
380 // Ran off the end of the input data
384 if (firstByte
<= 0x8D) {
389 secondByte
= it
->nextByte(det
);
390 if (secondByte
>= 0) {
391 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
393 // else we'll handle the error later.
395 if (firstByte
>= 0xA1 && firstByte
<= 0xFE) {
397 if (secondByte
< 0xA1) {
404 if (firstByte
== 0x8E) {
406 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
407 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
408 // We don't know which we've got.
409 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
410 // bytes will look like a well formed 2 byte char.
411 if (secondByte
< 0xA1) {
418 if (firstByte
== 0x8F) {
420 // Three byte total char size, two bytes of actual char value.
421 thirdByte
= it
->nextByte(det
);
422 it
->charValue
= (it
->charValue
<< 8) | thirdByte
;
424 if (thirdByte
< 0xa1) {
425 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
434 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
439 const char *CharsetRecog_euc_jp::getName() const
444 const char *CharsetRecog_euc_jp::getLanguage() const
449 UBool
CharsetRecog_euc_jp::match(InputText
*det
, CharsetMatch
*results
) const
451 #if U_PLATFORM_IS_DARWIN_BASED
452 int32_t confidence
= match_mbcs(det
, commonChars_euc_jp
, UPRV_LENGTHOF(commonChars_euc_jp
), keyStrings_euc_jp
);
454 int32_t confidence
= match_mbcs(det
, commonChars_euc_jp
, UPRV_LENGTHOF(commonChars_euc_jp
));
456 results
->set(det
, this, confidence
);
457 return (confidence
> 0);
460 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
465 const char *CharsetRecog_euc_kr::getName() const
470 const char *CharsetRecog_euc_kr::getLanguage() const
475 UBool
CharsetRecog_euc_kr::match(InputText
*det
, CharsetMatch
*results
) const
477 #if U_PLATFORM_IS_DARWIN_BASED
478 int32_t confidence
= match_mbcs(det
, commonChars_euc_kr
, UPRV_LENGTHOF(commonChars_euc_kr
), keyStrings_euc_kr
);
480 int32_t confidence
= match_mbcs(det
, commonChars_euc_kr
, UPRV_LENGTHOF(commonChars_euc_kr
));
482 results
->set(det
, this, confidence
);
483 return (confidence
> 0);
486 CharsetRecog_big5::~CharsetRecog_big5()
491 UBool
CharsetRecog_big5::nextChar(IteratedChar
* it
, InputText
* det
) const
495 it
->index
= it
->nextIndex
;
497 firstByte
= it
->charValue
= it
->nextByte(det
);
503 if (firstByte
<= 0x7F || firstByte
== 0xFF) {
504 // single byte character.
508 int32_t secondByte
= it
->nextByte(det
);
509 if (secondByte
>= 0) {
510 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
512 // else we'll handle the error later.
514 if (secondByte
< 0x40 || secondByte
== 0x7F || secondByte
== 0xFF) {
521 const char *CharsetRecog_big5::getName() const
526 const char *CharsetRecog_big5::getLanguage() const
531 UBool
CharsetRecog_big5::match(InputText
*det
, CharsetMatch
*results
) const
533 #if U_PLATFORM_IS_DARWIN_BASED
534 int32_t confidence
= match_mbcs(det
, commonChars_big5
, UPRV_LENGTHOF(commonChars_big5
), keyStrings_big5
);
536 int32_t confidence
= match_mbcs(det
, commonChars_big5
, UPRV_LENGTHOF(commonChars_big5
));
538 results
->set(det
, this, confidence
);
539 return (confidence
> 0);
542 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
547 UBool
CharsetRecog_gb_18030::nextChar(IteratedChar
* it
, InputText
* det
) const {
548 int32_t firstByte
= 0;
549 int32_t secondByte
= 0;
550 int32_t thirdByte
= 0;
551 int32_t fourthByte
= 0;
553 it
->index
= it
->nextIndex
;
555 firstByte
= it
->charValue
= it
->nextByte(det
);
558 // Ran off the end of the input data
562 if (firstByte
<= 0x80) {
567 secondByte
= it
->nextByte(det
);
568 if (secondByte
>= 0) {
569 it
->charValue
= (it
->charValue
<< 8) | secondByte
;
571 // else we'll handle the error later.
573 if (firstByte
>= 0x81 && firstByte
<= 0xFE) {
575 if ((secondByte
>= 0x40 && secondByte
<= 0x7E) || (secondByte
>=80 && secondByte
<= 0xFE)) {
580 if (secondByte
>= 0x30 && secondByte
<= 0x39) {
581 thirdByte
= it
->nextByte(det
);
583 if (thirdByte
>= 0x81 && thirdByte
<= 0xFE) {
584 fourthByte
= it
->nextByte(det
);
586 if (fourthByte
>= 0x30 && fourthByte
<= 0x39) {
587 it
->charValue
= (it
->charValue
<< 16) | (thirdByte
<< 8) | fourthByte
;
594 // Something wasn't valid, or we ran out of data (-1).
601 const char *CharsetRecog_gb_18030::getName() const
606 const char *CharsetRecog_gb_18030::getLanguage() const
611 UBool
CharsetRecog_gb_18030::match(InputText
*det
, CharsetMatch
*results
) const
613 #if U_PLATFORM_IS_DARWIN_BASED
614 int32_t confidence
= match_mbcs(det
, commonChars_gb_18030
, UPRV_LENGTHOF(commonChars_gb_18030
), keyStrings_gb_18030
);
616 int32_t confidence
= match_mbcs(det
, commonChars_gb_18030
, UPRV_LENGTHOF(commonChars_gb_18030
));
618 results
->set(det
, this, confidence
);
619 return (confidence
> 0);