]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csrmbcs.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/*
4 **********************************************************************
2ca993e8 5 * Copyright (C) 2005-2016, International Business Machines
73c04bcf
A
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_CONVERSION
13
2ca993e8 14#include "cmemory.h"
51004dcb 15#include "csmatch.h"
73c04bcf
A
16#include "csrmbcs.h"
17
18#include <math.h>
19
20U_NAMESPACE_BEGIN
21
73c04bcf
A
22#define min(x,y) (((x)<(y))?(x):(y))
23
46f4442e 24static const uint16_t commonChars_sjis [] = {
73c04bcf
A
25// TODO: This set of data comes from the character frequency-
26// of-occurence analysis tool. The data needs to be moved
27// into a resource and loaded from there.
280x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
290x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
300x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
310x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
320x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
330x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
46f4442e 35static const uint16_t commonChars_euc_jp[] = {
73c04bcf
A
36// TODO: This set of data comes from the character frequency-
37// of-occurence analysis tool. The data needs to be moved
38// into a resource and loaded from there.
390xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
400xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
410xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
420xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
430xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
440xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
450xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
460xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
470xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
480xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
46f4442e 50static const uint16_t commonChars_euc_kr[] = {
73c04bcf
A
51// TODO: This set of data comes from the character frequency-
52// of-occurence analysis tool. The data needs to be moved
53// into a resource and loaded from there.
540xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
550xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
560xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
570xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
580xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
590xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
600xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
610xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
620xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
630xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
46f4442e 65static const uint16_t commonChars_big5[] = {
73c04bcf
A
66// TODO: This set of data comes from the character frequency-
67// of-occurence analysis tool. The data needs to be moved
68// into a resource and loaded from there.
690xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
700xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
710xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
720xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
730xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
740xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
750xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
760xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
770xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
780xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
46f4442e 80static const uint16_t commonChars_gb_18030[] = {
73c04bcf
A
81// TODO: This set of data comes from the character frequency-
82// of-occurence analysis tool. The data needs to be moved
83// into a resource and loaded from there.
840xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
850xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
860xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
870xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
880xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
890xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
900xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
910xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
920xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
930xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
4388f060
A
95#if U_PLATFORM_IS_DARWIN_BASED
96static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = {
97 {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ...
98 {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward
99 {0}
100};
101static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = {
102 {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ...
103 {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward
104 {0}
105};
106static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = {
107 {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1
108 {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2
109 {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward
110 {0}
111};
112static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = {
113 {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1
114 {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2
115 {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward
116 {0}
117};
118static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = {
119 {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP...
120 {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward
121 {0}
122};
123#endif
124
46f4442e 125static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
73c04bcf
A
126{
127 int32_t start = 0, end = len-1;
128 int32_t mid = (start+end)/2;
129
130 while(start <= end) {
131 if(array[mid] == value) {
132 return mid;
133 }
134
135 if(array[mid] < value){
136 start = mid+1;
137 } else {
138 end = mid-1;
139 }
140
141 mid = (start+end)/2;
142 }
143
144 return -1;
145}
146
4388f060
A
147#if U_PLATFORM_IS_DARWIN_BASED
148// If testPrefix is a prefix of base, return its length, else return 0
149static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) {
150 const uint8_t *testPrefixStart = testPrefix;
151 while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) {
152 testPrefix++;
153 base++;
154 }
155 return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0;
156}
157#endif
158
46f4442e
A
159IteratedChar::IteratedChar() :
160charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
73c04bcf
A
161{
162 // nothing else to do.
163}
164
46f4442e 165/*void IteratedChar::reset()
73c04bcf
A
166{
167 charValue = 0;
168 index = -1;
169 nextIndex = 0;
170 error = FALSE;
171 done = FALSE;
46f4442e 172}*/
73c04bcf
A
173
174int32_t IteratedChar::nextByte(InputText *det)
175{
176 if (nextIndex >= det->fRawLength) {
177 done = TRUE;
178
179 return -1;
180 }
181
182 return det->fRawInput[nextIndex++];
183}
184
185CharsetRecog_mbcs::~CharsetRecog_mbcs()
186{
187 // nothing to do.
188}
189
4388f060 190#if U_PLATFORM_IS_DARWIN_BASED
51004dcb 191int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const {
4388f060 192#else
51004dcb 193int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
4388f060 194#endif
46f4442e
A
195 int32_t singleByteCharCount = 0;
196 int32_t doubleByteCharCount = 0;
197 int32_t commonCharCount = 0;
198 int32_t badCharCount = 0;
199 int32_t totalCharCount = 0;
200 int32_t confidence = 0;
4388f060
A
201#if U_PLATFORM_IS_DARWIN_BASED
202 int32_t confidenceFromKeys = 0;
203#endif
46f4442e
A
204 IteratedChar iter;
205
206 while (nextChar(&iter, det)) {
207 totalCharCount++;
208
209 if (iter.error) {
210 badCharCount++;
73c04bcf 211 } else {
46f4442e
A
212 if (iter.charValue <= 0xFF) {
213 singleByteCharCount++;
73c04bcf 214 } else {
46f4442e 215 doubleByteCharCount++;
73c04bcf
A
216
217 if (commonChars != 0) {
46f4442e 218 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
73c04bcf
A
219 commonCharCount += 1;
220 }
221 }
4388f060
A
222#if U_PLATFORM_IS_DARWIN_BASED
223 if (doubleByteCharCount <= 20) {
224 int32_t keyIndex;
225 for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) {
226 int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]);
227 confidenceFromKeys += prefixLen*5;
228 }
229 }
230#endif
73c04bcf
A
231 }
232 }
233
234
235 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
236 // Bail out early if the byte data is not matching the encoding scheme.
237 // break detectBlock;
73c04bcf
A
238 return confidence;
239 }
240 }
241
73c04bcf
A
242 if (doubleByteCharCount <= 10 && badCharCount == 0) {
243 // Not many multi-byte chars.
46f4442e
A
244 if (doubleByteCharCount == 0 && totalCharCount < 10) {
245 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
246 // We don't have enough data to have any confidence.
247 // Statistical analysis of single byte non-ASCII charcters would probably help here.
248 confidence = 0;
249 }
250 else {
251 // ASCII or ISO file? It's probably not our encoding,
252 // but is not incompatible with our encoding, so don't give it a zero.
4388f060
A
253#if U_PLATFORM_IS_DARWIN_BASED
254 if (confidenceFromKeys > 90) {
255 confidenceFromKeys = 90;
256 } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) {
257 confidenceFromKeys += 20;
258 }
259 confidence = 10 + confidenceFromKeys;
260#else
46f4442e 261 confidence = 10;
4388f060 262#endif
46f4442e 263 }
73c04bcf
A
264
265 return confidence;
266 }
267
268 //
269 // No match if there are too many characters that don't fit the encoding scheme.
270 // (should we have zero tolerance for these?)
271 //
272 if (doubleByteCharCount < 20*badCharCount) {
273 confidence = 0;
274
275 return confidence;
276 }
277
278 if (commonChars == 0) {
279 // We have no statistics on frequently occuring characters.
280 // Assess confidence purely on having a reasonable number of
281 // multi-byte characters (the more the better)
282 confidence = 30 + doubleByteCharCount - 20*badCharCount;
4388f060
A
283#if U_PLATFORM_IS_DARWIN_BASED
284 confidence += confidenceFromKeys;
285#endif
73c04bcf
A
286
287 if (confidence > 100) {
288 confidence = 100;
289 }
290 } else {
291 //
292 // Frequency of occurence statistics exist.
293 //
294
4388f060 295 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
73c04bcf 296 double scaleFactor = 90.0 / maxVal;
4388f060
A
297 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
298#if U_PLATFORM_IS_DARWIN_BASED
299 confidence += confidenceFromKeys;
300#endif
73c04bcf
A
301
302 confidence = min(confidence, 100);
303 }
304
305 if (confidence < 0) {
306 confidence = 0;
307 }
308
309 return confidence;
310}
311
312CharsetRecog_sjis::~CharsetRecog_sjis()
313{
314 // nothing to do
315}
316
51004dcb 317UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
73c04bcf
A
318 it->index = it->nextIndex;
319 it->error = FALSE;
320
321 int32_t firstByte = it->charValue = it->nextByte(det);
322
323 if (firstByte < 0) {
324 return FALSE;
325 }
326
327 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
328 return TRUE;
329 }
330
331 int32_t secondByte = it->nextByte(det);
46f4442e
A
332 if (secondByte >= 0) {
333 it->charValue = (firstByte << 8) | secondByte;
73c04bcf 334 }
46f4442e
A
335 // else we'll handle the error later.
336
73c04bcf
A
337 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
338 // Illegal second byte value.
339 it->error = TRUE;
340 }
341
342 return TRUE;
343}
344
51004dcb 345UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
4388f060 346#if U_PLATFORM_IS_DARWIN_BASED
2ca993e8 347 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis), keyStrings_sjis);
4388f060 348#else
2ca993e8 349 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
4388f060 350#endif
51004dcb
A
351 results->set(det, this, confidence);
352 return (confidence > 0);
73c04bcf
A
353}
354
355const char *CharsetRecog_sjis::getName() const
356{
357 return "Shift_JIS";
358}
359
360const char *CharsetRecog_sjis::getLanguage() const
361{
362 return "ja";
363}
364
365CharsetRecog_euc::~CharsetRecog_euc()
366{
367 // nothing to do
368}
369
51004dcb 370UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
73c04bcf
A
371 int32_t firstByte = 0;
372 int32_t secondByte = 0;
373 int32_t thirdByte = 0;
73c04bcf
A
374
375 it->index = it->nextIndex;
376 it->error = FALSE;
377 firstByte = it->charValue = it->nextByte(det);
378
379 if (firstByte < 0) {
380 // Ran off the end of the input data
46f4442e 381 return FALSE;
73c04bcf
A
382 }
383
384 if (firstByte <= 0x8D) {
385 // single byte char
46f4442e 386 return TRUE;
73c04bcf
A
387 }
388
389 secondByte = it->nextByte(det);
46f4442e
A
390 if (secondByte >= 0) {
391 it->charValue = (it->charValue << 8) | secondByte;
392 }
393 // else we'll handle the error later.
73c04bcf
A
394
395 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
396 // Two byte Char
397 if (secondByte < 0xA1) {
398 it->error = TRUE;
399 }
400
46f4442e 401 return TRUE;
73c04bcf
A
402 }
403
404 if (firstByte == 0x8E) {
405 // Code Set 2.
406 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
407 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
408 // We don't know which we've got.
409 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
410 // bytes will look like a well formed 2 byte char.
411 if (secondByte < 0xA1) {
412 it->error = TRUE;
413 }
414
46f4442e 415 return TRUE;
73c04bcf
A
416 }
417
418 if (firstByte == 0x8F) {
419 // Code set 3.
420 // Three byte total char size, two bytes of actual char value.
421 thirdByte = it->nextByte(det);
422 it->charValue = (it->charValue << 8) | thirdByte;
423
424 if (thirdByte < 0xa1) {
46f4442e 425 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
73c04bcf
A
426 it->error = TRUE;
427 }
428 }
429
46f4442e 430 return TRUE;
73c04bcf
A
431
432}
433
434CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
435{
436 // nothing to do
437}
438
439const char *CharsetRecog_euc_jp::getName() const
440{
441 return "EUC-JP";
442}
443
444const char *CharsetRecog_euc_jp::getLanguage() const
445{
446 return "ja";
447}
448
51004dcb 449UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
73c04bcf 450{
4388f060 451#if U_PLATFORM_IS_DARWIN_BASED
2ca993e8 452 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp), keyStrings_euc_jp);
4388f060 453#else
2ca993e8 454 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
4388f060 455#endif
51004dcb
A
456 results->set(det, this, confidence);
457 return (confidence > 0);
73c04bcf
A
458}
459
460CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
461{
462 // nothing to do
463}
464
465const char *CharsetRecog_euc_kr::getName() const
466{
467 return "EUC-KR";
468}
469
470const char *CharsetRecog_euc_kr::getLanguage() const
471{
472 return "ko";
473}
474
51004dcb 475UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
73c04bcf 476{
4388f060 477#if U_PLATFORM_IS_DARWIN_BASED
2ca993e8 478 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr), keyStrings_euc_kr);
4388f060 479#else
2ca993e8 480 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
4388f060 481#endif
51004dcb
A
482 results->set(det, this, confidence);
483 return (confidence > 0);
73c04bcf
A
484}
485
486CharsetRecog_big5::~CharsetRecog_big5()
487{
488 // nothing to do
489}
490
51004dcb 491UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
73c04bcf
A
492{
493 int32_t firstByte;
494
495 it->index = it->nextIndex;
496 it->error = FALSE;
497 firstByte = it->charValue = it->nextByte(det);
498
499 if (firstByte < 0) {
500 return FALSE;
501 }
502
503 if (firstByte <= 0x7F || firstByte == 0xFF) {
504 // single byte character.
505 return TRUE;
506 }
507
508 int32_t secondByte = it->nextByte(det);
46f4442e
A
509 if (secondByte >= 0) {
510 it->charValue = (it->charValue << 8) | secondByte;
73c04bcf 511 }
46f4442e 512 // else we'll handle the error later.
73c04bcf 513
46f4442e
A
514 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
515 it->error = TRUE;
73c04bcf
A
516 }
517
518 return TRUE;
519}
520
521const char *CharsetRecog_big5::getName() const
522{
523 return "Big5";
524}
525
526const char *CharsetRecog_big5::getLanguage() const
527{
528 return "zh";
529}
530
51004dcb 531UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
73c04bcf 532{
4388f060 533#if U_PLATFORM_IS_DARWIN_BASED
2ca993e8 534 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5), keyStrings_big5);
4388f060 535#else
2ca993e8 536 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
4388f060 537#endif
51004dcb
A
538 results->set(det, this, confidence);
539 return (confidence > 0);
73c04bcf
A
540}
541
542CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
543{
544 // nothing to do
545}
546
51004dcb 547UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
73c04bcf
A
548 int32_t firstByte = 0;
549 int32_t secondByte = 0;
550 int32_t thirdByte = 0;
551 int32_t fourthByte = 0;
552
553 it->index = it->nextIndex;
554 it->error = FALSE;
555 firstByte = it->charValue = it->nextByte(det);
556
557 if (firstByte < 0) {
558 // Ran off the end of the input data
46f4442e 559 return FALSE;
73c04bcf
A
560 }
561
562 if (firstByte <= 0x80) {
563 // single byte char
46f4442e 564 return TRUE;
73c04bcf
A
565 }
566
567 secondByte = it->nextByte(det);
46f4442e
A
568 if (secondByte >= 0) {
569 it->charValue = (it->charValue << 8) | secondByte;
570 }
571 // else we'll handle the error later.
73c04bcf
A
572
573 if (firstByte >= 0x81 && firstByte <= 0xFE) {
574 // Two byte Char
575 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
46f4442e 576 return TRUE;
73c04bcf
A
577 }
578
579 // Four byte char
580 if (secondByte >= 0x30 && secondByte <= 0x39) {
581 thirdByte = it->nextByte(det);
582
583 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
584 fourthByte = it->nextByte(det);
585
586 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
587 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
588
46f4442e 589 return TRUE;
73c04bcf
A
590 }
591 }
592 }
593
46f4442e 594 // Something wasn't valid, or we ran out of data (-1).
73c04bcf 595 it->error = TRUE;
73c04bcf
A
596 }
597
46f4442e 598 return TRUE;
73c04bcf
A
599}
600
601const char *CharsetRecog_gb_18030::getName() const
602{
603 return "GB18030";
604}
605
606const char *CharsetRecog_gb_18030::getLanguage() const
607{
608 return "zh";
609}
610
51004dcb 611UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
73c04bcf 612{
4388f060 613#if U_PLATFORM_IS_DARWIN_BASED
2ca993e8 614 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030), keyStrings_gb_18030);
4388f060 615#else
2ca993e8 616 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
4388f060 617#endif
51004dcb
A
618 results->set(det, this, confidence);
619 return (confidence > 0);
73c04bcf
A
620}
621
622U_NAMESPACE_END
623#endif