]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrmbcs.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / csrmbcs.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "csrmbcs.h"
13
14 #include <math.h>
15
16 U_NAMESPACE_BEGIN
17
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20 #define min(x,y) (((x)<(y))?(x):(y))
21
22 const int32_t commonChars_sjis [] = {
23 // TODO: This set of data comes from the character frequency-
24 // of-occurence analysis tool. The data needs to be moved
25 // into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
32
33 const int32_t commonChars_euc_jp[] = {
34 // TODO: This set of data comes from the character frequency-
35 // of-occurence analysis tool. The data needs to be moved
36 // into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
47
48 const int32_t commonChars_euc_kr[] = {
49 // TODO: This set of data comes from the character frequency-
50 // of-occurence analysis tool. The data needs to be moved
51 // into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
62
63 const int32_t commonChars_big5[] = {
64 // TODO: This set of data comes from the character frequency-
65 // of-occurence analysis tool. The data needs to be moved
66 // into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
77
78 const int32_t commonChars_gb_18030[] = {
79 // TODO: This set of data comes from the character frequency-
80 // of-occurence analysis tool. The data needs to be moved
81 // into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
92
93 static int32_t binarySearch(const int32_t *array, int32_t len, int32_t value)
94 {
95 int32_t start = 0, end = len-1;
96 int32_t mid = (start+end)/2;
97
98 while(start <= end) {
99 if(array[mid] == value) {
100 return mid;
101 }
102
103 if(array[mid] < value){
104 start = mid+1;
105 } else {
106 end = mid-1;
107 }
108
109 mid = (start+end)/2;
110 }
111
112 return -1;
113 }
114
115 IteratedChar::IteratedChar():charValue(0), index(0), nextIndex(0), error(FALSE), done(FALSE)
116 {
117 // nothing else to do.
118 }
119
120 void IteratedChar::reset()
121 {
122 charValue = 0;
123 index = -1;
124 nextIndex = 0;
125 error = FALSE;
126 done = FALSE;
127 }
128
129 int32_t IteratedChar::nextByte(InputText *det)
130 {
131 if (nextIndex >= det->fRawLength) {
132 done = TRUE;
133
134 return -1;
135 }
136
137 return det->fRawInput[nextIndex++];
138 }
139
140 CharsetRecog_mbcs::~CharsetRecog_mbcs()
141 {
142 // nothing to do.
143 }
144
145 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[], int32_t commonCharsLen) {
146 int singleByteCharCount = 0;
147 int doubleByteCharCount = 0;
148 int commonCharCount = 0;
149 int badCharCount = 0;
150 int totalCharCount = 0;
151 int confidence = 0;
152 IteratedChar *iter = new IteratedChar();
153
154 // {
155 for (iter->reset(); nextChar(iter, det);) {
156 totalCharCount += 1;
157
158 if (iter->error) {
159 badCharCount += 1;
160 } else {
161 if (iter->charValue <= 0xFF) {
162 singleByteCharCount += 1;
163 } else {
164 doubleByteCharCount += 1;
165
166 if (commonChars != 0) {
167 if (binarySearch(commonChars, commonCharsLen, iter->charValue) >= 0){
168 commonCharCount += 1;
169 }
170 }
171 }
172 }
173
174
175 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
176 // Bail out early if the byte data is not matching the encoding scheme.
177 // break detectBlock;
178 delete iter;
179 return confidence;
180 }
181 }
182
183 delete iter;
184
185 if (doubleByteCharCount <= 10 && badCharCount == 0) {
186 // Not many multi-byte chars.
187 // ASCII or ISO file? It's probably not our encoding,
188 // but is not incompatible with our encoding, so don't give it a zero.
189 confidence = 10;
190
191 return confidence;
192 }
193
194 //
195 // No match if there are too many characters that don't fit the encoding scheme.
196 // (should we have zero tolerance for these?)
197 //
198 if (doubleByteCharCount < 20*badCharCount) {
199 confidence = 0;
200
201 return confidence;
202 }
203
204 if (commonChars == 0) {
205 // We have no statistics on frequently occuring characters.
206 // Assess confidence purely on having a reasonable number of
207 // multi-byte characters (the more the better)
208 confidence = 30 + doubleByteCharCount - 20*badCharCount;
209
210 if (confidence > 100) {
211 confidence = 100;
212 }
213 } else {
214 //
215 // Frequency of occurence statistics exist.
216 //
217
218 double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
219 double scaleFactor = 90.0 / maxVal;
220 confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
221
222 confidence = min(confidence, 100);
223 }
224
225 if (confidence < 0) {
226 confidence = 0;
227 }
228
229 return confidence;
230 }
231
232 CharsetRecog_sjis::~CharsetRecog_sjis()
233 {
234 // nothing to do
235 }
236
237 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
238 it->index = it->nextIndex;
239 it->error = FALSE;
240
241 int32_t firstByte = it->charValue = it->nextByte(det);
242
243 if (firstByte < 0) {
244 return FALSE;
245 }
246
247 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
248 return TRUE;
249 }
250
251 int32_t secondByte = it->nextByte(det);
252
253 if (secondByte < 0) {
254 return FALSE;
255 }
256 it->charValue = (firstByte << 8) | secondByte;
257 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
258 // Illegal second byte value.
259 it->error = TRUE;
260 }
261
262 return TRUE;
263 }
264
265 int32_t CharsetRecog_sjis::match(InputText* det)
266 {
267 return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
268 }
269
270 const char *CharsetRecog_sjis::getName() const
271 {
272 return "Shift_JIS";
273 }
274
275 const char *CharsetRecog_sjis::getLanguage() const
276 {
277 return "ja";
278 }
279
280 CharsetRecog_euc::~CharsetRecog_euc()
281 {
282 // nothing to do
283 }
284
285 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
286 int32_t firstByte = 0;
287 int32_t secondByte = 0;
288 int32_t thirdByte = 0;
289 // int32_t fourthByte = 0;
290
291 it->index = it->nextIndex;
292 it->error = FALSE;
293 firstByte = it->charValue = it->nextByte(det);
294
295 if (firstByte < 0) {
296 // Ran off the end of the input data
297 it->done = TRUE;
298
299 return (! it->done);
300 }
301
302 if (firstByte <= 0x8D) {
303 // single byte char
304 return (! it->done);
305 }
306
307 secondByte = it->nextByte(det);
308 it->charValue = (it->charValue << 8) | secondByte;
309
310 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
311 // Two byte Char
312 if (secondByte < 0xA1) {
313 it->error = TRUE;
314 }
315
316 return (! it->done);
317 }
318
319 if (firstByte == 0x8E) {
320 // Code Set 2.
321 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
322 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
323 // We don't know which we've got.
324 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
325 // bytes will look like a well formed 2 byte char.
326 if (secondByte < 0xA1) {
327 it->error = TRUE;
328 }
329
330 return (! it->done);
331 }
332
333 if (firstByte == 0x8F) {
334 // Code set 3.
335 // Three byte total char size, two bytes of actual char value.
336 thirdByte = it->nextByte(det);
337 it->charValue = (it->charValue << 8) | thirdByte;
338
339 if (thirdByte < 0xa1) {
340 it->error = TRUE;
341 }
342 }
343
344 return (! it->done);
345
346 }
347
348 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
349 {
350 // nothing to do
351 }
352
353 const char *CharsetRecog_euc_jp::getName() const
354 {
355 return "EUC-JP";
356 }
357
358 const char *CharsetRecog_euc_jp::getLanguage() const
359 {
360 return "ja";
361 }
362
363 int32_t CharsetRecog_euc_jp::match(InputText *det)
364 {
365 return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
366 }
367
368 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
369 {
370 // nothing to do
371 }
372
373 const char *CharsetRecog_euc_kr::getName() const
374 {
375 return "EUC-KR";
376 }
377
378 const char *CharsetRecog_euc_kr::getLanguage() const
379 {
380 return "ko";
381 }
382
383 int32_t CharsetRecog_euc_kr::match(InputText *det)
384 {
385 return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
386 }
387
388 CharsetRecog_big5::~CharsetRecog_big5()
389 {
390 // nothing to do
391 }
392
393 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
394 {
395 int32_t firstByte;
396
397 it->index = it->nextIndex;
398 it->error = FALSE;
399 firstByte = it->charValue = it->nextByte(det);
400
401 if (firstByte < 0) {
402 return FALSE;
403 }
404
405 if (firstByte <= 0x7F || firstByte == 0xFF) {
406 // single byte character.
407 return TRUE;
408 }
409
410 int32_t secondByte = it->nextByte(det);
411
412 if (secondByte < 0) {
413 return FALSE;
414 }
415
416 it->charValue = (it->charValue << 8) | secondByte;
417
418 if (secondByte < 0x40 ||
419 secondByte == 0x7F ||
420 secondByte == 0xFF) {
421 it->error = TRUE;
422 }
423
424 return TRUE;
425 }
426
427 const char *CharsetRecog_big5::getName() const
428 {
429 return "Big5";
430 }
431
432 const char *CharsetRecog_big5::getLanguage() const
433 {
434 return "zh";
435 }
436
437 int32_t CharsetRecog_big5::match(InputText *det)
438 {
439 return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
440 }
441
442 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
443 {
444 // nothing to do
445 }
446
447 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
448 int32_t firstByte = 0;
449 int32_t secondByte = 0;
450 int32_t thirdByte = 0;
451 int32_t fourthByte = 0;
452
453 it->index = it->nextIndex;
454 it->error = FALSE;
455 firstByte = it->charValue = it->nextByte(det);
456
457 if (firstByte < 0) {
458 // Ran off the end of the input data
459 it->done = TRUE;
460
461 return (! it->done);
462 }
463
464 if (firstByte <= 0x80) {
465 // single byte char
466 return (! it->done);
467 }
468
469 secondByte = it->nextByte(det);
470 it->charValue = (it->charValue << 8) | secondByte;
471
472 if (firstByte >= 0x81 && firstByte <= 0xFE) {
473 // Two byte Char
474 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
475 return (! it->done);
476 }
477
478 // Four byte char
479 if (secondByte >= 0x30 && secondByte <= 0x39) {
480 thirdByte = it->nextByte(det);
481
482 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
483 fourthByte = it->nextByte(det);
484
485 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
486 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
487
488 return (! it->done);
489 }
490 }
491 }
492
493 it->error = TRUE;
494
495 return (! it->done);
496 }
497
498 return (! it->done);
499 }
500
501 const char *CharsetRecog_gb_18030::getName() const
502 {
503 return "GB18030";
504 }
505
506 const char *CharsetRecog_gb_18030::getLanguage() const
507 {
508 return "zh";
509 }
510
511 int32_t CharsetRecog_gb_18030::match(InputText *det)
512 {
513 return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
514 }
515
516 U_NAMESPACE_END
517 #endif