]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ********************************************************************** | |
4388f060 | 3 | * Copyright (C) 2005-2012, International Business Machines |
73c04bcf A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_CONVERSION | |
11 | ||
51004dcb | 12 | #include "csmatch.h" |
73c04bcf A |
13 | #include "csrmbcs.h" |
14 | ||
15 | #include <math.h> | |
16 | ||
17 | U_NAMESPACE_BEGIN | |
18 | ||
19 | #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | |
20 | ||
21 | #define min(x,y) (((x)<(y))?(x):(y)) | |
22 | ||
46f4442e | 23 | static const uint16_t commonChars_sjis [] = { |
73c04bcf A |
24 | // TODO: This set of data comes from the character frequency- |
25 | // of-occurence analysis tool. The data needs to be moved | |
26 | // into a resource and loaded from there. | |
27 | 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, | |
28 | 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, | |
29 | 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, | |
30 | 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, | |
31 | 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, | |
32 | 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; | |
33 | ||
46f4442e | 34 | static const uint16_t commonChars_euc_jp[] = { |
73c04bcf A |
35 | // TODO: This set of data comes from the character frequency- |
36 | // of-occurence analysis tool. The data needs to be moved | |
37 | // into a resource and loaded from there. | |
38 | 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, | |
39 | 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, | |
40 | 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, | |
41 | 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, | |
42 | 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, | |
43 | 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, | |
44 | 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, | |
45 | 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, | |
46 | 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, | |
47 | 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; | |
48 | ||
46f4442e | 49 | static const uint16_t commonChars_euc_kr[] = { |
73c04bcf A |
50 | // TODO: This set of data comes from the character frequency- |
51 | // of-occurence analysis tool. The data needs to be moved | |
52 | // into a resource and loaded from there. | |
53 | 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, | |
54 | 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, | |
55 | 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, | |
56 | 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, | |
57 | 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, | |
58 | 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, | |
59 | 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, | |
60 | 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, | |
61 | 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, | |
62 | 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; | |
63 | ||
46f4442e | 64 | static const uint16_t commonChars_big5[] = { |
73c04bcf A |
65 | // TODO: This set of data comes from the character frequency- |
66 | // of-occurence analysis tool. The data needs to be moved | |
67 | // into a resource and loaded from there. | |
68 | 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, | |
69 | 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, | |
70 | 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, | |
71 | 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, | |
72 | 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, | |
73 | 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, | |
74 | 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, | |
75 | 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, | |
76 | 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, | |
77 | 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; | |
78 | ||
46f4442e | 79 | static const uint16_t commonChars_gb_18030[] = { |
73c04bcf A |
80 | // TODO: This set of data comes from the character frequency- |
81 | // of-occurence analysis tool. The data needs to be moved | |
82 | // into a resource and loaded from there. | |
83 | 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, | |
84 | 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, | |
85 | 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, | |
86 | 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, | |
87 | 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, | |
88 | 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, | |
89 | 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, | |
90 | 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, | |
91 | 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, | |
92 | 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; | |
93 | ||
4388f060 A |
94 | #if U_PLATFORM_IS_DARWIN_BASED |
95 | static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = { | |
96 | {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ... | |
97 | {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward | |
98 | {0} | |
99 | }; | |
100 | static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = { | |
101 | {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ... | |
102 | {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward | |
103 | {0} | |
104 | }; | |
105 | static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = { | |
106 | {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1 | |
107 | {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2 | |
108 | {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward | |
109 | {0} | |
110 | }; | |
111 | static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = { | |
112 | {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1 | |
113 | {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2 | |
114 | {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward | |
115 | {0} | |
116 | }; | |
117 | static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = { | |
118 | {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP... | |
119 | {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward | |
120 | {0} | |
121 | }; | |
122 | #endif | |
123 | ||
46f4442e | 124 | static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) |
73c04bcf A |
125 | { |
126 | int32_t start = 0, end = len-1; | |
127 | int32_t mid = (start+end)/2; | |
128 | ||
129 | while(start <= end) { | |
130 | if(array[mid] == value) { | |
131 | return mid; | |
132 | } | |
133 | ||
134 | if(array[mid] < value){ | |
135 | start = mid+1; | |
136 | } else { | |
137 | end = mid-1; | |
138 | } | |
139 | ||
140 | mid = (start+end)/2; | |
141 | } | |
142 | ||
143 | return -1; | |
144 | } | |
145 | ||
4388f060 A |
146 | #if U_PLATFORM_IS_DARWIN_BASED |
147 | // If testPrefix is a prefix of base, return its length, else return 0 | |
148 | static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) { | |
149 | const uint8_t *testPrefixStart = testPrefix; | |
150 | while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) { | |
151 | testPrefix++; | |
152 | base++; | |
153 | } | |
154 | return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0; | |
155 | } | |
156 | #endif | |
157 | ||
46f4442e A |
158 | IteratedChar::IteratedChar() : |
159 | charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) | |
73c04bcf A |
160 | { |
161 | // nothing else to do. | |
162 | } | |
163 | ||
46f4442e | 164 | /*void IteratedChar::reset() |
73c04bcf A |
165 | { |
166 | charValue = 0; | |
167 | index = -1; | |
168 | nextIndex = 0; | |
169 | error = FALSE; | |
170 | done = FALSE; | |
46f4442e | 171 | }*/ |
73c04bcf A |
172 | |
173 | int32_t IteratedChar::nextByte(InputText *det) | |
174 | { | |
175 | if (nextIndex >= det->fRawLength) { | |
176 | done = TRUE; | |
177 | ||
178 | return -1; | |
179 | } | |
180 | ||
181 | return det->fRawInput[nextIndex++]; | |
182 | } | |
183 | ||
184 | CharsetRecog_mbcs::~CharsetRecog_mbcs() | |
185 | { | |
186 | // nothing to do. | |
187 | } | |
188 | ||
4388f060 | 189 | #if U_PLATFORM_IS_DARWIN_BASED |
51004dcb | 190 | int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const { |
4388f060 | 191 | #else |
51004dcb | 192 | int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { |
4388f060 | 193 | #endif |
46f4442e A |
194 | int32_t singleByteCharCount = 0; |
195 | int32_t doubleByteCharCount = 0; | |
196 | int32_t commonCharCount = 0; | |
197 | int32_t badCharCount = 0; | |
198 | int32_t totalCharCount = 0; | |
199 | int32_t confidence = 0; | |
4388f060 A |
200 | #if U_PLATFORM_IS_DARWIN_BASED |
201 | int32_t confidenceFromKeys = 0; | |
202 | #endif | |
46f4442e A |
203 | IteratedChar iter; |
204 | ||
205 | while (nextChar(&iter, det)) { | |
206 | totalCharCount++; | |
207 | ||
208 | if (iter.error) { | |
209 | badCharCount++; | |
73c04bcf | 210 | } else { |
46f4442e A |
211 | if (iter.charValue <= 0xFF) { |
212 | singleByteCharCount++; | |
73c04bcf | 213 | } else { |
46f4442e | 214 | doubleByteCharCount++; |
73c04bcf A |
215 | |
216 | if (commonChars != 0) { | |
46f4442e | 217 | if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ |
73c04bcf A |
218 | commonCharCount += 1; |
219 | } | |
220 | } | |
4388f060 A |
221 | #if U_PLATFORM_IS_DARWIN_BASED |
222 | if (doubleByteCharCount <= 20) { | |
223 | int32_t keyIndex; | |
224 | for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) { | |
225 | int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]); | |
226 | confidenceFromKeys += prefixLen*5; | |
227 | } | |
228 | } | |
229 | #endif | |
73c04bcf A |
230 | } |
231 | } | |
232 | ||
233 | ||
234 | if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { | |
235 | // Bail out early if the byte data is not matching the encoding scheme. | |
236 | // break detectBlock; | |
73c04bcf A |
237 | return confidence; |
238 | } | |
239 | } | |
240 | ||
73c04bcf A |
241 | if (doubleByteCharCount <= 10 && badCharCount == 0) { |
242 | // Not many multi-byte chars. | |
46f4442e A |
243 | if (doubleByteCharCount == 0 && totalCharCount < 10) { |
244 | // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. | |
245 | // We don't have enough data to have any confidence. | |
246 | // Statistical analysis of single byte non-ASCII charcters would probably help here. | |
247 | confidence = 0; | |
248 | } | |
249 | else { | |
250 | // ASCII or ISO file? It's probably not our encoding, | |
251 | // but is not incompatible with our encoding, so don't give it a zero. | |
4388f060 A |
252 | #if U_PLATFORM_IS_DARWIN_BASED |
253 | if (confidenceFromKeys > 90) { | |
254 | confidenceFromKeys = 90; | |
255 | } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) { | |
256 | confidenceFromKeys += 20; | |
257 | } | |
258 | confidence = 10 + confidenceFromKeys; | |
259 | #else | |
46f4442e | 260 | confidence = 10; |
4388f060 | 261 | #endif |
46f4442e | 262 | } |
73c04bcf A |
263 | |
264 | return confidence; | |
265 | } | |
266 | ||
267 | // | |
268 | // No match if there are too many characters that don't fit the encoding scheme. | |
269 | // (should we have zero tolerance for these?) | |
270 | // | |
271 | if (doubleByteCharCount < 20*badCharCount) { | |
272 | confidence = 0; | |
273 | ||
274 | return confidence; | |
275 | } | |
276 | ||
277 | if (commonChars == 0) { | |
278 | // We have no statistics on frequently occuring characters. | |
279 | // Assess confidence purely on having a reasonable number of | |
280 | // multi-byte characters (the more the better) | |
281 | confidence = 30 + doubleByteCharCount - 20*badCharCount; | |
4388f060 A |
282 | #if U_PLATFORM_IS_DARWIN_BASED |
283 | confidence += confidenceFromKeys; | |
284 | #endif | |
73c04bcf A |
285 | |
286 | if (confidence > 100) { | |
287 | confidence = 100; | |
288 | } | |
289 | } else { | |
290 | // | |
291 | // Frequency of occurence statistics exist. | |
292 | // | |
293 | ||
4388f060 | 294 | double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ |
73c04bcf | 295 | double scaleFactor = 90.0 / maxVal; |
4388f060 A |
296 | confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); |
297 | #if U_PLATFORM_IS_DARWIN_BASED | |
298 | confidence += confidenceFromKeys; | |
299 | #endif | |
73c04bcf A |
300 | |
301 | confidence = min(confidence, 100); | |
302 | } | |
303 | ||
304 | if (confidence < 0) { | |
305 | confidence = 0; | |
306 | } | |
307 | ||
308 | return confidence; | |
309 | } | |
310 | ||
311 | CharsetRecog_sjis::~CharsetRecog_sjis() | |
312 | { | |
313 | // nothing to do | |
314 | } | |
315 | ||
51004dcb | 316 | UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { |
73c04bcf A |
317 | it->index = it->nextIndex; |
318 | it->error = FALSE; | |
319 | ||
320 | int32_t firstByte = it->charValue = it->nextByte(det); | |
321 | ||
322 | if (firstByte < 0) { | |
323 | return FALSE; | |
324 | } | |
325 | ||
326 | if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { | |
327 | return TRUE; | |
328 | } | |
329 | ||
330 | int32_t secondByte = it->nextByte(det); | |
46f4442e A |
331 | if (secondByte >= 0) { |
332 | it->charValue = (firstByte << 8) | secondByte; | |
73c04bcf | 333 | } |
46f4442e A |
334 | // else we'll handle the error later. |
335 | ||
73c04bcf A |
336 | if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { |
337 | // Illegal second byte value. | |
338 | it->error = TRUE; | |
339 | } | |
340 | ||
341 | return TRUE; | |
342 | } | |
343 | ||
51004dcb | 344 | UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { |
4388f060 | 345 | #if U_PLATFORM_IS_DARWIN_BASED |
51004dcb | 346 | int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis), keyStrings_sjis); |
4388f060 | 347 | #else |
51004dcb | 348 | int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); |
4388f060 | 349 | #endif |
51004dcb A |
350 | results->set(det, this, confidence); |
351 | return (confidence > 0); | |
73c04bcf A |
352 | } |
353 | ||
354 | const char *CharsetRecog_sjis::getName() const | |
355 | { | |
356 | return "Shift_JIS"; | |
357 | } | |
358 | ||
359 | const char *CharsetRecog_sjis::getLanguage() const | |
360 | { | |
361 | return "ja"; | |
362 | } | |
363 | ||
364 | CharsetRecog_euc::~CharsetRecog_euc() | |
365 | { | |
366 | // nothing to do | |
367 | } | |
368 | ||
51004dcb | 369 | UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { |
73c04bcf A |
370 | int32_t firstByte = 0; |
371 | int32_t secondByte = 0; | |
372 | int32_t thirdByte = 0; | |
73c04bcf A |
373 | |
374 | it->index = it->nextIndex; | |
375 | it->error = FALSE; | |
376 | firstByte = it->charValue = it->nextByte(det); | |
377 | ||
378 | if (firstByte < 0) { | |
379 | // Ran off the end of the input data | |
46f4442e | 380 | return FALSE; |
73c04bcf A |
381 | } |
382 | ||
383 | if (firstByte <= 0x8D) { | |
384 | // single byte char | |
46f4442e | 385 | return TRUE; |
73c04bcf A |
386 | } |
387 | ||
388 | secondByte = it->nextByte(det); | |
46f4442e A |
389 | if (secondByte >= 0) { |
390 | it->charValue = (it->charValue << 8) | secondByte; | |
391 | } | |
392 | // else we'll handle the error later. | |
73c04bcf A |
393 | |
394 | if (firstByte >= 0xA1 && firstByte <= 0xFE) { | |
395 | // Two byte Char | |
396 | if (secondByte < 0xA1) { | |
397 | it->error = TRUE; | |
398 | } | |
399 | ||
46f4442e | 400 | return TRUE; |
73c04bcf A |
401 | } |
402 | ||
403 | if (firstByte == 0x8E) { | |
404 | // Code Set 2. | |
405 | // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. | |
406 | // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. | |
407 | // We don't know which we've got. | |
408 | // Treat it like EUC-JP. If the data really was EUC-TW, the following two | |
409 | // bytes will look like a well formed 2 byte char. | |
410 | if (secondByte < 0xA1) { | |
411 | it->error = TRUE; | |
412 | } | |
413 | ||
46f4442e | 414 | return TRUE; |
73c04bcf A |
415 | } |
416 | ||
417 | if (firstByte == 0x8F) { | |
418 | // Code set 3. | |
419 | // Three byte total char size, two bytes of actual char value. | |
420 | thirdByte = it->nextByte(det); | |
421 | it->charValue = (it->charValue << 8) | thirdByte; | |
422 | ||
423 | if (thirdByte < 0xa1) { | |
46f4442e | 424 | // Bad second byte or ran off the end of the input data with a non-ASCII first byte. |
73c04bcf A |
425 | it->error = TRUE; |
426 | } | |
427 | } | |
428 | ||
46f4442e | 429 | return TRUE; |
73c04bcf A |
430 | |
431 | } | |
432 | ||
433 | CharsetRecog_euc_jp::~CharsetRecog_euc_jp() | |
434 | { | |
435 | // nothing to do | |
436 | } | |
437 | ||
438 | const char *CharsetRecog_euc_jp::getName() const | |
439 | { | |
440 | return "EUC-JP"; | |
441 | } | |
442 | ||
443 | const char *CharsetRecog_euc_jp::getLanguage() const | |
444 | { | |
445 | return "ja"; | |
446 | } | |
447 | ||
51004dcb | 448 | UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const |
73c04bcf | 449 | { |
4388f060 | 450 | #if U_PLATFORM_IS_DARWIN_BASED |
51004dcb | 451 | int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp), keyStrings_euc_jp); |
4388f060 | 452 | #else |
51004dcb | 453 | int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); |
4388f060 | 454 | #endif |
51004dcb A |
455 | results->set(det, this, confidence); |
456 | return (confidence > 0); | |
73c04bcf A |
457 | } |
458 | ||
459 | CharsetRecog_euc_kr::~CharsetRecog_euc_kr() | |
460 | { | |
461 | // nothing to do | |
462 | } | |
463 | ||
464 | const char *CharsetRecog_euc_kr::getName() const | |
465 | { | |
466 | return "EUC-KR"; | |
467 | } | |
468 | ||
469 | const char *CharsetRecog_euc_kr::getLanguage() const | |
470 | { | |
471 | return "ko"; | |
472 | } | |
473 | ||
51004dcb | 474 | UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const |
73c04bcf | 475 | { |
4388f060 | 476 | #if U_PLATFORM_IS_DARWIN_BASED |
51004dcb | 477 | int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr), keyStrings_euc_kr); |
4388f060 | 478 | #else |
51004dcb | 479 | int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); |
4388f060 | 480 | #endif |
51004dcb A |
481 | results->set(det, this, confidence); |
482 | return (confidence > 0); | |
73c04bcf A |
483 | } |
484 | ||
485 | CharsetRecog_big5::~CharsetRecog_big5() | |
486 | { | |
487 | // nothing to do | |
488 | } | |
489 | ||
51004dcb | 490 | UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const |
73c04bcf A |
491 | { |
492 | int32_t firstByte; | |
493 | ||
494 | it->index = it->nextIndex; | |
495 | it->error = FALSE; | |
496 | firstByte = it->charValue = it->nextByte(det); | |
497 | ||
498 | if (firstByte < 0) { | |
499 | return FALSE; | |
500 | } | |
501 | ||
502 | if (firstByte <= 0x7F || firstByte == 0xFF) { | |
503 | // single byte character. | |
504 | return TRUE; | |
505 | } | |
506 | ||
507 | int32_t secondByte = it->nextByte(det); | |
46f4442e A |
508 | if (secondByte >= 0) { |
509 | it->charValue = (it->charValue << 8) | secondByte; | |
73c04bcf | 510 | } |
46f4442e | 511 | // else we'll handle the error later. |
73c04bcf | 512 | |
46f4442e A |
513 | if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { |
514 | it->error = TRUE; | |
73c04bcf A |
515 | } |
516 | ||
517 | return TRUE; | |
518 | } | |
519 | ||
520 | const char *CharsetRecog_big5::getName() const | |
521 | { | |
522 | return "Big5"; | |
523 | } | |
524 | ||
525 | const char *CharsetRecog_big5::getLanguage() const | |
526 | { | |
527 | return "zh"; | |
528 | } | |
529 | ||
51004dcb | 530 | UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const |
73c04bcf | 531 | { |
4388f060 | 532 | #if U_PLATFORM_IS_DARWIN_BASED |
51004dcb | 533 | int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5), keyStrings_big5); |
4388f060 | 534 | #else |
51004dcb | 535 | int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); |
4388f060 | 536 | #endif |
51004dcb A |
537 | results->set(det, this, confidence); |
538 | return (confidence > 0); | |
73c04bcf A |
539 | } |
540 | ||
541 | CharsetRecog_gb_18030::~CharsetRecog_gb_18030() | |
542 | { | |
543 | // nothing to do | |
544 | } | |
545 | ||
51004dcb | 546 | UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { |
73c04bcf A |
547 | int32_t firstByte = 0; |
548 | int32_t secondByte = 0; | |
549 | int32_t thirdByte = 0; | |
550 | int32_t fourthByte = 0; | |
551 | ||
552 | it->index = it->nextIndex; | |
553 | it->error = FALSE; | |
554 | firstByte = it->charValue = it->nextByte(det); | |
555 | ||
556 | if (firstByte < 0) { | |
557 | // Ran off the end of the input data | |
46f4442e | 558 | return FALSE; |
73c04bcf A |
559 | } |
560 | ||
561 | if (firstByte <= 0x80) { | |
562 | // single byte char | |
46f4442e | 563 | return TRUE; |
73c04bcf A |
564 | } |
565 | ||
566 | secondByte = it->nextByte(det); | |
46f4442e A |
567 | if (secondByte >= 0) { |
568 | it->charValue = (it->charValue << 8) | secondByte; | |
569 | } | |
570 | // else we'll handle the error later. | |
73c04bcf A |
571 | |
572 | if (firstByte >= 0x81 && firstByte <= 0xFE) { | |
573 | // Two byte Char | |
574 | if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { | |
46f4442e | 575 | return TRUE; |
73c04bcf A |
576 | } |
577 | ||
578 | // Four byte char | |
579 | if (secondByte >= 0x30 && secondByte <= 0x39) { | |
580 | thirdByte = it->nextByte(det); | |
581 | ||
582 | if (thirdByte >= 0x81 && thirdByte <= 0xFE) { | |
583 | fourthByte = it->nextByte(det); | |
584 | ||
585 | if (fourthByte >= 0x30 && fourthByte <= 0x39) { | |
586 | it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; | |
587 | ||
46f4442e | 588 | return TRUE; |
73c04bcf A |
589 | } |
590 | } | |
591 | } | |
592 | ||
46f4442e | 593 | // Something wasn't valid, or we ran out of data (-1). |
73c04bcf | 594 | it->error = TRUE; |
73c04bcf A |
595 | } |
596 | ||
46f4442e | 597 | return TRUE; |
73c04bcf A |
598 | } |
599 | ||
600 | const char *CharsetRecog_gb_18030::getName() const | |
601 | { | |
602 | return "GB18030"; | |
603 | } | |
604 | ||
605 | const char *CharsetRecog_gb_18030::getLanguage() const | |
606 | { | |
607 | return "zh"; | |
608 | } | |
609 | ||
51004dcb | 610 | UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const |
73c04bcf | 611 | { |
4388f060 | 612 | #if U_PLATFORM_IS_DARWIN_BASED |
51004dcb | 613 | int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030), keyStrings_gb_18030); |
4388f060 | 614 | #else |
51004dcb | 615 | int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); |
4388f060 | 616 | #endif |
51004dcb A |
617 | results->set(det, this, confidence); |
618 | return (confidence > 0); | |
73c04bcf A |
619 | } |
620 | ||
621 | U_NAMESPACE_END | |
622 | #endif |