]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ********************************************************************** | |
5 | * Copyright (C) 2005-2016, International Business Machines | |
6 | * Corporation and others. All Rights Reserved. | |
7 | ********************************************************************** | |
8 | */ | |
9 | ||
10 | #include "unicode/utypes.h" | |
11 | ||
12 | #if !UCONFIG_NO_CONVERSION | |
13 | ||
14 | #include "cmemory.h" | |
15 | #include "csmatch.h" | |
16 | #include "csrmbcs.h" | |
17 | ||
18 | #include <math.h> | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | #define min(x,y) (((x)<(y))?(x):(y)) | |
23 | ||
24 | static const uint16_t commonChars_sjis [] = { | |
25 | // TODO: This set of data comes from the character frequency- | |
26 | // of-occurence analysis tool. The data needs to be moved | |
27 | // into a resource and loaded from there. | |
28 | 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, | |
29 | 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, | |
30 | 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, | |
31 | 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, | |
32 | 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, | |
33 | 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; | |
34 | ||
35 | static const uint16_t commonChars_euc_jp[] = { | |
36 | // TODO: This set of data comes from the character frequency- | |
37 | // of-occurence analysis tool. The data needs to be moved | |
38 | // into a resource and loaded from there. | |
39 | 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, | |
40 | 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, | |
41 | 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, | |
42 | 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, | |
43 | 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, | |
44 | 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, | |
45 | 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, | |
46 | 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, | |
47 | 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, | |
48 | 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; | |
49 | ||
50 | static const uint16_t commonChars_euc_kr[] = { | |
51 | // TODO: This set of data comes from the character frequency- | |
52 | // of-occurence analysis tool. The data needs to be moved | |
53 | // into a resource and loaded from there. | |
54 | 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, | |
55 | 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, | |
56 | 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, | |
57 | 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, | |
58 | 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, | |
59 | 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, | |
60 | 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, | |
61 | 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, | |
62 | 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, | |
63 | 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; | |
64 | ||
65 | static const uint16_t commonChars_big5[] = { | |
66 | // TODO: This set of data comes from the character frequency- | |
67 | // of-occurence analysis tool. The data needs to be moved | |
68 | // into a resource and loaded from there. | |
69 | 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, | |
70 | 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, | |
71 | 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, | |
72 | 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, | |
73 | 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, | |
74 | 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, | |
75 | 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, | |
76 | 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, | |
77 | 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, | |
78 | 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; | |
79 | ||
80 | static const uint16_t commonChars_gb_18030[] = { | |
81 | // TODO: This set of data comes from the character frequency- | |
82 | // of-occurence analysis tool. The data needs to be moved | |
83 | // into a resource and loaded from there. | |
84 | 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, | |
85 | 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, | |
86 | 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, | |
87 | 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, | |
88 | 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, | |
89 | 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, | |
90 | 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, | |
91 | 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, | |
92 | 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, | |
93 | 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; | |
94 | ||
95 | #if U_PLATFORM_IS_DARWIN_BASED | |
96 | static const uint8_t keyStrings_sjis[][MAX_KEY_STRING_WITH_NULL] = { | |
97 | {0x82,0xa9,0x82,0xe7,0x91,0x97,0x90,0x4d,0}, // Signatures - Sent from my ... | |
98 | {0x93,0x5d,0x91,0x97,0x83,0x81,0x83,0x62,0x83,0x5a,0x81,0x5b,0x83,0x57,0}, // forward | |
99 | {0} | |
100 | }; | |
101 | static const uint8_t keyStrings_euc_jp[][MAX_KEY_STRING_WITH_NULL] = { | |
102 | {0xa4,0xab,0xa4,0xe9,0xc1,0xf7,0xbf,0xae,0}, // Signatures - Sent from my ... | |
103 | {0xc5,0xbe,0xc1,0xf7,0xa5,0xe1,0xa5,0xc3,0xa5,0xbb,0xa1,0xbc,0xa5,0xb8,0}, // forward | |
104 | {0} | |
105 | }; | |
106 | static const uint8_t keyStrings_euc_kr[][MAX_KEY_STRING_WITH_NULL] = { | |
107 | {0xb3,0xaa,0xc0,0xc7,0}, // Signatures - Sent from my ... #1 | |
108 | {0xbf,0xa1,0xbc,0xad,0x20,0xba,0xb8,0xb3,0xbf,0}, // Signatures - Sent from my ... #2 | |
109 | {0xc0,0xfc,0xb4,0xde,0xb5,0xc8,0x20,0xb8,0xde,0xbd,0xc3,0xc1,0xf6,0}, // forward | |
110 | {0} | |
111 | }; | |
112 | static const uint8_t keyStrings_big5[][MAX_KEY_STRING_WITH_NULL] = { | |
113 | {0xb1,0x71,0xa7,0xda,0xaa,0xba,0}, // Signatures - Sent from my ... #1 | |
114 | {0xb6,0xc7,0xb0,0x65,0}, // Signatures - Sent from my ... #2 | |
115 | {0xb6,0x7d,0xa9,0x6c,0xc2,0xe0,0xb1,0x48,0xb6,0x6c,0xa5,0xf3,0}, // forward | |
116 | {0} | |
117 | }; | |
118 | static const uint8_t keyStrings_gb_18030[][MAX_KEY_STRING_WITH_NULL] = { | |
119 | {0xb7,0xa2,0xd7,0xd4,0xce,0xd2,0xb5,0xc4,0}, // Signatures - Sent from my iP... | |
120 | {0xd7,0xaa,0xb7,0xa2,0xb5,0xc4,0xd3,0xca,0xbc,0xfe,0}, // forward | |
121 | {0} | |
122 | }; | |
123 | #endif | |
124 | ||
125 | static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) | |
126 | { | |
127 | int32_t start = 0, end = len-1; | |
128 | int32_t mid = (start+end)/2; | |
129 | ||
130 | while(start <= end) { | |
131 | if(array[mid] == value) { | |
132 | return mid; | |
133 | } | |
134 | ||
135 | if(array[mid] < value){ | |
136 | start = mid+1; | |
137 | } else { | |
138 | end = mid-1; | |
139 | } | |
140 | ||
141 | mid = (start+end)/2; | |
142 | } | |
143 | ||
144 | return -1; | |
145 | } | |
146 | ||
147 | #if U_PLATFORM_IS_DARWIN_BASED | |
148 | // If testPrefix is a prefix of base, return its length, else return 0 | |
149 | static int32_t isPrefix(const uint8_t *testPrefix, const uint8_t *base, const uint8_t *baseLimit) { | |
150 | const uint8_t *testPrefixStart = testPrefix; | |
151 | while (*testPrefix != 0 && base < baseLimit && *testPrefix == *base) { | |
152 | testPrefix++; | |
153 | base++; | |
154 | } | |
155 | return (*testPrefix == 0)? (int32_t)(testPrefix-testPrefixStart): 0; | |
156 | } | |
157 | #endif | |
158 | ||
159 | IteratedChar::IteratedChar() : | |
160 | charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) | |
161 | { | |
162 | // nothing else to do. | |
163 | } | |
164 | ||
165 | /*void IteratedChar::reset() | |
166 | { | |
167 | charValue = 0; | |
168 | index = -1; | |
169 | nextIndex = 0; | |
170 | error = FALSE; | |
171 | done = FALSE; | |
172 | }*/ | |
173 | ||
174 | int32_t IteratedChar::nextByte(InputText *det) | |
175 | { | |
176 | if (nextIndex >= det->fRawLength) { | |
177 | done = TRUE; | |
178 | ||
179 | return -1; | |
180 | } | |
181 | ||
182 | return det->fRawInput[nextIndex++]; | |
183 | } | |
184 | ||
185 | CharsetRecog_mbcs::~CharsetRecog_mbcs() | |
186 | { | |
187 | // nothing to do. | |
188 | } | |
189 | ||
190 | #if U_PLATFORM_IS_DARWIN_BASED | |
191 | int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const { | |
192 | #else | |
193 | int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { | |
194 | #endif | |
195 | int32_t singleByteCharCount = 0; | |
196 | int32_t doubleByteCharCount = 0; | |
197 | int32_t commonCharCount = 0; | |
198 | int32_t badCharCount = 0; | |
199 | int32_t totalCharCount = 0; | |
200 | int32_t confidence = 0; | |
201 | #if U_PLATFORM_IS_DARWIN_BASED | |
202 | int32_t confidenceFromKeys = 0; | |
203 | #endif | |
204 | IteratedChar iter; | |
205 | ||
206 | while (nextChar(&iter, det)) { | |
207 | totalCharCount++; | |
208 | ||
209 | if (iter.error) { | |
210 | badCharCount++; | |
211 | } else { | |
212 | if (iter.charValue <= 0xFF) { | |
213 | singleByteCharCount++; | |
214 | } else { | |
215 | doubleByteCharCount++; | |
216 | ||
217 | if (commonChars != 0) { | |
218 | if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ | |
219 | commonCharCount += 1; | |
220 | } | |
221 | } | |
222 | #if U_PLATFORM_IS_DARWIN_BASED | |
223 | if (doubleByteCharCount <= 20) { | |
224 | int32_t keyIndex; | |
225 | for ( keyIndex = 0; keyStrings[keyIndex][0] != 0; keyIndex++ ) { | |
226 | int32_t prefixLen = isPrefix(keyStrings[keyIndex], &det->fRawInput[iter.index], &det->fRawInput[det->fRawLength]); | |
227 | confidenceFromKeys += prefixLen*5; | |
228 | } | |
229 | } | |
230 | #endif | |
231 | } | |
232 | } | |
233 | ||
234 | ||
235 | if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { | |
236 | // Bail out early if the byte data is not matching the encoding scheme. | |
237 | // break detectBlock; | |
238 | return confidence; | |
239 | } | |
240 | } | |
241 | ||
242 | if (doubleByteCharCount <= 10 && badCharCount == 0) { | |
243 | // Not many multi-byte chars. | |
244 | if (doubleByteCharCount == 0 && totalCharCount < 10) { | |
245 | // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. | |
246 | // We don't have enough data to have any confidence. | |
247 | // Statistical analysis of single byte non-ASCII charcters would probably help here. | |
248 | confidence = 0; | |
249 | } | |
250 | else { | |
251 | // ASCII or ISO file? It's probably not our encoding, | |
252 | // but is not incompatible with our encoding, so don't give it a zero. | |
253 | #if U_PLATFORM_IS_DARWIN_BASED | |
254 | if (confidenceFromKeys > 90) { | |
255 | confidenceFromKeys = 90; | |
256 | } else if (confidenceFromKeys > 0 && confidenceFromKeys < 70) { | |
257 | confidenceFromKeys += 20; | |
258 | } | |
259 | confidence = 10 + confidenceFromKeys; | |
260 | #else | |
261 | confidence = 10; | |
262 | #endif | |
263 | } | |
264 | ||
265 | return confidence; | |
266 | } | |
267 | ||
268 | // | |
269 | // No match if there are too many characters that don't fit the encoding scheme. | |
270 | // (should we have zero tolerance for these?) | |
271 | // | |
272 | if (doubleByteCharCount < 20*badCharCount) { | |
273 | confidence = 0; | |
274 | ||
275 | return confidence; | |
276 | } | |
277 | ||
278 | if (commonChars == 0) { | |
279 | // We have no statistics on frequently occuring characters. | |
280 | // Assess confidence purely on having a reasonable number of | |
281 | // multi-byte characters (the more the better) | |
282 | confidence = 30 + doubleByteCharCount - 20*badCharCount; | |
283 | #if U_PLATFORM_IS_DARWIN_BASED | |
284 | confidence += confidenceFromKeys; | |
285 | #endif | |
286 | ||
287 | if (confidence > 100) { | |
288 | confidence = 100; | |
289 | } | |
290 | } else { | |
291 | // | |
292 | // Frequency of occurence statistics exist. | |
293 | // | |
294 | ||
295 | double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ | |
296 | double scaleFactor = 90.0 / maxVal; | |
297 | confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); | |
298 | #if U_PLATFORM_IS_DARWIN_BASED | |
299 | confidence += confidenceFromKeys; | |
300 | #endif | |
301 | ||
302 | confidence = min(confidence, 100); | |
303 | } | |
304 | ||
305 | if (confidence < 0) { | |
306 | confidence = 0; | |
307 | } | |
308 | ||
309 | return confidence; | |
310 | } | |
311 | ||
312 | CharsetRecog_sjis::~CharsetRecog_sjis() | |
313 | { | |
314 | // nothing to do | |
315 | } | |
316 | ||
317 | UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { | |
318 | it->index = it->nextIndex; | |
319 | it->error = FALSE; | |
320 | ||
321 | int32_t firstByte = it->charValue = it->nextByte(det); | |
322 | ||
323 | if (firstByte < 0) { | |
324 | return FALSE; | |
325 | } | |
326 | ||
327 | if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { | |
328 | return TRUE; | |
329 | } | |
330 | ||
331 | int32_t secondByte = it->nextByte(det); | |
332 | if (secondByte >= 0) { | |
333 | it->charValue = (firstByte << 8) | secondByte; | |
334 | } | |
335 | // else we'll handle the error later. | |
336 | ||
337 | if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { | |
338 | // Illegal second byte value. | |
339 | it->error = TRUE; | |
340 | } | |
341 | ||
342 | return TRUE; | |
343 | } | |
344 | ||
345 | UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { | |
346 | #if U_PLATFORM_IS_DARWIN_BASED | |
347 | int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis), keyStrings_sjis); | |
348 | #else | |
349 | int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis)); | |
350 | #endif | |
351 | results->set(det, this, confidence); | |
352 | return (confidence > 0); | |
353 | } | |
354 | ||
355 | const char *CharsetRecog_sjis::getName() const | |
356 | { | |
357 | return "Shift_JIS"; | |
358 | } | |
359 | ||
360 | const char *CharsetRecog_sjis::getLanguage() const | |
361 | { | |
362 | return "ja"; | |
363 | } | |
364 | ||
365 | CharsetRecog_euc::~CharsetRecog_euc() | |
366 | { | |
367 | // nothing to do | |
368 | } | |
369 | ||
370 | UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { | |
371 | int32_t firstByte = 0; | |
372 | int32_t secondByte = 0; | |
373 | int32_t thirdByte = 0; | |
374 | ||
375 | it->index = it->nextIndex; | |
376 | it->error = FALSE; | |
377 | firstByte = it->charValue = it->nextByte(det); | |
378 | ||
379 | if (firstByte < 0) { | |
380 | // Ran off the end of the input data | |
381 | return FALSE; | |
382 | } | |
383 | ||
384 | if (firstByte <= 0x8D) { | |
385 | // single byte char | |
386 | return TRUE; | |
387 | } | |
388 | ||
389 | secondByte = it->nextByte(det); | |
390 | if (secondByte >= 0) { | |
391 | it->charValue = (it->charValue << 8) | secondByte; | |
392 | } | |
393 | // else we'll handle the error later. | |
394 | ||
395 | if (firstByte >= 0xA1 && firstByte <= 0xFE) { | |
396 | // Two byte Char | |
397 | if (secondByte < 0xA1) { | |
398 | it->error = TRUE; | |
399 | } | |
400 | ||
401 | return TRUE; | |
402 | } | |
403 | ||
404 | if (firstByte == 0x8E) { | |
405 | // Code Set 2. | |
406 | // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. | |
407 | // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. | |
408 | // We don't know which we've got. | |
409 | // Treat it like EUC-JP. If the data really was EUC-TW, the following two | |
410 | // bytes will look like a well formed 2 byte char. | |
411 | if (secondByte < 0xA1) { | |
412 | it->error = TRUE; | |
413 | } | |
414 | ||
415 | return TRUE; | |
416 | } | |
417 | ||
418 | if (firstByte == 0x8F) { | |
419 | // Code set 3. | |
420 | // Three byte total char size, two bytes of actual char value. | |
421 | thirdByte = it->nextByte(det); | |
422 | it->charValue = (it->charValue << 8) | thirdByte; | |
423 | ||
424 | if (thirdByte < 0xa1) { | |
425 | // Bad second byte or ran off the end of the input data with a non-ASCII first byte. | |
426 | it->error = TRUE; | |
427 | } | |
428 | } | |
429 | ||
430 | return TRUE; | |
431 | ||
432 | } | |
433 | ||
434 | CharsetRecog_euc_jp::~CharsetRecog_euc_jp() | |
435 | { | |
436 | // nothing to do | |
437 | } | |
438 | ||
439 | const char *CharsetRecog_euc_jp::getName() const | |
440 | { | |
441 | return "EUC-JP"; | |
442 | } | |
443 | ||
444 | const char *CharsetRecog_euc_jp::getLanguage() const | |
445 | { | |
446 | return "ja"; | |
447 | } | |
448 | ||
449 | UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const | |
450 | { | |
451 | #if U_PLATFORM_IS_DARWIN_BASED | |
452 | int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp), keyStrings_euc_jp); | |
453 | #else | |
454 | int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp)); | |
455 | #endif | |
456 | results->set(det, this, confidence); | |
457 | return (confidence > 0); | |
458 | } | |
459 | ||
460 | CharsetRecog_euc_kr::~CharsetRecog_euc_kr() | |
461 | { | |
462 | // nothing to do | |
463 | } | |
464 | ||
465 | const char *CharsetRecog_euc_kr::getName() const | |
466 | { | |
467 | return "EUC-KR"; | |
468 | } | |
469 | ||
470 | const char *CharsetRecog_euc_kr::getLanguage() const | |
471 | { | |
472 | return "ko"; | |
473 | } | |
474 | ||
475 | UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const | |
476 | { | |
477 | #if U_PLATFORM_IS_DARWIN_BASED | |
478 | int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr), keyStrings_euc_kr); | |
479 | #else | |
480 | int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr)); | |
481 | #endif | |
482 | results->set(det, this, confidence); | |
483 | return (confidence > 0); | |
484 | } | |
485 | ||
486 | CharsetRecog_big5::~CharsetRecog_big5() | |
487 | { | |
488 | // nothing to do | |
489 | } | |
490 | ||
491 | UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const | |
492 | { | |
493 | int32_t firstByte; | |
494 | ||
495 | it->index = it->nextIndex; | |
496 | it->error = FALSE; | |
497 | firstByte = it->charValue = it->nextByte(det); | |
498 | ||
499 | if (firstByte < 0) { | |
500 | return FALSE; | |
501 | } | |
502 | ||
503 | if (firstByte <= 0x7F || firstByte == 0xFF) { | |
504 | // single byte character. | |
505 | return TRUE; | |
506 | } | |
507 | ||
508 | int32_t secondByte = it->nextByte(det); | |
509 | if (secondByte >= 0) { | |
510 | it->charValue = (it->charValue << 8) | secondByte; | |
511 | } | |
512 | // else we'll handle the error later. | |
513 | ||
514 | if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { | |
515 | it->error = TRUE; | |
516 | } | |
517 | ||
518 | return TRUE; | |
519 | } | |
520 | ||
521 | const char *CharsetRecog_big5::getName() const | |
522 | { | |
523 | return "Big5"; | |
524 | } | |
525 | ||
526 | const char *CharsetRecog_big5::getLanguage() const | |
527 | { | |
528 | return "zh"; | |
529 | } | |
530 | ||
531 | UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const | |
532 | { | |
533 | #if U_PLATFORM_IS_DARWIN_BASED | |
534 | int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5), keyStrings_big5); | |
535 | #else | |
536 | int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5)); | |
537 | #endif | |
538 | results->set(det, this, confidence); | |
539 | return (confidence > 0); | |
540 | } | |
541 | ||
542 | CharsetRecog_gb_18030::~CharsetRecog_gb_18030() | |
543 | { | |
544 | // nothing to do | |
545 | } | |
546 | ||
547 | UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { | |
548 | int32_t firstByte = 0; | |
549 | int32_t secondByte = 0; | |
550 | int32_t thirdByte = 0; | |
551 | int32_t fourthByte = 0; | |
552 | ||
553 | it->index = it->nextIndex; | |
554 | it->error = FALSE; | |
555 | firstByte = it->charValue = it->nextByte(det); | |
556 | ||
557 | if (firstByte < 0) { | |
558 | // Ran off the end of the input data | |
559 | return FALSE; | |
560 | } | |
561 | ||
562 | if (firstByte <= 0x80) { | |
563 | // single byte char | |
564 | return TRUE; | |
565 | } | |
566 | ||
567 | secondByte = it->nextByte(det); | |
568 | if (secondByte >= 0) { | |
569 | it->charValue = (it->charValue << 8) | secondByte; | |
570 | } | |
571 | // else we'll handle the error later. | |
572 | ||
573 | if (firstByte >= 0x81 && firstByte <= 0xFE) { | |
574 | // Two byte Char | |
575 | if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { | |
576 | return TRUE; | |
577 | } | |
578 | ||
579 | // Four byte char | |
580 | if (secondByte >= 0x30 && secondByte <= 0x39) { | |
581 | thirdByte = it->nextByte(det); | |
582 | ||
583 | if (thirdByte >= 0x81 && thirdByte <= 0xFE) { | |
584 | fourthByte = it->nextByte(det); | |
585 | ||
586 | if (fourthByte >= 0x30 && fourthByte <= 0x39) { | |
587 | it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; | |
588 | ||
589 | return TRUE; | |
590 | } | |
591 | } | |
592 | } | |
593 | ||
594 | // Something wasn't valid, or we ran out of data (-1). | |
595 | it->error = TRUE; | |
596 | } | |
597 | ||
598 | return TRUE; | |
599 | } | |
600 | ||
601 | const char *CharsetRecog_gb_18030::getName() const | |
602 | { | |
603 | return "GB18030"; | |
604 | } | |
605 | ||
606 | const char *CharsetRecog_gb_18030::getLanguage() const | |
607 | { | |
608 | return "zh"; | |
609 | } | |
610 | ||
611 | UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const | |
612 | { | |
613 | #if U_PLATFORM_IS_DARWIN_BASED | |
614 | int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030), keyStrings_gb_18030); | |
615 | #else | |
616 | int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030)); | |
617 | #endif | |
618 | results->set(det, this, confidence); | |
619 | return (confidence > 0); | |
620 | } | |
621 | ||
622 | U_NAMESPACE_END | |
623 | #endif |