]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | **************************************************************************** | |
3 | * Copyright (c) 2005-2006, International Business Machines Corporation and * | |
4 | * others. All Rights Reserved. * | |
5 | **************************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #include "unicode/ucsdet.h" | |
11 | #include "unicode/ucnv.h" | |
12 | #include "unicode/ustring.h" | |
13 | ||
14 | #include "cintltst.h" | |
15 | ||
16 | #include <stdlib.h> | |
17 | #include <string.h> | |
18 | ||
19 | #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | |
20 | ||
21 | #define NEW_ARRAY(type,count) (type *) ctst_malloc((count) * sizeof(type)) | |
22 | #define DELETE_ARRAY(array) | |
23 | ||
24 | static void TestConstruction(void); | |
25 | static void TestUTF8(void); | |
26 | static void TestUTF16(void); | |
27 | static void TestC1Bytes(void); | |
28 | static void TestInputFilter(void); | |
29 | static void TestChaining(void); | |
30 | ||
31 | void addUCsdetTest(TestNode** root); | |
32 | ||
33 | void addUCsdetTest(TestNode** root) | |
34 | { | |
35 | addTest(root, &TestConstruction, "ucsdetst/TestConstruction"); | |
36 | addTest(root, &TestUTF8, "ucsdetst/TestUTF8"); | |
37 | addTest(root, &TestUTF16, "ucsdetst/TestUTF16"); | |
38 | addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes"); | |
39 | addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter"); | |
40 | addTest(root, &TestChaining, "ucsdetst/TestErrorChaining"); | |
41 | } | |
42 | ||
43 | static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) | |
44 | { | |
45 | UErrorCode status; | |
46 | char buffer[1024]; | |
47 | char *dest, *destLimit = buffer + sizeof(buffer); | |
48 | const UChar *srcLimit = src + length; | |
49 | int32_t result = 0; | |
50 | ||
51 | do { | |
52 | dest = buffer; | |
53 | status = U_ZERO_ERROR; | |
54 | ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); | |
55 | result += (int32_t) (dest - buffer); | |
56 | } while (status == U_BUFFER_OVERFLOW_ERROR); | |
57 | ||
58 | return result; | |
59 | } | |
60 | ||
61 | static UChar *unescape(const char *src, int32_t *length) | |
62 | { | |
63 | int32_t charCount = u_unescape(src, NULL, 0); | |
64 | UChar *chars = NEW_ARRAY(UChar, charCount + 1); | |
65 | ||
66 | u_unescape(src, chars, charCount); | |
67 | ||
68 | *length = charCount; | |
69 | return chars; | |
70 | } | |
71 | ||
72 | static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength) | |
73 | { | |
74 | UErrorCode status = U_ZERO_ERROR; | |
75 | UConverter *cnv = ucnv_open(codepage, &status); | |
76 | int32_t byteCount = preflight(src, length, cnv); | |
77 | const UChar *srcLimit = src + length; | |
78 | char *bytes = NEW_ARRAY(char, byteCount + 1); | |
79 | char *dest = bytes, *destLimit = bytes + byteCount + 1; | |
80 | ||
81 | ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); | |
82 | ucnv_close(cnv); | |
83 | ||
84 | *byteLength = byteCount; | |
85 | return bytes; | |
86 | } | |
87 | ||
88 | static void freeBytes(char *bytes) | |
89 | { | |
90 | DELETE_ARRAY(bytes); | |
91 | } | |
92 | ||
93 | static void TestConstruction(void) | |
94 | { | |
95 | UErrorCode status = U_ZERO_ERROR; | |
96 | UCharsetDetector *csd = ucsdet_open(&status); | |
97 | UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); | |
98 | const char *name; | |
99 | int32_t count = uenum_count(e, &status); | |
100 | int32_t i, length; | |
101 | ||
102 | for(i = 0; i < count; i += 1) { | |
103 | name = uenum_next(e, &length, &status); | |
104 | ||
105 | if(name == NULL || length <= 0) { | |
106 | log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n"); | |
107 | } | |
108 | } | |
109 | /* one past the list of all names must return NULL */ | |
110 | name = uenum_next(e, &length, &status); | |
111 | if(name != NULL || length != 0 || U_FAILURE(status)) { | |
112 | log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n"); | |
113 | } | |
114 | ||
115 | uenum_close(e); | |
116 | ucsdet_close(csd); | |
117 | } | |
118 | ||
119 | static void TestUTF8(void) | |
120 | { | |
121 | UErrorCode status = U_ZERO_ERROR; | |
122 | const char *ss = "This is a string with some non-ascii characters that will " | |
123 | "be converted to UTF-8, then shoved through the detection process. " | |
124 | "\\u0391\\u0392\\u0393\\u0394\\u0395" | |
125 | "Sure would be nice if our source could contain Unicode directly!"; | |
126 | int32_t byteLength = 0, sLength = 0, dLength = 0; | |
127 | UChar *s = unescape(ss, &sLength); | |
128 | char *bytes = extractBytes(s, sLength, "UTF-8", &byteLength); | |
129 | UCharsetDetector *csd = ucsdet_open(&status); | |
130 | const UCharsetMatch *match; | |
131 | UChar *detected = NEW_ARRAY(UChar, sLength); | |
132 | ||
133 | ucsdet_setText(csd, bytes, byteLength, &status); | |
134 | match = ucsdet_detect(csd, &status); | |
135 | ||
136 | if (match == NULL) { | |
137 | log_err("Detection failure for UTF-8: got no matches.\n"); | |
138 | goto bail; | |
139 | } | |
140 | ||
141 | dLength = ucsdet_getUChars(match, detected, sLength, &status); | |
142 | ||
143 | if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) { | |
144 | log_err("Round-trip test failed!\n"); | |
145 | } | |
146 | ||
147 | ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ | |
148 | ||
149 | bail: | |
150 | DELETE_ARRAY(detected); | |
151 | freeBytes(bytes); | |
152 | ucsdet_close(csd); | |
153 | } | |
154 | ||
155 | static void TestUTF16(void) | |
156 | { | |
157 | UErrorCode status = U_ZERO_ERROR; | |
158 | /* Notice the BOM on the start of this string */ | |
159 | UChar chars[] = { | |
160 | 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, | |
161 | 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, | |
162 | 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, | |
163 | 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, | |
164 | 0x064a, 0x062a, 0x0000}; | |
165 | int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars); | |
166 | char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength); | |
167 | char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength); | |
168 | UCharsetDetector *csd = ucsdet_open(&status); | |
169 | const UCharsetMatch *match; | |
170 | const char *name; | |
171 | int32_t conf; | |
172 | ||
173 | ucsdet_setText(csd, beBytes, beLength, &status); | |
174 | match = ucsdet_detect(csd, &status); | |
175 | ||
176 | if (match == NULL) { | |
177 | log_err("Encoding detection failure for UTF-16BE: got no matches.\n"); | |
178 | goto try_le; | |
179 | } | |
180 | ||
181 | name = ucsdet_getName(match, &status); | |
182 | conf = ucsdet_getConfidence(match, &status); | |
183 | ||
184 | if (strcmp(name, "UTF-16BE") != 0) { | |
185 | log_err("Encoding detection failure for UTF-16BE: got %s\n", name); | |
186 | } | |
187 | ||
188 | if (conf != 100) { | |
189 | log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf); | |
190 | } | |
191 | ||
192 | try_le: | |
193 | ucsdet_setText(csd, leBytes, leLength, &status); | |
194 | match = ucsdet_detect(csd, &status); | |
195 | ||
196 | if (match == NULL) { | |
197 | log_err("Encoding detection failure for UTF-16LE: got no matches.\n"); | |
198 | goto bail; | |
199 | } | |
200 | ||
201 | name = ucsdet_getName(match, &status); | |
202 | conf = ucsdet_getConfidence(match, &status); | |
203 | ||
204 | ||
205 | if (strcmp(name, "UTF-16LE") != 0) { | |
206 | log_err("Enconding detection failure for UTF-16LE: got %s\n", name); | |
207 | } | |
208 | ||
209 | if (conf != 100) { | |
210 | log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf); | |
211 | } | |
212 | ||
213 | bail: | |
214 | freeBytes(leBytes); | |
215 | freeBytes(beBytes); | |
216 | ucsdet_close(csd); | |
217 | } | |
218 | ||
219 | static void TestC1Bytes(void) | |
220 | { | |
221 | #if !UCONFIG_NO_LEGACY_CONVERSION | |
222 | UErrorCode status = U_ZERO_ERROR; | |
223 | const char *ssISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; | |
224 | const char *ssWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes."; | |
225 | int32_t sISOLength = 0, sWindowsLength = 0; | |
226 | UChar *sISO = unescape(ssISO, &sISOLength); | |
227 | UChar *sWindows = unescape(ssWindows, &sWindowsLength); | |
228 | int32_t lISO = 0, lWindows = 0; | |
229 | char *bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO); | |
230 | char *bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows); | |
231 | UCharsetDetector *csd = ucsdet_open(&status); | |
232 | const UCharsetMatch *match; | |
233 | const char *name; | |
234 | ||
235 | ucsdet_setText(csd, bWindows, lWindows, &status); | |
236 | match = ucsdet_detect(csd, &status); | |
237 | ||
238 | if (match == NULL) { | |
239 | log_err("English test with C1 bytes got no matches.\n"); | |
240 | goto bail; | |
241 | } | |
242 | ||
243 | name = ucsdet_getName(match, &status); | |
244 | ||
245 | if (strcmp(name, "windows-1252") != 0) { | |
246 | log_err("English text with C1 bytes does not detect as windows-1252, but as %s\n", name); | |
247 | } | |
248 | ||
249 | ucsdet_setText(csd, bISO, lISO, &status); | |
250 | match = ucsdet_detect(csd, &status); | |
251 | ||
252 | if (match == NULL) { | |
253 | log_err("English text without C1 bytes got no matches.\n"); | |
254 | goto bail; | |
255 | } | |
256 | ||
257 | name = ucsdet_getName(match, &status); | |
258 | ||
259 | if (strcmp(name, "ISO-8859-1") != 0) { | |
260 | log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name); | |
261 | } | |
262 | ||
263 | bail: | |
264 | freeBytes(bWindows); | |
265 | freeBytes(bISO); | |
266 | ||
267 | ucsdet_close(csd); | |
268 | #endif | |
269 | } | |
270 | ||
271 | static void TestInputFilter(void) | |
272 | { | |
273 | UErrorCode status = U_ZERO_ERROR; | |
274 | const char *ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; | |
275 | int32_t sLength = 0; | |
276 | UChar *s = unescape(ss, &sLength); | |
277 | int32_t byteLength = 0; | |
278 | char *bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength); | |
279 | UCharsetDetector *csd = ucsdet_open(&status); | |
280 | const UCharsetMatch *match; | |
281 | const char *lang, *name; | |
282 | ||
283 | ucsdet_enableInputFilter(csd, TRUE); | |
284 | ||
285 | if (!ucsdet_isInputFilterEnabled(csd)) { | |
286 | log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n"); | |
287 | } | |
288 | ||
289 | ||
290 | ucsdet_setText(csd, bytes, byteLength, &status); | |
291 | match = ucsdet_detect(csd, &status); | |
292 | ||
293 | if (match == NULL) { | |
294 | log_err("Turning on the input filter resulted in no matches.\n"); | |
295 | goto turn_off; | |
296 | } | |
297 | ||
298 | name = ucsdet_getName(match, &status); | |
299 | ||
300 | if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
301 | log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name); | |
302 | } else { | |
303 | lang = ucsdet_getLanguage(match, &status); | |
304 | ||
305 | if (lang == NULL || strcmp(lang, "fr") != 0) { | |
306 | log_err("Input filter did not strip markup!\n"); | |
307 | } | |
308 | } | |
309 | ||
310 | turn_off: | |
311 | ucsdet_enableInputFilter(csd, FALSE); | |
312 | ucsdet_setText(csd, bytes, byteLength, &status); | |
313 | match = ucsdet_detect(csd, &status); | |
314 | ||
315 | if (match == NULL) { | |
316 | log_err("Turning off the input filter resulted in no matches.\n"); | |
317 | goto bail; | |
318 | } | |
319 | ||
320 | name = ucsdet_getName(match, &status); | |
321 | ||
322 | if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
323 | log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name); | |
324 | } else { | |
325 | lang = ucsdet_getLanguage(match, &status); | |
326 | ||
327 | if (lang == NULL || strcmp(lang, "en") != 0) { | |
328 | log_err("Unfiltered input did not detect as English!\n"); | |
329 | } | |
330 | } | |
331 | ||
332 | bail: | |
333 | freeBytes(bytes); | |
334 | ucsdet_close(csd); | |
335 | } | |
336 | ||
337 | static void TestChaining(void) { | |
338 | UErrorCode status = U_USELESS_COLLATOR_ERROR; | |
339 | ||
340 | ucsdet_open(&status); | |
341 | ucsdet_setText(NULL, NULL, 0, &status); | |
342 | ucsdet_getName(NULL, &status); | |
343 | ucsdet_getConfidence(NULL, &status); | |
344 | ucsdet_getLanguage(NULL, &status); | |
345 | ucsdet_detect(NULL, &status); | |
346 | ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status); | |
347 | ucsdet_detectAll(NULL, NULL, &status); | |
348 | ucsdet_getUChars(NULL, NULL, 0, &status); | |
349 | ucsdet_getUChars(NULL, NULL, 0, &status); | |
350 | ucsdet_close(NULL); | |
351 | ||
352 | /* All of this code should have done nothing. */ | |
353 | if (status != U_USELESS_COLLATOR_ERROR) { | |
354 | log_err("Status got changed to %s\n", u_errorName(status)); | |
355 | } | |
356 | } |