]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/ucsdetst.c
ICU-400.39.tar.gz
[apple/icu.git] / icuSources / test / cintltst / ucsdetst.c
CommitLineData
73c04bcf
A
1/*
2 ****************************************************************************
46f4442e 3 * Copyright (c) 2005-2008, International Business Machines Corporation and *
73c04bcf
A
4 * others. All Rights Reserved. *
5 ****************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/ustring.h"
13
14#include "cintltst.h"
15
16#include <stdlib.h>
17#include <string.h>
18
46f4442e 19#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
73c04bcf 20
46f4442e
A
21#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
22#define DELETE_ARRAY(array) free(array)
73c04bcf
A
23
24static void TestConstruction(void);
25static void TestUTF8(void);
26static void TestUTF16(void);
27static void TestC1Bytes(void);
28static void TestInputFilter(void);
29static void TestChaining(void);
46f4442e 30static void TestBufferOverflow(void);
73c04bcf
A
31
32void addUCsdetTest(TestNode** root);
33
34void addUCsdetTest(TestNode** root)
35{
36 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
37 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
38 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
39 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
40 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
41 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
46f4442e 42 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
73c04bcf
A
43}
44
45static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
46{
47 UErrorCode status;
48 char buffer[1024];
49 char *dest, *destLimit = buffer + sizeof(buffer);
50 const UChar *srcLimit = src + length;
51 int32_t result = 0;
52
53 do {
54 dest = buffer;
55 status = U_ZERO_ERROR;
56 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
57 result += (int32_t) (dest - buffer);
58 } while (status == U_BUFFER_OVERFLOW_ERROR);
59
60 return result;
61}
62
73c04bcf
A
63static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
64{
65 UErrorCode status = U_ZERO_ERROR;
66 UConverter *cnv = ucnv_open(codepage, &status);
67 int32_t byteCount = preflight(src, length, cnv);
68 const UChar *srcLimit = src + length;
69 char *bytes = NEW_ARRAY(char, byteCount + 1);
70 char *dest = bytes, *destLimit = bytes + byteCount + 1;
71
72 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
73 ucnv_close(cnv);
74
75 *byteLength = byteCount;
76 return bytes;
77}
78
79static void freeBytes(char *bytes)
80{
81 DELETE_ARRAY(bytes);
82}
83
84static void TestConstruction(void)
85{
86 UErrorCode status = U_ZERO_ERROR;
87 UCharsetDetector *csd = ucsdet_open(&status);
88 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
89 const char *name;
90 int32_t count = uenum_count(e, &status);
91 int32_t i, length;
92
93 for(i = 0; i < count; i += 1) {
94 name = uenum_next(e, &length, &status);
95
96 if(name == NULL || length <= 0) {
97 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
98 }
99 }
100 /* one past the list of all names must return NULL */
101 name = uenum_next(e, &length, &status);
102 if(name != NULL || length != 0 || U_FAILURE(status)) {
103 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
104 }
105
106 uenum_close(e);
107 ucsdet_close(csd);
108}
109
110static void TestUTF8(void)
111{
112 UErrorCode status = U_ZERO_ERROR;
46f4442e 113 static const char ss[] = "This is a string with some non-ascii characters that will "
73c04bcf
A
114 "be converted to UTF-8, then shoved through the detection process. "
115 "\\u0391\\u0392\\u0393\\u0394\\u0395"
116 "Sure would be nice if our source could contain Unicode directly!";
117 int32_t byteLength = 0, sLength = 0, dLength = 0;
46f4442e
A
118 UChar s[sizeof(ss)];
119 char *bytes;
73c04bcf
A
120 UCharsetDetector *csd = ucsdet_open(&status);
121 const UCharsetMatch *match;
46f4442e
A
122 UChar detected[sizeof(ss)];
123
124 sLength = u_unescape(ss, s, sizeof(ss));
125 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
73c04bcf
A
126
127 ucsdet_setText(csd, bytes, byteLength, &status);
46f4442e
A
128 if (U_FAILURE(status)) {
129 log_err("status is %s\n", u_errorName(status));
130 goto bail;
131 }
132
73c04bcf
A
133 match = ucsdet_detect(csd, &status);
134
135 if (match == NULL) {
136 log_err("Detection failure for UTF-8: got no matches.\n");
137 goto bail;
138 }
139
140 dLength = ucsdet_getUChars(match, detected, sLength, &status);
141
142 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
143 log_err("Round-trip test failed!\n");
144 }
145
146 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
147
148bail:
73c04bcf
A
149 freeBytes(bytes);
150 ucsdet_close(csd);
151}
152
153static void TestUTF16(void)
154{
155 UErrorCode status = U_ZERO_ERROR;
156 /* Notice the BOM on the start of this string */
46f4442e 157 static const UChar chars[] = {
73c04bcf
A
158 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
159 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
160 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
161 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
162 0x064a, 0x062a, 0x0000};
163 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
164 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
165 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
166 UCharsetDetector *csd = ucsdet_open(&status);
167 const UCharsetMatch *match;
168 const char *name;
169 int32_t conf;
170
171 ucsdet_setText(csd, beBytes, beLength, &status);
172 match = ucsdet_detect(csd, &status);
173
174 if (match == NULL) {
175 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
176 goto try_le;
177 }
178
179 name = ucsdet_getName(match, &status);
180 conf = ucsdet_getConfidence(match, &status);
181
182 if (strcmp(name, "UTF-16BE") != 0) {
183 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
184 }
185
186 if (conf != 100) {
187 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
188 }
189
190try_le:
191 ucsdet_setText(csd, leBytes, leLength, &status);
192 match = ucsdet_detect(csd, &status);
193
194 if (match == NULL) {
195 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
196 goto bail;
197 }
198
199 name = ucsdet_getName(match, &status);
200 conf = ucsdet_getConfidence(match, &status);
201
202
203 if (strcmp(name, "UTF-16LE") != 0) {
204 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
205 }
206
207 if (conf != 100) {
208 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
209 }
210
211bail:
212 freeBytes(leBytes);
213 freeBytes(beBytes);
214 ucsdet_close(csd);
215}
216
217static void TestC1Bytes(void)
218{
219#if !UCONFIG_NO_LEGACY_CONVERSION
220 UErrorCode status = U_ZERO_ERROR;
46f4442e
A
221 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
222 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
73c04bcf 223 int32_t sISOLength = 0, sWindowsLength = 0;
46f4442e
A
224 UChar sISO[sizeof(ssISO)];
225 UChar sWindows[sizeof(ssWindows)];
73c04bcf 226 int32_t lISO = 0, lWindows = 0;
46f4442e
A
227 char *bISO;
228 char *bWindows;
73c04bcf
A
229 UCharsetDetector *csd = ucsdet_open(&status);
230 const UCharsetMatch *match;
231 const char *name;
232
46f4442e
A
233 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
234 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
235 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
236 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
237
73c04bcf
A
238 ucsdet_setText(csd, bWindows, lWindows, &status);
239 match = ucsdet_detect(csd, &status);
240
241 if (match == NULL) {
242 log_err("English test with C1 bytes got no matches.\n");
243 goto bail;
244 }
245
246 name = ucsdet_getName(match, &status);
247
248 if (strcmp(name, "windows-1252") != 0) {
249 log_err("English text with C1 bytes does not detect as windows-1252, but as %s\n", name);
250 }
251
252 ucsdet_setText(csd, bISO, lISO, &status);
253 match = ucsdet_detect(csd, &status);
254
255 if (match == NULL) {
256 log_err("English text without C1 bytes got no matches.\n");
257 goto bail;
258 }
259
260 name = ucsdet_getName(match, &status);
261
262 if (strcmp(name, "ISO-8859-1") != 0) {
263 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
264 }
265
266bail:
267 freeBytes(bWindows);
268 freeBytes(bISO);
269
270 ucsdet_close(csd);
271#endif
272}
273
274static void TestInputFilter(void)
275{
276 UErrorCode status = U_ZERO_ERROR;
46f4442e 277 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
73c04bcf 278 int32_t sLength = 0;
46f4442e 279 UChar s[sizeof(ss)];
73c04bcf 280 int32_t byteLength = 0;
46f4442e 281 char *bytes;
73c04bcf
A
282 UCharsetDetector *csd = ucsdet_open(&status);
283 const UCharsetMatch *match;
284 const char *lang, *name;
285
46f4442e
A
286 sLength = u_unescape(ss, s, sizeof(ss));
287 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
288
73c04bcf
A
289 ucsdet_enableInputFilter(csd, TRUE);
290
291 if (!ucsdet_isInputFilterEnabled(csd)) {
292 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
293 }
294
295
296 ucsdet_setText(csd, bytes, byteLength, &status);
297 match = ucsdet_detect(csd, &status);
298
299 if (match == NULL) {
300 log_err("Turning on the input filter resulted in no matches.\n");
301 goto turn_off;
302 }
303
304 name = ucsdet_getName(match, &status);
305
306 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
307 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
308 } else {
309 lang = ucsdet_getLanguage(match, &status);
310
311 if (lang == NULL || strcmp(lang, "fr") != 0) {
312 log_err("Input filter did not strip markup!\n");
313 }
314 }
315
316turn_off:
317 ucsdet_enableInputFilter(csd, FALSE);
318 ucsdet_setText(csd, bytes, byteLength, &status);
319 match = ucsdet_detect(csd, &status);
320
321 if (match == NULL) {
322 log_err("Turning off the input filter resulted in no matches.\n");
323 goto bail;
324 }
325
326 name = ucsdet_getName(match, &status);
327
328 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
329 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
330 } else {
331 lang = ucsdet_getLanguage(match, &status);
332
333 if (lang == NULL || strcmp(lang, "en") != 0) {
334 log_err("Unfiltered input did not detect as English!\n");
335 }
336 }
337
338bail:
339 freeBytes(bytes);
340 ucsdet_close(csd);
341}
342
343static void TestChaining(void) {
344 UErrorCode status = U_USELESS_COLLATOR_ERROR;
345
346 ucsdet_open(&status);
347 ucsdet_setText(NULL, NULL, 0, &status);
348 ucsdet_getName(NULL, &status);
349 ucsdet_getConfidence(NULL, &status);
350 ucsdet_getLanguage(NULL, &status);
351 ucsdet_detect(NULL, &status);
352 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
353 ucsdet_detectAll(NULL, NULL, &status);
354 ucsdet_getUChars(NULL, NULL, 0, &status);
355 ucsdet_getUChars(NULL, NULL, 0, &status);
356 ucsdet_close(NULL);
357
358 /* All of this code should have done nothing. */
359 if (status != U_USELESS_COLLATOR_ERROR) {
360 log_err("Status got changed to %s\n", u_errorName(status));
361 }
362}
46f4442e
A
363
364static void TestBufferOverflow(void) {
365 UErrorCode status = U_ZERO_ERROR;
366 static const char *testStrings[] = {
367 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
368 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
369 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
370 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
371 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
372 "\xa1", /* Could be a single byte shift-jis at the end */
373 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
374 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
375 };
376 static const char *testResults[] = {
377 "windows-1252",
378 "windows-1252",
379 "windows-1252",
380 "windows-1252",
381 "ISO-2022-JP",
382 NULL,
383 NULL,
384 "ISO-8859-1"
385 };
386 int32_t idx = 0;
387 UCharsetDetector *csd = ucsdet_open(&status);
388 const UCharsetMatch *match;
389
390 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
391
392 if (U_FAILURE(status)) {
393 log_err("Couldn't open detector. %s\n", u_errorName(status));
394 goto bail;
395 }
396
397 for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
398 ucsdet_setText(csd, testStrings[idx], -1, &status);
399 match = ucsdet_detect(csd, &status);
400
401 if (match == NULL) {
402 if (testResults[idx] != NULL) {
403 log_err("Unexpectedly got no results at index %d.\n", idx);
404 }
405 else {
406 log_verbose("Got no result as expected at index %d.\n", idx);
407 }
408 continue;
409 }
410
411 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
412 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
413 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
414 goto bail;
415 }
416 }
417
418bail:
419 ucsdet_close(csd);
420}
421