]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/ucsdetst.c
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / test / cintltst / ucsdetst.c
CommitLineData
73c04bcf
A
1/*
2 ****************************************************************************
3 * Copyright (c) 2005-2006, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ****************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/ustring.h"
13
14#include "cintltst.h"
15
16#include <stdlib.h>
17#include <string.h>
18
19#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20
21#define NEW_ARRAY(type,count) (type *) ctst_malloc((count) * sizeof(type))
22#define DELETE_ARRAY(array)
23
24static void TestConstruction(void);
25static void TestUTF8(void);
26static void TestUTF16(void);
27static void TestC1Bytes(void);
28static void TestInputFilter(void);
29static void TestChaining(void);
30
31void addUCsdetTest(TestNode** root);
32
33void addUCsdetTest(TestNode** root)
34{
35 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
36 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
37 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
38 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
39 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
40 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
41}
42
43static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
44{
45 UErrorCode status;
46 char buffer[1024];
47 char *dest, *destLimit = buffer + sizeof(buffer);
48 const UChar *srcLimit = src + length;
49 int32_t result = 0;
50
51 do {
52 dest = buffer;
53 status = U_ZERO_ERROR;
54 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
55 result += (int32_t) (dest - buffer);
56 } while (status == U_BUFFER_OVERFLOW_ERROR);
57
58 return result;
59}
60
61static UChar *unescape(const char *src, int32_t *length)
62{
63 int32_t charCount = u_unescape(src, NULL, 0);
64 UChar *chars = NEW_ARRAY(UChar, charCount + 1);
65
66 u_unescape(src, chars, charCount);
67
68 *length = charCount;
69 return chars;
70}
71
72static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
73{
74 UErrorCode status = U_ZERO_ERROR;
75 UConverter *cnv = ucnv_open(codepage, &status);
76 int32_t byteCount = preflight(src, length, cnv);
77 const UChar *srcLimit = src + length;
78 char *bytes = NEW_ARRAY(char, byteCount + 1);
79 char *dest = bytes, *destLimit = bytes + byteCount + 1;
80
81 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
82 ucnv_close(cnv);
83
84 *byteLength = byteCount;
85 return bytes;
86}
87
88static void freeBytes(char *bytes)
89{
90 DELETE_ARRAY(bytes);
91}
92
93static void TestConstruction(void)
94{
95 UErrorCode status = U_ZERO_ERROR;
96 UCharsetDetector *csd = ucsdet_open(&status);
97 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
98 const char *name;
99 int32_t count = uenum_count(e, &status);
100 int32_t i, length;
101
102 for(i = 0; i < count; i += 1) {
103 name = uenum_next(e, &length, &status);
104
105 if(name == NULL || length <= 0) {
106 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
107 }
108 }
109 /* one past the list of all names must return NULL */
110 name = uenum_next(e, &length, &status);
111 if(name != NULL || length != 0 || U_FAILURE(status)) {
112 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
113 }
114
115 uenum_close(e);
116 ucsdet_close(csd);
117}
118
119static void TestUTF8(void)
120{
121 UErrorCode status = U_ZERO_ERROR;
122 const char *ss = "This is a string with some non-ascii characters that will "
123 "be converted to UTF-8, then shoved through the detection process. "
124 "\\u0391\\u0392\\u0393\\u0394\\u0395"
125 "Sure would be nice if our source could contain Unicode directly!";
126 int32_t byteLength = 0, sLength = 0, dLength = 0;
127 UChar *s = unescape(ss, &sLength);
128 char *bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
129 UCharsetDetector *csd = ucsdet_open(&status);
130 const UCharsetMatch *match;
131 UChar *detected = NEW_ARRAY(UChar, sLength);
132
133 ucsdet_setText(csd, bytes, byteLength, &status);
134 match = ucsdet_detect(csd, &status);
135
136 if (match == NULL) {
137 log_err("Detection failure for UTF-8: got no matches.\n");
138 goto bail;
139 }
140
141 dLength = ucsdet_getUChars(match, detected, sLength, &status);
142
143 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
144 log_err("Round-trip test failed!\n");
145 }
146
147 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
148
149bail:
150 DELETE_ARRAY(detected);
151 freeBytes(bytes);
152 ucsdet_close(csd);
153}
154
155static void TestUTF16(void)
156{
157 UErrorCode status = U_ZERO_ERROR;
158 /* Notice the BOM on the start of this string */
159 UChar chars[] = {
160 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
161 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
162 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
163 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
164 0x064a, 0x062a, 0x0000};
165 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
166 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
167 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
168 UCharsetDetector *csd = ucsdet_open(&status);
169 const UCharsetMatch *match;
170 const char *name;
171 int32_t conf;
172
173 ucsdet_setText(csd, beBytes, beLength, &status);
174 match = ucsdet_detect(csd, &status);
175
176 if (match == NULL) {
177 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
178 goto try_le;
179 }
180
181 name = ucsdet_getName(match, &status);
182 conf = ucsdet_getConfidence(match, &status);
183
184 if (strcmp(name, "UTF-16BE") != 0) {
185 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
186 }
187
188 if (conf != 100) {
189 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
190 }
191
192try_le:
193 ucsdet_setText(csd, leBytes, leLength, &status);
194 match = ucsdet_detect(csd, &status);
195
196 if (match == NULL) {
197 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
198 goto bail;
199 }
200
201 name = ucsdet_getName(match, &status);
202 conf = ucsdet_getConfidence(match, &status);
203
204
205 if (strcmp(name, "UTF-16LE") != 0) {
206 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
207 }
208
209 if (conf != 100) {
210 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
211 }
212
213bail:
214 freeBytes(leBytes);
215 freeBytes(beBytes);
216 ucsdet_close(csd);
217}
218
219static void TestC1Bytes(void)
220{
221#if !UCONFIG_NO_LEGACY_CONVERSION
222 UErrorCode status = U_ZERO_ERROR;
223 const char *ssISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
224 const char *ssWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
225 int32_t sISOLength = 0, sWindowsLength = 0;
226 UChar *sISO = unescape(ssISO, &sISOLength);
227 UChar *sWindows = unescape(ssWindows, &sWindowsLength);
228 int32_t lISO = 0, lWindows = 0;
229 char *bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
230 char *bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
231 UCharsetDetector *csd = ucsdet_open(&status);
232 const UCharsetMatch *match;
233 const char *name;
234
235 ucsdet_setText(csd, bWindows, lWindows, &status);
236 match = ucsdet_detect(csd, &status);
237
238 if (match == NULL) {
239 log_err("English test with C1 bytes got no matches.\n");
240 goto bail;
241 }
242
243 name = ucsdet_getName(match, &status);
244
245 if (strcmp(name, "windows-1252") != 0) {
246 log_err("English text with C1 bytes does not detect as windows-1252, but as %s\n", name);
247 }
248
249 ucsdet_setText(csd, bISO, lISO, &status);
250 match = ucsdet_detect(csd, &status);
251
252 if (match == NULL) {
253 log_err("English text without C1 bytes got no matches.\n");
254 goto bail;
255 }
256
257 name = ucsdet_getName(match, &status);
258
259 if (strcmp(name, "ISO-8859-1") != 0) {
260 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
261 }
262
263bail:
264 freeBytes(bWindows);
265 freeBytes(bISO);
266
267 ucsdet_close(csd);
268#endif
269}
270
271static void TestInputFilter(void)
272{
273 UErrorCode status = U_ZERO_ERROR;
274 const char *ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
275 int32_t sLength = 0;
276 UChar *s = unescape(ss, &sLength);
277 int32_t byteLength = 0;
278 char *bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
279 UCharsetDetector *csd = ucsdet_open(&status);
280 const UCharsetMatch *match;
281 const char *lang, *name;
282
283 ucsdet_enableInputFilter(csd, TRUE);
284
285 if (!ucsdet_isInputFilterEnabled(csd)) {
286 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
287 }
288
289
290 ucsdet_setText(csd, bytes, byteLength, &status);
291 match = ucsdet_detect(csd, &status);
292
293 if (match == NULL) {
294 log_err("Turning on the input filter resulted in no matches.\n");
295 goto turn_off;
296 }
297
298 name = ucsdet_getName(match, &status);
299
300 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
301 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
302 } else {
303 lang = ucsdet_getLanguage(match, &status);
304
305 if (lang == NULL || strcmp(lang, "fr") != 0) {
306 log_err("Input filter did not strip markup!\n");
307 }
308 }
309
310turn_off:
311 ucsdet_enableInputFilter(csd, FALSE);
312 ucsdet_setText(csd, bytes, byteLength, &status);
313 match = ucsdet_detect(csd, &status);
314
315 if (match == NULL) {
316 log_err("Turning off the input filter resulted in no matches.\n");
317 goto bail;
318 }
319
320 name = ucsdet_getName(match, &status);
321
322 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
323 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
324 } else {
325 lang = ucsdet_getLanguage(match, &status);
326
327 if (lang == NULL || strcmp(lang, "en") != 0) {
328 log_err("Unfiltered input did not detect as English!\n");
329 }
330 }
331
332bail:
333 freeBytes(bytes);
334 ucsdet_close(csd);
335}
336
337static void TestChaining(void) {
338 UErrorCode status = U_USELESS_COLLATOR_ERROR;
339
340 ucsdet_open(&status);
341 ucsdet_setText(NULL, NULL, 0, &status);
342 ucsdet_getName(NULL, &status);
343 ucsdet_getConfidence(NULL, &status);
344 ucsdet_getLanguage(NULL, &status);
345 ucsdet_detect(NULL, &status);
346 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
347 ucsdet_detectAll(NULL, NULL, &status);
348 ucsdet_getUChars(NULL, NULL, 0, &status);
349 ucsdet_getUChars(NULL, NULL, 0, &status);
350 ucsdet_close(NULL);
351
352 /* All of this code should have done nothing. */
353 if (status != U_USELESS_COLLATOR_ERROR) {
354 log_err("Status got changed to %s\n", u_errorName(status));
355 }
356}