]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/csdetest.cpp
ICU-400.38.tar.gz
[apple/icu.git] / icuSources / test / intltest / csdetest.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
46f4442e 3 * Copyright (C) 2005-2008, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9#include "unicode/utypes.h"
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/unistr.h"
13#include "unicode/putil.h"
14
15#include "intltest.h"
16#include "csdetest.h"
17
18#include "xmlparser.h"
19
20#include <stdlib.h>
21#include <string.h>
22
23#ifdef DEBUG_DETECT
24#include <stdio.h>
25#endif
26
27#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
28
29#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
30#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
31
32#define CH_SPACE 0x0020
33#define CH_SLASH 0x002F
34
35//---------------------------------------------------------------------------
36//
37// Test class boilerplate
38//
39//---------------------------------------------------------------------------
40CharsetDetectionTest::CharsetDetectionTest()
41{
42}
43
44
45CharsetDetectionTest::~CharsetDetectionTest()
46{
47}
48
49
50
51void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
52{
53 if (exec) logln("TestSuite CharsetDetectionTest: ");
54 switch (index) {
55 case 0: name = "ConstructionTest";
56 if (exec) ConstructionTest();
57 break;
58
59 case 1: name = "UTF8Test";
60 if (exec) UTF8Test();
61 break;
62
63 case 2: name = "UTF16Test";
64 if (exec) UTF16Test();
65 break;
66
67 case 3: name = "C1BytesTest";
68 if (exec) C1BytesTest();
69 break;
70
71 case 4: name = "InputFilterTest";
72 if (exec) InputFilterTest();
73 break;
74
75 case 5: name = "DetectionTest";
76 if (exec) DetectionTest();
77 break;
78
79 default: name = "";
80 break; //needed to end loop
81 }
82}
83
84static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
85{
86 int32_t offset = -1;
87
88 splits = 1;
89 while((offset = src.indexOf(ch, offset + 1)) >= 0) {
90 splits += 1;
91 }
92
93 UnicodeString *result = new UnicodeString[splits];
94
95 int32_t start = 0;
96 int32_t split = 0;
97 int32_t end;
98
99 while((end = src.indexOf(ch, start)) >= 0) {
100 src.extractBetween(start, end, result[split++]);
101 start = end + 1;
102 }
103
104 src.extractBetween(start, src.length(), result[split]);
105
106 return result;
107}
108
109static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
110{
111 int32_t sLength = source.length();
112 char *bytes = NULL;
113
114 length = source.extract(0, sLength, NULL, codepage);
115
116 if (length > 0) {
117 bytes = NEW_ARRAY(char, length + 1);
118 source.extract(0, sLength, bytes, codepage);
119 }
120
121 return bytes;
122}
123
124static void freeBytes(char *bytes)
125{
126 DELETE_ARRAY(bytes);
127}
128
129void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
130{
131 int32_t splits = 0;
132 int32_t testLength = testString.length();
133 UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
134 UErrorCode status = U_ZERO_ERROR;
135 int32_t cpLength = eSplit[0].length();
136 char codepage[64];
137
138 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
139 codepage[cpLength] = '\0';
140
141 UCharsetDetector *csd = ucsdet_open(&status);
142
143 int32_t byteLength = 0;
144 char *bytes = extractBytes(testString, codepage, byteLength);
145
146 if (bytes == NULL) {
147#if !UCONFIG_NO_LEGACY_CONVERSION
148 errln("Can't open a " + encoding + " converter for " + id);
149#endif
150 return;
151 }
152
153 ucsdet_setText(csd, bytes, byteLength, &status);
154
155 int32_t matchCount = 0;
156 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
157
158
159 UnicodeString name(ucsdet_getName(matches[0], &status));
160 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
161 UChar *decoded = NULL;
162 int32_t dLength = 0;
163
164 if (matchCount == 0) {
165 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
166 goto bail;
167 }
168
169 if (name.compare(eSplit[0]) != 0) {
170 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
171
172#ifdef DEBUG_DETECT
173 for (int32_t m = 0; m < matchCount; m += 1) {
174 const char *name = ucsdet_getName(matches[m], &status);
175 const char *lang = ucsdet_getLanguage(matches[m], &status);
176 int32_t confidence = ucsdet_getConfidence(matches[m], &status);
177
178 printf("%s (%s) %d\n", name, lang, confidence);
179 }
180#endif
181 goto bail;
182 }
183
184 if (splits > 1 && lang.compare(eSplit[1]) != 0) {
185 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
186 goto bail;
187 }
188
189 decoded = NEW_ARRAY(UChar, testLength);
190 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
191
192 if (testString.compare(decoded, dLength) != 0) {
193 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
194
195#ifdef DEBUG_DETECT
196 for(int32_t i = 0; i < testLength; i += 1) {
197 if(testString[i] != decoded[i]) {
198 printf("Strings differ at byte %d\n", i);
199 break;
200 }
201 }
202#endif
203
204 }
205
206 DELETE_ARRAY(decoded);
207
208bail:
209 freeBytes(bytes);
210 ucsdet_close(csd);
211 delete[] eSplit;
212}
213
214const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
215 UErrorCode status = U_ZERO_ERROR;
216 const char *testDataDirectory = IntlTest::getSourceTestData(status);
217
218 if (U_FAILURE(status)) {
219 errln("ERROR: getPath() failed - %s", u_errorName(status));
220 return NULL;
221 }
222
223 strcpy(buffer, testDataDirectory);
224 strcat(buffer, filename);
225 return buffer;
226}
227
228void CharsetDetectionTest::ConstructionTest()
229{
230 UErrorCode status = U_ZERO_ERROR;
231 UCharsetDetector *csd = ucsdet_open(&status);
232 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
233 int32_t count = uenum_count(e, &status);
234
235#ifdef DEBUG_DETECT
236 printf("There are %d recognizers.\n", count);
237#endif
238
239 for(int32_t i = 0; i < count; i += 1) {
240 int32_t length;
241 const char *name = uenum_next(e, &length, &status);
242
243 if(name == NULL || length <= 0) {
244 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
245 }
246
247#ifdef DEBUG_DETECT
248 printf("%s\n", name);
249#endif
250 }
251
252 uenum_close(e);
253 ucsdet_close(csd);
254}
255
256void CharsetDetectionTest::UTF8Test()
257{
258 UErrorCode status = U_ZERO_ERROR;
259 UnicodeString ss = "This is a string with some non-ascii characters that will "
260 "be converted to UTF-8, then shoved through the detection process. "
261 "\\u0391\\u0392\\u0393\\u0394\\u0395"
262 "Sure would be nice if our source could contain Unicode directly!";
263 UnicodeString s = ss.unescape();
264 int32_t byteLength = 0, sLength = s.length();
265 char *bytes = extractBytes(s, "UTF-8", byteLength);
266 UCharsetDetector *csd = ucsdet_open(&status);
267 const UCharsetMatch *match;
268 UChar *detected = NEW_ARRAY(UChar, sLength);
269
270 ucsdet_setText(csd, bytes, byteLength, &status);
271 match = ucsdet_detect(csd, &status);
272
273 if (match == NULL) {
274 errln("Detection failure for UTF-8: got no matches.");
275 goto bail;
276 }
277
278 ucsdet_getUChars(match, detected, sLength, &status);
279
280 if (s.compare(detected, sLength) != 0) {
281 errln("Round-trip test failed!");
282 }
283
284 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
285
286bail:
287 DELETE_ARRAY(detected);
288 freeBytes(bytes);
289 ucsdet_close(csd);
290}
291
292void CharsetDetectionTest::UTF16Test()
293{
294 UErrorCode status = U_ZERO_ERROR;
295 /* Notice the BOM on the start of this string */
296 UChar chars[] = {
297 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
298 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
299 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
300 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
301 0x064a, 0x062a, 0x0000};
302 UnicodeString s(chars);
303 int32_t beLength = 0, leLength = 0;
304 char *beBytes = extractBytes(s, "UTF-16BE", beLength);
305 char *leBytes = extractBytes(s, "UTF-16LE", leLength);
306 UCharsetDetector *csd = ucsdet_open(&status);
307 const UCharsetMatch *match;
308 const char *name;
309 int32_t conf;
310
311 ucsdet_setText(csd, beBytes, beLength, &status);
312 match = ucsdet_detect(csd, &status);
313
314 if (match == NULL) {
315 errln("Encoding detection failure for UTF-16BE: got no matches.");
316 goto try_le;
317 }
318
319 name = ucsdet_getName(match, &status);
320 conf = ucsdet_getConfidence(match, &status);
321
322 if (strcmp(name, "UTF-16BE") != 0) {
323 errln("Encoding detection failure for UTF-16BE: got %s", name);
324 goto try_le; // no point in looking at confidence if we got the wrong character set.
325 }
326
327 if (conf != 100) {
328 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
329 }
330
331try_le:
332 ucsdet_setText(csd, leBytes, leLength, &status);
333 match = ucsdet_detect(csd, &status);
334
335 if (match == NULL) {
336 errln("Encoding detection failure for UTF-16LE: got no matches.");
337 goto bail;
338 }
339
340 name = ucsdet_getName(match, &status);
341 conf = ucsdet_getConfidence(match, &status);
342
343
344 if (strcmp(name, "UTF-16LE") != 0) {
345 errln("Enconding detection failure for UTF-16LE: got %s", name);
346 goto bail; // no point in looking at confidence if we got the wrong character set.
347 }
348
349 if (conf != 100) {
350 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
351 }
352
353bail:
354 freeBytes(leBytes);
355 freeBytes(beBytes);
356 ucsdet_close(csd);
357}
358
359void CharsetDetectionTest::InputFilterTest()
360{
361 UErrorCode status = U_ZERO_ERROR;
362 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
363 UnicodeString s = ss.unescape();
364 int32_t byteLength = 0;
365 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
366 UCharsetDetector *csd = ucsdet_open(&status);
367 const UCharsetMatch *match;
368 const char *lang, *name;
369
370 ucsdet_enableInputFilter(csd, TRUE);
371
372 if (!ucsdet_isInputFilterEnabled(csd)) {
373 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
374 }
375
376
377 ucsdet_setText(csd, bytes, byteLength, &status);
378 match = ucsdet_detect(csd, &status);
379
380 if (match == NULL) {
381 errln("Turning on the input filter resulted in no matches.");
382 goto turn_off;
383 }
384
385 name = ucsdet_getName(match, &status);
386
387 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
388 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
389 } else {
390 lang = ucsdet_getLanguage(match, &status);
391
392 if (lang == NULL || strcmp(lang, "fr") != 0) {
393 errln("Input filter did not strip markup!");
394 }
395 }
396
397turn_off:
398 ucsdet_enableInputFilter(csd, FALSE);
399 ucsdet_setText(csd, bytes, byteLength, &status);
400 match = ucsdet_detect(csd, &status);
401
402 if (match == NULL) {
403 errln("Turning off the input filter resulted in no matches.");
404 goto bail;
405 }
406
407 name = ucsdet_getName(match, &status);
408
409 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
410 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
411 } else {
412 lang = ucsdet_getLanguage(match, &status);
413
414 if (lang == NULL || strcmp(lang, "en") != 0) {
415 errln("Unfiltered input did not detect as English!");
416 }
417 }
418
419bail:
420 freeBytes(bytes);
421 ucsdet_close(csd);
422}
423
424void CharsetDetectionTest::C1BytesTest()
425{
426#if !UCONFIG_NO_LEGACY_CONVERSION
427 UErrorCode status = U_ZERO_ERROR;
428 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
46f4442e 429 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
73c04bcf
A
430 UnicodeString sWindows = ssWindows.unescape();
431 int32_t lISO = 0, lWindows = 0;
432 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
433 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
434 UCharsetDetector *csd = ucsdet_open(&status);
435 const UCharsetMatch *match;
436 const char *name;
437
438 ucsdet_setText(csd, bWindows, lWindows, &status);
439 match = ucsdet_detect(csd, &status);
440
441 if (match == NULL) {
442 errln("English test with C1 bytes got no matches.");
443 goto bail;
444 }
445
446 name = ucsdet_getName(match, &status);
447
448 if (strcmp(name, "windows-1252") != 0) {
449 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
450 }
451
452 ucsdet_setText(csd, bISO, lISO, &status);
453 match = ucsdet_detect(csd, &status);
454
455 if (match == NULL) {
456 errln("English text without C1 bytes got no matches.");
457 goto bail;
458 }
459
460 name = ucsdet_getName(match, &status);
461
462 if (strcmp(name, "ISO-8859-1") != 0) {
463 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
464 }
465
466bail:
467 freeBytes(bWindows);
468 freeBytes(bISO);
469
470 ucsdet_close(csd);
471#endif
472}
473
474void CharsetDetectionTest::DetectionTest()
475{
476#if !UCONFIG_NO_REGULAR_EXPRESSIONS
477 UErrorCode status = U_ZERO_ERROR;
478 char path[2048];
479 const char *testFilePath = getPath(path, "csdetest.xml");
480
481 if (testFilePath == NULL) {
482 return; /* Couldn't get path: error message already output. */
483 }
484
485 UXMLParser *parser = UXMLParser::createParser(status);
486 if (!assertSuccess("UXMLParser::createParser",status)) return;
487 UXMLElement *root = parser->parseFile(testFilePath, status);
488 if (!assertSuccess( "parseFile",status)) return;
489
490 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
491 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
492 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
493
494 const UXMLElement *testCase;
495 int32_t tc = 0;
496
497 while((testCase = root->nextChildElement(tc)) != NULL) {
498 if (testCase->getTagName().compare(test_case) == 0) {
499 const UnicodeString *id = testCase->getAttribute(id_attr);
500 const UnicodeString *encodings = testCase->getAttribute(enc_attr);
501 const UnicodeString text = testCase->getText(TRUE);
502 int32_t encodingCount;
503 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
504
505 for(int32_t e = 0; e < encodingCount; e += 1) {
506 checkEncoding(text, encodingList[e], *id);
507 }
508
509 delete[] encodingList;
510 }
511 }
512
513 delete root;
514 delete parser;
515#endif
516}
517
518