]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ********************************************************************** | |
46f4442e | 3 | * Copyright (C) 2005-2008, International Business Machines |
73c04bcf A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | ||
9 | #include "unicode/utypes.h" | |
10 | #include "unicode/ucsdet.h" | |
11 | #include "unicode/ucnv.h" | |
12 | #include "unicode/unistr.h" | |
13 | #include "unicode/putil.h" | |
14 | ||
15 | #include "intltest.h" | |
16 | #include "csdetest.h" | |
17 | ||
18 | #include "xmlparser.h" | |
19 | ||
20 | #include <stdlib.h> | |
21 | #include <string.h> | |
22 | ||
23 | #ifdef DEBUG_DETECT | |
24 | #include <stdio.h> | |
25 | #endif | |
26 | ||
27 | #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | |
28 | ||
29 | #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) | |
30 | #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) | |
31 | ||
32 | #define CH_SPACE 0x0020 | |
33 | #define CH_SLASH 0x002F | |
34 | ||
35 | //--------------------------------------------------------------------------- | |
36 | // | |
37 | // Test class boilerplate | |
38 | // | |
39 | //--------------------------------------------------------------------------- | |
40 | CharsetDetectionTest::CharsetDetectionTest() | |
41 | { | |
42 | } | |
43 | ||
44 | ||
45 | CharsetDetectionTest::~CharsetDetectionTest() | |
46 | { | |
47 | } | |
48 | ||
49 | ||
50 | ||
51 | void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) | |
52 | { | |
53 | if (exec) logln("TestSuite CharsetDetectionTest: "); | |
54 | switch (index) { | |
55 | case 0: name = "ConstructionTest"; | |
56 | if (exec) ConstructionTest(); | |
57 | break; | |
58 | ||
59 | case 1: name = "UTF8Test"; | |
60 | if (exec) UTF8Test(); | |
61 | break; | |
62 | ||
63 | case 2: name = "UTF16Test"; | |
64 | if (exec) UTF16Test(); | |
65 | break; | |
66 | ||
67 | case 3: name = "C1BytesTest"; | |
68 | if (exec) C1BytesTest(); | |
69 | break; | |
70 | ||
71 | case 4: name = "InputFilterTest"; | |
72 | if (exec) InputFilterTest(); | |
73 | break; | |
74 | ||
75 | case 5: name = "DetectionTest"; | |
76 | if (exec) DetectionTest(); | |
77 | break; | |
78 | ||
79 | default: name = ""; | |
80 | break; //needed to end loop | |
81 | } | |
82 | } | |
83 | ||
84 | static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) | |
85 | { | |
86 | int32_t offset = -1; | |
87 | ||
88 | splits = 1; | |
89 | while((offset = src.indexOf(ch, offset + 1)) >= 0) { | |
90 | splits += 1; | |
91 | } | |
92 | ||
93 | UnicodeString *result = new UnicodeString[splits]; | |
94 | ||
95 | int32_t start = 0; | |
96 | int32_t split = 0; | |
97 | int32_t end; | |
98 | ||
99 | while((end = src.indexOf(ch, start)) >= 0) { | |
100 | src.extractBetween(start, end, result[split++]); | |
101 | start = end + 1; | |
102 | } | |
103 | ||
104 | src.extractBetween(start, src.length(), result[split]); | |
105 | ||
106 | return result; | |
107 | } | |
108 | ||
109 | static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length) | |
110 | { | |
111 | int32_t sLength = source.length(); | |
112 | char *bytes = NULL; | |
113 | ||
114 | length = source.extract(0, sLength, NULL, codepage); | |
115 | ||
116 | if (length > 0) { | |
117 | bytes = NEW_ARRAY(char, length + 1); | |
118 | source.extract(0, sLength, bytes, codepage); | |
119 | } | |
120 | ||
121 | return bytes; | |
122 | } | |
123 | ||
124 | static void freeBytes(char *bytes) | |
125 | { | |
126 | DELETE_ARRAY(bytes); | |
127 | } | |
128 | ||
129 | void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id) | |
130 | { | |
131 | int32_t splits = 0; | |
132 | int32_t testLength = testString.length(); | |
133 | UnicodeString *eSplit = split(encoding, CH_SLASH, splits); | |
134 | UErrorCode status = U_ZERO_ERROR; | |
135 | int32_t cpLength = eSplit[0].length(); | |
136 | char codepage[64]; | |
137 | ||
138 | u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); | |
139 | codepage[cpLength] = '\0'; | |
140 | ||
141 | UCharsetDetector *csd = ucsdet_open(&status); | |
142 | ||
143 | int32_t byteLength = 0; | |
144 | char *bytes = extractBytes(testString, codepage, byteLength); | |
145 | ||
146 | if (bytes == NULL) { | |
147 | #if !UCONFIG_NO_LEGACY_CONVERSION | |
148 | errln("Can't open a " + encoding + " converter for " + id); | |
149 | #endif | |
150 | return; | |
151 | } | |
152 | ||
153 | ucsdet_setText(csd, bytes, byteLength, &status); | |
154 | ||
155 | int32_t matchCount = 0; | |
156 | const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); | |
157 | ||
158 | ||
159 | UnicodeString name(ucsdet_getName(matches[0], &status)); | |
160 | UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); | |
161 | UChar *decoded = NULL; | |
162 | int32_t dLength = 0; | |
163 | ||
164 | if (matchCount == 0) { | |
165 | errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches"); | |
166 | goto bail; | |
167 | } | |
168 | ||
169 | if (name.compare(eSplit[0]) != 0) { | |
170 | errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); | |
171 | ||
172 | #ifdef DEBUG_DETECT | |
173 | for (int32_t m = 0; m < matchCount; m += 1) { | |
174 | const char *name = ucsdet_getName(matches[m], &status); | |
175 | const char *lang = ucsdet_getLanguage(matches[m], &status); | |
176 | int32_t confidence = ucsdet_getConfidence(matches[m], &status); | |
177 | ||
178 | printf("%s (%s) %d\n", name, lang, confidence); | |
179 | } | |
180 | #endif | |
181 | goto bail; | |
182 | } | |
183 | ||
184 | if (splits > 1 && lang.compare(eSplit[1]) != 0) { | |
185 | errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang); | |
186 | goto bail; | |
187 | } | |
188 | ||
189 | decoded = NEW_ARRAY(UChar, testLength); | |
190 | dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); | |
191 | ||
192 | if (testString.compare(decoded, dLength) != 0) { | |
193 | errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string."); | |
194 | ||
195 | #ifdef DEBUG_DETECT | |
196 | for(int32_t i = 0; i < testLength; i += 1) { | |
197 | if(testString[i] != decoded[i]) { | |
198 | printf("Strings differ at byte %d\n", i); | |
199 | break; | |
200 | } | |
201 | } | |
202 | #endif | |
203 | ||
204 | } | |
205 | ||
206 | DELETE_ARRAY(decoded); | |
207 | ||
208 | bail: | |
209 | freeBytes(bytes); | |
210 | ucsdet_close(csd); | |
211 | delete[] eSplit; | |
212 | } | |
213 | ||
214 | const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) { | |
215 | UErrorCode status = U_ZERO_ERROR; | |
216 | const char *testDataDirectory = IntlTest::getSourceTestData(status); | |
217 | ||
218 | if (U_FAILURE(status)) { | |
219 | errln("ERROR: getPath() failed - %s", u_errorName(status)); | |
220 | return NULL; | |
221 | } | |
222 | ||
223 | strcpy(buffer, testDataDirectory); | |
224 | strcat(buffer, filename); | |
225 | return buffer; | |
226 | } | |
227 | ||
228 | void CharsetDetectionTest::ConstructionTest() | |
229 | { | |
230 | UErrorCode status = U_ZERO_ERROR; | |
231 | UCharsetDetector *csd = ucsdet_open(&status); | |
232 | UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); | |
233 | int32_t count = uenum_count(e, &status); | |
234 | ||
235 | #ifdef DEBUG_DETECT | |
236 | printf("There are %d recognizers.\n", count); | |
237 | #endif | |
238 | ||
239 | for(int32_t i = 0; i < count; i += 1) { | |
240 | int32_t length; | |
241 | const char *name = uenum_next(e, &length, &status); | |
242 | ||
243 | if(name == NULL || length <= 0) { | |
244 | errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); | |
245 | } | |
246 | ||
247 | #ifdef DEBUG_DETECT | |
248 | printf("%s\n", name); | |
249 | #endif | |
250 | } | |
251 | ||
252 | uenum_close(e); | |
253 | ucsdet_close(csd); | |
254 | } | |
255 | ||
256 | void CharsetDetectionTest::UTF8Test() | |
257 | { | |
258 | UErrorCode status = U_ZERO_ERROR; | |
259 | UnicodeString ss = "This is a string with some non-ascii characters that will " | |
260 | "be converted to UTF-8, then shoved through the detection process. " | |
261 | "\\u0391\\u0392\\u0393\\u0394\\u0395" | |
262 | "Sure would be nice if our source could contain Unicode directly!"; | |
263 | UnicodeString s = ss.unescape(); | |
264 | int32_t byteLength = 0, sLength = s.length(); | |
265 | char *bytes = extractBytes(s, "UTF-8", byteLength); | |
266 | UCharsetDetector *csd = ucsdet_open(&status); | |
267 | const UCharsetMatch *match; | |
268 | UChar *detected = NEW_ARRAY(UChar, sLength); | |
269 | ||
270 | ucsdet_setText(csd, bytes, byteLength, &status); | |
271 | match = ucsdet_detect(csd, &status); | |
272 | ||
273 | if (match == NULL) { | |
274 | errln("Detection failure for UTF-8: got no matches."); | |
275 | goto bail; | |
276 | } | |
277 | ||
278 | ucsdet_getUChars(match, detected, sLength, &status); | |
279 | ||
280 | if (s.compare(detected, sLength) != 0) { | |
281 | errln("Round-trip test failed!"); | |
282 | } | |
283 | ||
284 | ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ | |
285 | ||
286 | bail: | |
287 | DELETE_ARRAY(detected); | |
288 | freeBytes(bytes); | |
289 | ucsdet_close(csd); | |
290 | } | |
291 | ||
292 | void CharsetDetectionTest::UTF16Test() | |
293 | { | |
294 | UErrorCode status = U_ZERO_ERROR; | |
295 | /* Notice the BOM on the start of this string */ | |
296 | UChar chars[] = { | |
297 | 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, | |
298 | 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, | |
299 | 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, | |
300 | 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, | |
301 | 0x064a, 0x062a, 0x0000}; | |
302 | UnicodeString s(chars); | |
303 | int32_t beLength = 0, leLength = 0; | |
304 | char *beBytes = extractBytes(s, "UTF-16BE", beLength); | |
305 | char *leBytes = extractBytes(s, "UTF-16LE", leLength); | |
306 | UCharsetDetector *csd = ucsdet_open(&status); | |
307 | const UCharsetMatch *match; | |
308 | const char *name; | |
309 | int32_t conf; | |
310 | ||
311 | ucsdet_setText(csd, beBytes, beLength, &status); | |
312 | match = ucsdet_detect(csd, &status); | |
313 | ||
314 | if (match == NULL) { | |
315 | errln("Encoding detection failure for UTF-16BE: got no matches."); | |
316 | goto try_le; | |
317 | } | |
318 | ||
319 | name = ucsdet_getName(match, &status); | |
320 | conf = ucsdet_getConfidence(match, &status); | |
321 | ||
322 | if (strcmp(name, "UTF-16BE") != 0) { | |
323 | errln("Encoding detection failure for UTF-16BE: got %s", name); | |
324 | goto try_le; // no point in looking at confidence if we got the wrong character set. | |
325 | } | |
326 | ||
327 | if (conf != 100) { | |
328 | errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); | |
329 | } | |
330 | ||
331 | try_le: | |
332 | ucsdet_setText(csd, leBytes, leLength, &status); | |
333 | match = ucsdet_detect(csd, &status); | |
334 | ||
335 | if (match == NULL) { | |
336 | errln("Encoding detection failure for UTF-16LE: got no matches."); | |
337 | goto bail; | |
338 | } | |
339 | ||
340 | name = ucsdet_getName(match, &status); | |
341 | conf = ucsdet_getConfidence(match, &status); | |
342 | ||
343 | ||
344 | if (strcmp(name, "UTF-16LE") != 0) { | |
345 | errln("Enconding detection failure for UTF-16LE: got %s", name); | |
346 | goto bail; // no point in looking at confidence if we got the wrong character set. | |
347 | } | |
348 | ||
349 | if (conf != 100) { | |
350 | errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); | |
351 | } | |
352 | ||
353 | bail: | |
354 | freeBytes(leBytes); | |
355 | freeBytes(beBytes); | |
356 | ucsdet_close(csd); | |
357 | } | |
358 | ||
359 | void CharsetDetectionTest::InputFilterTest() | |
360 | { | |
361 | UErrorCode status = U_ZERO_ERROR; | |
362 | UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; | |
363 | UnicodeString s = ss.unescape(); | |
364 | int32_t byteLength = 0; | |
365 | char *bytes = extractBytes(s, "ISO-8859-1", byteLength); | |
366 | UCharsetDetector *csd = ucsdet_open(&status); | |
367 | const UCharsetMatch *match; | |
368 | const char *lang, *name; | |
369 | ||
370 | ucsdet_enableInputFilter(csd, TRUE); | |
371 | ||
372 | if (!ucsdet_isInputFilterEnabled(csd)) { | |
373 | errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"); | |
374 | } | |
375 | ||
376 | ||
377 | ucsdet_setText(csd, bytes, byteLength, &status); | |
378 | match = ucsdet_detect(csd, &status); | |
379 | ||
380 | if (match == NULL) { | |
381 | errln("Turning on the input filter resulted in no matches."); | |
382 | goto turn_off; | |
383 | } | |
384 | ||
385 | name = ucsdet_getName(match, &status); | |
386 | ||
387 | if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
388 | errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); | |
389 | } else { | |
390 | lang = ucsdet_getLanguage(match, &status); | |
391 | ||
392 | if (lang == NULL || strcmp(lang, "fr") != 0) { | |
393 | errln("Input filter did not strip markup!"); | |
394 | } | |
395 | } | |
396 | ||
397 | turn_off: | |
398 | ucsdet_enableInputFilter(csd, FALSE); | |
399 | ucsdet_setText(csd, bytes, byteLength, &status); | |
400 | match = ucsdet_detect(csd, &status); | |
401 | ||
402 | if (match == NULL) { | |
403 | errln("Turning off the input filter resulted in no matches."); | |
404 | goto bail; | |
405 | } | |
406 | ||
407 | name = ucsdet_getName(match, &status); | |
408 | ||
409 | if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { | |
410 | errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); | |
411 | } else { | |
412 | lang = ucsdet_getLanguage(match, &status); | |
413 | ||
414 | if (lang == NULL || strcmp(lang, "en") != 0) { | |
415 | errln("Unfiltered input did not detect as English!"); | |
416 | } | |
417 | } | |
418 | ||
419 | bail: | |
420 | freeBytes(bytes); | |
421 | ucsdet_close(csd); | |
422 | } | |
423 | ||
424 | void CharsetDetectionTest::C1BytesTest() | |
425 | { | |
426 | #if !UCONFIG_NO_LEGACY_CONVERSION | |
427 | UErrorCode status = U_ZERO_ERROR; | |
428 | UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; | |
46f4442e | 429 | UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); |
73c04bcf A |
430 | UnicodeString sWindows = ssWindows.unescape(); |
431 | int32_t lISO = 0, lWindows = 0; | |
432 | char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); | |
433 | char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); | |
434 | UCharsetDetector *csd = ucsdet_open(&status); | |
435 | const UCharsetMatch *match; | |
436 | const char *name; | |
437 | ||
438 | ucsdet_setText(csd, bWindows, lWindows, &status); | |
439 | match = ucsdet_detect(csd, &status); | |
440 | ||
441 | if (match == NULL) { | |
442 | errln("English test with C1 bytes got no matches."); | |
443 | goto bail; | |
444 | } | |
445 | ||
446 | name = ucsdet_getName(match, &status); | |
447 | ||
448 | if (strcmp(name, "windows-1252") != 0) { | |
449 | errln("English text with C1 bytes does not detect as windows-1252, but as %s", name); | |
450 | } | |
451 | ||
452 | ucsdet_setText(csd, bISO, lISO, &status); | |
453 | match = ucsdet_detect(csd, &status); | |
454 | ||
455 | if (match == NULL) { | |
456 | errln("English text without C1 bytes got no matches."); | |
457 | goto bail; | |
458 | } | |
459 | ||
460 | name = ucsdet_getName(match, &status); | |
461 | ||
462 | if (strcmp(name, "ISO-8859-1") != 0) { | |
463 | errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name); | |
464 | } | |
465 | ||
466 | bail: | |
467 | freeBytes(bWindows); | |
468 | freeBytes(bISO); | |
469 | ||
470 | ucsdet_close(csd); | |
471 | #endif | |
472 | } | |
473 | ||
474 | void CharsetDetectionTest::DetectionTest() | |
475 | { | |
476 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
477 | UErrorCode status = U_ZERO_ERROR; | |
478 | char path[2048]; | |
479 | const char *testFilePath = getPath(path, "csdetest.xml"); | |
480 | ||
481 | if (testFilePath == NULL) { | |
482 | return; /* Couldn't get path: error message already output. */ | |
483 | } | |
484 | ||
485 | UXMLParser *parser = UXMLParser::createParser(status); | |
486 | if (!assertSuccess("UXMLParser::createParser",status)) return; | |
487 | UXMLElement *root = parser->parseFile(testFilePath, status); | |
488 | if (!assertSuccess( "parseFile",status)) return; | |
489 | ||
490 | UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); | |
491 | UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); | |
492 | UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); | |
493 | ||
494 | const UXMLElement *testCase; | |
495 | int32_t tc = 0; | |
496 | ||
497 | while((testCase = root->nextChildElement(tc)) != NULL) { | |
498 | if (testCase->getTagName().compare(test_case) == 0) { | |
499 | const UnicodeString *id = testCase->getAttribute(id_attr); | |
500 | const UnicodeString *encodings = testCase->getAttribute(enc_attr); | |
501 | const UnicodeString text = testCase->getText(TRUE); | |
502 | int32_t encodingCount; | |
503 | UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount); | |
504 | ||
505 | for(int32_t e = 0; e < encodingCount; e += 1) { | |
506 | checkEncoding(text, encodingList[e], *id); | |
507 | } | |
508 | ||
509 | delete[] encodingList; | |
510 | } | |
511 | } | |
512 | ||
513 | delete root; | |
514 | delete parser; | |
515 | #endif | |
516 | } | |
517 | ||
518 |