]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/csdetest.cpp
ICU-511.34.tar.gz
[apple/icu.git] / icuSources / test / intltest / csdetest.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
51004dcb 3 * Copyright (C) 2005-2012, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9#include "unicode/utypes.h"
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/unistr.h"
13#include "unicode/putil.h"
729e4ab9 14#include "unicode/uniset.h"
73c04bcf
A
15
16#include "intltest.h"
17#include "csdetest.h"
18
19#include "xmlparser.h"
20
21#include <stdlib.h>
22#include <string.h>
23
24#ifdef DEBUG_DETECT
25#include <stdio.h>
26#endif
27
28#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29
30#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32
33#define CH_SPACE 0x0020
34#define CH_SLASH 0x002F
35
51004dcb
A
36#define TEST_ASSERT(x) {if (!(x)) { \
37 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
38
39#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
41 return;}}
42
43
73c04bcf
A
44//---------------------------------------------------------------------------
45//
46// Test class boilerplate
47//
48//---------------------------------------------------------------------------
49CharsetDetectionTest::CharsetDetectionTest()
50{
51}
52
53
54CharsetDetectionTest::~CharsetDetectionTest()
55{
56}
57
58
59
60void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61{
62 if (exec) logln("TestSuite CharsetDetectionTest: ");
63 switch (index) {
64 case 0: name = "ConstructionTest";
65 if (exec) ConstructionTest();
66 break;
67
68 case 1: name = "UTF8Test";
69 if (exec) UTF8Test();
70 break;
71
72 case 2: name = "UTF16Test";
73 if (exec) UTF16Test();
74 break;
75
76 case 3: name = "C1BytesTest";
77 if (exec) C1BytesTest();
78 break;
79
80 case 4: name = "InputFilterTest";
81 if (exec) InputFilterTest();
82 break;
83
84 case 5: name = "DetectionTest";
85 if (exec) DetectionTest();
86 break;
729e4ab9
A
87#if !UCONFIG_NO_LEGACY_CONVERSION
88 case 6: name = "IBM424Test";
89 if (exec) IBM424Test();
90 break;
91
92 case 7: name = "IBM420Test";
93 if (exec) IBM420Test();
94 break;
95#else
96 case 6:
97 case 7: name = "skip"; break;
98#endif
99 case 8: name = "Ticket6394Test";
100 if (exec) Ticket6394Test();
101 break;
73c04bcf 102
51004dcb
A
103 case 9: name = "Ticket6954Test";
104 if (exec) Ticket6954Test();
105 break;
106
73c04bcf
A
107 default: name = "";
108 break; //needed to end loop
109 }
110}
111
112static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113{
114 int32_t offset = -1;
115
116 splits = 1;
117 while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118 splits += 1;
119 }
120
121 UnicodeString *result = new UnicodeString[splits];
122
123 int32_t start = 0;
124 int32_t split = 0;
125 int32_t end;
126
127 while((end = src.indexOf(ch, start)) >= 0) {
128 src.extractBetween(start, end, result[split++]);
129 start = end + 1;
130 }
131
132 src.extractBetween(start, src.length(), result[split]);
133
134 return result;
135}
136
137static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138{
139 int32_t sLength = source.length();
140 char *bytes = NULL;
141
142 length = source.extract(0, sLength, NULL, codepage);
143
144 if (length > 0) {
145 bytes = NEW_ARRAY(char, length + 1);
146 source.extract(0, sLength, bytes, codepage);
147 }
148
149 return bytes;
150}
151
152static void freeBytes(char *bytes)
153{
154 DELETE_ARRAY(bytes);
155}
156
157void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158{
159 int32_t splits = 0;
160 int32_t testLength = testString.length();
161 UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162 UErrorCode status = U_ZERO_ERROR;
163 int32_t cpLength = eSplit[0].length();
164 char codepage[64];
165
166 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167 codepage[cpLength] = '\0';
168
729e4ab9 169 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
73c04bcf
A
170
171 int32_t byteLength = 0;
172 char *bytes = extractBytes(testString, codepage, byteLength);
173
174 if (bytes == NULL) {
175#if !UCONFIG_NO_LEGACY_CONVERSION
4388f060 176 dataerrln("Can't open a " + encoding + " converter for " + id);
73c04bcf
A
177#endif
178 return;
179 }
180
729e4ab9 181 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
73c04bcf
A
182
183 int32_t matchCount = 0;
729e4ab9 184 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
73c04bcf
A
185
186
187 UnicodeString name(ucsdet_getName(matches[0], &status));
188 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189 UChar *decoded = NULL;
190 int32_t dLength = 0;
191
192 if (matchCount == 0) {
193 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194 goto bail;
195 }
196
197 if (name.compare(eSplit[0]) != 0) {
198 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199
200#ifdef DEBUG_DETECT
201 for (int32_t m = 0; m < matchCount; m += 1) {
202 const char *name = ucsdet_getName(matches[m], &status);
203 const char *lang = ucsdet_getLanguage(matches[m], &status);
204 int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205
206 printf("%s (%s) %d\n", name, lang, confidence);
207 }
208#endif
209 goto bail;
210 }
211
212 if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214 goto bail;
215 }
216
217 decoded = NEW_ARRAY(UChar, testLength);
218 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219
220 if (testString.compare(decoded, dLength) != 0) {
221 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222
223#ifdef DEBUG_DETECT
224 for(int32_t i = 0; i < testLength; i += 1) {
225 if(testString[i] != decoded[i]) {
226 printf("Strings differ at byte %d\n", i);
227 break;
228 }
229 }
230#endif
231
232 }
233
234 DELETE_ARRAY(decoded);
235
236bail:
237 freeBytes(bytes);
73c04bcf
A
238 delete[] eSplit;
239}
240
241const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242 UErrorCode status = U_ZERO_ERROR;
243 const char *testDataDirectory = IntlTest::getSourceTestData(status);
244
245 if (U_FAILURE(status)) {
246 errln("ERROR: getPath() failed - %s", u_errorName(status));
247 return NULL;
248 }
249
250 strcpy(buffer, testDataDirectory);
251 strcat(buffer, filename);
252 return buffer;
253}
254
255void CharsetDetectionTest::ConstructionTest()
256{
729e4ab9
A
257 IcuTestErrorCode status(*this, "ConstructionTest");
258 LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260 int32_t count = uenum_count(e.getAlias(), status);
73c04bcf
A
261
262#ifdef DEBUG_DETECT
263 printf("There are %d recognizers.\n", count);
264#endif
265
266 for(int32_t i = 0; i < count; i += 1) {
267 int32_t length;
729e4ab9 268 const char *name = uenum_next(e.getAlias(), &length, status);
73c04bcf
A
269
270 if(name == NULL || length <= 0) {
271 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272 }
273
274#ifdef DEBUG_DETECT
275 printf("%s\n", name);
276#endif
277 }
73c04bcf
A
278}
279
280void CharsetDetectionTest::UTF8Test()
281{
282 UErrorCode status = U_ZERO_ERROR;
283 UnicodeString ss = "This is a string with some non-ascii characters that will "
284 "be converted to UTF-8, then shoved through the detection process. "
285 "\\u0391\\u0392\\u0393\\u0394\\u0395"
286 "Sure would be nice if our source could contain Unicode directly!";
287 UnicodeString s = ss.unescape();
288 int32_t byteLength = 0, sLength = s.length();
289 char *bytes = extractBytes(s, "UTF-8", byteLength);
290 UCharsetDetector *csd = ucsdet_open(&status);
291 const UCharsetMatch *match;
292 UChar *detected = NEW_ARRAY(UChar, sLength);
293
294 ucsdet_setText(csd, bytes, byteLength, &status);
295 match = ucsdet_detect(csd, &status);
296
297 if (match == NULL) {
298 errln("Detection failure for UTF-8: got no matches.");
299 goto bail;
300 }
301
302 ucsdet_getUChars(match, detected, sLength, &status);
303
304 if (s.compare(detected, sLength) != 0) {
305 errln("Round-trip test failed!");
306 }
307
308 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
309
310bail:
311 DELETE_ARRAY(detected);
312 freeBytes(bytes);
313 ucsdet_close(csd);
314}
315
316void CharsetDetectionTest::UTF16Test()
317{
318 UErrorCode status = U_ZERO_ERROR;
319 /* Notice the BOM on the start of this string */
320 UChar chars[] = {
321 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
322 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
323 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
324 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
325 0x064a, 0x062a, 0x0000};
326 UnicodeString s(chars);
327 int32_t beLength = 0, leLength = 0;
328 char *beBytes = extractBytes(s, "UTF-16BE", beLength);
329 char *leBytes = extractBytes(s, "UTF-16LE", leLength);
330 UCharsetDetector *csd = ucsdet_open(&status);
331 const UCharsetMatch *match;
332 const char *name;
333 int32_t conf;
334
335 ucsdet_setText(csd, beBytes, beLength, &status);
336 match = ucsdet_detect(csd, &status);
337
338 if (match == NULL) {
339 errln("Encoding detection failure for UTF-16BE: got no matches.");
340 goto try_le;
341 }
342
343 name = ucsdet_getName(match, &status);
344 conf = ucsdet_getConfidence(match, &status);
345
346 if (strcmp(name, "UTF-16BE") != 0) {
347 errln("Encoding detection failure for UTF-16BE: got %s", name);
348 goto try_le; // no point in looking at confidence if we got the wrong character set.
349 }
350
351 if (conf != 100) {
352 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
353 }
354
355try_le:
356 ucsdet_setText(csd, leBytes, leLength, &status);
357 match = ucsdet_detect(csd, &status);
358
359 if (match == NULL) {
360 errln("Encoding detection failure for UTF-16LE: got no matches.");
361 goto bail;
362 }
363
364 name = ucsdet_getName(match, &status);
365 conf = ucsdet_getConfidence(match, &status);
366
367
368 if (strcmp(name, "UTF-16LE") != 0) {
369 errln("Enconding detection failure for UTF-16LE: got %s", name);
370 goto bail; // no point in looking at confidence if we got the wrong character set.
371 }
372
373 if (conf != 100) {
374 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
375 }
376
377bail:
378 freeBytes(leBytes);
379 freeBytes(beBytes);
380 ucsdet_close(csd);
381}
382
383void CharsetDetectionTest::InputFilterTest()
384{
385 UErrorCode status = U_ZERO_ERROR;
386 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
387 UnicodeString s = ss.unescape();
388 int32_t byteLength = 0;
389 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
390 UCharsetDetector *csd = ucsdet_open(&status);
391 const UCharsetMatch *match;
392 const char *lang, *name;
393
394 ucsdet_enableInputFilter(csd, TRUE);
395
396 if (!ucsdet_isInputFilterEnabled(csd)) {
397 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
398 }
399
400
401 ucsdet_setText(csd, bytes, byteLength, &status);
402 match = ucsdet_detect(csd, &status);
403
404 if (match == NULL) {
405 errln("Turning on the input filter resulted in no matches.");
406 goto turn_off;
407 }
408
409 name = ucsdet_getName(match, &status);
410
411 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
412 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
413 } else {
414 lang = ucsdet_getLanguage(match, &status);
415
416 if (lang == NULL || strcmp(lang, "fr") != 0) {
417 errln("Input filter did not strip markup!");
418 }
419 }
420
421turn_off:
422 ucsdet_enableInputFilter(csd, FALSE);
423 ucsdet_setText(csd, bytes, byteLength, &status);
424 match = ucsdet_detect(csd, &status);
425
426 if (match == NULL) {
427 errln("Turning off the input filter resulted in no matches.");
428 goto bail;
429 }
430
431 name = ucsdet_getName(match, &status);
432
433 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
434 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
435 } else {
436 lang = ucsdet_getLanguage(match, &status);
437
438 if (lang == NULL || strcmp(lang, "en") != 0) {
439 errln("Unfiltered input did not detect as English!");
440 }
441 }
442
443bail:
444 freeBytes(bytes);
445 ucsdet_close(csd);
446}
447
448void CharsetDetectionTest::C1BytesTest()
449{
450#if !UCONFIG_NO_LEGACY_CONVERSION
451 UErrorCode status = U_ZERO_ERROR;
452 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
46f4442e 453 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
73c04bcf
A
454 UnicodeString sWindows = ssWindows.unescape();
455 int32_t lISO = 0, lWindows = 0;
456 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
457 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
458 UCharsetDetector *csd = ucsdet_open(&status);
459 const UCharsetMatch *match;
460 const char *name;
461
462 ucsdet_setText(csd, bWindows, lWindows, &status);
463 match = ucsdet_detect(csd, &status);
464
465 if (match == NULL) {
729e4ab9 466 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
73c04bcf
A
467 goto bail;
468 }
469
470 name = ucsdet_getName(match, &status);
471
472 if (strcmp(name, "windows-1252") != 0) {
473 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
474 }
475
476 ucsdet_setText(csd, bISO, lISO, &status);
477 match = ucsdet_detect(csd, &status);
478
479 if (match == NULL) {
480 errln("English text without C1 bytes got no matches.");
481 goto bail;
482 }
483
484 name = ucsdet_getName(match, &status);
485
486 if (strcmp(name, "ISO-8859-1") != 0) {
487 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
488 }
489
490bail:
491 freeBytes(bWindows);
492 freeBytes(bISO);
493
494 ucsdet_close(csd);
495#endif
496}
497
498void CharsetDetectionTest::DetectionTest()
499{
500#if !UCONFIG_NO_REGULAR_EXPRESSIONS
501 UErrorCode status = U_ZERO_ERROR;
502 char path[2048];
503 const char *testFilePath = getPath(path, "csdetest.xml");
504
505 if (testFilePath == NULL) {
506 return; /* Couldn't get path: error message already output. */
507 }
508
509 UXMLParser *parser = UXMLParser::createParser(status);
729e4ab9
A
510 if (U_FAILURE(status)) {
511 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
512 return;
513 }
514
73c04bcf
A
515 UXMLElement *root = parser->parseFile(testFilePath, status);
516 if (!assertSuccess( "parseFile",status)) return;
517
518 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
519 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
520 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
521
522 const UXMLElement *testCase;
523 int32_t tc = 0;
524
525 while((testCase = root->nextChildElement(tc)) != NULL) {
526 if (testCase->getTagName().compare(test_case) == 0) {
527 const UnicodeString *id = testCase->getAttribute(id_attr);
528 const UnicodeString *encodings = testCase->getAttribute(enc_attr);
529 const UnicodeString text = testCase->getText(TRUE);
530 int32_t encodingCount;
531 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
532
533 for(int32_t e = 0; e < encodingCount; e += 1) {
534 checkEncoding(text, encodingList[e], *id);
535 }
536
537 delete[] encodingList;
538 }
539 }
540
541 delete root;
542 delete parser;
543#endif
544}
545
729e4ab9
A
546void CharsetDetectionTest::IBM424Test()
547{
548 UErrorCode status = U_ZERO_ERROR;
549
550 static const UChar chars[] = {
551 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
552 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
553 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
554 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
555 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
556 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
557 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
558 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
559 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
560 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
561 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
562 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
563 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
564 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
565 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
566 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
567 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
568 };
569
570 static const UChar chars_reverse[] = {
571 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
572 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
573 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
574 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
575 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
576 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
577 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
578 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
579 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
580 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
581 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
582 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
583 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
584 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
585 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
586 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
587 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
588 0x0000
589 };
590
591 int32_t bLength = 0, brLength = 0;
592
593 UnicodeString s1(chars);
594 UnicodeString s2(chars_reverse);
595
596 char *bytes = extractBytes(s1, "IBM424", bLength);
597 char *bytes_r = extractBytes(s2, "IBM424", brLength);
598
599 UCharsetDetector *csd = ucsdet_open(&status);
600 if (U_FAILURE(status)) {
601 errln("Error opening charset detector. - %s", u_errorName(status));
602 }
603 const UCharsetMatch *match;
604 const char *name;
605
606 ucsdet_setText(csd, bytes, bLength, &status);
607 match = ucsdet_detect(csd, &status);
608
609 if (match == NULL) {
610 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
611 goto bail;
612 }
613
614 name = ucsdet_getName(match, &status);
615 if (strcmp(name, "IBM424_rtl") != 0) {
616 errln("Encoding detection failure for IBM424_rtl: got %s", name);
617 }
618
619 ucsdet_setText(csd, bytes_r, brLength, &status);
620 match = ucsdet_detect(csd, &status);
621
622 if (match == NULL) {
623 errln("Encoding detection failure for IBM424_ltr: got no matches.");
624 goto bail;
625 }
626
627 name = ucsdet_getName(match, &status);
628 if (strcmp(name, "IBM424_ltr") != 0) {
629 errln("Encoding detection failure for IBM424_ltr: got %s", name);
630 }
631
632bail:
633 freeBytes(bytes);
634 freeBytes(bytes_r);
635 ucsdet_close(csd);
636}
637
638void CharsetDetectionTest::IBM420Test()
639{
640 UErrorCode status = U_ZERO_ERROR;
641
642 static const UChar chars[] = {
643 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
644 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
645 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
646 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
647 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
648 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
649 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
650 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
651 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
652 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
653 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
654 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
655 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
656 0x0000
657 };
658 static const UChar chars_reverse[] = {
659 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
660 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
661 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
662 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
663 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
664 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
665 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
666 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
667 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
668 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
669 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
670 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
671 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
672 0x0000,
673 };
674
675 int32_t bLength = 0, brLength = 0;
676
677 UnicodeString s1(chars);
678 UnicodeString s2(chars_reverse);
679
680 char *bytes = extractBytes(s1, "IBM420", bLength);
681 char *bytes_r = extractBytes(s2, "IBM420", brLength);
682
683 UCharsetDetector *csd = ucsdet_open(&status);
684 if (U_FAILURE(status)) {
685 errln("Error opening charset detector. - %s", u_errorName(status));
686 }
687 const UCharsetMatch *match;
688 const char *name;
689
690 ucsdet_setText(csd, bytes, bLength, &status);
691 match = ucsdet_detect(csd, &status);
692
693 if (match == NULL) {
694 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
695 goto bail;
696 }
697
698 name = ucsdet_getName(match, &status);
699 if (strcmp(name, "IBM420_rtl") != 0) {
700 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
701 }
702
703 ucsdet_setText(csd, bytes_r, brLength, &status);
704 match = ucsdet_detect(csd, &status);
705
706 if (match == NULL) {
707 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
708 goto bail;
709 }
710
711 name = ucsdet_getName(match, &status);
712 if (strcmp(name, "IBM420_ltr") != 0) {
713 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
714 }
715
716bail:
717 freeBytes(bytes);
718 freeBytes(bytes_r);
719 ucsdet_close(csd);
720}
721
722
723void CharsetDetectionTest::Ticket6394Test() {
724#if !UCONFIG_NO_CONVERSION
725 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
726 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
727 "encodings more than once. The hop through UnicodeString is for platforms "
728 "where this char * string is be EBCDIC and needs conversion to Latin1.";
729 char latin1Text[sizeof(charText)];
730 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
731
732 UErrorCode status = U_ZERO_ERROR;
733 UCharsetDetector *csd = ucsdet_open(&status);
734 ucsdet_setText(csd, latin1Text, -1, &status);
735 if (U_FAILURE(status)) {
736 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
737 return;
738 }
739
740 int32_t matchCount = 0;
741 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
742 if (U_FAILURE(status)) {
743 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
744 return;
745 }
746
747 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
748 int32_t i;
749 for (i=0; i<matchCount; i++) {
750 UnicodeString charSetName(ucsdet_getName(matches[i], &status));
751 if (U_FAILURE(status)) {
752 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
753 status = U_ZERO_ERROR;
754 }
755 if (setOfCharsetNames.contains(charSetName)) {
756 errln("Fail at file %s, line %d ", __FILE__, __LINE__);
757 errln(UnicodeString(" Duplicate charset name = ") + charSetName);
758 }
759 setOfCharsetNames.add(charSetName);
760 }
761 ucsdet_close(csd);
762#endif
763}
73c04bcf 764
51004dcb
A
765
766// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
767// similar Windows and non-Windows SBCS encodings. State was kept in the shared
768// Charset Recognizer objects, and could be overwritten.
769void CharsetDetectionTest::Ticket6954Test() {
770#if !UCONFIG_NO_CONVERSION
771 UErrorCode status = U_ZERO_ERROR;
772 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
773 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
774 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
775 UnicodeString sWindows = ssWindows.unescape();
776 int32_t lISO = 0, lWindows = 0;
777 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
778 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
779
780 // First do a plain vanilla detect of 1252 text
781
782 UCharsetDetector *csd1 = ucsdet_open(&status);
783 ucsdet_setText(csd1, bWindows, lWindows, &status);
784 const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
785 const char *name1 = ucsdet_getName(match1, &status);
786 TEST_ASSERT_SUCCESS(status);
787 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
788
789 // Next, using a completely separate detector, detect some 8859-1 text
790
791 UCharsetDetector *csd2 = ucsdet_open(&status);
792 ucsdet_setText(csd2, bISO, lISO, &status);
793 const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
794 const char *name2 = ucsdet_getName(match2, &status);
795 TEST_ASSERT_SUCCESS(status);
796 TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
797
798 // Recheck the 1252 results from the first detector, which should not have been
799 // altered by the use of a different detector.
800
801 name1 = ucsdet_getName(match1, &status);
802 TEST_ASSERT_SUCCESS(status);
803 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
804
805 ucsdet_close(csd1);
806 ucsdet_close(csd2);
807 freeBytes(bISO);
808 freeBytes(bWindows);
809#endif
810}