]>
git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/csdetest.cpp
2 **********************************************************************
3 * Copyright (C) 2005-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
18 #include "xmlparser.h"
27 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29 #define NEW_ARRAY(type,count) (type *) /*uprv_*/ malloc((count) * sizeof(type))
30 #define DELETE_ARRAY(array) /*uprv_*/ free((void *) (array))
32 #define CH_SPACE 0x0020
33 #define CH_SLASH 0x002F
35 //---------------------------------------------------------------------------
37 // Test class boilerplate
39 //---------------------------------------------------------------------------
40 CharsetDetectionTest :: CharsetDetectionTest ()
45 CharsetDetectionTest ::~ CharsetDetectionTest ()
51 void CharsetDetectionTest :: runIndexedTest ( int32_t index
, UBool exec
, const char * & name
, char * /*par*/ )
53 if ( exec
) logln ( "TestSuite CharsetDetectionTest: " );
55 case 0 : name
= "ConstructionTest" ;
56 if ( exec
) ConstructionTest ();
59 case 1 : name
= "UTF8Test" ;
63 case 2 : name
= "UTF16Test" ;
64 if ( exec
) UTF16Test ();
67 case 3 : name
= "C1BytesTest" ;
68 if ( exec
) C1BytesTest ();
71 case 4 : name
= "InputFilterTest" ;
72 if ( exec
) InputFilterTest ();
75 case 5 : name
= "DetectionTest" ;
76 if ( exec
) DetectionTest ();
80 break ; //needed to end loop
84 static UnicodeString
* split ( const UnicodeString
& src
, UChar ch
, int32_t & splits
)
89 while (( offset
= src
. indexOf ( ch
, offset
+ 1 )) >= 0 ) {
93 UnicodeString
* result
= new UnicodeString
[ splits
];
99 while (( end
= src
. indexOf ( ch
, start
)) >= 0 ) {
100 src
. extractBetween ( start
, end
, result
[ split
++]);
104 src
. extractBetween ( start
, src
. length (), result
[ split
]);
109 static char * extractBytes ( const UnicodeString
& source
, const char * codepage
, int32_t & length
)
111 int32_t sLength
= source
. length ();
114 length
= source
. extract ( 0 , sLength
, NULL
, codepage
);
117 bytes
= NEW_ARRAY ( char , length
+ 1 );
118 source
. extract ( 0 , sLength
, bytes
, codepage
);
124 static void freeBytes ( char * bytes
)
129 void CharsetDetectionTest :: checkEncoding ( const UnicodeString
& testString
, const UnicodeString
& encoding
, const UnicodeString
& id
)
132 int32_t testLength
= testString
. length ();
133 UnicodeString
* eSplit
= split ( encoding
, CH_SLASH
, splits
);
134 UErrorCode status
= U_ZERO_ERROR
;
135 int32_t cpLength
= eSplit
[ 0 ]. length ();
138 u_UCharsToChars ( eSplit
[ 0 ]. getBuffer (), codepage
, cpLength
);
139 codepage
[ cpLength
] = '\0' ;
141 UCharsetDetector
* csd
= ucsdet_open (& status
);
143 int32_t byteLength
= 0 ;
144 char * bytes
= extractBytes ( testString
, codepage
, byteLength
);
147 #if !UCONFIG_NO_LEGACY_CONVERSION
148 errln ( "Can't open a " + encoding
+ " converter for " + id
);
153 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
155 int32_t matchCount
= 0 ;
156 const UCharsetMatch
** matches
= ucsdet_detectAll ( csd
, & matchCount
, & status
);
159 UnicodeString
name ( ucsdet_getName ( matches
[ 0 ], & status
));
160 UnicodeString
lang ( ucsdet_getLanguage ( matches
[ 0 ], & status
));
161 UChar
* decoded
= NULL
;
164 if ( matchCount
== 0 ) {
165 errln ( "Encoding detection failure for " + id
+ ": expected " + eSplit
[ 0 ] + ", got no matches" );
169 if ( name
. compare ( eSplit
[ 0 ]) != 0 ) {
170 errln ( "Encoding detection failure for " + id
+ ": expected " + eSplit
[ 0 ] + ", got " + name
);
173 for ( int32_t m
= 0 ; m
< matchCount
; m
+= 1 ) {
174 const char * name
= ucsdet_getName ( matches
[ m
], & status
);
175 const char * lang
= ucsdet_getLanguage ( matches
[ m
], & status
);
176 int32_t confidence
= ucsdet_getConfidence ( matches
[ m
], & status
);
178 printf ( " %s ( %s ) %d \n " , name
, lang
, confidence
);
184 if ( splits
> 1 && lang
. compare ( eSplit
[ 1 ]) != 0 ) {
185 errln ( "Language detection failure for " + id
+ ", " + eSplit
[ 0 ] + ": expected " + eSplit
[ 1 ] + ", got " + lang
);
189 decoded
= NEW_ARRAY ( UChar
, testLength
);
190 dLength
= ucsdet_getUChars ( matches
[ 0 ], decoded
, testLength
, & status
);
192 if ( testString
. compare ( decoded
, dLength
) != 0 ) {
193 errln ( "Round-trip error for " + id
+ ", " + eSplit
[ 0 ] + ": getUChars() didn't yeild the original string." );
196 for ( int32_t i
= 0 ; i
< testLength
; i
+= 1 ) {
197 if ( testString
[ i
] != decoded
[ i
]) {
198 printf ( "Strings differ at byte %d \n " , i
);
206 DELETE_ARRAY ( decoded
);
214 const char * CharsetDetectionTest :: getPath ( char buffer
[ 2048 ], const char * filename
) {
215 UErrorCode status
= U_ZERO_ERROR
;
216 const char * testDataDirectory
= IntlTest :: getSourceTestData ( status
);
218 if ( U_FAILURE ( status
)) {
219 errln ( "ERROR: getPath() failed - %s " , u_errorName ( status
));
223 strcpy ( buffer
, testDataDirectory
);
224 strcat ( buffer
, filename
);
228 void CharsetDetectionTest :: ConstructionTest ()
230 UErrorCode status
= U_ZERO_ERROR
;
231 UCharsetDetector
* csd
= ucsdet_open (& status
);
232 UEnumeration
* e
= ucsdet_getAllDetectableCharsets ( csd
, & status
);
233 int32_t count
= uenum_count ( e
, & status
);
236 printf ( "There are %d recognizers. \n " , count
);
239 for ( int32_t i
= 0 ; i
< count
; i
+= 1 ) {
241 const char * name
= uenum_next ( e
, & length
, & status
);
243 if ( name
== NULL
|| length
<= 0 ) {
244 errln ( "ucsdet_getAllDetectableCharsets() returned a null or empty name!" );
248 printf ( " %s \n " , name
);
256 void CharsetDetectionTest :: UTF8Test ()
258 UErrorCode status
= U_ZERO_ERROR
;
259 UnicodeString ss
= "This is a string with some non-ascii characters that will "
260 "be converted to UTF-8, then shoved through the detection process. "
261 " \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395"
262 "Sure would be nice if our source could contain Unicode directly!" ;
263 UnicodeString s
= ss
. unescape ();
264 int32_t byteLength
= 0 , sLength
= s
. length ();
265 char * bytes
= extractBytes ( s
, "UTF-8" , byteLength
);
266 UCharsetDetector
* csd
= ucsdet_open (& status
);
267 const UCharsetMatch
* match
;
268 UChar
* detected
= NEW_ARRAY ( UChar
, sLength
);
270 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
271 match
= ucsdet_detect ( csd
, & status
);
274 errln ( "Detection failure for UTF-8: got no matches." );
278 ucsdet_getUChars ( match
, detected
, sLength
, & status
);
280 if ( s
. compare ( detected
, sLength
) != 0 ) {
281 errln ( "Round-trip test failed!" );
284 ucsdet_setDeclaredEncoding ( csd
, "UTF-8" , 5 , & status
); /* for coverage */
287 DELETE_ARRAY ( detected
);
292 void CharsetDetectionTest :: UTF16Test ()
294 UErrorCode status
= U_ZERO_ERROR
;
295 /* Notice the BOM on the start of this string */
297 0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
298 0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
299 0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
300 0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
301 0x064a , 0x062a , 0x0000 };
302 UnicodeString
s ( chars
);
303 int32_t beLength
= 0 , leLength
= 0 ;
304 char * beBytes
= extractBytes ( s
, "UTF-16BE" , beLength
);
305 char * leBytes
= extractBytes ( s
, "UTF-16LE" , leLength
);
306 UCharsetDetector
* csd
= ucsdet_open (& status
);
307 const UCharsetMatch
* match
;
311 ucsdet_setText ( csd
, beBytes
, beLength
, & status
);
312 match
= ucsdet_detect ( csd
, & status
);
315 errln ( "Encoding detection failure for UTF-16BE: got no matches." );
319 name
= ucsdet_getName ( match
, & status
);
320 conf
= ucsdet_getConfidence ( match
, & status
);
322 if ( strcmp ( name
, "UTF-16BE" ) != 0 ) {
323 errln ( "Encoding detection failure for UTF-16BE: got %s " , name
);
324 goto try_le
; // no point in looking at confidence if we got the wrong character set.
328 errln ( "Did not get 100%% confidence for UTF-16BE: got %d " , conf
);
332 ucsdet_setText ( csd
, leBytes
, leLength
, & status
);
333 match
= ucsdet_detect ( csd
, & status
);
336 errln ( "Encoding detection failure for UTF-16LE: got no matches." );
340 name
= ucsdet_getName ( match
, & status
);
341 conf
= ucsdet_getConfidence ( match
, & status
);
344 if ( strcmp ( name
, "UTF-16LE" ) != 0 ) {
345 errln ( "Enconding detection failure for UTF-16LE: got %s " , name
);
346 goto bail
; // no point in looking at confidence if we got the wrong character set.
350 errln ( "Did not get 100%% confidence for UTF-16LE: got %d " , conf
);
359 void CharsetDetectionTest :: InputFilterTest ()
361 UErrorCode status
= U_ZERO_ERROR
;
362 UnicodeString ss
= "<a> <lot> <of> <English> <inside> <the> <markup> Un tr \\ u00E8s petit peu de Fran \\ u00E7ais. <to> <confuse> <the> <detector>" ;
363 UnicodeString s
= ss
. unescape ();
364 int32_t byteLength
= 0 ;
365 char * bytes
= extractBytes ( s
, "ISO-8859-1" , byteLength
);
366 UCharsetDetector
* csd
= ucsdet_open (& status
);
367 const UCharsetMatch
* match
;
368 const char * lang
, * name
;
370 ucsdet_enableInputFilter ( csd
, TRUE
);
372 if (! ucsdet_isInputFilterEnabled ( csd
)) {
373 errln ( "ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!" );
377 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
378 match
= ucsdet_detect ( csd
, & status
);
381 errln ( "Turning on the input filter resulted in no matches." );
385 name
= ucsdet_getName ( match
, & status
);
387 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
388 errln ( "Turning on the input filter resulted in %s rather than ISO-8859-1." , name
);
390 lang
= ucsdet_getLanguage ( match
, & status
);
392 if ( lang
== NULL
|| strcmp ( lang
, "fr" ) != 0 ) {
393 errln ( "Input filter did not strip markup!" );
398 ucsdet_enableInputFilter ( csd
, FALSE
);
399 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
400 match
= ucsdet_detect ( csd
, & status
);
403 errln ( "Turning off the input filter resulted in no matches." );
407 name
= ucsdet_getName ( match
, & status
);
409 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
410 errln ( "Turning off the input filter resulted in %s rather than ISO-8859-1." , name
);
412 lang
= ucsdet_getLanguage ( match
, & status
);
414 if ( lang
== NULL
|| strcmp ( lang
, "en" ) != 0 ) {
415 errln ( "Unfiltered input did not detect as English!" );
424 void CharsetDetectionTest :: C1BytesTest ()
426 #if !UCONFIG_NO_LEGACY_CONVERSION
427 UErrorCode status
= U_ZERO_ERROR
;
428 UnicodeString sISO
= "This is a small sample of some English text. Just enough to be sure that it detects correctly." ;
429 UnicodeString
ssWindows ( "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes." , - 1 , US_INV
);
430 UnicodeString sWindows
= ssWindows
. unescape ();
431 int32_t lISO
= 0 , lWindows
= 0 ;
432 char * bISO
= extractBytes ( sISO
, "ISO-8859-1" , lISO
);
433 char * bWindows
= extractBytes ( sWindows
, "windows-1252" , lWindows
);
434 UCharsetDetector
* csd
= ucsdet_open (& status
);
435 const UCharsetMatch
* match
;
438 ucsdet_setText ( csd
, bWindows
, lWindows
, & status
);
439 match
= ucsdet_detect ( csd
, & status
);
442 errln ( "English test with C1 bytes got no matches." );
446 name
= ucsdet_getName ( match
, & status
);
448 if ( strcmp ( name
, "windows-1252" ) != 0 ) {
449 errln ( "English text with C1 bytes does not detect as windows-1252, but as %s " , name
);
452 ucsdet_setText ( csd
, bISO
, lISO
, & status
);
453 match
= ucsdet_detect ( csd
, & status
);
456 errln ( "English text without C1 bytes got no matches." );
460 name
= ucsdet_getName ( match
, & status
);
462 if ( strcmp ( name
, "ISO-8859-1" ) != 0 ) {
463 errln ( "English text without C1 bytes does not detect as ISO-8859-1, but as %s " , name
);
474 void CharsetDetectionTest :: DetectionTest ()
476 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
477 UErrorCode status
= U_ZERO_ERROR
;
479 const char * testFilePath
= getPath ( path
, "csdetest.xml" );
481 if ( testFilePath
== NULL
) {
482 return ; /* Couldn't get path: error message already output. */
485 UXMLParser
* parser
= UXMLParser :: createParser ( status
);
486 if (! assertSuccess ( "UXMLParser::createParser" , status
)) return ;
487 UXMLElement
* root
= parser
-> parseFile ( testFilePath
, status
);
488 if (! assertSuccess ( "parseFile" , status
)) return ;
490 UnicodeString test_case
= UNICODE_STRING_SIMPLE ( "test-case" );
491 UnicodeString id_attr
= UNICODE_STRING_SIMPLE ( "id" );
492 UnicodeString enc_attr
= UNICODE_STRING_SIMPLE ( "encodings" );
494 const UXMLElement
* testCase
;
497 while (( testCase
= root
-> nextChildElement ( tc
)) != NULL
) {
498 if ( testCase
-> getTagName (). compare ( test_case
) == 0 ) {
499 const UnicodeString
* id
= testCase
-> getAttribute ( id_attr
);
500 const UnicodeString
* encodings
= testCase
-> getAttribute ( enc_attr
);
501 const UnicodeString text
= testCase
-> getText ( TRUE
);
502 int32_t encodingCount
;
503 UnicodeString
* encodingList
= split (* encodings
, CH_SPACE
, encodingCount
);
505 for ( int32_t e
= 0 ; e
< encodingCount
; e
+= 1 ) {
506 checkEncoding ( text
, encodingList
[ e
], * id
);
509 delete [] encodingList
;