]>
git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/ucsdetst.c
2 ****************************************************************************
3 * Copyright (c) 2005-2009, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ****************************************************************************
8 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/ustring.h"
19 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
21 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
22 #define DELETE_ARRAY(array) free(array)
24 static void TestConstruction ( void );
25 static void TestUTF8 ( void );
26 static void TestUTF16 ( void );
27 static void TestC1Bytes ( void );
28 static void TestInputFilter ( void );
29 static void TestChaining ( void );
30 static void TestBufferOverflow ( void );
31 static void TestIBM424 ( void );
32 static void TestIBM420 ( void );
34 void addUCsdetTest ( TestNode
** root
);
36 void addUCsdetTest ( TestNode
** root
)
38 addTest ( root
, & TestConstruction
, "ucsdetst/TestConstruction" );
39 addTest ( root
, & TestUTF8
, "ucsdetst/TestUTF8" );
40 addTest ( root
, & TestUTF16
, "ucsdetst/TestUTF16" );
41 addTest ( root
, & TestC1Bytes
, "ucsdetst/TestC1Bytes" );
42 addTest ( root
, & TestInputFilter
, "ucsdetst/TestInputFilter" );
43 addTest ( root
, & TestChaining
, "ucsdetst/TestErrorChaining" );
44 addTest ( root
, & TestBufferOverflow
, "ucsdetst/TestBufferOverflow" );
45 #if !UCONFIG_NO_LEGACY_CONVERSION
46 addTest ( root
, & TestIBM424
, "ucsdetst/TestIBM424" );
47 addTest ( root
, & TestIBM420
, "ucsdetst/TestIBM420" );
51 static int32_t preflight ( const UChar
* src
, int32_t length
, UConverter
* cnv
)
55 char * dest
, * destLimit
= buffer
+ sizeof ( buffer
);
56 const UChar
* srcLimit
= src
+ length
;
61 status
= U_ZERO_ERROR
;
62 ucnv_fromUnicode ( cnv
, & dest
, destLimit
, & src
, srcLimit
, 0 , TRUE
, & status
);
63 result
+= ( int32_t ) ( dest
- buffer
);
64 } while ( status
== U_BUFFER_OVERFLOW_ERROR
);
69 static char * extractBytes ( const UChar
* src
, int32_t length
, const char * codepage
, int32_t * byteLength
)
71 UErrorCode status
= U_ZERO_ERROR
;
72 UConverter
* cnv
= ucnv_open ( codepage
, & status
);
73 int32_t byteCount
= preflight ( src
, length
, cnv
);
74 const UChar
* srcLimit
= src
+ length
;
75 char * bytes
= NEW_ARRAY ( char , byteCount
+ 1 );
76 char * dest
= bytes
, * destLimit
= bytes
+ byteCount
+ 1 ;
78 ucnv_fromUnicode ( cnv
, & dest
, destLimit
, & src
, srcLimit
, 0 , TRUE
, & status
);
81 * byteLength
= byteCount
;
85 static void freeBytes ( char * bytes
)
90 static void TestConstruction ( void )
92 UErrorCode status
= U_ZERO_ERROR
;
93 UCharsetDetector
* csd
= ucsdet_open (& status
);
94 UEnumeration
* e
= ucsdet_getAllDetectableCharsets ( csd
, & status
);
96 int32_t count
= uenum_count ( e
, & status
);
99 for ( i
= 0 ; i
< count
; i
+= 1 ) {
100 name
= uenum_next ( e
, & length
, & status
);
102 if ( name
== NULL
|| length
<= 0 ) {
103 log_err ( "ucsdet_getAllDetectableCharsets() returned a null or empty name! \n " );
106 /* one past the list of all names must return NULL */
107 name
= uenum_next ( e
, & length
, & status
);
108 if ( name
!= NULL
|| length
!= 0 || U_FAILURE ( status
)) {
109 log_err ( "ucsdet_getAllDetectableCharsets(past the list) returned a non-null name! \n " );
116 static void TestUTF8 ( void )
118 UErrorCode status
= U_ZERO_ERROR
;
119 static const char ss
[] = "This is a string with some non-ascii characters that will "
120 "be converted to UTF-8, then shoved through the detection process. "
121 " \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395"
122 "Sure would be nice if our source could contain Unicode directly!" ;
123 int32_t byteLength
= 0 , sLength
= 0 , dLength
= 0 ;
126 UCharsetDetector
* csd
= ucsdet_open (& status
);
127 const UCharsetMatch
* match
;
128 UChar detected
[ sizeof ( ss
)];
130 sLength
= u_unescape ( ss
, s
, sizeof ( ss
));
131 bytes
= extractBytes ( s
, sLength
, "UTF-8" , & byteLength
);
133 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
134 if ( U_FAILURE ( status
)) {
135 log_err ( "status is %s \n " , u_errorName ( status
));
139 match
= ucsdet_detect ( csd
, & status
);
142 log_err ( "Detection failure for UTF-8: got no matches. \n " );
146 dLength
= ucsdet_getUChars ( match
, detected
, sLength
, & status
);
148 if ( u_strCompare ( detected
, dLength
, s
, sLength
, FALSE
) != 0 ) {
149 log_err ( "Round-trip test failed! \n " );
152 ucsdet_setDeclaredEncoding ( csd
, "UTF-8" , 5 , & status
); /* for coverage */
159 static void TestUTF16 ( void )
161 UErrorCode status
= U_ZERO_ERROR
;
162 /* Notice the BOM on the start of this string */
163 static const UChar chars
[] = {
164 0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
165 0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
166 0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
167 0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
168 0x064a , 0x062a , 0x0000 };
169 int32_t beLength
= 0 , leLength
= 0 , cLength
= ARRAY_SIZE ( chars
);
170 char * beBytes
= extractBytes ( chars
, cLength
, "UTF-16BE" , & beLength
);
171 char * leBytes
= extractBytes ( chars
, cLength
, "UTF-16LE" , & leLength
);
172 UCharsetDetector
* csd
= ucsdet_open (& status
);
173 const UCharsetMatch
* match
;
177 ucsdet_setText ( csd
, beBytes
, beLength
, & status
);
178 match
= ucsdet_detect ( csd
, & status
);
181 log_err ( "Encoding detection failure for UTF-16BE: got no matches. \n " );
185 name
= ucsdet_getName ( match
, & status
);
186 conf
= ucsdet_getConfidence ( match
, & status
);
188 if ( strcmp ( name
, "UTF-16BE" ) != 0 ) {
189 log_err ( "Encoding detection failure for UTF-16BE: got %s \n " , name
);
193 log_err ( "Did not get 100%% confidence for UTF-16BE: got %d \n " , conf
);
197 ucsdet_setText ( csd
, leBytes
, leLength
, & status
);
198 match
= ucsdet_detect ( csd
, & status
);
201 log_err ( "Encoding detection failure for UTF-16LE: got no matches. \n " );
205 name
= ucsdet_getName ( match
, & status
);
206 conf
= ucsdet_getConfidence ( match
, & status
);
209 if ( strcmp ( name
, "UTF-16LE" ) != 0 ) {
210 log_err ( "Enconding detection failure for UTF-16LE: got %s \n " , name
);
214 log_err ( "Did not get 100%% confidence for UTF-16LE: got %d \n " , conf
);
223 static void TestC1Bytes ( void )
225 #if !UCONFIG_NO_LEGACY_CONVERSION
226 UErrorCode status
= U_ZERO_ERROR
;
227 static const char ssISO
[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly." ;
228 static const char ssWindows
[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes." ;
229 int32_t sISOLength
= 0 , sWindowsLength
= 0 ;
230 UChar sISO
[ sizeof ( ssISO
)];
231 UChar sWindows
[ sizeof ( ssWindows
)];
232 int32_t lISO
= 0 , lWindows
= 0 ;
235 UCharsetDetector
* csd
= ucsdet_open (& status
);
236 const UCharsetMatch
* match
;
239 sISOLength
= u_unescape ( ssISO
, sISO
, sizeof ( ssISO
));
240 sWindowsLength
= u_unescape ( ssWindows
, sWindows
, sizeof ( ssWindows
));
241 bISO
= extractBytes ( sISO
, sISOLength
, "ISO-8859-1" , & lISO
);
242 bWindows
= extractBytes ( sWindows
, sWindowsLength
, "windows-1252" , & lWindows
);
244 ucsdet_setText ( csd
, bWindows
, lWindows
, & status
);
245 match
= ucsdet_detect ( csd
, & status
);
248 log_err ( "English test with C1 bytes got no matches. \n " );
252 name
= ucsdet_getName ( match
, & status
);
254 if ( strcmp ( name
, "windows-1252" ) != 0 ) {
255 log_data_err ( "English text with C1 bytes does not detect as windows-1252, but as %s . (Are you missing data?) \n " , name
);
258 ucsdet_setText ( csd
, bISO
, lISO
, & status
);
259 match
= ucsdet_detect ( csd
, & status
);
262 log_err ( "English text without C1 bytes got no matches. \n " );
266 name
= ucsdet_getName ( match
, & status
);
268 if ( strcmp ( name
, "ISO-8859-1" ) != 0 ) {
269 log_err ( "English text without C1 bytes does not detect as ISO-8859-1, but as %s \n " , name
);
280 static void TestInputFilter ( void )
282 UErrorCode status
= U_ZERO_ERROR
;
283 static const char ss
[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr \\ u00E8s petit peu de Fran \\ u00E7ais. <to> <confuse> <the> <detector>" ;
286 int32_t byteLength
= 0 ;
288 UCharsetDetector
* csd
= ucsdet_open (& status
);
289 const UCharsetMatch
* match
;
290 const char * lang
, * name
;
292 sLength
= u_unescape ( ss
, s
, sizeof ( ss
));
293 bytes
= extractBytes ( s
, sLength
, "ISO-8859-1" , & byteLength
);
295 ucsdet_enableInputFilter ( csd
, TRUE
);
297 if (! ucsdet_isInputFilterEnabled ( csd
)) {
298 log_err ( "ucsdet_enableInputFilter(csd, TRUE) did not enable input filter! \n " );
302 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
303 match
= ucsdet_detect ( csd
, & status
);
306 log_err ( "Turning on the input filter resulted in no matches. \n " );
310 name
= ucsdet_getName ( match
, & status
);
312 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
313 log_err ( "Turning on the input filter resulted in %s rather than ISO-8859-1 \n " , name
);
315 lang
= ucsdet_getLanguage ( match
, & status
);
317 if ( lang
== NULL
|| strcmp ( lang
, "fr" ) != 0 ) {
318 log_err ( "Input filter did not strip markup! \n " );
323 ucsdet_enableInputFilter ( csd
, FALSE
);
324 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
325 match
= ucsdet_detect ( csd
, & status
);
328 log_err ( "Turning off the input filter resulted in no matches. \n " );
332 name
= ucsdet_getName ( match
, & status
);
334 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
335 log_err ( "Turning off the input filter resulted in %s rather than ISO-8859-1 \n " , name
);
337 lang
= ucsdet_getLanguage ( match
, & status
);
339 if ( lang
== NULL
|| strcmp ( lang
, "en" ) != 0 ) {
340 log_err ( "Unfiltered input did not detect as English! \n " );
349 static void TestChaining ( void ) {
350 UErrorCode status
= U_USELESS_COLLATOR_ERROR
;
352 ucsdet_open (& status
);
353 ucsdet_setText ( NULL
, NULL
, 0 , & status
);
354 ucsdet_getName ( NULL
, & status
);
355 ucsdet_getConfidence ( NULL
, & status
);
356 ucsdet_getLanguage ( NULL
, & status
);
357 ucsdet_detect ( NULL
, & status
);
358 ucsdet_setDeclaredEncoding ( NULL
, NULL
, 0 , & status
);
359 ucsdet_detectAll ( NULL
, NULL
, & status
);
360 ucsdet_getUChars ( NULL
, NULL
, 0 , & status
);
361 ucsdet_getUChars ( NULL
, NULL
, 0 , & status
);
364 /* All of this code should have done nothing. */
365 if ( status
!= U_USELESS_COLLATOR_ERROR
) {
366 log_err ( "Status got changed to %s \n " , u_errorName ( status
));
370 static void TestBufferOverflow ( void ) {
371 UErrorCode status
= U_ZERO_ERROR
;
372 static const char * testStrings
[] = {
373 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b " , /* A partial ISO-2022 shift state at the end */
374 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24 " , /* A partial ISO-2022 shift state at the end */
375 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28 " , /* A partial ISO-2022 shift state at the end */
376 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44 " , /* A complete ISO-2022 shift state at the end with a bad one at the start */
377 " \x1b\x24\x28\x44 " , /* A complete ISO-2022 shift state at the end */
378 " \xa1 " , /* Could be a single byte shift-jis at the end */
379 " \x74\x68\xa1 " , /* Could be a single byte shift-jis at the end */
380 " \x74\x68\x65\xa1 " /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
382 static const char * testResults
[] = {
393 UCharsetDetector
* csd
= ucsdet_open (& status
);
394 const UCharsetMatch
* match
;
396 ucsdet_setDeclaredEncoding ( csd
, "ISO-2022-JP" , - 1 , & status
);
398 if ( U_FAILURE ( status
)) {
399 log_err ( "Couldn't open detector. %s \n " , u_errorName ( status
));
403 for ( idx
= 0 ; idx
< ARRAY_SIZE ( testStrings
); idx
++) {
404 ucsdet_setText ( csd
, testStrings
[ idx
], - 1 , & status
);
405 match
= ucsdet_detect ( csd
, & status
);
408 if ( testResults
[ idx
] != NULL
) {
409 log_err ( "Unexpectedly got no results at index %d . \n " , idx
);
412 log_verbose ( "Got no result as expected at index %d . \n " , idx
);
417 if ( testResults
[ idx
] == NULL
|| strcmp ( ucsdet_getName ( match
, & status
), testResults
[ idx
]) != 0 ) {
418 log_err ( "Unexpectedly got %s instead of %s at index %d with confidence %d . \n " ,
419 ucsdet_getName ( match
, & status
), testResults
[ idx
], idx
, ucsdet_getConfidence ( match
, & status
));
428 static void TestIBM424 ( void )
430 UErrorCode status
= U_ZERO_ERROR
;
432 static const UChar chars
[] = {
433 0x05D4 , 0x05E4 , 0x05E8 , 0x05E7 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05D4 , 0x05E6 , 0x05D1 , 0x05D0 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E8 ,
434 0x05D0 , 0x05E9 , 0x05D9 , 0x002C , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x05D0 , 0x05DC , 0x05D5 , 0x05E3 , 0x0020 , 0x05D0 , 0x05D1 , 0x05D9 ,
435 0x05D7 , 0x05D9 , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC , 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x002C , 0x0020 , 0x05D4 , 0x05D5 , 0x05E8 ,
436 0x05D4 , 0x0020 , 0x05E2 , 0x05DC , 0x0020 , 0x05E4 , 0x05EA , 0x05D9 , 0x05D7 , 0x05EA , 0x0020 , 0x05D7 , 0x05E7 , 0x05D9 , 0x05E8 , 0x05EA ,
437 0x0020 , 0x05DE , 0x05E6 , 0x0022 , 0x05D7 , 0x0020 , 0x05D1 , 0x05E2 , 0x05E7 , 0x05D1 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D3 , 0x05D5 ,
438 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC , 0x05D9 , 0x0020 , 0x05E6 , 0x05D4 , 0x0022 , 0x05DC , 0x0020 , 0x05DE ,
439 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 , 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 ,
440 0x0020 , 0x05D1 , 0x002B , 0x0020 , 0x05E8 , 0x05E6 , 0x05D5 , 0x05E2 , 0x05EA , 0x0020 , 0x05E2 , 0x05D6 , 0x05D4 , 0x002E , 0x0020 , 0x05DC ,
441 0x05D3 , 0x05D1 , 0x05E8 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E4 , 0x05E6 , 0x0022 , 0x05E8 , 0x002C , 0x0020 , 0x05DE , 0x05D4 , 0x05E2 , 0x05D3 ,
442 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D5 , 0x05DC , 0x05D4 , 0x0020 , 0x05EA , 0x05DE , 0x05D5 , 0x05E0 , 0x05D4 , 0x0020 ,
443 0x05E9 , 0x05DC , 0x0020 , 0x0022 , 0x05D4 , 0x05EA , 0x05E0 , 0x05D4 , 0x05D2 , 0x05D5 , 0x05EA , 0x0020 , 0x05E4 , 0x05E1 , 0x05D5 , 0x05DC ,
444 0x05D4 , 0x0020 , 0x05DC , 0x05DB , 0x05D0 , 0x05D5 , 0x05E8 , 0x05D4 , 0x0020 , 0x05E9 , 0x05DC , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC ,
445 0x05D9 , 0x05DD , 0x0020 , 0x05D1 , 0x05DE , 0x05D4 , 0x05DC , 0x05DA , 0x0020 , 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 ,
446 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 , 0x0022 , 0x002E , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC ,
447 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05E7 , 0x05D9 , 0x05D1 , 0x05DC , 0x0020 , 0x05D0 , 0x05EA , 0x0020 , 0x05D4 , 0x05D7 , 0x05DC ,
448 0x05D8 , 0x05EA , 0x05D5 , 0x0020 , 0x05DC , 0x05D0 , 0x05D7 , 0x05E8 , 0x0020 , 0x05E9 , 0x05E2 , 0x05D9 , 0x05D9 , 0x05DF , 0x0020 , 0x05D1 ,
449 0x05EA , 0x05DE , 0x05DC , 0x05D9 , 0x05DC , 0x0020 , 0x05D4 , 0x05E2 , 0x05D3 , 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0000
452 static const UChar chars_reverse
[] = {
453 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 , 0x05D3 , 0x05E2 , 0x05D4 , 0x0020 , 0x05DC , 0x05D9 , 0x05DC , 0x05DE , 0x05EA ,
454 0x05D1 , 0x0020 , 0x05DF , 0x05D9 , 0x05D9 , 0x05E2 , 0x05E9 , 0x0020 , 0x05E8 , 0x05D7 , 0x05D0 , 0x05DC , 0x0020 , 0x05D5 , 0x05EA , 0x05D8 ,
455 0x05DC , 0x05D7 , 0x05D4 , 0x0020 , 0x05EA , 0x05D0 , 0x0020 , 0x05DC , 0x05D1 , 0x05D9 , 0x05E7 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 ,
456 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x002E , 0x0022 , 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 ,
457 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE , 0x0020 , 0x05DA , 0x05DC , 0x05D4 , 0x05DE , 0x05D1 , 0x0020 , 0x05DD , 0x05D9 ,
458 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05DC , 0x05E9 , 0x0020 , 0x05D4 , 0x05E8 , 0x05D5 , 0x05D0 , 0x05DB , 0x05DC , 0x0020 , 0x05D4 ,
459 0x05DC , 0x05D5 , 0x05E1 , 0x05E4 , 0x0020 , 0x05EA , 0x05D5 , 0x05D2 , 0x05D4 , 0x05E0 , 0x05EA , 0x05D4 , 0x0022 , 0x0020 , 0x05DC , 0x05E9 ,
460 0x0020 , 0x05D4 , 0x05E0 , 0x05D5 , 0x05DE , 0x05EA , 0x0020 , 0x05D4 , 0x05DC , 0x05D5 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 ,
461 0x05D3 , 0x05E2 , 0x05D4 , 0x05DE , 0x0020 , 0x002C , 0x05E8 , 0x0022 , 0x05E6 , 0x05E4 , 0x05D4 , 0x0020 , 0x05D9 , 0x05E8 , 0x05D1 , 0x05D3 ,
462 0x05DC , 0x0020 , 0x002E , 0x05D4 , 0x05D6 , 0x05E2 , 0x0020 , 0x05EA , 0x05E2 , 0x05D5 , 0x05E6 , 0x05E8 , 0x0020 , 0x002B , 0x05D1 , 0x0020 ,
463 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 , 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE ,
464 0x05DE , 0x0020 , 0x05DC , 0x0022 , 0x05D4 , 0x05E6 , 0x0020 , 0x05D9 , 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 ,
465 0x05D5 , 0x05D3 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D1 , 0x05E7 , 0x05E2 , 0x05D1 , 0x0020 , 0x05D7 , 0x0022 , 0x05E6 , 0x05DE , 0x0020 ,
466 0x05EA , 0x05E8 , 0x05D9 , 0x05E7 , 0x05D7 , 0x0020 , 0x05EA , 0x05D7 , 0x05D9 , 0x05EA , 0x05E4 , 0x0020 , 0x05DC , 0x05E2 , 0x0020 , 0x05D4 ,
467 0x05E8 , 0x05D5 , 0x05D4 , 0x0020 , 0x002C , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 , 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x05D9 , 0x05D7 ,
468 0x05D9 , 0x05D1 , 0x05D0 , 0x0020 , 0x05E3 , 0x05D5 , 0x05DC , 0x05D0 , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x002C , 0x05D9 , 0x05E9 , 0x05D0 ,
469 0x05E8 , 0x05D4 , 0x0020 , 0x05D9 , 0x05D0 , 0x05D1 , 0x05E6 , 0x05D4 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05E7 , 0x05E8 , 0x05E4 , 0x05D4 ,
473 int32_t bLength
= 0 , brLength
= 0 , cLength
= ARRAY_SIZE ( chars
), crLength
= ARRAY_SIZE ( chars_reverse
);
475 char * bytes
= extractBytes ( chars
, cLength
, "IBM424" , & bLength
);
476 char * bytes_r
= extractBytes ( chars_reverse
, crLength
, "IBM424" , & brLength
);
478 UCharsetDetector
* csd
= ucsdet_open (& status
);
479 const UCharsetMatch
* match
;
482 ucsdet_setText ( csd
, bytes
, bLength
, & status
);
483 match
= ucsdet_detect ( csd
, & status
);
486 log_err ( "Encoding detection failure for IBM424_rtl: got no matches. \n " );
490 name
= ucsdet_getName ( match
, & status
);
491 if ( strcmp ( name
, "IBM424_rtl" ) != 0 ) {
492 log_data_err ( "Encoding detection failure for IBM424_rtl: got %s . (Are you missing data?) \n " , name
);
495 ucsdet_setText ( csd
, bytes_r
, brLength
, & status
);
496 match
= ucsdet_detect ( csd
, & status
);
499 log_err ( "Encoding detection failure for IBM424_ltr: got no matches. \n " );
503 name
= ucsdet_getName ( match
, & status
);
504 if ( strcmp ( name
, "IBM424_ltr" ) != 0 ) {
505 log_data_err ( "Encoding detection failure for IBM424_ltr: got %s . (Are you missing data?) \n " , name
);
514 static void TestIBM420 ( void )
516 UErrorCode status
= U_ZERO_ERROR
;
518 static const UChar chars
[] = {
519 0x0648 , 0x064F , 0x0636 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x0646 , 0x064F , 0x0641 , 0x0630 , 0x062A , 0x0020 , 0x0628 , 0x0631 , 0x0627 ,
520 0x0645 , 0x062C , 0x0020 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 , 0x0639 , 0x062F , 0x064A , 0x062F , 0x0629 , 0x0020 , 0x0641 ,
521 0x064A , 0x0020 , 0x0645 , 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0627 , 0x0644 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 ,
522 0x0627 , 0x0644 , 0x0648 , 0x0637 , 0x0646 , 0x064A , 0x002C , 0x0020 , 0x0645 , 0x0639 , 0x0020 , 0x0645 , 0x0644 , 0x0627 , 0x0626 , 0x0645 ,
523 0x062A , 0x0647 , 0x0627 , 0x0020 , 0x062F , 0x0627 , 0x0626 , 0x0645 , 0x0627 , 0x064B , 0x0020 , 0x0644 , 0x0644 , 0x0627 , 0x062D , 0x062A ,
524 0x064A , 0x0627 , 0x062C , 0x0627 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062A , 0x063A , 0x064A , 0x0631 , 0x0629 , 0x0020 , 0x0644 ,
525 0x0644 , 0x0645 , 0x062C , 0x062A , 0x0645 , 0x0639 , 0x0020 , 0x0648 , 0x0644 , 0x0644 , 0x062F , 0x0648 , 0x0644 , 0x0629 , 0x002E , 0x0020 ,
526 0x062A , 0x0648 , 0x0633 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x062A , 0x0637 , 0x0648 , 0x0631 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 ,
527 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0628 , 0x0647 , 0x062F , 0x0641 , 0x0020 , 0x0636 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0634 ,
528 0x0628 , 0x0643 , 0x0629 , 0x0020 , 0x0623 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0644 , 0x0633 , 0x0643 , 0x0627 , 0x0646 , 0x0020 , 0x062F ,
529 0x0648 , 0x0644 , 0x0629 , 0x0020 , 0x0627 , 0x0633 , 0x0631 , 0x0627 , 0x0626 , 0x064A , 0x0644 , 0x0020 , 0x0628 , 0x0648 , 0x062C , 0x0647 ,
530 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062E , 0x0627 , 0x0637 , 0x0631 , 0x0020 , 0x0627 , 0x0644 , 0x0627 , 0x0642 , 0x062A , 0x0635 , 0x0627 ,
531 0x062F , 0x064A , 0x0629 , 0x0020 , 0x0648 , 0x0627 , 0x0644 , 0x0627 , 0x062C , 0x062A , 0x0645 , 0x0627 , 0x0639 , 0x064A , 0x0629 , 0x002E ,
534 static const UChar chars_reverse
[] = {
535 0x002E , 0x0629 , 0x064A , 0x0639 , 0x0627 , 0x0645 , 0x062A , 0x062C , 0x0627 , 0x0644 , 0x0627 , 0x0648 , 0x0020 , 0x0629 , 0x064A , 0x062F ,
536 0x0627 , 0x0635 , 0x062A , 0x0642 , 0x0627 , 0x0644 , 0x0627 , 0x0020 , 0x0631 , 0x0637 , 0x0627 , 0x062E , 0x0645 , 0x0644 , 0x0627 , 0x0020 ,
537 0x0647 , 0x062C , 0x0648 , 0x0628 , 0x0020 , 0x0644 , 0x064A , 0x0626 , 0x0627 , 0x0631 , 0x0633 , 0x0627 , 0x0020 , 0x0629 , 0x0644 , 0x0648 ,
538 0x062F , 0x0020 , 0x0646 , 0x0627 , 0x0643 , 0x0633 , 0x0644 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0623 , 0x0020 , 0x0629 , 0x0643 , 0x0628 ,
539 0x0634 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0636 , 0x0020 , 0x0641 , 0x062F , 0x0647 , 0x0628 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 ,
540 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0631 , 0x0648 , 0x0637 , 0x062A , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0633 , 0x0648 , 0x062A ,
541 0x0020 , 0x002E , 0x0629 , 0x0644 , 0x0648 , 0x062F , 0x0644 , 0x0644 , 0x0648 , 0x0020 , 0x0639 , 0x0645 , 0x062A , 0x062C , 0x0645 , 0x0644 ,
542 0x0644 , 0x0020 , 0x0629 , 0x0631 , 0x064A , 0x063A , 0x062A , 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0627 , 0x062C , 0x0627 , 0x064A ,
543 0x062A , 0x062D , 0x0627 , 0x0644 , 0x0644 , 0x0020 , 0x064B , 0x0627 , 0x0645 , 0x0626 , 0x0627 , 0x062F , 0x0020 , 0x0627 , 0x0647 , 0x062A ,
544 0x0645 , 0x0626 , 0x0627 , 0x0644 , 0x0645 , 0x0020 , 0x0639 , 0x0645 , 0x0020 , 0x002C , 0x064A , 0x0646 , 0x0637 , 0x0648 , 0x0644 , 0x0627 ,
545 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0644 , 0x0627 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 , 0x0645 , 0x0020 , 0x064A ,
546 0x0641 , 0x0020 , 0x0629 , 0x062F , 0x064A , 0x062F , 0x0639 , 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0020 , 0x062C , 0x0645 ,
547 0x0627 , 0x0631 , 0x0628 , 0x0020 , 0x062A , 0x0630 , 0x0641 , 0x064F , 0x0646 , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0636 , 0x064F , 0x0648 ,
551 int32_t bLength
= 0 , brLength
= 0 , cLength
= ARRAY_SIZE ( chars
), crLength
= ARRAY_SIZE ( chars_reverse
);
553 char * bytes
= extractBytes ( chars
, cLength
, "IBM420" , & bLength
);
554 char * bytes_r
= extractBytes ( chars_reverse
, crLength
, "IBM420" , & brLength
);
556 UCharsetDetector
* csd
= ucsdet_open (& status
);
557 const UCharsetMatch
* match
;
560 ucsdet_setText ( csd
, bytes
, bLength
, & status
);
561 match
= ucsdet_detect ( csd
, & status
);
564 log_err ( "Encoding detection failure for IBM420_rtl: got no matches. \n " );
568 name
= ucsdet_getName ( match
, & status
);
569 if ( strcmp ( name
, "IBM420_rtl" ) != 0 ) {
570 log_data_err ( "Encoding detection failure for IBM420_rtl: got %s . (Are you missing data?) \n " , name
);
573 ucsdet_setText ( csd
, bytes_r
, brLength
, & status
);
574 match
= ucsdet_detect ( csd
, & status
);
577 log_err ( "Encoding detection failure for IBM420_ltr: got no matches. \n " );
581 name
= ucsdet_getName ( match
, & status
);
582 if ( strcmp ( name
, "IBM420_ltr" ) != 0 ) {
583 log_data_err ( "Encoding detection failure for IBM420_ltr: got %s . (Are you missing data?) \n " , name
);