]>
git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/ucsdetst.c
2 ****************************************************************************
3 * Copyright (c) 2005-2016, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ****************************************************************************
8 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/ustring.h"
20 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
21 #define DELETE_ARRAY(array) free(array)
23 static void TestConstruction ( void );
24 static void TestUTF8 ( void );
25 static void TestUTF16 ( void );
26 static void TestC1Bytes ( void );
27 static void TestInputFilter ( void );
28 static void TestChaining ( void );
29 static void TestBufferOverflow ( void );
30 static void TestIBM424 ( void );
31 static void TestIBM420 ( void );
33 void addUCsdetTest ( TestNode
** root
);
35 void addUCsdetTest ( TestNode
** root
)
37 addTest ( root
, & TestConstruction
, "ucsdetst/TestConstruction" );
38 addTest ( root
, & TestUTF8
, "ucsdetst/TestUTF8" );
39 addTest ( root
, & TestUTF16
, "ucsdetst/TestUTF16" );
40 addTest ( root
, & TestC1Bytes
, "ucsdetst/TestC1Bytes" );
41 addTest ( root
, & TestInputFilter
, "ucsdetst/TestInputFilter" );
42 addTest ( root
, & TestChaining
, "ucsdetst/TestErrorChaining" );
43 addTest ( root
, & TestBufferOverflow
, "ucsdetst/TestBufferOverflow" );
44 #if !UCONFIG_NO_LEGACY_CONVERSION
45 addTest ( root
, & TestIBM424
, "ucsdetst/TestIBM424" );
46 addTest ( root
, & TestIBM420
, "ucsdetst/TestIBM420" );
50 static int32_t preflight ( const UChar
* src
, int32_t length
, UConverter
* cnv
)
54 char * dest
, * destLimit
= buffer
+ sizeof ( buffer
);
55 const UChar
* srcLimit
= src
+ length
;
60 status
= U_ZERO_ERROR
;
61 ucnv_fromUnicode ( cnv
, & dest
, destLimit
, & src
, srcLimit
, 0 , TRUE
, & status
);
62 result
+= ( int32_t ) ( dest
- buffer
);
63 } while ( status
== U_BUFFER_OVERFLOW_ERROR
);
68 static char * extractBytes ( const UChar
* src
, int32_t length
, const char * codepage
, int32_t * byteLength
)
70 UErrorCode status
= U_ZERO_ERROR
;
71 UConverter
* cnv
= ucnv_open ( codepage
, & status
);
72 int32_t byteCount
= preflight ( src
, length
, cnv
);
73 const UChar
* srcLimit
= src
+ length
;
74 char * bytes
= NEW_ARRAY ( char , byteCount
+ 1 );
75 char * dest
= bytes
, * destLimit
= bytes
+ byteCount
+ 1 ;
77 ucnv_fromUnicode ( cnv
, & dest
, destLimit
, & src
, srcLimit
, 0 , TRUE
, & status
);
80 * byteLength
= byteCount
;
84 static void freeBytes ( char * bytes
)
89 static void TestConstruction ( void )
91 UErrorCode status
= U_ZERO_ERROR
;
92 UCharsetDetector
* csd
= ucsdet_open (& status
);
93 UEnumeration
* e
= ucsdet_getAllDetectableCharsets ( csd
, & status
);
95 int32_t count
= uenum_count ( e
, & status
);
98 for ( i
= 0 ; i
< count
; i
+= 1 ) {
99 name
= uenum_next ( e
, & length
, & status
);
101 if ( name
== NULL
|| length
<= 0 ) {
102 log_err ( "ucsdet_getAllDetectableCharsets() returned a null or empty name! \n " );
105 /* one past the list of all names must return NULL */
106 name
= uenum_next ( e
, & length
, & status
);
107 if ( name
!= NULL
|| length
!= 0 || U_FAILURE ( status
)) {
108 log_err ( "ucsdet_getAllDetectableCharsets(past the list) returned a non-null name! \n " );
115 static void TestUTF8 ( void )
117 UErrorCode status
= U_ZERO_ERROR
;
118 static const char ss
[] = "This is a string with some non-ascii characters that will "
119 "be converted to UTF-8, then shoved through the detection process. "
120 " \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395"
121 "Sure would be nice if our source could contain Unicode directly!" ;
122 int32_t byteLength
= 0 , sLength
= 0 , dLength
= 0 ;
125 UCharsetDetector
* csd
= ucsdet_open (& status
);
126 const UCharsetMatch
* match
;
127 UChar detected
[ sizeof ( ss
)];
129 sLength
= u_unescape ( ss
, s
, sizeof ( ss
));
130 bytes
= extractBytes ( s
, sLength
, "UTF-8" , & byteLength
);
132 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
133 if ( U_FAILURE ( status
)) {
134 log_err ( "status is %s \n " , u_errorName ( status
));
138 match
= ucsdet_detect ( csd
, & status
);
141 log_err ( "Detection failure for UTF-8: got no matches. \n " );
145 dLength
= ucsdet_getUChars ( match
, detected
, sLength
, & status
);
147 if ( u_strCompare ( detected
, dLength
, s
, sLength
, FALSE
) != 0 ) {
148 log_err ( "Round-trip test failed! \n " );
151 ucsdet_setDeclaredEncoding ( csd
, "UTF-8" , 5 , & status
); /* for coverage */
158 static void TestUTF16 ( void )
160 UErrorCode status
= U_ZERO_ERROR
;
161 /* Notice the BOM on the start of this string */
162 static const UChar chars
[] = {
163 0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
164 0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
165 0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
166 0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
167 0x064a , 0x062a , 0x0000 };
168 int32_t beLength
= 0 , leLength
= 0 , cLength
= UPRV_LENGTHOF ( chars
);
169 char * beBytes
= extractBytes ( chars
, cLength
, "UTF-16BE" , & beLength
);
170 char * leBytes
= extractBytes ( chars
, cLength
, "UTF-16LE" , & leLength
);
171 UCharsetDetector
* csd
= ucsdet_open (& status
);
172 const UCharsetMatch
* match
;
176 ucsdet_setText ( csd
, beBytes
, beLength
, & status
);
177 match
= ucsdet_detect ( csd
, & status
);
180 log_err ( "Encoding detection failure for UTF-16BE: got no matches. \n " );
184 name
= ucsdet_getName ( match
, & status
);
185 conf
= ucsdet_getConfidence ( match
, & status
);
187 if ( strcmp ( name
, "UTF-16BE" ) != 0 ) {
188 log_err ( "Encoding detection failure for UTF-16BE: got %s \n " , name
);
192 log_err ( "Did not get 100%% confidence for UTF-16BE: got %d \n " , conf
);
196 ucsdet_setText ( csd
, leBytes
, leLength
, & status
);
197 match
= ucsdet_detect ( csd
, & status
);
200 log_err ( "Encoding detection failure for UTF-16LE: got no matches. \n " );
204 name
= ucsdet_getName ( match
, & status
);
205 conf
= ucsdet_getConfidence ( match
, & status
);
208 if ( strcmp ( name
, "UTF-16LE" ) != 0 ) {
209 log_err ( "Enconding detection failure for UTF-16LE: got %s \n " , name
);
213 log_err ( "Did not get 100%% confidence for UTF-16LE: got %d \n " , conf
);
222 static void TestC1Bytes ( void )
224 #if !UCONFIG_NO_LEGACY_CONVERSION
225 UErrorCode status
= U_ZERO_ERROR
;
226 static const char ssISO
[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly." ;
227 static const char ssWindows
[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes." ;
228 int32_t sISOLength
= 0 , sWindowsLength
= 0 ;
229 UChar sISO
[ sizeof ( ssISO
)];
230 UChar sWindows
[ sizeof ( ssWindows
)];
231 int32_t lISO
= 0 , lWindows
= 0 ;
234 UCharsetDetector
* csd
= ucsdet_open (& status
);
235 const UCharsetMatch
* match
;
238 sISOLength
= u_unescape ( ssISO
, sISO
, sizeof ( ssISO
));
239 sWindowsLength
= u_unescape ( ssWindows
, sWindows
, sizeof ( ssWindows
));
240 bISO
= extractBytes ( sISO
, sISOLength
, "ISO-8859-1" , & lISO
);
241 bWindows
= extractBytes ( sWindows
, sWindowsLength
, "windows-1252" , & lWindows
);
243 ucsdet_setText ( csd
, bWindows
, lWindows
, & status
);
244 match
= ucsdet_detect ( csd
, & status
);
247 log_err ( "English test with C1 bytes got no matches. \n " );
251 name
= ucsdet_getName ( match
, & status
);
253 if ( strcmp ( name
, "windows-1252" ) != 0 ) {
254 log_data_err ( "English text with C1 bytes does not detect as windows-1252, but as %s . (Are you missing data?) \n " , name
);
257 ucsdet_setText ( csd
, bISO
, lISO
, & status
);
258 match
= ucsdet_detect ( csd
, & status
);
261 log_err ( "English text without C1 bytes got no matches. \n " );
265 name
= ucsdet_getName ( match
, & status
);
267 if ( strcmp ( name
, "ISO-8859-1" ) != 0 ) {
268 log_err ( "English text without C1 bytes does not detect as ISO-8859-1, but as %s \n " , name
);
279 static void TestInputFilter ( void )
281 UErrorCode status
= U_ZERO_ERROR
;
282 static const char ss
[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr \\ u00E8s petit peu de Fran \\ u00E7ais. <to> <confuse> <the> <detector>" ;
285 int32_t byteLength
= 0 ;
287 UCharsetDetector
* csd
= ucsdet_open (& status
);
288 const UCharsetMatch
* match
;
289 const char * lang
, * name
;
291 sLength
= u_unescape ( ss
, s
, sizeof ( ss
));
292 bytes
= extractBytes ( s
, sLength
, "ISO-8859-1" , & byteLength
);
294 ucsdet_enableInputFilter ( csd
, TRUE
);
296 if (! ucsdet_isInputFilterEnabled ( csd
)) {
297 log_err ( "ucsdet_enableInputFilter(csd, TRUE) did not enable input filter! \n " );
301 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
302 match
= ucsdet_detect ( csd
, & status
);
305 log_err ( "Turning on the input filter resulted in no matches. \n " );
309 name
= ucsdet_getName ( match
, & status
);
311 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
312 log_err ( "Turning on the input filter resulted in %s rather than ISO-8859-1 \n " , name
);
314 lang
= ucsdet_getLanguage ( match
, & status
);
316 if ( lang
== NULL
|| strcmp ( lang
, "fr" ) != 0 ) {
317 log_err ( "Input filter did not strip markup! \n " );
322 ucsdet_enableInputFilter ( csd
, FALSE
);
323 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
324 match
= ucsdet_detect ( csd
, & status
);
327 log_err ( "Turning off the input filter resulted in no matches. \n " );
331 name
= ucsdet_getName ( match
, & status
);
333 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
334 log_err ( "Turning off the input filter resulted in %s rather than ISO-8859-1 \n " , name
);
336 lang
= ucsdet_getLanguage ( match
, & status
);
338 if ( lang
== NULL
|| strcmp ( lang
, "en" ) != 0 ) {
339 log_err ( "Unfiltered input did not detect as English! \n " );
348 static void TestChaining ( void ) {
349 UErrorCode status
= U_USELESS_COLLATOR_ERROR
;
351 ucsdet_open (& status
);
352 ucsdet_setText ( NULL
, NULL
, 0 , & status
);
353 ucsdet_getName ( NULL
, & status
);
354 ucsdet_getConfidence ( NULL
, & status
);
355 ucsdet_getLanguage ( NULL
, & status
);
356 ucsdet_detect ( NULL
, & status
);
357 ucsdet_setDeclaredEncoding ( NULL
, NULL
, 0 , & status
);
358 ucsdet_detectAll ( NULL
, NULL
, & status
);
359 ucsdet_getUChars ( NULL
, NULL
, 0 , & status
);
360 ucsdet_getUChars ( NULL
, NULL
, 0 , & status
);
363 /* All of this code should have done nothing. */
364 if ( status
!= U_USELESS_COLLATOR_ERROR
) {
365 log_err ( "Status got changed to %s \n " , u_errorName ( status
));
369 static void TestBufferOverflow ( void ) {
370 UErrorCode status
= U_ZERO_ERROR
;
371 static const char * testStrings
[] = {
372 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b " , /* A partial ISO-2022 shift state at the end */
373 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24 " , /* A partial ISO-2022 shift state at the end */
374 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28 " , /* A partial ISO-2022 shift state at the end */
375 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44 " , /* A complete ISO-2022 shift state at the end with a bad one at the start */
376 " \x1b\x24\x28\x44 " , /* A complete ISO-2022 shift state at the end */
377 " \xa1 " , /* Could be a single byte shift-jis at the end */
378 " \x74\x68\xa1 " , /* Could be a single byte shift-jis at the end */
379 " \x74\x68\x65\xa1 " /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
381 static const char * testResults
[] = {
392 UCharsetDetector
* csd
= ucsdet_open (& status
);
393 const UCharsetMatch
* match
;
395 ucsdet_setDeclaredEncoding ( csd
, "ISO-2022-JP" , - 1 , & status
);
397 if ( U_FAILURE ( status
)) {
398 log_err ( "Couldn't open detector. %s \n " , u_errorName ( status
));
402 for ( idx
= 0 ; idx
< UPRV_LENGTHOF ( testStrings
); idx
++) {
403 ucsdet_setText ( csd
, testStrings
[ idx
], - 1 , & status
);
404 match
= ucsdet_detect ( csd
, & status
);
407 if ( testResults
[ idx
] != NULL
) {
408 log_err ( "Unexpectedly got no results at index %d . \n " , idx
);
411 log_verbose ( "Got no result as expected at index %d . \n " , idx
);
416 if ( testResults
[ idx
] == NULL
|| strcmp ( ucsdet_getName ( match
, & status
), testResults
[ idx
]) != 0 ) {
417 log_err ( "Unexpectedly got %s instead of %s at index %d with confidence %d . \n " ,
418 ucsdet_getName ( match
, & status
), testResults
[ idx
], idx
, ucsdet_getConfidence ( match
, & status
));
427 static void TestIBM424 ( void )
429 UErrorCode status
= U_ZERO_ERROR
;
431 static const UChar chars
[] = {
432 0x05D4 , 0x05E4 , 0x05E8 , 0x05E7 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05D4 , 0x05E6 , 0x05D1 , 0x05D0 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E8 ,
433 0x05D0 , 0x05E9 , 0x05D9 , 0x002C , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x05D0 , 0x05DC , 0x05D5 , 0x05E3 , 0x0020 , 0x05D0 , 0x05D1 , 0x05D9 ,
434 0x05D7 , 0x05D9 , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC , 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x002C , 0x0020 , 0x05D4 , 0x05D5 , 0x05E8 ,
435 0x05D4 , 0x0020 , 0x05E2 , 0x05DC , 0x0020 , 0x05E4 , 0x05EA , 0x05D9 , 0x05D7 , 0x05EA , 0x0020 , 0x05D7 , 0x05E7 , 0x05D9 , 0x05E8 , 0x05EA ,
436 0x0020 , 0x05DE , 0x05E6 , 0x0022 , 0x05D7 , 0x0020 , 0x05D1 , 0x05E2 , 0x05E7 , 0x05D1 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D3 , 0x05D5 ,
437 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC , 0x05D9 , 0x0020 , 0x05E6 , 0x05D4 , 0x0022 , 0x05DC , 0x0020 , 0x05DE ,
438 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 , 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 ,
439 0x0020 , 0x05D1 , 0x002B , 0x0020 , 0x05E8 , 0x05E6 , 0x05D5 , 0x05E2 , 0x05EA , 0x0020 , 0x05E2 , 0x05D6 , 0x05D4 , 0x002E , 0x0020 , 0x05DC ,
440 0x05D3 , 0x05D1 , 0x05E8 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E4 , 0x05E6 , 0x0022 , 0x05E8 , 0x002C , 0x0020 , 0x05DE , 0x05D4 , 0x05E2 , 0x05D3 ,
441 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D5 , 0x05DC , 0x05D4 , 0x0020 , 0x05EA , 0x05DE , 0x05D5 , 0x05E0 , 0x05D4 , 0x0020 ,
442 0x05E9 , 0x05DC , 0x0020 , 0x0022 , 0x05D4 , 0x05EA , 0x05E0 , 0x05D4 , 0x05D2 , 0x05D5 , 0x05EA , 0x0020 , 0x05E4 , 0x05E1 , 0x05D5 , 0x05DC ,
443 0x05D4 , 0x0020 , 0x05DC , 0x05DB , 0x05D0 , 0x05D5 , 0x05E8 , 0x05D4 , 0x0020 , 0x05E9 , 0x05DC , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC ,
444 0x05D9 , 0x05DD , 0x0020 , 0x05D1 , 0x05DE , 0x05D4 , 0x05DC , 0x05DA , 0x0020 , 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 ,
445 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 , 0x0022 , 0x002E , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC ,
446 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05E7 , 0x05D9 , 0x05D1 , 0x05DC , 0x0020 , 0x05D0 , 0x05EA , 0x0020 , 0x05D4 , 0x05D7 , 0x05DC ,
447 0x05D8 , 0x05EA , 0x05D5 , 0x0020 , 0x05DC , 0x05D0 , 0x05D7 , 0x05E8 , 0x0020 , 0x05E9 , 0x05E2 , 0x05D9 , 0x05D9 , 0x05DF , 0x0020 , 0x05D1 ,
448 0x05EA , 0x05DE , 0x05DC , 0x05D9 , 0x05DC , 0x0020 , 0x05D4 , 0x05E2 , 0x05D3 , 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0000
451 static const UChar chars_reverse
[] = {
452 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 , 0x05D3 , 0x05E2 , 0x05D4 , 0x0020 , 0x05DC , 0x05D9 , 0x05DC , 0x05DE , 0x05EA ,
453 0x05D1 , 0x0020 , 0x05DF , 0x05D9 , 0x05D9 , 0x05E2 , 0x05E9 , 0x0020 , 0x05E8 , 0x05D7 , 0x05D0 , 0x05DC , 0x0020 , 0x05D5 , 0x05EA , 0x05D8 ,
454 0x05DC , 0x05D7 , 0x05D4 , 0x0020 , 0x05EA , 0x05D0 , 0x0020 , 0x05DC , 0x05D1 , 0x05D9 , 0x05E7 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 ,
455 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x002E , 0x0022 , 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 ,
456 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE , 0x0020 , 0x05DA , 0x05DC , 0x05D4 , 0x05DE , 0x05D1 , 0x0020 , 0x05DD , 0x05D9 ,
457 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05DC , 0x05E9 , 0x0020 , 0x05D4 , 0x05E8 , 0x05D5 , 0x05D0 , 0x05DB , 0x05DC , 0x0020 , 0x05D4 ,
458 0x05DC , 0x05D5 , 0x05E1 , 0x05E4 , 0x0020 , 0x05EA , 0x05D5 , 0x05D2 , 0x05D4 , 0x05E0 , 0x05EA , 0x05D4 , 0x0022 , 0x0020 , 0x05DC , 0x05E9 ,
459 0x0020 , 0x05D4 , 0x05E0 , 0x05D5 , 0x05DE , 0x05EA , 0x0020 , 0x05D4 , 0x05DC , 0x05D5 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 ,
460 0x05D3 , 0x05E2 , 0x05D4 , 0x05DE , 0x0020 , 0x002C , 0x05E8 , 0x0022 , 0x05E6 , 0x05E4 , 0x05D4 , 0x0020 , 0x05D9 , 0x05E8 , 0x05D1 , 0x05D3 ,
461 0x05DC , 0x0020 , 0x002E , 0x05D4 , 0x05D6 , 0x05E2 , 0x0020 , 0x05EA , 0x05E2 , 0x05D5 , 0x05E6 , 0x05E8 , 0x0020 , 0x002B , 0x05D1 , 0x0020 ,
462 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 , 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE ,
463 0x05DE , 0x0020 , 0x05DC , 0x0022 , 0x05D4 , 0x05E6 , 0x0020 , 0x05D9 , 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 ,
464 0x05D5 , 0x05D3 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D1 , 0x05E7 , 0x05E2 , 0x05D1 , 0x0020 , 0x05D7 , 0x0022 , 0x05E6 , 0x05DE , 0x0020 ,
465 0x05EA , 0x05E8 , 0x05D9 , 0x05E7 , 0x05D7 , 0x0020 , 0x05EA , 0x05D7 , 0x05D9 , 0x05EA , 0x05E4 , 0x0020 , 0x05DC , 0x05E2 , 0x0020 , 0x05D4 ,
466 0x05E8 , 0x05D5 , 0x05D4 , 0x0020 , 0x002C , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 , 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x05D9 , 0x05D7 ,
467 0x05D9 , 0x05D1 , 0x05D0 , 0x0020 , 0x05E3 , 0x05D5 , 0x05DC , 0x05D0 , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x002C , 0x05D9 , 0x05E9 , 0x05D0 ,
468 0x05E8 , 0x05D4 , 0x0020 , 0x05D9 , 0x05D0 , 0x05D1 , 0x05E6 , 0x05D4 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05E7 , 0x05E8 , 0x05E4 , 0x05D4 ,
472 int32_t bLength
= 0 , brLength
= 0 , cLength
= UPRV_LENGTHOF ( chars
), crLength
= UPRV_LENGTHOF ( chars_reverse
);
474 char * bytes
= extractBytes ( chars
, cLength
, "IBM424" , & bLength
);
475 char * bytes_r
= extractBytes ( chars_reverse
, crLength
, "IBM424" , & brLength
);
477 UCharsetDetector
* csd
= ucsdet_open (& status
);
478 const UCharsetMatch
* match
;
481 ucsdet_setText ( csd
, bytes
, bLength
, & status
);
482 match
= ucsdet_detect ( csd
, & status
);
485 log_err ( "Encoding detection failure for IBM424_rtl: got no matches. \n " );
489 name
= ucsdet_getName ( match
, & status
);
490 if ( strcmp ( name
, "IBM424_rtl" ) != 0 ) {
491 log_data_err ( "Encoding detection failure for IBM424_rtl: got %s . (Are you missing data?) \n " , name
);
494 ucsdet_setText ( csd
, bytes_r
, brLength
, & status
);
495 match
= ucsdet_detect ( csd
, & status
);
498 log_err ( "Encoding detection failure for IBM424_ltr: got no matches. \n " );
502 name
= ucsdet_getName ( match
, & status
);
503 if ( strcmp ( name
, "IBM424_ltr" ) != 0 ) {
504 log_data_err ( "Encoding detection failure for IBM424_ltr: got %s . (Are you missing data?) \n " , name
);
513 static void TestIBM420 ( void )
515 UErrorCode status
= U_ZERO_ERROR
;
517 static const UChar chars
[] = {
518 0x0648 , 0x064F , 0x0636 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x0646 , 0x064F , 0x0641 , 0x0630 , 0x062A , 0x0020 , 0x0628 , 0x0631 , 0x0627 ,
519 0x0645 , 0x062C , 0x0020 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 , 0x0639 , 0x062F , 0x064A , 0x062F , 0x0629 , 0x0020 , 0x0641 ,
520 0x064A , 0x0020 , 0x0645 , 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0627 , 0x0644 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 ,
521 0x0627 , 0x0644 , 0x0648 , 0x0637 , 0x0646 , 0x064A , 0x002C , 0x0020 , 0x0645 , 0x0639 , 0x0020 , 0x0645 , 0x0644 , 0x0627 , 0x0626 , 0x0645 ,
522 0x062A , 0x0647 , 0x0627 , 0x0020 , 0x062F , 0x0627 , 0x0626 , 0x0645 , 0x0627 , 0x064B , 0x0020 , 0x0644 , 0x0644 , 0x0627 , 0x062D , 0x062A ,
523 0x064A , 0x0627 , 0x062C , 0x0627 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062A , 0x063A , 0x064A , 0x0631 , 0x0629 , 0x0020 , 0x0644 ,
524 0x0644 , 0x0645 , 0x062C , 0x062A , 0x0645 , 0x0639 , 0x0020 , 0x0648 , 0x0644 , 0x0644 , 0x062F , 0x0648 , 0x0644 , 0x0629 , 0x002E , 0x0020 ,
525 0x062A , 0x0648 , 0x0633 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x062A , 0x0637 , 0x0648 , 0x0631 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 ,
526 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0628 , 0x0647 , 0x062F , 0x0641 , 0x0020 , 0x0636 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0634 ,
527 0x0628 , 0x0643 , 0x0629 , 0x0020 , 0x0623 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0644 , 0x0633 , 0x0643 , 0x0627 , 0x0646 , 0x0020 , 0x062F ,
528 0x0648 , 0x0644 , 0x0629 , 0x0020 , 0x0627 , 0x0633 , 0x0631 , 0x0627 , 0x0626 , 0x064A , 0x0644 , 0x0020 , 0x0628 , 0x0648 , 0x062C , 0x0647 ,
529 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062E , 0x0627 , 0x0637 , 0x0631 , 0x0020 , 0x0627 , 0x0644 , 0x0627 , 0x0642 , 0x062A , 0x0635 , 0x0627 ,
530 0x062F , 0x064A , 0x0629 , 0x0020 , 0x0648 , 0x0627 , 0x0644 , 0x0627 , 0x062C , 0x062A , 0x0645 , 0x0627 , 0x0639 , 0x064A , 0x0629 , 0x002E ,
533 static const UChar chars_reverse
[] = {
534 0x002E , 0x0629 , 0x064A , 0x0639 , 0x0627 , 0x0645 , 0x062A , 0x062C , 0x0627 , 0x0644 , 0x0627 , 0x0648 , 0x0020 , 0x0629 , 0x064A , 0x062F ,
535 0x0627 , 0x0635 , 0x062A , 0x0642 , 0x0627 , 0x0644 , 0x0627 , 0x0020 , 0x0631 , 0x0637 , 0x0627 , 0x062E , 0x0645 , 0x0644 , 0x0627 , 0x0020 ,
536 0x0647 , 0x062C , 0x0648 , 0x0628 , 0x0020 , 0x0644 , 0x064A , 0x0626 , 0x0627 , 0x0631 , 0x0633 , 0x0627 , 0x0020 , 0x0629 , 0x0644 , 0x0648 ,
537 0x062F , 0x0020 , 0x0646 , 0x0627 , 0x0643 , 0x0633 , 0x0644 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0623 , 0x0020 , 0x0629 , 0x0643 , 0x0628 ,
538 0x0634 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0636 , 0x0020 , 0x0641 , 0x062F , 0x0647 , 0x0628 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 ,
539 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0631 , 0x0648 , 0x0637 , 0x062A , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0633 , 0x0648 , 0x062A ,
540 0x0020 , 0x002E , 0x0629 , 0x0644 , 0x0648 , 0x062F , 0x0644 , 0x0644 , 0x0648 , 0x0020 , 0x0639 , 0x0645 , 0x062A , 0x062C , 0x0645 , 0x0644 ,
541 0x0644 , 0x0020 , 0x0629 , 0x0631 , 0x064A , 0x063A , 0x062A , 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0627 , 0x062C , 0x0627 , 0x064A ,
542 0x062A , 0x062D , 0x0627 , 0x0644 , 0x0644 , 0x0020 , 0x064B , 0x0627 , 0x0645 , 0x0626 , 0x0627 , 0x062F , 0x0020 , 0x0627 , 0x0647 , 0x062A ,
543 0x0645 , 0x0626 , 0x0627 , 0x0644 , 0x0645 , 0x0020 , 0x0639 , 0x0645 , 0x0020 , 0x002C , 0x064A , 0x0646 , 0x0637 , 0x0648 , 0x0644 , 0x0627 ,
544 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0644 , 0x0627 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 , 0x0645 , 0x0020 , 0x064A ,
545 0x0641 , 0x0020 , 0x0629 , 0x062F , 0x064A , 0x062F , 0x0639 , 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0020 , 0x062C , 0x0645 ,
546 0x0627 , 0x0631 , 0x0628 , 0x0020 , 0x062A , 0x0630 , 0x0641 , 0x064F , 0x0646 , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0636 , 0x064F , 0x0648 ,
550 int32_t bLength
= 0 , brLength
= 0 , cLength
= UPRV_LENGTHOF ( chars
), crLength
= UPRV_LENGTHOF ( chars_reverse
);
552 char * bytes
= extractBytes ( chars
, cLength
, "IBM420" , & bLength
);
553 char * bytes_r
= extractBytes ( chars_reverse
, crLength
, "IBM420" , & brLength
);
555 UCharsetDetector
* csd
= ucsdet_open (& status
);
556 const UCharsetMatch
* match
;
559 ucsdet_setText ( csd
, bytes
, bLength
, & status
);
560 match
= ucsdet_detect ( csd
, & status
);
563 log_err ( "Encoding detection failure for IBM420_rtl: got no matches. \n " );
567 name
= ucsdet_getName ( match
, & status
);
568 if ( strcmp ( name
, "IBM420_rtl" ) != 0 ) {
569 log_data_err ( "Encoding detection failure for IBM420_rtl: got %s . (Are you missing data?) \n " , name
);
572 ucsdet_setText ( csd
, bytes_r
, brLength
, & status
);
573 match
= ucsdet_detect ( csd
, & status
);
576 log_err ( "Encoding detection failure for IBM420_ltr: got no matches. \n " );
580 name
= ucsdet_getName ( match
, & status
);
581 if ( strcmp ( name
, "IBM420_ltr" ) != 0 ) {
582 log_data_err ( "Encoding detection failure for IBM420_ltr: got %s . (Are you missing data?) \n " , name
);