]>
git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/ucsdetst.c
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ****************************************************************************
5 * Copyright (c) 2005-2016, International Business Machines Corporation and *
6 * others. All Rights Reserved. *
7 ****************************************************************************
10 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/ustring.h"
22 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
23 #define DELETE_ARRAY(array) free(array)
25 static void TestConstruction ( void );
26 static void TestUTF8 ( void );
27 static void TestUTF16 ( void );
28 static void TestC1Bytes ( void );
29 static void TestInputFilter ( void );
30 static void TestChaining ( void );
31 static void TestBufferOverflow ( void );
32 static void TestIBM424 ( void );
33 static void TestIBM420 ( void );
35 void addUCsdetTest ( TestNode
** root
);
37 void addUCsdetTest ( TestNode
** root
)
39 addTest ( root
, & TestConstruction
, "ucsdetst/TestConstruction" );
40 addTest ( root
, & TestUTF8
, "ucsdetst/TestUTF8" );
41 addTest ( root
, & TestUTF16
, "ucsdetst/TestUTF16" );
42 addTest ( root
, & TestC1Bytes
, "ucsdetst/TestC1Bytes" );
43 addTest ( root
, & TestInputFilter
, "ucsdetst/TestInputFilter" );
44 addTest ( root
, & TestChaining
, "ucsdetst/TestErrorChaining" );
45 addTest ( root
, & TestBufferOverflow
, "ucsdetst/TestBufferOverflow" );
46 #if !UCONFIG_NO_LEGACY_CONVERSION
47 addTest ( root
, & TestIBM424
, "ucsdetst/TestIBM424" );
48 addTest ( root
, & TestIBM420
, "ucsdetst/TestIBM420" );
52 static int32_t preflight ( const UChar
* src
, int32_t length
, UConverter
* cnv
)
56 char * dest
, * destLimit
= buffer
+ sizeof ( buffer
);
57 const UChar
* srcLimit
= src
+ length
;
62 status
= U_ZERO_ERROR
;
63 ucnv_fromUnicode ( cnv
, & dest
, destLimit
, & src
, srcLimit
, 0 , TRUE
, & status
);
64 result
+= ( int32_t ) ( dest
- buffer
);
65 } while ( status
== U_BUFFER_OVERFLOW_ERROR
);
70 static char * extractBytes ( const UChar
* src
, int32_t length
, const char * codepage
, int32_t * byteLength
)
72 UErrorCode status
= U_ZERO_ERROR
;
73 UConverter
* cnv
= ucnv_open ( codepage
, & status
);
74 int32_t byteCount
= preflight ( src
, length
, cnv
);
75 const UChar
* srcLimit
= src
+ length
;
76 char * bytes
= NEW_ARRAY ( char , byteCount
+ 1 );
77 char * dest
= bytes
, * destLimit
= bytes
+ byteCount
+ 1 ;
79 ucnv_fromUnicode ( cnv
, & dest
, destLimit
, & src
, srcLimit
, 0 , TRUE
, & status
);
82 * byteLength
= byteCount
;
86 static void freeBytes ( char * bytes
)
91 static void TestConstruction ( void )
93 UErrorCode status
= U_ZERO_ERROR
;
94 UCharsetDetector
* csd
= ucsdet_open (& status
);
95 UEnumeration
* e
= ucsdet_getAllDetectableCharsets ( csd
, & status
);
97 int32_t count
= uenum_count ( e
, & status
);
100 for ( i
= 0 ; i
< count
; i
+= 1 ) {
101 name
= uenum_next ( e
, & length
, & status
);
103 if ( name
== NULL
|| length
<= 0 ) {
104 log_err ( "ucsdet_getAllDetectableCharsets() returned a null or empty name! \n " );
107 /* one past the list of all names must return NULL */
108 name
= uenum_next ( e
, & length
, & status
);
109 if ( name
!= NULL
|| length
!= 0 || U_FAILURE ( status
)) {
110 log_err ( "ucsdet_getAllDetectableCharsets(past the list) returned a non-null name! \n " );
117 static void TestUTF8 ( void )
119 UErrorCode status
= U_ZERO_ERROR
;
120 static const char ss
[] = "This is a string with some non-ascii characters that will "
121 "be converted to UTF-8, then shoved through the detection process. "
122 " \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395"
123 "Sure would be nice if our source could contain Unicode directly!" ;
124 int32_t byteLength
= 0 , sLength
= 0 , dLength
= 0 ;
127 UCharsetDetector
* csd
= ucsdet_open (& status
);
128 const UCharsetMatch
* match
;
129 UChar detected
[ sizeof ( ss
)];
131 sLength
= u_unescape ( ss
, s
, sizeof ( ss
));
132 bytes
= extractBytes ( s
, sLength
, "UTF-8" , & byteLength
);
134 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
135 if ( U_FAILURE ( status
)) {
136 log_err ( "status is %s \n " , u_errorName ( status
));
140 match
= ucsdet_detect ( csd
, & status
);
143 log_err ( "Detection failure for UTF-8: got no matches. \n " );
147 dLength
= ucsdet_getUChars ( match
, detected
, sLength
, & status
);
149 if ( u_strCompare ( detected
, dLength
, s
, sLength
, FALSE
) != 0 ) {
150 log_err ( "Round-trip test failed! \n " );
153 ucsdet_setDeclaredEncoding ( csd
, "UTF-8" , 5 , & status
); /* for coverage */
160 static void TestUTF16 ( void )
162 UErrorCode status
= U_ZERO_ERROR
;
163 /* Notice the BOM on the start of this string */
164 static const UChar chars
[] = {
165 0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
166 0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
167 0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
168 0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
169 0x064a , 0x062a , 0x0000 };
170 int32_t beLength
= 0 , leLength
= 0 , cLength
= UPRV_LENGTHOF ( chars
);
171 char * beBytes
= extractBytes ( chars
, cLength
, "UTF-16BE" , & beLength
);
172 char * leBytes
= extractBytes ( chars
, cLength
, "UTF-16LE" , & leLength
);
173 UCharsetDetector
* csd
= ucsdet_open (& status
);
174 const UCharsetMatch
* match
;
178 ucsdet_setText ( csd
, beBytes
, beLength
, & status
);
179 match
= ucsdet_detect ( csd
, & status
);
182 log_err ( "Encoding detection failure for UTF-16BE: got no matches. \n " );
186 name
= ucsdet_getName ( match
, & status
);
187 conf
= ucsdet_getConfidence ( match
, & status
);
189 if ( strcmp ( name
, "UTF-16BE" ) != 0 ) {
190 log_err ( "Encoding detection failure for UTF-16BE: got %s \n " , name
);
194 log_err ( "Did not get 100%% confidence for UTF-16BE: got %d \n " , conf
);
198 ucsdet_setText ( csd
, leBytes
, leLength
, & status
);
199 match
= ucsdet_detect ( csd
, & status
);
202 log_err ( "Encoding detection failure for UTF-16LE: got no matches. \n " );
206 name
= ucsdet_getName ( match
, & status
);
207 conf
= ucsdet_getConfidence ( match
, & status
);
210 if ( strcmp ( name
, "UTF-16LE" ) != 0 ) {
211 log_err ( "Enconding detection failure for UTF-16LE: got %s \n " , name
);
215 log_err ( "Did not get 100%% confidence for UTF-16LE: got %d \n " , conf
);
224 static void TestC1Bytes ( void )
226 #if !UCONFIG_NO_LEGACY_CONVERSION
227 UErrorCode status
= U_ZERO_ERROR
;
228 static const char ssISO
[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly." ;
229 static const char ssWindows
[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes." ;
230 int32_t sISOLength
= 0 , sWindowsLength
= 0 ;
231 UChar sISO
[ sizeof ( ssISO
)];
232 UChar sWindows
[ sizeof ( ssWindows
)];
233 int32_t lISO
= 0 , lWindows
= 0 ;
236 UCharsetDetector
* csd
= ucsdet_open (& status
);
237 const UCharsetMatch
* match
;
240 sISOLength
= u_unescape ( ssISO
, sISO
, sizeof ( ssISO
));
241 sWindowsLength
= u_unescape ( ssWindows
, sWindows
, sizeof ( ssWindows
));
242 bISO
= extractBytes ( sISO
, sISOLength
, "ISO-8859-1" , & lISO
);
243 bWindows
= extractBytes ( sWindows
, sWindowsLength
, "windows-1252" , & lWindows
);
245 ucsdet_setText ( csd
, bWindows
, lWindows
, & status
);
246 match
= ucsdet_detect ( csd
, & status
);
249 log_err ( "English test with C1 bytes got no matches. \n " );
253 name
= ucsdet_getName ( match
, & status
);
255 if ( strcmp ( name
, "windows-1252" ) != 0 ) {
256 log_data_err ( "English text with C1 bytes does not detect as windows-1252, but as %s . (Are you missing data?) \n " , name
);
259 ucsdet_setText ( csd
, bISO
, lISO
, & status
);
260 match
= ucsdet_detect ( csd
, & status
);
263 log_err ( "English text without C1 bytes got no matches. \n " );
267 name
= ucsdet_getName ( match
, & status
);
269 if ( strcmp ( name
, "ISO-8859-1" ) != 0 ) {
270 log_err ( "English text without C1 bytes does not detect as ISO-8859-1, but as %s \n " , name
);
281 static void TestInputFilter ( void )
283 UErrorCode status
= U_ZERO_ERROR
;
284 static const char ss
[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr \\ u00E8s petit peu de Fran \\ u00E7ais. <to> <confuse> <the> <detector>" ;
287 int32_t byteLength
= 0 ;
289 UCharsetDetector
* csd
= ucsdet_open (& status
);
290 const UCharsetMatch
* match
;
291 const char * lang
, * name
;
293 sLength
= u_unescape ( ss
, s
, sizeof ( ss
));
294 bytes
= extractBytes ( s
, sLength
, "ISO-8859-1" , & byteLength
);
296 ucsdet_enableInputFilter ( csd
, TRUE
);
298 if (! ucsdet_isInputFilterEnabled ( csd
)) {
299 log_err ( "ucsdet_enableInputFilter(csd, TRUE) did not enable input filter! \n " );
303 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
304 match
= ucsdet_detect ( csd
, & status
);
307 log_err ( "Turning on the input filter resulted in no matches. \n " );
311 name
= ucsdet_getName ( match
, & status
);
313 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
314 log_err ( "Turning on the input filter resulted in %s rather than ISO-8859-1 \n " , name
);
316 lang
= ucsdet_getLanguage ( match
, & status
);
318 if ( lang
== NULL
|| strcmp ( lang
, "fr" ) != 0 ) {
319 log_err ( "Input filter did not strip markup! \n " );
324 ucsdet_enableInputFilter ( csd
, FALSE
);
325 ucsdet_setText ( csd
, bytes
, byteLength
, & status
);
326 match
= ucsdet_detect ( csd
, & status
);
329 log_err ( "Turning off the input filter resulted in no matches. \n " );
333 name
= ucsdet_getName ( match
, & status
);
335 if ( name
== NULL
|| strcmp ( name
, "ISO-8859-1" ) != 0 ) {
336 log_err ( "Turning off the input filter resulted in %s rather than ISO-8859-1 \n " , name
);
338 lang
= ucsdet_getLanguage ( match
, & status
);
340 if ( lang
== NULL
|| strcmp ( lang
, "en" ) != 0 ) {
341 log_err ( "Unfiltered input did not detect as English! \n " );
350 static void TestChaining ( void ) {
351 UErrorCode status
= U_USELESS_COLLATOR_ERROR
;
353 ucsdet_open (& status
);
354 ucsdet_setText ( NULL
, NULL
, 0 , & status
);
355 ucsdet_getName ( NULL
, & status
);
356 ucsdet_getConfidence ( NULL
, & status
);
357 ucsdet_getLanguage ( NULL
, & status
);
358 ucsdet_detect ( NULL
, & status
);
359 ucsdet_setDeclaredEncoding ( NULL
, NULL
, 0 , & status
);
360 ucsdet_detectAll ( NULL
, NULL
, & status
);
361 ucsdet_getUChars ( NULL
, NULL
, 0 , & status
);
362 ucsdet_getUChars ( NULL
, NULL
, 0 , & status
);
365 /* All of this code should have done nothing. */
366 if ( status
!= U_USELESS_COLLATOR_ERROR
) {
367 log_err ( "Status got changed to %s \n " , u_errorName ( status
));
371 static void TestBufferOverflow ( void ) {
372 UErrorCode status
= U_ZERO_ERROR
;
373 static const char * testStrings
[] = {
374 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b " , /* A partial ISO-2022 shift state at the end */
375 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24 " , /* A partial ISO-2022 shift state at the end */
376 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28 " , /* A partial ISO-2022 shift state at the end */
377 " \x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44 " , /* A complete ISO-2022 shift state at the end with a bad one at the start */
378 " \x1b\x24\x28\x44 " , /* A complete ISO-2022 shift state at the end */
379 " \xa1 " , /* Could be a single byte shift-jis at the end */
380 " \x74\x68\xa1 " , /* Could be a single byte shift-jis at the end */
381 " \x74\x68\x65\xa1 " /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
383 static const char * testResults
[] = {
394 UCharsetDetector
* csd
= ucsdet_open (& status
);
395 const UCharsetMatch
* match
;
397 ucsdet_setDeclaredEncoding ( csd
, "ISO-2022-JP" , - 1 , & status
);
399 if ( U_FAILURE ( status
)) {
400 log_err ( "Couldn't open detector. %s \n " , u_errorName ( status
));
404 for ( idx
= 0 ; idx
< UPRV_LENGTHOF ( testStrings
); idx
++) {
405 ucsdet_setText ( csd
, testStrings
[ idx
], - 1 , & status
);
406 match
= ucsdet_detect ( csd
, & status
);
409 if ( testResults
[ idx
] != NULL
) {
410 log_err ( "Unexpectedly got no results at index %d . \n " , idx
);
413 log_verbose ( "Got no result as expected at index %d . \n " , idx
);
418 if ( testResults
[ idx
] == NULL
|| strcmp ( ucsdet_getName ( match
, & status
), testResults
[ idx
]) != 0 ) {
419 log_err ( "Unexpectedly got %s instead of %s at index %d with confidence %d . \n " ,
420 ucsdet_getName ( match
, & status
), testResults
[ idx
], idx
, ucsdet_getConfidence ( match
, & status
));
429 static void TestIBM424 ( void )
431 UErrorCode status
= U_ZERO_ERROR
;
433 static const UChar chars
[] = {
434 0x05D4 , 0x05E4 , 0x05E8 , 0x05E7 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05D4 , 0x05E6 , 0x05D1 , 0x05D0 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E8 ,
435 0x05D0 , 0x05E9 , 0x05D9 , 0x002C , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x05D0 , 0x05DC , 0x05D5 , 0x05E3 , 0x0020 , 0x05D0 , 0x05D1 , 0x05D9 ,
436 0x05D7 , 0x05D9 , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC , 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x002C , 0x0020 , 0x05D4 , 0x05D5 , 0x05E8 ,
437 0x05D4 , 0x0020 , 0x05E2 , 0x05DC , 0x0020 , 0x05E4 , 0x05EA , 0x05D9 , 0x05D7 , 0x05EA , 0x0020 , 0x05D7 , 0x05E7 , 0x05D9 , 0x05E8 , 0x05EA ,
438 0x0020 , 0x05DE , 0x05E6 , 0x0022 , 0x05D7 , 0x0020 , 0x05D1 , 0x05E2 , 0x05E7 , 0x05D1 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D3 , 0x05D5 ,
439 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC , 0x05D9 , 0x0020 , 0x05E6 , 0x05D4 , 0x0022 , 0x05DC , 0x0020 , 0x05DE ,
440 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 , 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 ,
441 0x0020 , 0x05D1 , 0x002B , 0x0020 , 0x05E8 , 0x05E6 , 0x05D5 , 0x05E2 , 0x05EA , 0x0020 , 0x05E2 , 0x05D6 , 0x05D4 , 0x002E , 0x0020 , 0x05DC ,
442 0x05D3 , 0x05D1 , 0x05E8 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E4 , 0x05E6 , 0x0022 , 0x05E8 , 0x002C , 0x0020 , 0x05DE , 0x05D4 , 0x05E2 , 0x05D3 ,
443 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D5 , 0x05DC , 0x05D4 , 0x0020 , 0x05EA , 0x05DE , 0x05D5 , 0x05E0 , 0x05D4 , 0x0020 ,
444 0x05E9 , 0x05DC , 0x0020 , 0x0022 , 0x05D4 , 0x05EA , 0x05E0 , 0x05D4 , 0x05D2 , 0x05D5 , 0x05EA , 0x0020 , 0x05E4 , 0x05E1 , 0x05D5 , 0x05DC ,
445 0x05D4 , 0x0020 , 0x05DC , 0x05DB , 0x05D0 , 0x05D5 , 0x05E8 , 0x05D4 , 0x0020 , 0x05E9 , 0x05DC , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC ,
446 0x05D9 , 0x05DD , 0x0020 , 0x05D1 , 0x05DE , 0x05D4 , 0x05DC , 0x05DA , 0x0020 , 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 ,
447 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 , 0x0022 , 0x002E , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC ,
448 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05E7 , 0x05D9 , 0x05D1 , 0x05DC , 0x0020 , 0x05D0 , 0x05EA , 0x0020 , 0x05D4 , 0x05D7 , 0x05DC ,
449 0x05D8 , 0x05EA , 0x05D5 , 0x0020 , 0x05DC , 0x05D0 , 0x05D7 , 0x05E8 , 0x0020 , 0x05E9 , 0x05E2 , 0x05D9 , 0x05D9 , 0x05DF , 0x0020 , 0x05D1 ,
450 0x05EA , 0x05DE , 0x05DC , 0x05D9 , 0x05DC , 0x0020 , 0x05D4 , 0x05E2 , 0x05D3 , 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0000
453 static const UChar chars_reverse
[] = {
454 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 , 0x05D3 , 0x05E2 , 0x05D4 , 0x0020 , 0x05DC , 0x05D9 , 0x05DC , 0x05DE , 0x05EA ,
455 0x05D1 , 0x0020 , 0x05DF , 0x05D9 , 0x05D9 , 0x05E2 , 0x05E9 , 0x0020 , 0x05E8 , 0x05D7 , 0x05D0 , 0x05DC , 0x0020 , 0x05D5 , 0x05EA , 0x05D8 ,
456 0x05DC , 0x05D7 , 0x05D4 , 0x0020 , 0x05EA , 0x05D0 , 0x0020 , 0x05DC , 0x05D1 , 0x05D9 , 0x05E7 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 ,
457 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x002E , 0x0022 , 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 ,
458 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE , 0x0020 , 0x05DA , 0x05DC , 0x05D4 , 0x05DE , 0x05D1 , 0x0020 , 0x05DD , 0x05D9 ,
459 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05DC , 0x05E9 , 0x0020 , 0x05D4 , 0x05E8 , 0x05D5 , 0x05D0 , 0x05DB , 0x05DC , 0x0020 , 0x05D4 ,
460 0x05DC , 0x05D5 , 0x05E1 , 0x05E4 , 0x0020 , 0x05EA , 0x05D5 , 0x05D2 , 0x05D4 , 0x05E0 , 0x05EA , 0x05D4 , 0x0022 , 0x0020 , 0x05DC , 0x05E9 ,
461 0x0020 , 0x05D4 , 0x05E0 , 0x05D5 , 0x05DE , 0x05EA , 0x0020 , 0x05D4 , 0x05DC , 0x05D5 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 ,
462 0x05D3 , 0x05E2 , 0x05D4 , 0x05DE , 0x0020 , 0x002C , 0x05E8 , 0x0022 , 0x05E6 , 0x05E4 , 0x05D4 , 0x0020 , 0x05D9 , 0x05E8 , 0x05D1 , 0x05D3 ,
463 0x05DC , 0x0020 , 0x002E , 0x05D4 , 0x05D6 , 0x05E2 , 0x0020 , 0x05EA , 0x05E2 , 0x05D5 , 0x05E6 , 0x05E8 , 0x0020 , 0x002B , 0x05D1 , 0x0020 ,
464 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 , 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE ,
465 0x05DE , 0x0020 , 0x05DC , 0x0022 , 0x05D4 , 0x05E6 , 0x0020 , 0x05D9 , 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 ,
466 0x05D5 , 0x05D3 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D1 , 0x05E7 , 0x05E2 , 0x05D1 , 0x0020 , 0x05D7 , 0x0022 , 0x05E6 , 0x05DE , 0x0020 ,
467 0x05EA , 0x05E8 , 0x05D9 , 0x05E7 , 0x05D7 , 0x0020 , 0x05EA , 0x05D7 , 0x05D9 , 0x05EA , 0x05E4 , 0x0020 , 0x05DC , 0x05E2 , 0x0020 , 0x05D4 ,
468 0x05E8 , 0x05D5 , 0x05D4 , 0x0020 , 0x002C , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 , 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x05D9 , 0x05D7 ,
469 0x05D9 , 0x05D1 , 0x05D0 , 0x0020 , 0x05E3 , 0x05D5 , 0x05DC , 0x05D0 , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x002C , 0x05D9 , 0x05E9 , 0x05D0 ,
470 0x05E8 , 0x05D4 , 0x0020 , 0x05D9 , 0x05D0 , 0x05D1 , 0x05E6 , 0x05D4 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05E7 , 0x05E8 , 0x05E4 , 0x05D4 ,
474 int32_t bLength
= 0 , brLength
= 0 , cLength
= UPRV_LENGTHOF ( chars
), crLength
= UPRV_LENGTHOF ( chars_reverse
);
476 char * bytes
= extractBytes ( chars
, cLength
, "IBM424" , & bLength
);
477 char * bytes_r
= extractBytes ( chars_reverse
, crLength
, "IBM424" , & brLength
);
479 UCharsetDetector
* csd
= ucsdet_open (& status
);
480 const UCharsetMatch
* match
;
483 ucsdet_setText ( csd
, bytes
, bLength
, & status
);
484 match
= ucsdet_detect ( csd
, & status
);
487 log_err ( "Encoding detection failure for IBM424_rtl: got no matches. \n " );
491 name
= ucsdet_getName ( match
, & status
);
492 if ( strcmp ( name
, "IBM424_rtl" ) != 0 ) {
493 log_data_err ( "Encoding detection failure for IBM424_rtl: got %s . (Are you missing data?) \n " , name
);
496 ucsdet_setText ( csd
, bytes_r
, brLength
, & status
);
497 match
= ucsdet_detect ( csd
, & status
);
500 log_err ( "Encoding detection failure for IBM424_ltr: got no matches. \n " );
504 name
= ucsdet_getName ( match
, & status
);
505 if ( strcmp ( name
, "IBM424_ltr" ) != 0 ) {
506 log_data_err ( "Encoding detection failure for IBM424_ltr: got %s . (Are you missing data?) \n " , name
);
515 static void TestIBM420 ( void )
517 UErrorCode status
= U_ZERO_ERROR
;
519 static const UChar chars
[] = {
520 0x0648 , 0x064F , 0x0636 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x0646 , 0x064F , 0x0641 , 0x0630 , 0x062A , 0x0020 , 0x0628 , 0x0631 , 0x0627 ,
521 0x0645 , 0x062C , 0x0020 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 , 0x0639 , 0x062F , 0x064A , 0x062F , 0x0629 , 0x0020 , 0x0641 ,
522 0x064A , 0x0020 , 0x0645 , 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0627 , 0x0644 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 ,
523 0x0627 , 0x0644 , 0x0648 , 0x0637 , 0x0646 , 0x064A , 0x002C , 0x0020 , 0x0645 , 0x0639 , 0x0020 , 0x0645 , 0x0644 , 0x0627 , 0x0626 , 0x0645 ,
524 0x062A , 0x0647 , 0x0627 , 0x0020 , 0x062F , 0x0627 , 0x0626 , 0x0645 , 0x0627 , 0x064B , 0x0020 , 0x0644 , 0x0644 , 0x0627 , 0x062D , 0x062A ,
525 0x064A , 0x0627 , 0x062C , 0x0627 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062A , 0x063A , 0x064A , 0x0631 , 0x0629 , 0x0020 , 0x0644 ,
526 0x0644 , 0x0645 , 0x062C , 0x062A , 0x0645 , 0x0639 , 0x0020 , 0x0648 , 0x0644 , 0x0644 , 0x062F , 0x0648 , 0x0644 , 0x0629 , 0x002E , 0x0020 ,
527 0x062A , 0x0648 , 0x0633 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x062A , 0x0637 , 0x0648 , 0x0631 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 ,
528 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0628 , 0x0647 , 0x062F , 0x0641 , 0x0020 , 0x0636 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0634 ,
529 0x0628 , 0x0643 , 0x0629 , 0x0020 , 0x0623 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0644 , 0x0633 , 0x0643 , 0x0627 , 0x0646 , 0x0020 , 0x062F ,
530 0x0648 , 0x0644 , 0x0629 , 0x0020 , 0x0627 , 0x0633 , 0x0631 , 0x0627 , 0x0626 , 0x064A , 0x0644 , 0x0020 , 0x0628 , 0x0648 , 0x062C , 0x0647 ,
531 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062E , 0x0627 , 0x0637 , 0x0631 , 0x0020 , 0x0627 , 0x0644 , 0x0627 , 0x0642 , 0x062A , 0x0635 , 0x0627 ,
532 0x062F , 0x064A , 0x0629 , 0x0020 , 0x0648 , 0x0627 , 0x0644 , 0x0627 , 0x062C , 0x062A , 0x0645 , 0x0627 , 0x0639 , 0x064A , 0x0629 , 0x002E ,
535 static const UChar chars_reverse
[] = {
536 0x002E , 0x0629 , 0x064A , 0x0639 , 0x0627 , 0x0645 , 0x062A , 0x062C , 0x0627 , 0x0644 , 0x0627 , 0x0648 , 0x0020 , 0x0629 , 0x064A , 0x062F ,
537 0x0627 , 0x0635 , 0x062A , 0x0642 , 0x0627 , 0x0644 , 0x0627 , 0x0020 , 0x0631 , 0x0637 , 0x0627 , 0x062E , 0x0645 , 0x0644 , 0x0627 , 0x0020 ,
538 0x0647 , 0x062C , 0x0648 , 0x0628 , 0x0020 , 0x0644 , 0x064A , 0x0626 , 0x0627 , 0x0631 , 0x0633 , 0x0627 , 0x0020 , 0x0629 , 0x0644 , 0x0648 ,
539 0x062F , 0x0020 , 0x0646 , 0x0627 , 0x0643 , 0x0633 , 0x0644 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0623 , 0x0020 , 0x0629 , 0x0643 , 0x0628 ,
540 0x0634 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0636 , 0x0020 , 0x0641 , 0x062F , 0x0647 , 0x0628 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 ,
541 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0631 , 0x0648 , 0x0637 , 0x062A , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0633 , 0x0648 , 0x062A ,
542 0x0020 , 0x002E , 0x0629 , 0x0644 , 0x0648 , 0x062F , 0x0644 , 0x0644 , 0x0648 , 0x0020 , 0x0639 , 0x0645 , 0x062A , 0x062C , 0x0645 , 0x0644 ,
543 0x0644 , 0x0020 , 0x0629 , 0x0631 , 0x064A , 0x063A , 0x062A , 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0627 , 0x062C , 0x0627 , 0x064A ,
544 0x062A , 0x062D , 0x0627 , 0x0644 , 0x0644 , 0x0020 , 0x064B , 0x0627 , 0x0645 , 0x0626 , 0x0627 , 0x062F , 0x0020 , 0x0627 , 0x0647 , 0x062A ,
545 0x0645 , 0x0626 , 0x0627 , 0x0644 , 0x0645 , 0x0020 , 0x0639 , 0x0645 , 0x0020 , 0x002C , 0x064A , 0x0646 , 0x0637 , 0x0648 , 0x0644 , 0x0627 ,
546 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0644 , 0x0627 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 , 0x0645 , 0x0020 , 0x064A ,
547 0x0641 , 0x0020 , 0x0629 , 0x062F , 0x064A , 0x062F , 0x0639 , 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0020 , 0x062C , 0x0645 ,
548 0x0627 , 0x0631 , 0x0628 , 0x0020 , 0x062A , 0x0630 , 0x0641 , 0x064F , 0x0646 , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0636 , 0x064F , 0x0648 ,
552 int32_t bLength
= 0 , brLength
= 0 , cLength
= UPRV_LENGTHOF ( chars
), crLength
= UPRV_LENGTHOF ( chars_reverse
);
554 char * bytes
= extractBytes ( chars
, cLength
, "IBM420" , & bLength
);
555 char * bytes_r
= extractBytes ( chars_reverse
, crLength
, "IBM420" , & brLength
);
557 UCharsetDetector
* csd
= ucsdet_open (& status
);
558 const UCharsetMatch
* match
;
561 ucsdet_setText ( csd
, bytes
, bLength
, & status
);
562 match
= ucsdet_detect ( csd
, & status
);
565 log_err ( "Encoding detection failure for IBM420_rtl: got no matches. \n " );
569 name
= ucsdet_getName ( match
, & status
);
570 if ( strcmp ( name
, "IBM420_rtl" ) != 0 ) {
571 log_data_err ( "Encoding detection failure for IBM420_rtl: got %s . (Are you missing data?) \n " , name
);
574 ucsdet_setText ( csd
, bytes_r
, brLength
, & status
);
575 match
= ucsdet_detect ( csd
, & status
);
578 log_err ( "Encoding detection failure for IBM420_ltr: got no matches. \n " );
582 name
= ucsdet_getName ( match
, & status
);
583 if ( strcmp ( name
, "IBM420_ltr" ) != 0 ) {
584 log_data_err ( "Encoding detection failure for IBM420_ltr: got %s . (Are you missing data?) \n " , name
);