1 /*************************************************************************
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
6 **************************************************************************
7 **************************************************************************
9 * Copyright (C) 2000-2016, International Business Machines
10 * Corporation and others. All Rights Reserved.
12 ***************************************************************************
13 * file name: convsamp.c
14 * encoding: ASCII (7-bit)
16 * created on: 2000may30
17 * created by: Steven R. Loomis
19 * Sample code for the ICU conversion routines.
21 * Note: Nothing special is needed to build this sample. Link with
22 * the icu UC and icu I18N libraries.
24 * I use 'assert' for error checking, you probably will want
25 * something more flexible. '***BEGIN SAMPLE***' and
26 * '***END SAMPLE***' mark pieces suitable for stand alone
30 * Each test can define it's own BUFFERSIZE
34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
37 #include <ctype.h> /* for isspace, etc. */
40 #include <stdlib.h> /* malloc */
42 #include "unicode/utypes.h" /* Basic ICU data types */
43 #include "unicode/ucnv.h" /* C Converter API */
44 #include "unicode/ustring.h" /* some more string fcns*/
45 #include "unicode/uchar.h" /* char names */
46 #include "unicode/uloc.h"
47 #include "unicode/unistr.h"
51 /* Some utility functions */
53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
56 static const UChar kNone
[] = { 0x0000 };
58 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
60 /* Print a UChar if possible, in seven characters. */
61 void prettyPrintUChar(UChar c
)
65 printf(" '%c' ", (char)(0x00FF&c
));
66 } else if ( c
> 0x007F ) {
68 UErrorCode status
= U_ZERO_ERROR
;
71 o
= u_charName(c
, U_EXTENDED_CHAR_NAME
, buf
, 1000, &status
);
72 if(U_SUCCESS(status
) && (o
>0) ) {
79 switch((char)(c
& 0x007F)) {
97 void printUChars(const char *name
= "?",
98 const UChar
*uch
= kNone
,
103 if( (len
== -1) && (uch
) ) {
107 printf("%5s: ", name
);
108 for( i
= 0; i
<len
; i
++) {
113 printf("%5s: ", "uni");
114 for( i
= 0; i
<len
; i
++) {
115 printf("\\u%04X ", (int)uch
[i
]);
119 printf("%5s:", "ch");
120 for( i
= 0; i
<len
; i
++) {
121 prettyPrintUChar(uch
[i
]);
126 void printBytes(const char *name
= "?",
127 const char *uch
= "",
132 if( (len
== -1) && (uch
) ) {
136 printf("%5s: ", name
);
137 for( i
= 0; i
<len
; i
++) {
142 printf("%5s: ", "uni");
143 for( i
= 0; i
<len
; i
++) {
144 printf("\\x%02X ", 0x00FF & (int)uch
[i
]);
148 printf("%5s:", "ch");
149 for( i
= 0; i
<len
; i
++) {
150 if(isgraph(0x00FF & (int)uch
[i
])) {
151 printf(" '%c' ", (char)uch
[i
]);
159 void printUChar(UChar32 ch32
)
162 printf("ch: U+%06X\n", ch32
);
165 UChar ch
= (UChar
)ch32
;
166 printUChars("C", &ch
, 1);
170 /*******************************************************************
171 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
172 followed by an exclamation mark (!) into the KOI8-R Russian code page.
174 This example first creates a UChar String out of the Unicode chars.
176 targetSize must be set to the amount of space available in the target
177 buffer. After fromUChars is called,
178 len will contain the number of bytes in target[] which were
179 used in the resulting codepage. In this case, there is a 1:1 mapping
180 between the input and output characters. The exclamation mark has the
181 same value in both KOI8-R and Unicode.
184 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
185 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
188 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
192 Converting FROM unicode
194 You must call ucnv_close to clean up the memory used by the
197 'len' returns the number of OUTPUT bytes resulting from the
201 UErrorCode
convsample_02()
203 printf("\n\n==============================================\n"
204 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
207 // **************************** START SAMPLE *******************
209 UChar source
[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
210 0x0430, 0x0021, 0x0000 };
212 UErrorCode status
= U_ZERO_ERROR
;
216 // set up the converter
218 conv
= ucnv_open("koi8-r", &status
);
220 assert(U_SUCCESS(status
));
223 len
= ucnv_fromUChars(conv
, target
, 100, source
, -1, &status
);
224 assert(U_SUCCESS(status
));
226 // close the converter
229 // ***************************** END SAMPLE ********************
232 printUChars("src", source
);
234 printBytes("targ", target
, len
);
240 UErrorCode
convsample_03()
242 printf("\n\n==============================================\n"
243 "Sample 03: C: print out all converters\n");
248 // **************************** START SAMPLE *******************
249 count
= ucnv_countAvailable();
250 printf("Available converters: %d\n", count
);
254 printf("%s ", ucnv_getAvailableName(i
));
257 // ***************************** END SAMPLE ********************
266 #define BUFFERSIZE 17 /* make it interesting :) */
269 Converting from a codepage to Unicode in bulk..
270 What is the best way to determine the buffer size?
272 The 'buffersize' is in bytes of input.
273 For a given converter, divinding this by the minimum char size
274 give you the maximum number of Unicode characters that could be
275 expected for a given number of input bytes.
276 see: ucnv_getMinCharSize()
278 For example, a single byte codepage like 'Latin-3' has a
279 minimum char size of 1. (It takes at least 1 byte to represent
280 each Unicode char.) So the unicode buffer has the same number of
281 UChars as the input buffer has bytes.
283 In a strictly double byte codepage such as cp1362 (Windows
284 Korean), the minimum char size is 2. So, only half as many Unicode
285 chars as bytes are needed.
287 This work to calculate the buffer size is an optimization. Any
288 size of input and output buffer can be used, as long as the
289 program handles the following cases: If the input buffer is empty,
290 the source pointer will be equal to sourceLimit. If the output
291 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
294 UErrorCode
convsample_05()
296 printf("\n\n==============================================\n"
297 "Sample 05: C: count the number of letters in a UTF-8 document\n");
301 char inBuf
[BUFFERSIZE
];
303 const char *sourceLimit
;
308 int32_t uBufSize
= 0;
310 UErrorCode status
= U_ZERO_ERROR
;
311 uint32_t letters
=0, total
=0;
313 f
= fopen("data01.txt", "r");
316 fprintf(stderr
, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
317 return U_FILE_ACCESS_ERROR
;
320 // **************************** START SAMPLE *******************
321 conv
= ucnv_open("utf-8", &status
);
322 assert(U_SUCCESS(status
));
324 uBufSize
= (BUFFERSIZE
/ucnv_getMinCharSize(conv
));
325 printf("input bytes %d / min chars %d = %d UChars\n",
326 BUFFERSIZE
, ucnv_getMinCharSize(conv
), uBufSize
);
327 uBuf
= (UChar
*)malloc(uBufSize
* sizeof(UChar
));
330 // grab another buffer's worth
332 ((count
=fread(inBuf
, 1, BUFFERSIZE
, f
)) > 0) )
334 // Convert bytes to unicode
336 sourceLimit
= inBuf
+ count
;
341 targetLimit
= uBuf
+ uBufSize
;
343 ucnv_toUnicode(conv
, &target
, targetLimit
,
344 &source
, sourceLimit
, NULL
,
345 feof(f
)?TRUE
:FALSE
, /* pass 'flush' when eof */
346 /* is true (when no more data will come) */
349 if(status
== U_BUFFER_OVERFLOW_ERROR
)
351 // simply ran out of space - we'll reset the target ptr the next
352 // time through the loop.
353 status
= U_ZERO_ERROR
;
357 // Check other errors here.
358 assert(U_SUCCESS(status
));
359 // Break out of the loop (by force)
362 // Process the Unicode
363 // Todo: handle UTF-16/surrogates
365 for(p
= uBuf
; p
<target
; p
++)
371 } while (source
< sourceLimit
); // while simply out of space
374 printf("%d letters out of %d total UChars.\n", letters
, total
);
376 // ***************************** END SAMPLE ********************
387 #define BUFFERSIZE 1024
394 UErrorCode
convsample_06()
396 printf("\n\n==============================================\n"
397 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
401 char inBuf
[BUFFERSIZE
];
403 const char *sourceLimit
;
404 int32_t uBufSize
= 0;
406 UErrorCode status
= U_ZERO_ERROR
;
407 uint32_t letters
=0, total
=0;
410 UChar32 charCount
= 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
417 f
= fopen("data06.txt", "r");
420 fprintf(stderr
, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
421 return U_FILE_ACCESS_ERROR
;
424 info
= (CharFreqInfo
*)malloc(sizeof(CharFreqInfo
) * charCount
);
427 fprintf(stderr
, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo
)*charCount
);
430 /* reset frequencies */
431 for(p
=0;p
<charCount
;p
++)
433 info
[p
].codepoint
= p
;
434 info
[p
].frequency
= 0;
437 // **************************** START SAMPLE *******************
438 conv
= ucnv_open("utf-8", &status
);
439 assert(U_SUCCESS(status
));
441 uBufSize
= (BUFFERSIZE
/ucnv_getMinCharSize(conv
));
442 printf("input bytes %d / min chars %d = %d UChars\n",
443 BUFFERSIZE
, ucnv_getMinCharSize(conv
), uBufSize
);
445 // grab another buffer's worth
447 ((count
=fread(inBuf
, 1, BUFFERSIZE
, f
)) > 0) )
449 // Convert bytes to unicode
451 sourceLimit
= inBuf
+ count
;
453 while(source
< sourceLimit
)
455 p
= ucnv_getNextUChar(conv
, &source
, sourceLimit
, &status
);
456 if(U_FAILURE(status
))
458 fprintf(stderr
, "%s @ %d\n", u_errorName(status
), total
);
459 status
= U_ZERO_ERROR
;
468 if((u_tolower(l
) == 'i') && (u_tolower(p
) == 'e'))
471 if((u_tolower(l
) == 'g') && (u_tolower(p
) == 0x0127))
476 fprintf(stderr
, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p
);
480 return U_UNSUPPORTED_ERROR
;
490 printf("%d letters out of %d total UChars.\n", letters
, total
);
491 printf("%d ie digraphs, %d gh digraphs.\n", ie
, gh
);
493 // now, we could sort it..
495 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
497 for(p
=0;p
<charCount
;p
++)
499 if(info
[p
].frequency
)
501 printf("% 5d U+%06X ", info
[p
].frequency
, p
);
504 prettyPrintUChar((UChar
)p
);
510 // ***************************** END SAMPLE ********************
519 /******************************************************
520 You must call ucnv_close to clean up the memory used by the
523 'len' returns the number of OUTPUT bytes resulting from the
527 UErrorCode
convsample_12()
529 printf("\n\n==============================================\n"
530 "Sample 12: C: simple sjis -> unicode conversion\n");
533 // **************************** START SAMPLE *******************
535 char source
[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
537 UErrorCode status
= U_ZERO_ERROR
;
541 // set up the converter
542 conv
= ucnv_open("shift_jis", &status
);
543 assert(U_SUCCESS(status
));
545 // convert to Unicode
546 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
548 len
= ucnv_toUChars(conv
, target
, 100, source
, strlen(source
), &status
);
550 // close the converter
553 // ***************************** END SAMPLE ********************
556 printBytes("src", source
, strlen(source
) );
558 printUChars("targ", target
, len
);
563 /******************************************************************
564 C: Convert from codepage to Unicode one at a time.
567 UErrorCode
convsample_13()
569 printf("\n\n==============================================\n"
570 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
573 const char sourceChars
[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
574 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
575 const char *source
, *sourceLimit
;
577 UErrorCode status
= U_ZERO_ERROR
;
578 UConverter
*conv
= NULL
;
582 srcCount
= sizeof(sourceChars
);
584 conv
= ucnv_open("Big5", &status
);
587 source
= sourceChars
;
588 sourceLimit
= sourceChars
+ sizeof(sourceChars
);
590 // **************************** START SAMPLE *******************
593 printBytes("src",source
,sourceLimit
-source
);
595 while(source
< sourceLimit
)
598 target
= ucnv_getNextUChar (conv
,
603 // printBytes("src",source,sourceLimit-source);
610 // ************************** END SAMPLE *************************
612 printf("src=%d bytes, dst=%d uchars\n", srcCount
, dstCount
);
621 UBool
convsample_20_didSubstitute(const char *source
)
625 UConverter
*conv
= NULL
;
626 UErrorCode status
= U_ZERO_ERROR
;
630 FromUFLAGContext
* context
= NULL
;
632 printf("\n\n==============================================\n"
633 "Sample 20: C: Test for substitution using callbacks\n");
635 /* print out the original source */
636 printBytes("src", source
);
639 /* First, convert from UTF8 to unicode */
640 conv
= ucnv_open("utf-8", &status
);
643 len
= ucnv_toUChars(conv
, uchars
, 100, source
, strlen(source
), &status
);
646 printUChars("uch", uchars
, len
);
649 /* Now, close the converter */
652 /* Now, convert to windows-1252 */
653 conv
= ucnv_open("windows-1252", &status
);
656 /* Converter starts out with the SUBSTITUTE callback set. */
658 /* initialize our callback */
659 context
= flagCB_fromU_openContext();
661 /* Set our special callback */
662 ucnv_setFromUCallBack(conv
,
665 &(context
->subCallback
),
666 &(context
->subContext
),
671 len2
= ucnv_fromUChars(conv
, bytes
, 100, uchars
, len
, &status
);
674 flagVal
= context
->flag
; /* it's about to go away when we close the cnv */
678 /* print out the original source */
679 printBytes("bytes", bytes
, len2
);
681 return flagVal
; /* true if callback was called */
684 UErrorCode
convsample_20()
686 const char *sample1
= "abc\xdf\xbf";
687 const char *sample2
= "abc_def";
690 if(convsample_20_didSubstitute(sample1
))
692 printf("DID substitute.\n******\n");
696 printf("Did NOT substitute.\n*****\n");
699 if(convsample_20_didSubstitute(sample2
))
701 printf("DID substitute.\n******\n");
705 printf("Did NOT substitute.\n*****\n");
711 // 21 - C, callback, with clone and debug
715 UBool
convsample_21_didSubstitute(const char *source
)
719 UConverter
*conv
= NULL
, *cloneCnv
= NULL
;
720 UErrorCode status
= U_ZERO_ERROR
;
723 UBool flagVal
= FALSE
;
724 UConverterFromUCallback junkCB
;
726 FromUFLAGContext
*flagCtx
= NULL
,
727 *cloneFlagCtx
= NULL
;
729 debugCBContext
*debugCtx1
= NULL
,
731 *cloneDebugCtx
= NULL
;
733 printf("\n\n==============================================\n"
734 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
736 /* print out the original source */
737 printBytes("src", source
);
740 /* First, convert from UTF8 to unicode */
741 conv
= ucnv_open("utf-8", &status
);
744 len
= ucnv_toUChars(conv
, uchars
, 100, source
, strlen(source
), &status
);
747 printUChars("uch", uchars
, len
);
750 /* Now, close the converter */
753 /* Now, convert to windows-1252 */
754 conv
= ucnv_open("windows-1252", &status
);
757 /* Converter starts out with the SUBSTITUTE callback set. */
759 /* initialize our callback */
760 /* from the 'bottom' innermost, out
761 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
764 printf("flagCB_fromU = %p\n", &flagCB_fromU
);
765 printf("debugCB_fromU = %p\n", &debugCB_fromU
);
768 debugCtx1
= debugCB_openContext();
769 flagCtx
= flagCB_fromU_openContext();
770 debugCtx2
= debugCB_openContext();
772 debugCtx1
->subCallback
= flagCB_fromU
; /* debug1 -> flag */
773 debugCtx1
->subContext
= flagCtx
;
775 flagCtx
->subCallback
= debugCB_fromU
; /* flag -> debug2 */
776 flagCtx
->subContext
= debugCtx2
;
778 debugCtx2
->subCallback
= UCNV_FROM_U_CALLBACK_SUBSTITUTE
;
779 debugCtx2
->subContext
= NULL
;
781 /* Set our special callback */
783 ucnv_setFromUCallBack(conv
,
786 &(debugCtx2
->subCallback
),
787 &(debugCtx2
->subContext
),
793 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
794 conv
, debugCtx1
, debugCtx1
->subCallback
,
795 debugCtx1
->subContext
, flagCtx
, debugCtx2
, debugCtx2
->subCallback
);
798 cloneCnv
= ucnv_safeClone(conv
, NULL
, NULL
, &status
);
803 printf("Cloned converter from %p -> %p. Closing %p.\n", conv
, cloneCnv
, conv
);
809 printf("%p closed.\n", conv
);
813 /* Now, we have to extract the context */
814 cloneDebugCtx
= NULL
;
817 ucnv_getFromUCallBack(cloneCnv
, &junkCB
, (const void **)&cloneDebugCtx
);
818 if(cloneDebugCtx
!= NULL
) {
819 cloneFlagCtx
= (FromUFLAGContext
*) cloneDebugCtx
-> subContext
;
822 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
823 cloneCnv
, cloneDebugCtx
, cloneFlagCtx
, cloneFlagCtx
?cloneFlagCtx
->subContext
:NULL
);
825 len2
= ucnv_fromUChars(cloneCnv
, bytes
, 100, uchars
, len
, &status
);
828 if(cloneFlagCtx
!= NULL
) {
829 flagVal
= cloneFlagCtx
->flag
; /* it's about to go away when we close the cnv */
831 printf("** Warning, couldn't get the subcallback \n");
834 ucnv_close(cloneCnv
);
836 /* print out the original source */
837 printBytes("bytes", bytes
, len2
);
839 return flagVal
; /* true if callback was called */
842 UErrorCode
convsample_21()
844 const char *sample1
= "abc\xdf\xbf";
845 const char *sample2
= "abc_def";
847 if(convsample_21_didSubstitute(sample1
))
849 printf("DID substitute.\n******\n");
853 printf("Did NOT substitute.\n*****\n");
856 if(convsample_21_didSubstitute(sample2
))
858 printf("DID substitute.\n******\n");
862 printf("Did NOT substitute.\n*****\n");
869 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
871 #define BUFFERSIZE 17 /* make it interesting :) */
873 UErrorCode
convsample_40()
875 printf("\n\n==============================================\n"
876 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
881 char inBuf
[BUFFERSIZE
];
883 const char *sourceLimit
;
887 int32_t uBufSize
= 0;
888 UConverter
*conv
= NULL
;
889 UErrorCode status
= U_ZERO_ERROR
;
890 uint32_t inbytes
=0, total
=0;
892 f
= fopen("data02.bin", "rb");
895 fprintf(stderr
, "Couldn't open file 'data02.bin' (cp37 data file).\n");
896 return U_FILE_ACCESS_ERROR
;
899 out
= fopen("data40.utf16", "wb");
902 fprintf(stderr
, "Couldn't create file 'data40.utf16'.\n");
904 return U_FILE_ACCESS_ERROR
;
907 // **************************** START SAMPLE *******************
908 conv
= ucnv_openCCSID(37, UCNV_IBM
, &status
);
909 assert(U_SUCCESS(status
));
911 uBufSize
= (BUFFERSIZE
/ucnv_getMinCharSize(conv
));
912 printf("input bytes %d / min chars %d = %d UChars\n",
913 BUFFERSIZE
, ucnv_getMinCharSize(conv
), uBufSize
);
914 uBuf
= (UChar
*)malloc(uBufSize
* sizeof(UChar
));
917 // grab another buffer's worth
919 ((count
=fread(inBuf
, 1, BUFFERSIZE
, f
)) > 0) )
923 // Convert bytes to unicode
925 sourceLimit
= inBuf
+ count
;
930 targetLimit
= uBuf
+ uBufSize
;
932 ucnv_toUnicode( conv
, &target
, targetLimit
,
933 &source
, sourceLimit
, NULL
,
934 feof(f
)?TRUE
:FALSE
, /* pass 'flush' when eof */
935 /* is true (when no more data will come) */
938 if(status
== U_BUFFER_OVERFLOW_ERROR
)
940 // simply ran out of space - we'll reset the target ptr the next
941 // time through the loop.
942 status
= U_ZERO_ERROR
;
946 // Check other errors here.
947 assert(U_SUCCESS(status
));
948 // Break out of the loop (by force)
951 // Process the Unicode
952 // Todo: handle UTF-16/surrogates
953 assert(fwrite(uBuf
, sizeof(uBuf
[0]), (target
-uBuf
), out
) ==
954 (size_t)(target
-uBuf
));
955 total
+= (target
-uBuf
);
956 } while (source
< sourceLimit
); // while simply out of space
959 printf("%d bytes in, %d UChars out.\n", inbytes
, total
);
961 // ***************************** END SAMPLE ********************
974 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
976 #define BUFFERSIZE 24 /* make it interesting :) */
978 UErrorCode
convsample_46()
980 printf("\n\n==============================================\n"
981 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
986 UChar inBuf
[BUFFERSIZE
];
988 const UChar
*sourceLimit
;
994 UConverter
*conv
= NULL
;
995 UErrorCode status
= U_ZERO_ERROR
;
996 uint32_t inchars
=0, total
=0;
998 f
= fopen("data40.utf16", "rb");
1001 fprintf(stderr
, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1002 return U_FILE_ACCESS_ERROR
;
1005 out
= fopen("data46.out", "wb");
1008 fprintf(stderr
, "Couldn't create file 'data46.out'.\n");
1010 return U_FILE_ACCESS_ERROR
;
1013 // **************************** START SAMPLE *******************
1014 conv
= ucnv_open( "iso-8859-2", &status
);
1015 assert(U_SUCCESS(status
));
1017 bufSize
= (BUFFERSIZE
*ucnv_getMaxCharSize(conv
));
1018 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1019 BUFFERSIZE
, ucnv_getMaxCharSize(conv
), bufSize
);
1020 buf
= (char*)malloc(bufSize
* sizeof(char));
1023 // grab another buffer's worth
1025 ((count
=fread(inBuf
, sizeof(UChar
), BUFFERSIZE
, f
)) > 0) )
1029 // Convert bytes to unicode
1031 sourceLimit
= inBuf
+ count
;
1036 targetLimit
= buf
+ bufSize
;
1038 ucnv_fromUnicode( conv
, &target
, targetLimit
,
1039 &source
, sourceLimit
, NULL
,
1040 feof(f
)?TRUE
:FALSE
, /* pass 'flush' when eof */
1041 /* is true (when no more data will come) */
1044 if(status
== U_BUFFER_OVERFLOW_ERROR
)
1046 // simply ran out of space - we'll reset the target ptr the next
1047 // time through the loop.
1048 status
= U_ZERO_ERROR
;
1052 // Check other errors here.
1053 assert(U_SUCCESS(status
));
1054 // Break out of the loop (by force)
1057 // Process the Unicode
1058 assert(fwrite(buf
, sizeof(buf
[0]), (target
-buf
), out
) ==
1059 (size_t)(target
-buf
));
1060 total
+= (target
-buf
);
1061 } while (source
< sourceLimit
); // while simply out of space
1064 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars
, inchars
* sizeof(UChar
), total
);
1066 // ***************************** END SAMPLE ********************
1073 return U_ZERO_ERROR
;
1077 #define BUFFERSIZE 219
1079 void convsample_50() {
1080 printf("\n\n==============================================\n"
1081 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1083 //! [ucnv_detectUnicodeSignature]
1084 UErrorCode err
= U_ZERO_ERROR
;
1085 UBool discardSignature
= TRUE
; /* set to TRUE to throw away the initial U+FEFF */
1086 char input
[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1087 int32_t signatureLength
= 0;
1088 const char *encoding
= ucnv_detectUnicodeSignature(input
,sizeof(input
),&signatureLength
,&err
);
1089 UConverter
*conv
= NULL
;
1091 UChar
*target
= output
, *out
;
1092 const char *source
= input
;
1093 if(encoding
!=NULL
&& U_SUCCESS(err
)){
1094 // should signature be discarded ?
1095 conv
= ucnv_open(encoding
, &err
);
1096 // do the conversion
1097 ucnv_toUnicode(conv
,
1098 &target
, output
+ UPRV_LENGTHOF(output
),
1099 &source
, input
+ sizeof(input
),
1102 if (discardSignature
){
1103 ++out
; // ignore initial U+FEFF
1105 while(out
!= target
) {
1106 printf("%04x ", *out
++);
1110 //! [ucnv_detectUnicodeSignature]
1121 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1123 convsample_02(); // C , u->koi8r, conv
1124 convsample_03(); // C, iterate
1126 convsample_05(); // C, utf8->u, getNextUChar
1127 convsample_06(); // C freq counter thingy
1129 convsample_12(); // C, sjis->u, conv
1130 convsample_13(); // C, big5->u, getNextU
1132 convsample_20(); // C, callback
1133 convsample_21(); // C, callback debug
1135 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1137 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1139 convsample_50(); // C, detect unicode signature
1141 printf("End of converter samples.\n");