1 /*************************************************************************
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
6 **************************************************************************
7 **************************************************************************
9 * Copyright (C) 2000-2016, International Business Machines
10 * Corporation and others. All Rights Reserved.
12 ***************************************************************************
13 * file name: convsamp.c
14 * encoding: ASCII (7-bit)
16 * created on: 2000may30
17 * created by: Steven R. Loomis
19 * Sample code for the ICU conversion routines.
21 * Note: Nothing special is needed to build this sample. Link with
22 * the icu UC and icu I18N libraries.
24 * I use 'assert' for error checking, you probably will want
25 * something more flexible. '***BEGIN SAMPLE***' and
26 * '***END SAMPLE***' mark pieces suitable for stand alone
30 * Each test can define it's own BUFFERSIZE
34 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
37 #include <ctype.h> /* for isspace, etc. */
40 #include <stdlib.h> /* malloc */
42 #include "unicode/utypes.h" /* Basic ICU data types */
43 #include "unicode/ucnv.h" /* C Converter API */
44 #include "unicode/ustring.h" /* some more string fcns*/
45 #include "unicode/uchar.h" /* char names */
46 #include "unicode/uloc.h"
47 #include "unicode/unistr.h"
51 /* Some utility functions */
53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
56 static const UChar kNone
[] = { 0x0000 };
58 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
60 /* Print a UChar if possible, in seven characters. */
61 void prettyPrintUChar(UChar c
)
65 printf(" '%c' ", (char)(0x00FF&c
));
66 } else if ( c
> 0x007F ) {
68 UErrorCode status
= U_ZERO_ERROR
;
71 o
= u_charName(c
, U_EXTENDED_CHAR_NAME
, buf
, 1000, &status
);
72 if(U_SUCCESS(status
) && (o
>0) ) {
79 switch((char)(c
& 0x007F)) {
97 void printUChars(const char *name
= "?",
98 const UChar
*uch
= kNone
,
103 if( (len
== -1) && (uch
) ) {
107 printf("%5s: ", name
);
108 for( i
= 0; i
<len
; i
++) {
113 printf("%5s: ", "uni");
114 for( i
= 0; i
<len
; i
++) {
115 printf("\\u%04X ", (int)uch
[i
]);
119 printf("%5s:", "ch");
120 for( i
= 0; i
<len
; i
++) {
121 prettyPrintUChar(uch
[i
]);
126 void printBytes(const char *name
= "?",
127 const char *uch
= "",
132 if( (len
== -1) && (uch
) ) {
133 len
= static_cast<int32_t>(strlen(uch
));
136 printf("%5s: ", name
);
137 for( i
= 0; i
<len
; i
++) {
142 printf("%5s: ", "uni");
143 for( i
= 0; i
<len
; i
++) {
144 printf("\\x%02X ", 0x00FF & (int)uch
[i
]);
148 printf("%5s:", "ch");
149 for( i
= 0; i
<len
; i
++) {
150 if(isgraph(0x00FF & (int)uch
[i
])) {
151 printf(" '%c' ", (char)uch
[i
]);
159 void printUChar(UChar32 ch32
)
162 printf("ch: U+%06X\n", ch32
);
165 UChar ch
= (UChar
)ch32
;
166 printUChars("C", &ch
, 1);
170 /*******************************************************************
171 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
172 followed by an exclamation mark (!) into the KOI8-R Russian code page.
174 This example first creates a UChar String out of the Unicode chars.
176 targetSize must be set to the amount of space available in the target
177 buffer. After fromUChars is called,
178 len will contain the number of bytes in target[] which were
179 used in the resulting codepage. In this case, there is a 1:1 mapping
180 between the input and output characters. The exclamation mark has the
181 same value in both KOI8-R and Unicode.
184 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
185 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
188 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
192 Converting FROM unicode
194 You must call ucnv_close to clean up the memory used by the
197 'len' returns the number of OUTPUT bytes resulting from the
201 UErrorCode
convsample_02()
203 printf("\n\n==============================================\n"
204 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
207 // **************************** START SAMPLE *******************
209 UChar source
[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
210 0x0430, 0x0021, 0x0000 };
212 UErrorCode status
= U_ZERO_ERROR
;
216 // set up the converter
218 conv
= ucnv_open("koi8-r", &status
);
220 assert(U_SUCCESS(status
));
223 len
= ucnv_fromUChars(conv
, target
, 100, source
, -1, &status
);
224 assert(U_SUCCESS(status
));
226 // close the converter
229 // ***************************** END SAMPLE ********************
232 printUChars("src", source
);
234 printBytes("targ", target
, len
);
240 UErrorCode
convsample_03()
242 printf("\n\n==============================================\n"
243 "Sample 03: C: print out all converters\n");
248 // **************************** START SAMPLE *******************
249 count
= ucnv_countAvailable();
250 printf("Available converters: %d\n", count
);
254 printf("%s ", ucnv_getAvailableName(i
));
257 // ***************************** END SAMPLE ********************
266 #define BUFFERSIZE 17 /* make it interesting :) */
269 Converting from a codepage to Unicode in bulk..
270 What is the best way to determine the buffer size?
272 The 'buffersize' is in bytes of input.
273 For a given converter, divinding this by the minimum char size
274 give you the maximum number of Unicode characters that could be
275 expected for a given number of input bytes.
276 see: ucnv_getMinCharSize()
278 For example, a single byte codepage like 'Latin-3' has a
279 minimum char size of 1. (It takes at least 1 byte to represent
280 each Unicode char.) So the unicode buffer has the same number of
281 UChars as the input buffer has bytes.
283 In a strictly double byte codepage such as cp1362 (Windows
284 Korean), the minimum char size is 2. So, only half as many Unicode
285 chars as bytes are needed.
287 This work to calculate the buffer size is an optimization. Any
288 size of input and output buffer can be used, as long as the
289 program handles the following cases: If the input buffer is empty,
290 the source pointer will be equal to sourceLimit. If the output
291 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
294 UErrorCode
convsample_05()
296 printf("\n\n==============================================\n"
297 "Sample 05: C: count the number of letters in a UTF-8 document\n");
301 char inBuf
[BUFFERSIZE
];
303 const char *sourceLimit
;
308 int32_t uBufSize
= 0;
310 UErrorCode status
= U_ZERO_ERROR
;
311 uint32_t letters
=0, total
=0;
313 f
= fopen("data01.txt", "r");
316 fprintf(stderr
, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
317 return U_FILE_ACCESS_ERROR
;
320 // **************************** START SAMPLE *******************
321 conv
= ucnv_open("utf-8", &status
);
322 assert(U_SUCCESS(status
));
324 uBufSize
= (BUFFERSIZE
/ucnv_getMinCharSize(conv
));
325 printf("input bytes %d / min chars %d = %d UChars\n",
326 BUFFERSIZE
, ucnv_getMinCharSize(conv
), uBufSize
);
327 uBuf
= (UChar
*)malloc(uBufSize
* sizeof(UChar
));
330 // grab another buffer's worth
332 ((count
=static_cast<int32_t>(fread(inBuf
, 1, BUFFERSIZE
, f
))) > 0) )
334 // Convert bytes to unicode
336 sourceLimit
= inBuf
+ count
;
341 targetLimit
= uBuf
+ uBufSize
;
343 ucnv_toUnicode(conv
, &target
, targetLimit
,
344 &source
, sourceLimit
, NULL
,
345 feof(f
)?TRUE
:FALSE
, /* pass 'flush' when eof */
346 /* is true (when no more data will come) */
349 if(status
== U_BUFFER_OVERFLOW_ERROR
)
351 // simply ran out of space - we'll reset the target ptr the next
352 // time through the loop.
353 status
= U_ZERO_ERROR
;
357 // Check other errors here.
358 assert(U_SUCCESS(status
));
359 // Break out of the loop (by force)
362 // Process the Unicode
363 // Todo: handle UTF-16/surrogates
365 for(p
= uBuf
; p
<target
; p
++)
371 } while (source
< sourceLimit
); // while simply out of space
374 printf("%d letters out of %d total UChars.\n", letters
, total
);
376 // ***************************** END SAMPLE ********************
387 #define BUFFERSIZE 1024
394 UErrorCode
convsample_06()
396 printf("\n\n==============================================\n"
397 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
401 char inBuf
[BUFFERSIZE
];
403 const char *sourceLimit
;
404 int32_t uBufSize
= 0;
406 UErrorCode status
= U_ZERO_ERROR
;
407 uint32_t letters
=0, total
=0;
410 UChar32 charCount
= 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
417 f
= fopen("data06.txt", "r");
420 fprintf(stderr
, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
421 return U_FILE_ACCESS_ERROR
;
424 info
= (CharFreqInfo
*)malloc(sizeof(CharFreqInfo
) * charCount
);
427 fprintf(stderr
, " Couldn't allocate %d bytes for freq counter\n", static_cast<int>(sizeof(CharFreqInfo
)*charCount
));
430 /* reset frequencies */
431 for(p
=0;p
<charCount
;p
++)
433 info
[p
].codepoint
= p
;
434 info
[p
].frequency
= 0;
437 // **************************** START SAMPLE *******************
438 conv
= ucnv_open("utf-8", &status
);
439 assert(U_SUCCESS(status
));
441 uBufSize
= (BUFFERSIZE
/ucnv_getMinCharSize(conv
));
442 printf("input bytes %d / min chars %d = %d UChars\n",
443 BUFFERSIZE
, ucnv_getMinCharSize(conv
), uBufSize
);
445 // grab another buffer's worth
447 ((count
=static_cast<int32_t>(fread(inBuf
, 1, BUFFERSIZE
, f
))) > 0) )
449 // Convert bytes to unicode
451 sourceLimit
= inBuf
+ count
;
453 while(source
< sourceLimit
)
455 p
= ucnv_getNextUChar(conv
, &source
, sourceLimit
, &status
);
456 if(U_FAILURE(status
))
458 fprintf(stderr
, "%s @ %d\n", u_errorName(status
), total
);
459 status
= U_ZERO_ERROR
;
468 if((u_tolower(l
) == 'i') && (u_tolower(p
) == 'e'))
471 if((u_tolower(l
) == 'g') && (u_tolower(p
) == 0x0127))
476 fprintf(stderr
, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p
);
480 return U_UNSUPPORTED_ERROR
;
490 printf("%d letters out of %d total UChars.\n", letters
, total
);
491 printf("%d ie digraphs, %d gh digraphs.\n", ie
, gh
);
493 // now, we could sort it..
495 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
497 for(p
=0;p
<charCount
;p
++)
499 if(info
[p
].frequency
)
501 printf("% 5d U+%06X ", info
[p
].frequency
, p
);
504 prettyPrintUChar((UChar
)p
);
510 // ***************************** END SAMPLE ********************
519 /******************************************************
520 You must call ucnv_close to clean up the memory used by the
523 'len' returns the number of OUTPUT bytes resulting from the
527 UErrorCode
convsample_12()
529 printf("\n\n==============================================\n"
530 "Sample 12: C: simple sjis -> unicode conversion\n");
533 // **************************** START SAMPLE *******************
535 char source
[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
537 UErrorCode status
= U_ZERO_ERROR
;
541 // set up the converter
542 conv
= ucnv_open("shift_jis", &status
);
543 assert(U_SUCCESS(status
));
545 // convert to Unicode
546 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
548 len
= ucnv_toUChars(conv
, target
, 100, source
, static_cast<int32_t>(strlen(source
)), &status
);
550 // close the converter
553 // ***************************** END SAMPLE ********************
556 printBytes("src", source
, static_cast<int32_t>(strlen(source
)) );
558 printUChars("targ", target
, len
);
563 /******************************************************************
564 C: Convert from codepage to Unicode one at a time.
567 UErrorCode
convsample_13()
569 printf("\n\n==============================================\n"
570 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
573 const char sourceChars
[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
574 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
575 const char *source
, *sourceLimit
;
577 UErrorCode status
= U_ZERO_ERROR
;
578 UConverter
*conv
= NULL
;
582 srcCount
= sizeof(sourceChars
);
584 conv
= ucnv_open("Big5", &status
);
587 source
= sourceChars
;
588 sourceLimit
= sourceChars
+ sizeof(sourceChars
);
590 // **************************** START SAMPLE *******************
593 printBytes("src", source
, static_cast<int32_t>(sourceLimit
- source
));
595 while(source
< sourceLimit
)
598 target
= ucnv_getNextUChar (conv
,
603 // printBytes("src",source,sourceLimit-source);
610 // ************************** END SAMPLE *************************
612 printf("src=%d bytes, dst=%d uchars\n", srcCount
, dstCount
);
621 UBool
convsample_20_didSubstitute(const char *source
)
625 UConverter
*conv
= NULL
;
626 UErrorCode status
= U_ZERO_ERROR
;
630 FromUFLAGContext
* context
= NULL
;
632 printf("\n\n==============================================\n"
633 "Sample 20: C: Test for substitution using callbacks\n");
635 /* print out the original source */
636 printBytes("src", source
);
639 /* First, convert from UTF8 to unicode */
640 conv
= ucnv_open("utf-8", &status
);
643 len
= ucnv_toUChars(conv
, uchars
, 100, source
, static_cast<int32_t>(strlen(source
)), &status
);
646 printUChars("uch", uchars
, len
);
649 /* Now, close the converter */
652 /* Now, convert to windows-1252 */
653 conv
= ucnv_open("windows-1252", &status
);
656 /* Converter starts out with the SUBSTITUTE callback set. */
658 /* initialize our callback */
659 context
= flagCB_fromU_openContext();
661 /* Set our special callback */
662 ucnv_setFromUCallBack(conv
,
665 &(context
->subCallback
),
666 &(context
->subContext
),
671 len2
= ucnv_fromUChars(conv
, bytes
, 100, uchars
, len
, &status
);
674 flagVal
= context
->flag
; /* it's about to go away when we close the cnv */
678 /* print out the original source */
679 printBytes("bytes", bytes
, len2
);
681 return flagVal
; /* true if callback was called */
684 UErrorCode
convsample_20()
686 const char *sample1
= "abc\xdf\xbf";
687 const char *sample2
= "abc_def";
690 if(convsample_20_didSubstitute(sample1
))
692 printf("DID substitute.\n******\n");
696 printf("Did NOT substitute.\n*****\n");
699 if(convsample_20_didSubstitute(sample2
))
701 printf("DID substitute.\n******\n");
705 printf("Did NOT substitute.\n*****\n");
711 // 21 - C, callback, with clone and debug
715 UBool
convsample_21_didSubstitute(const char *source
)
719 UConverter
*conv
= NULL
, *cloneCnv
= NULL
;
720 UErrorCode status
= U_ZERO_ERROR
;
722 UBool flagVal
= FALSE
;
723 UConverterFromUCallback junkCB
;
725 FromUFLAGContext
*flagCtx
= NULL
,
726 *cloneFlagCtx
= NULL
;
728 debugCBContext
*debugCtx1
= NULL
,
730 *cloneDebugCtx
= NULL
;
732 printf("\n\n==============================================\n"
733 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
735 /* print out the original source */
736 printBytes("src", source
);
739 /* First, convert from UTF8 to unicode */
740 conv
= ucnv_open("utf-8", &status
);
743 len
= ucnv_toUChars(conv
, uchars
, 100, source
, static_cast<int32_t>(strlen(source
)), &status
);
746 printUChars("uch", uchars
, len
);
749 /* Now, close the converter */
752 /* Now, convert to windows-1252 */
753 conv
= ucnv_open("windows-1252", &status
);
756 /* Converter starts out with the SUBSTITUTE callback set. */
758 /* initialize our callback */
759 /* from the 'bottom' innermost, out
760 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
763 printf("flagCB_fromU = %p\n", &flagCB_fromU
);
764 printf("debugCB_fromU = %p\n", &debugCB_fromU
);
767 debugCtx1
= debugCB_openContext();
768 flagCtx
= flagCB_fromU_openContext();
769 debugCtx2
= debugCB_openContext();
771 debugCtx1
->subCallback
= flagCB_fromU
; /* debug1 -> flag */
772 debugCtx1
->subContext
= flagCtx
;
774 flagCtx
->subCallback
= debugCB_fromU
; /* flag -> debug2 */
775 flagCtx
->subContext
= debugCtx2
;
777 debugCtx2
->subCallback
= UCNV_FROM_U_CALLBACK_SUBSTITUTE
;
778 debugCtx2
->subContext
= NULL
;
780 /* Set our special callback */
782 ucnv_setFromUCallBack(conv
,
785 &(debugCtx2
->subCallback
),
786 &(debugCtx2
->subContext
),
792 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
793 conv
, debugCtx1
, debugCtx1
->subCallback
,
794 debugCtx1
->subContext
, flagCtx
, debugCtx2
, debugCtx2
->subCallback
);
797 cloneCnv
= ucnv_safeClone(conv
, NULL
, NULL
, &status
);
802 printf("Cloned converter from %p -> %p. Closing %p.\n", conv
, cloneCnv
, conv
);
808 printf("%p closed.\n", conv
);
812 /* Now, we have to extract the context */
813 cloneDebugCtx
= NULL
;
816 ucnv_getFromUCallBack(cloneCnv
, &junkCB
, (const void **)&cloneDebugCtx
);
817 if(cloneDebugCtx
!= NULL
) {
818 cloneFlagCtx
= (FromUFLAGContext
*) cloneDebugCtx
-> subContext
;
821 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
822 cloneCnv
, cloneDebugCtx
, cloneFlagCtx
, cloneFlagCtx
?cloneFlagCtx
->subContext
:NULL
);
824 len2
= ucnv_fromUChars(cloneCnv
, bytes
, 100, uchars
, len
, &status
);
827 if(cloneFlagCtx
!= NULL
) {
828 flagVal
= cloneFlagCtx
->flag
; /* it's about to go away when we close the cnv */
830 printf("** Warning, couldn't get the subcallback \n");
833 ucnv_close(cloneCnv
);
835 /* print out the original source */
836 printBytes("bytes", bytes
, len2
);
838 return flagVal
; /* true if callback was called */
841 UErrorCode
convsample_21()
843 const char *sample1
= "abc\xdf\xbf";
844 const char *sample2
= "abc_def";
846 if(convsample_21_didSubstitute(sample1
))
848 printf("DID substitute.\n******\n");
852 printf("Did NOT substitute.\n*****\n");
855 if(convsample_21_didSubstitute(sample2
))
857 printf("DID substitute.\n******\n");
861 printf("Did NOT substitute.\n*****\n");
868 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
870 #define BUFFERSIZE 17 /* make it interesting :) */
872 UErrorCode
convsample_40()
874 printf("\n\n==============================================\n"
875 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
880 char inBuf
[BUFFERSIZE
];
882 const char *sourceLimit
;
886 int32_t uBufSize
= 0;
887 UConverter
*conv
= NULL
;
888 UErrorCode status
= U_ZERO_ERROR
;
889 uint32_t inbytes
=0, total
=0;
891 f
= fopen("data02.bin", "rb");
894 fprintf(stderr
, "Couldn't open file 'data02.bin' (cp37 data file).\n");
895 return U_FILE_ACCESS_ERROR
;
898 out
= fopen("data40.utf16", "wb");
901 fprintf(stderr
, "Couldn't create file 'data40.utf16'.\n");
903 return U_FILE_ACCESS_ERROR
;
906 // **************************** START SAMPLE *******************
907 conv
= ucnv_openCCSID(37, UCNV_IBM
, &status
);
908 assert(U_SUCCESS(status
));
910 uBufSize
= (BUFFERSIZE
/ucnv_getMinCharSize(conv
));
911 printf("input bytes %d / min chars %d = %d UChars\n",
912 BUFFERSIZE
, ucnv_getMinCharSize(conv
), uBufSize
);
913 uBuf
= (UChar
*)malloc(uBufSize
* sizeof(UChar
));
916 // grab another buffer's worth
918 ((count
=static_cast<int32_t>(fread(inBuf
, 1, BUFFERSIZE
, f
))) > 0) )
922 // Convert bytes to unicode
924 sourceLimit
= inBuf
+ count
;
929 targetLimit
= uBuf
+ uBufSize
;
931 ucnv_toUnicode( conv
, &target
, targetLimit
,
932 &source
, sourceLimit
, NULL
,
933 feof(f
)?TRUE
:FALSE
, /* pass 'flush' when eof */
934 /* is true (when no more data will come) */
937 if(status
== U_BUFFER_OVERFLOW_ERROR
)
939 // simply ran out of space - we'll reset the target ptr the next
940 // time through the loop.
941 status
= U_ZERO_ERROR
;
945 // Check other errors here.
946 assert(U_SUCCESS(status
));
947 // Break out of the loop (by force)
950 // Process the Unicode
951 // Todo: handle UTF-16/surrogates
952 assert(fwrite(uBuf
, sizeof(uBuf
[0]), (target
-uBuf
), out
) == (size_t)(target
-uBuf
));
953 total
+= static_cast<uint32_t>((target
-uBuf
));
954 } while (source
< sourceLimit
); // while simply out of space
957 printf("%d bytes in, %d UChars out.\n", inbytes
, total
);
959 // ***************************** END SAMPLE ********************
972 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
974 #define BUFFERSIZE 24 /* make it interesting :) */
976 UErrorCode
convsample_46()
978 printf("\n\n==============================================\n"
979 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
984 UChar inBuf
[BUFFERSIZE
];
986 const UChar
*sourceLimit
;
992 UConverter
*conv
= NULL
;
993 UErrorCode status
= U_ZERO_ERROR
;
994 uint32_t inchars
=0, total
=0;
996 f
= fopen("data40.utf16", "rb");
999 fprintf(stderr
, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1000 return U_FILE_ACCESS_ERROR
;
1003 out
= fopen("data46.out", "wb");
1006 fprintf(stderr
, "Couldn't create file 'data46.out'.\n");
1008 return U_FILE_ACCESS_ERROR
;
1011 // **************************** START SAMPLE *******************
1012 conv
= ucnv_open( "iso-8859-2", &status
);
1013 assert(U_SUCCESS(status
));
1015 bufSize
= (BUFFERSIZE
*ucnv_getMaxCharSize(conv
));
1016 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1017 BUFFERSIZE
, ucnv_getMaxCharSize(conv
), bufSize
);
1018 buf
= (char*)malloc(bufSize
* sizeof(char));
1021 // grab another buffer's worth
1023 ((count
=static_cast<int32_t>(fread(inBuf
, sizeof(UChar
), BUFFERSIZE
, f
))) > 0) )
1027 // Convert bytes to unicode
1029 sourceLimit
= inBuf
+ count
;
1034 targetLimit
= buf
+ bufSize
;
1036 ucnv_fromUnicode( conv
, &target
, targetLimit
,
1037 &source
, sourceLimit
, NULL
,
1038 feof(f
)?TRUE
:FALSE
, /* pass 'flush' when eof */
1039 /* is true (when no more data will come) */
1042 if(status
== U_BUFFER_OVERFLOW_ERROR
)
1044 // simply ran out of space - we'll reset the target ptr the next
1045 // time through the loop.
1046 status
= U_ZERO_ERROR
;
1050 // Check other errors here.
1051 assert(U_SUCCESS(status
));
1052 // Break out of the loop (by force)
1055 // Process the Unicode
1056 assert(fwrite(buf
, sizeof(buf
[0]), (target
-buf
), out
) == (size_t)(target
-buf
));
1057 total
+= static_cast<uint32_t>((target
-buf
));
1058 } while (source
< sourceLimit
); // while simply out of space
1061 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars
, static_cast<int>(inchars
* sizeof(UChar
)), total
);
1063 // ***************************** END SAMPLE ********************
1070 return U_ZERO_ERROR
;
1074 #define BUFFERSIZE 219
1076 void convsample_50() {
1077 printf("\n\n==============================================\n"
1078 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1080 //! [ucnv_detectUnicodeSignature]
1081 UErrorCode err
= U_ZERO_ERROR
;
1082 UBool discardSignature
= TRUE
; /* set to TRUE to throw away the initial U+FEFF */
1083 char input
[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1084 int32_t signatureLength
= 0;
1085 const char *encoding
= ucnv_detectUnicodeSignature(input
,sizeof(input
),&signatureLength
,&err
);
1086 UConverter
*conv
= NULL
;
1088 UChar
*target
= output
, *out
;
1089 const char *source
= input
;
1090 if(encoding
!=NULL
&& U_SUCCESS(err
)){
1091 // should signature be discarded ?
1092 conv
= ucnv_open(encoding
, &err
);
1093 // do the conversion
1094 ucnv_toUnicode(conv
,
1095 &target
, output
+ UPRV_LENGTHOF(output
),
1096 &source
, input
+ sizeof(input
),
1099 if (discardSignature
){
1100 ++out
; // ignore initial U+FEFF
1102 while(out
!= target
) {
1103 printf("%04x ", *out
++);
1107 //! [ucnv_detectUnicodeSignature]
1118 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1120 convsample_02(); // C , u->koi8r, conv
1121 convsample_03(); // C, iterate
1123 convsample_05(); // C, utf8->u, getNextUChar
1124 convsample_06(); // C freq counter thingy
1126 convsample_12(); // C, sjis->u, conv
1127 convsample_13(); // C, big5->u, getNextU
1129 convsample_20(); // C, callback
1130 convsample_21(); // C, callback debug
1132 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1134 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1136 convsample_50(); // C, detect unicode signature
1138 printf("End of converter samples.\n");