]> git.saurik.com Git - apple/icu.git/blob - icuSources/samples/ucnv/convsamp.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / samples / ucnv / convsamp.cpp
1 /**************************************************************************
2 *
3 * Copyright (C) 2000-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ***************************************************************************
7 * file name: convsamp.c
8 * encoding: ASCII (7-bit)
9 *
10 * created on: 2000may30
11 * created by: Steven R. Loomis
12 *
13 * Sample code for the ICU conversion routines.
14 *
15 * Note: Nothing special is needed to build this sample. Link with
16 * the icu UC and icu I18N libraries.
17 *
18 * I use 'assert' for error checking, you probably will want
19 * something more flexible. '***BEGIN SAMPLE***' and
20 * '***END SAMPLE***' mark pieces suitable for stand alone
21 * code snippets.
22 *
23 *
24 * Each test can define it's own BUFFERSIZE
25 *
26 */
27
28 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
29
30 #include <stdio.h>
31 #include <ctype.h> /* for isspace, etc. */
32 #include <assert.h>
33 #include <string.h>
34 #include <stdlib.h> /* malloc */
35
36 #include "cmemory.h"
37 #include "unicode/utypes.h" /* Basic ICU data types */
38 #include "unicode/ucnv.h" /* C Converter API */
39 #include "unicode/ustring.h" /* some more string fcns*/
40 #include "unicode/uchar.h" /* char names */
41 #include "unicode/uloc.h"
42 #include "unicode/unistr.h"
43
44 #include "flagcb.h"
45
46 /* Some utility functions */
47
48 static const UChar kNone[] = { 0x0000 };
49
50 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
51
52 /* Print a UChar if possible, in seven characters. */
53 void prettyPrintUChar(UChar c)
54 {
55 if( (c <= 0x007F) &&
56 (isgraph(c)) ) {
57 printf(" '%c' ", (char)(0x00FF&c));
58 } else if ( c > 0x007F ) {
59 char buf[1000];
60 UErrorCode status = U_ZERO_ERROR;
61 int32_t o;
62
63 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
64 if(U_SUCCESS(status) && (o>0) ) {
65 buf[6] = 0;
66 printf("%7s", buf);
67 } else {
68 printf(" ??????");
69 }
70 } else {
71 switch((char)(c & 0x007F)) {
72 case ' ':
73 printf(" ' ' ");
74 break;
75 case '\t':
76 printf(" \\t ");
77 break;
78 case '\n':
79 printf(" \\n ");
80 break;
81 default:
82 printf(" _ ");
83 break;
84 }
85 }
86 }
87
88
89 void printUChars(const char *name = "?",
90 const UChar *uch = kNone,
91 int32_t len = -1 )
92 {
93 int32_t i;
94
95 if( (len == -1) && (uch) ) {
96 len = u_strlen(uch);
97 }
98
99 printf("%5s: ", name);
100 for( i = 0; i <len; i++) {
101 printf("%-6d ", i);
102 }
103 printf("\n");
104
105 printf("%5s: ", "uni");
106 for( i = 0; i <len; i++) {
107 printf("\\u%04X ", (int)uch[i]);
108 }
109 printf("\n");
110
111 printf("%5s:", "ch");
112 for( i = 0; i <len; i++) {
113 prettyPrintUChar(uch[i]);
114 }
115 printf("\n");
116 }
117
118 void printBytes(const char *name = "?",
119 const char *uch = "",
120 int32_t len = -1 )
121 {
122 int32_t i;
123
124 if( (len == -1) && (uch) ) {
125 len = strlen(uch);
126 }
127
128 printf("%5s: ", name);
129 for( i = 0; i <len; i++) {
130 printf("%-4d ", i);
131 }
132 printf("\n");
133
134 printf("%5s: ", "uni");
135 for( i = 0; i <len; i++) {
136 printf("\\x%02X ", 0x00FF & (int)uch[i]);
137 }
138 printf("\n");
139
140 printf("%5s:", "ch");
141 for( i = 0; i <len; i++) {
142 if(isgraph(0x00FF & (int)uch[i])) {
143 printf(" '%c' ", (char)uch[i]);
144 } else {
145 printf(" ");
146 }
147 }
148 printf("\n");
149 }
150
151 void printUChar(UChar32 ch32)
152 {
153 if(ch32 > 0xFFFF) {
154 printf("ch: U+%06X\n", ch32);
155 }
156 else {
157 UChar ch = (UChar)ch32;
158 printUChars("C", &ch, 1);
159 }
160 }
161
162 /*******************************************************************
163 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
164 followed by an exclamation mark (!) into the KOI8-R Russian code page.
165
166 This example first creates a UChar String out of the Unicode chars.
167
168 targetSize must be set to the amount of space available in the target
169 buffer. After fromUChars is called,
170 len will contain the number of bytes in target[] which were
171 used in the resulting codepage. In this case, there is a 1:1 mapping
172 between the input and output characters. The exclamation mark has the
173 same value in both KOI8-R and Unicode.
174
175 src: 0 1 2 3 4 5 6
176 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
177 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
178
179 targ: 0 1 2 3 4 5 6
180 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
181 ch: '!'
182
183
184 Converting FROM unicode
185 to koi8-r.
186 You must call ucnv_close to clean up the memory used by the
187 converter.
188
189 'len' returns the number of OUTPUT bytes resulting from the
190 conversion.
191 */
192
193 UErrorCode convsample_02()
194 {
195 printf("\n\n==============================================\n"
196 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
197
198
199 // **************************** START SAMPLE *******************
200 // "cat<cat>OK"
201 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
202 0x0430, 0x0021, 0x0000 };
203 char target[100];
204 UErrorCode status = U_ZERO_ERROR;
205 UConverter *conv;
206 int32_t len;
207
208 // set up the converter
209 //! [ucnv_open]
210 conv = ucnv_open("koi8-r", &status);
211 //! [ucnv_open]
212 assert(U_SUCCESS(status));
213
214 // convert to koi8-r
215 len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
216 assert(U_SUCCESS(status));
217
218 // close the converter
219 ucnv_close(conv);
220
221 // ***************************** END SAMPLE ********************
222
223 // Print it out
224 printUChars("src", source);
225 printf("\n");
226 printBytes("targ", target, len);
227
228 return U_ZERO_ERROR;
229 }
230
231
232 UErrorCode convsample_03()
233 {
234 printf("\n\n==============================================\n"
235 "Sample 03: C: print out all converters\n");
236
237 int32_t count;
238 int32_t i;
239
240 // **************************** START SAMPLE *******************
241 count = ucnv_countAvailable();
242 printf("Available converters: %d\n", count);
243
244 for(i=0;i<count;i++)
245 {
246 printf("%s ", ucnv_getAvailableName(i));
247 }
248
249 // ***************************** END SAMPLE ********************
250
251 printf("\n");
252
253 return U_ZERO_ERROR;
254 }
255
256
257
258 #define BUFFERSIZE 17 /* make it interesting :) */
259
260 /*
261 Converting from a codepage to Unicode in bulk..
262 What is the best way to determine the buffer size?
263
264 The 'buffersize' is in bytes of input.
265 For a given converter, divinding this by the minimum char size
266 give you the maximum number of Unicode characters that could be
267 expected for a given number of input bytes.
268 see: ucnv_getMinCharSize()
269
270 For example, a single byte codepage like 'Latin-3' has a
271 minimum char size of 1. (It takes at least 1 byte to represent
272 each Unicode char.) So the unicode buffer has the same number of
273 UChars as the input buffer has bytes.
274
275 In a strictly double byte codepage such as cp1362 (Windows
276 Korean), the minimum char size is 2. So, only half as many Unicode
277 chars as bytes are needed.
278
279 This work to calculate the buffer size is an optimization. Any
280 size of input and output buffer can be used, as long as the
281 program handles the following cases: If the input buffer is empty,
282 the source pointer will be equal to sourceLimit. If the output
283 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
284 */
285
286 UErrorCode convsample_05()
287 {
288 printf("\n\n==============================================\n"
289 "Sample 05: C: count the number of letters in a UTF-8 document\n");
290
291 FILE *f;
292 int32_t count;
293 char inBuf[BUFFERSIZE];
294 const char *source;
295 const char *sourceLimit;
296 UChar *uBuf;
297 UChar *target;
298 UChar *targetLimit;
299 UChar *p;
300 int32_t uBufSize = 0;
301 UConverter *conv;
302 UErrorCode status = U_ZERO_ERROR;
303 uint32_t letters=0, total=0;
304
305 f = fopen("data01.txt", "r");
306 if(!f)
307 {
308 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
309 return U_FILE_ACCESS_ERROR;
310 }
311
312 // **************************** START SAMPLE *******************
313 conv = ucnv_open("utf-8", &status);
314 assert(U_SUCCESS(status));
315
316 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
317 printf("input bytes %d / min chars %d = %d UChars\n",
318 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
319 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
320 assert(uBuf!=NULL);
321
322 // grab another buffer's worth
323 while((!feof(f)) &&
324 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
325 {
326 // Convert bytes to unicode
327 source = inBuf;
328 sourceLimit = inBuf + count;
329
330 do
331 {
332 target = uBuf;
333 targetLimit = uBuf + uBufSize;
334
335 ucnv_toUnicode(conv, &target, targetLimit,
336 &source, sourceLimit, NULL,
337 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
338 /* is true (when no more data will come) */
339 &status);
340
341 if(status == U_BUFFER_OVERFLOW_ERROR)
342 {
343 // simply ran out of space - we'll reset the target ptr the next
344 // time through the loop.
345 status = U_ZERO_ERROR;
346 }
347 else
348 {
349 // Check other errors here.
350 assert(U_SUCCESS(status));
351 // Break out of the loop (by force)
352 }
353
354 // Process the Unicode
355 // Todo: handle UTF-16/surrogates
356
357 for(p = uBuf; p<target; p++)
358 {
359 if(u_isalpha(*p))
360 letters++;
361 total++;
362 }
363 } while (source < sourceLimit); // while simply out of space
364 }
365
366 printf("%d letters out of %d total UChars.\n", letters, total);
367
368 // ***************************** END SAMPLE ********************
369 ucnv_close(conv);
370
371 printf("\n");
372
373 fclose(f);
374
375 return U_ZERO_ERROR;
376 }
377 #undef BUFFERSIZE
378
379 #define BUFFERSIZE 1024
380 typedef struct
381 {
382 UChar32 codepoint;
383 uint32_t frequency;
384 } CharFreqInfo;
385
386 UErrorCode convsample_06()
387 {
388 printf("\n\n==============================================\n"
389 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
390
391 FILE *f;
392 int32_t count;
393 char inBuf[BUFFERSIZE];
394 const char *source;
395 const char *sourceLimit;
396 int32_t uBufSize = 0;
397 UConverter *conv;
398 UErrorCode status = U_ZERO_ERROR;
399 uint32_t letters=0, total=0;
400
401 CharFreqInfo *info;
402 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
403 UChar32 p;
404
405 uint32_t ie = 0;
406 uint32_t gh = 0;
407 UChar32 l = 0;
408
409 f = fopen("data06.txt", "r");
410 if(!f)
411 {
412 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
413 return U_FILE_ACCESS_ERROR;
414 }
415
416 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
417 if(!info)
418 {
419 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
420 }
421
422 /* reset frequencies */
423 for(p=0;p<charCount;p++)
424 {
425 info[p].codepoint = p;
426 info[p].frequency = 0;
427 }
428
429 // **************************** START SAMPLE *******************
430 conv = ucnv_open("utf-8", &status);
431 assert(U_SUCCESS(status));
432
433 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
434 printf("input bytes %d / min chars %d = %d UChars\n",
435 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
436
437 // grab another buffer's worth
438 while((!feof(f)) &&
439 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
440 {
441 // Convert bytes to unicode
442 source = inBuf;
443 sourceLimit = inBuf + count;
444
445 while(source < sourceLimit)
446 {
447 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
448 if(U_FAILURE(status))
449 {
450 fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
451 status = U_ZERO_ERROR;
452 continue;
453 }
454 U_ASSERT(status);
455 total++;
456
457 if(u_isalpha(p))
458 letters++;
459
460 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
461 ie++;
462
463 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
464 gh++;
465
466 if(p>charCount)
467 {
468 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
469 free(info);
470 fclose(f);
471 ucnv_close(conv);
472 return U_UNSUPPORTED_ERROR;
473 }
474 info[p].frequency++;
475 l = p;
476 }
477 }
478
479 fclose(f);
480 ucnv_close(conv);
481
482 printf("%d letters out of %d total UChars.\n", letters, total);
483 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
484
485 // now, we could sort it..
486
487 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
488
489 for(p=0;p<charCount;p++)
490 {
491 if(info[p].frequency)
492 {
493 printf("% 5d U+%06X ", info[p].frequency, p);
494 if(p <= 0xFFFF)
495 {
496 prettyPrintUChar((UChar)p);
497 }
498 printf("\n");
499 }
500 }
501 free(info);
502 // ***************************** END SAMPLE ********************
503
504 printf("\n");
505
506 return U_ZERO_ERROR;
507 }
508 #undef BUFFERSIZE
509
510
511 /******************************************************
512 You must call ucnv_close to clean up the memory used by the
513 converter.
514
515 'len' returns the number of OUTPUT bytes resulting from the
516 conversion.
517 */
518
519 UErrorCode convsample_12()
520 {
521 printf("\n\n==============================================\n"
522 "Sample 12: C: simple sjis -> unicode conversion\n");
523
524
525 // **************************** START SAMPLE *******************
526
527 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
528 UChar target[100];
529 UErrorCode status = U_ZERO_ERROR;
530 UConverter *conv;
531 int32_t len;
532
533 // set up the converter
534 conv = ucnv_open("shift_jis", &status);
535 assert(U_SUCCESS(status));
536
537 // convert to Unicode
538 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
539 target[6] = 0xFDCA;
540 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
541 U_ASSERT(status);
542 // close the converter
543 ucnv_close(conv);
544
545 // ***************************** END SAMPLE ********************
546
547 // Print it out
548 printBytes("src", source, strlen(source) );
549 printf("\n");
550 printUChars("targ", target, len);
551
552 return U_ZERO_ERROR;
553 }
554
555 /******************************************************************
556 C: Convert from codepage to Unicode one at a time.
557 */
558
559 UErrorCode convsample_13()
560 {
561 printf("\n\n==============================================\n"
562 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
563
564
565 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
566 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
567 const char *source, *sourceLimit;
568 UChar32 target;
569 UErrorCode status = U_ZERO_ERROR;
570 UConverter *conv = NULL;
571 int32_t srcCount=0;
572 int32_t dstCount=0;
573
574 srcCount = sizeof(sourceChars);
575
576 conv = ucnv_open("Big5", &status);
577 U_ASSERT(status);
578
579 source = sourceChars;
580 sourceLimit = sourceChars + sizeof(sourceChars);
581
582 // **************************** START SAMPLE *******************
583
584
585 printBytes("src",source,sourceLimit-source);
586
587 while(source < sourceLimit)
588 {
589 puts("");
590 target = ucnv_getNextUChar (conv,
591 &source,
592 sourceLimit,
593 &status);
594
595 // printBytes("src",source,sourceLimit-source);
596 U_ASSERT(status);
597 printUChar(target);
598 dstCount++;
599 }
600
601
602 // ************************** END SAMPLE *************************
603
604 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
605 ucnv_close(conv);
606
607 return U_ZERO_ERROR;
608 }
609
610
611
612
613 UBool convsample_20_didSubstitute(const char *source)
614 {
615 UChar uchars[100];
616 char bytes[100];
617 UConverter *conv = NULL;
618 UErrorCode status = U_ZERO_ERROR;
619 uint32_t len, len2;
620 UBool flagVal;
621
622 FromUFLAGContext * context = NULL;
623
624 printf("\n\n==============================================\n"
625 "Sample 20: C: Test for substitution using callbacks\n");
626
627 /* print out the original source */
628 printBytes("src", source);
629 printf("\n");
630
631 /* First, convert from UTF8 to unicode */
632 conv = ucnv_open("utf-8", &status);
633 U_ASSERT(status);
634
635 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
636 U_ASSERT(status);
637
638 printUChars("uch", uchars, len);
639 printf("\n");
640
641 /* Now, close the converter */
642 ucnv_close(conv);
643
644 /* Now, convert to windows-1252 */
645 conv = ucnv_open("windows-1252", &status);
646 U_ASSERT(status);
647
648 /* Converter starts out with the SUBSTITUTE callback set. */
649
650 /* initialize our callback */
651 context = flagCB_fromU_openContext();
652
653 /* Set our special callback */
654 ucnv_setFromUCallBack(conv,
655 flagCB_fromU,
656 context,
657 &(context->subCallback),
658 &(context->subContext),
659 &status);
660
661 U_ASSERT(status);
662
663 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
664 U_ASSERT(status);
665
666 flagVal = context->flag; /* it's about to go away when we close the cnv */
667
668 ucnv_close(conv);
669
670 /* print out the original source */
671 printBytes("bytes", bytes, len2);
672
673 return flagVal; /* true if callback was called */
674 }
675
676 UErrorCode convsample_20()
677 {
678 const char *sample1 = "abc\xdf\xbf";
679 const char *sample2 = "abc_def";
680
681
682 if(convsample_20_didSubstitute(sample1))
683 {
684 printf("DID substitute.\n******\n");
685 }
686 else
687 {
688 printf("Did NOT substitute.\n*****\n");
689 }
690
691 if(convsample_20_didSubstitute(sample2))
692 {
693 printf("DID substitute.\n******\n");
694 }
695 else
696 {
697 printf("Did NOT substitute.\n*****\n");
698 }
699
700 return U_ZERO_ERROR;
701 }
702
703 // 21 - C, callback, with clone and debug
704
705
706
707 UBool convsample_21_didSubstitute(const char *source)
708 {
709 UChar uchars[100];
710 char bytes[100];
711 UConverter *conv = NULL, *cloneCnv = NULL;
712 UErrorCode status = U_ZERO_ERROR;
713 uint32_t len, len2;
714 int32_t cloneLen;
715 UBool flagVal = FALSE;
716 UConverterFromUCallback junkCB;
717
718 FromUFLAGContext *flagCtx = NULL,
719 *cloneFlagCtx = NULL;
720
721 debugCBContext *debugCtx1 = NULL,
722 *debugCtx2 = NULL,
723 *cloneDebugCtx = NULL;
724
725 printf("\n\n==============================================\n"
726 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
727
728 /* print out the original source */
729 printBytes("src", source);
730 printf("\n");
731
732 /* First, convert from UTF8 to unicode */
733 conv = ucnv_open("utf-8", &status);
734 U_ASSERT(status);
735
736 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
737 U_ASSERT(status);
738
739 printUChars("uch", uchars, len);
740 printf("\n");
741
742 /* Now, close the converter */
743 ucnv_close(conv);
744
745 /* Now, convert to windows-1252 */
746 conv = ucnv_open("windows-1252", &status);
747 U_ASSERT(status);
748
749 /* Converter starts out with the SUBSTITUTE callback set. */
750
751 /* initialize our callback */
752 /* from the 'bottom' innermost, out
753 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
754
755 #if DEBUG_TMI
756 printf("flagCB_fromU = %p\n", &flagCB_fromU);
757 printf("debugCB_fromU = %p\n", &debugCB_fromU);
758 #endif
759
760 debugCtx1 = debugCB_openContext();
761 flagCtx = flagCB_fromU_openContext();
762 debugCtx2 = debugCB_openContext();
763
764 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */
765 debugCtx1->subContext = flagCtx;
766
767 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */
768 flagCtx->subContext = debugCtx2;
769
770 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
771 debugCtx2->subContext = NULL;
772
773 /* Set our special callback */
774
775 ucnv_setFromUCallBack(conv,
776 debugCB_fromU,
777 debugCtx1,
778 &(debugCtx2->subCallback),
779 &(debugCtx2->subContext),
780 &status);
781
782 U_ASSERT(status);
783
784 #if DEBUG_TMI
785 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
786 conv, debugCtx1, debugCtx1->subCallback,
787 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
788 #endif
789
790 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
791
792 U_ASSERT(status);
793
794 #if DEBUG_TMI
795 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv);
796 #endif
797
798 ucnv_close(conv);
799
800 #if DEBUG_TMI
801 printf("%p closed.\n", conv);
802 #endif
803
804 U_ASSERT(status);
805 /* Now, we have to extract the context */
806 cloneDebugCtx = NULL;
807 cloneFlagCtx = NULL;
808
809 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
810 if(cloneDebugCtx != NULL) {
811 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
812 }
813
814 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
815 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
816
817 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
818 U_ASSERT(status);
819
820 if(cloneFlagCtx != NULL) {
821 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */
822 } else {
823 printf("** Warning, couldn't get the subcallback \n");
824 }
825
826 ucnv_close(cloneCnv);
827
828 /* print out the original source */
829 printBytes("bytes", bytes, len2);
830
831 return flagVal; /* true if callback was called */
832 }
833
834 UErrorCode convsample_21()
835 {
836 const char *sample1 = "abc\xdf\xbf";
837 const char *sample2 = "abc_def";
838
839 if(convsample_21_didSubstitute(sample1))
840 {
841 printf("DID substitute.\n******\n");
842 }
843 else
844 {
845 printf("Did NOT substitute.\n*****\n");
846 }
847
848 if(convsample_21_didSubstitute(sample2))
849 {
850 printf("DID substitute.\n******\n");
851 }
852 else
853 {
854 printf("Did NOT substitute.\n*****\n");
855 }
856
857 return U_ZERO_ERROR;
858 }
859
860
861 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
862
863 #define BUFFERSIZE 17 /* make it interesting :) */
864
865 UErrorCode convsample_40()
866 {
867 printf("\n\n==============================================\n"
868 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
869
870 FILE *f;
871 FILE *out;
872 int32_t count;
873 char inBuf[BUFFERSIZE];
874 const char *source;
875 const char *sourceLimit;
876 UChar *uBuf;
877 UChar *target;
878 UChar *targetLimit;
879 int32_t uBufSize = 0;
880 UConverter *conv = NULL;
881 UErrorCode status = U_ZERO_ERROR;
882 uint32_t inbytes=0, total=0;
883
884 f = fopen("data02.bin", "rb");
885 if(!f)
886 {
887 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
888 return U_FILE_ACCESS_ERROR;
889 }
890
891 out = fopen("data40.utf16", "wb");
892 if(!out)
893 {
894 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
895 fclose(f);
896 return U_FILE_ACCESS_ERROR;
897 }
898
899 // **************************** START SAMPLE *******************
900 conv = ucnv_openCCSID(37, UCNV_IBM, &status);
901 assert(U_SUCCESS(status));
902
903 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
904 printf("input bytes %d / min chars %d = %d UChars\n",
905 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
906 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
907 assert(uBuf!=NULL);
908
909 // grab another buffer's worth
910 while((!feof(f)) &&
911 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
912 {
913 inbytes += count;
914
915 // Convert bytes to unicode
916 source = inBuf;
917 sourceLimit = inBuf + count;
918
919 do
920 {
921 target = uBuf;
922 targetLimit = uBuf + uBufSize;
923
924 ucnv_toUnicode( conv, &target, targetLimit,
925 &source, sourceLimit, NULL,
926 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
927 /* is true (when no more data will come) */
928 &status);
929
930 if(status == U_BUFFER_OVERFLOW_ERROR)
931 {
932 // simply ran out of space - we'll reset the target ptr the next
933 // time through the loop.
934 status = U_ZERO_ERROR;
935 }
936 else
937 {
938 // Check other errors here.
939 assert(U_SUCCESS(status));
940 // Break out of the loop (by force)
941 }
942
943 // Process the Unicode
944 // Todo: handle UTF-16/surrogates
945 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
946 (size_t)(target-uBuf));
947 total += (target-uBuf);
948 } while (source < sourceLimit); // while simply out of space
949 }
950
951 printf("%d bytes in, %d UChars out.\n", inbytes, total);
952
953 // ***************************** END SAMPLE ********************
954 ucnv_close(conv);
955
956 fclose(f);
957 fclose(out);
958 printf("\n");
959
960 return U_ZERO_ERROR;
961 }
962 #undef BUFFERSIZE
963
964
965
966 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
967
968 #define BUFFERSIZE 24 /* make it interesting :) */
969
970 UErrorCode convsample_46()
971 {
972 printf("\n\n==============================================\n"
973 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
974
975 FILE *f;
976 FILE *out;
977 int32_t count;
978 UChar inBuf[BUFFERSIZE];
979 const UChar *source;
980 const UChar *sourceLimit;
981 char *buf;
982 char *target;
983 char *targetLimit;
984
985 int32_t bufSize = 0;
986 UConverter *conv = NULL;
987 UErrorCode status = U_ZERO_ERROR;
988 uint32_t inchars=0, total=0;
989
990 f = fopen("data40.utf16", "rb");
991 if(!f)
992 {
993 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
994 return U_FILE_ACCESS_ERROR;
995 }
996
997 out = fopen("data46.out", "wb");
998 if(!out)
999 {
1000 fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1001 fclose(f);
1002 return U_FILE_ACCESS_ERROR;
1003 }
1004
1005 // **************************** START SAMPLE *******************
1006 conv = ucnv_open( "iso-8859-2", &status);
1007 assert(U_SUCCESS(status));
1008
1009 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1010 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1011 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1012 buf = (char*)malloc(bufSize * sizeof(char));
1013 assert(buf!=NULL);
1014
1015 // grab another buffer's worth
1016 while((!feof(f)) &&
1017 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1018 {
1019 inchars += count;
1020
1021 // Convert bytes to unicode
1022 source = inBuf;
1023 sourceLimit = inBuf + count;
1024
1025 do
1026 {
1027 target = buf;
1028 targetLimit = buf + bufSize;
1029
1030 ucnv_fromUnicode( conv, &target, targetLimit,
1031 &source, sourceLimit, NULL,
1032 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
1033 /* is true (when no more data will come) */
1034 &status);
1035
1036 if(status == U_BUFFER_OVERFLOW_ERROR)
1037 {
1038 // simply ran out of space - we'll reset the target ptr the next
1039 // time through the loop.
1040 status = U_ZERO_ERROR;
1041 }
1042 else
1043 {
1044 // Check other errors here.
1045 assert(U_SUCCESS(status));
1046 // Break out of the loop (by force)
1047 }
1048
1049 // Process the Unicode
1050 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1051 (size_t)(target-buf));
1052 total += (target-buf);
1053 } while (source < sourceLimit); // while simply out of space
1054 }
1055
1056 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1057
1058 // ***************************** END SAMPLE ********************
1059 ucnv_close(conv);
1060
1061 fclose(f);
1062 fclose(out);
1063 printf("\n");
1064
1065 return U_ZERO_ERROR;
1066 }
1067 #undef BUFFERSIZE
1068
1069 #define BUFFERSIZE 219
1070
1071 void convsample_50() {
1072 printf("\n\n==============================================\n"
1073 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1074
1075 //! [ucnv_detectUnicodeSignature]
1076 UErrorCode err = U_ZERO_ERROR;
1077 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1078 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1079 int32_t signatureLength = 0;
1080 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1081 UConverter *conv = NULL;
1082 UChar output[100];
1083 UChar *target = output, *out;
1084 const char *source = input;
1085 if(encoding!=NULL && U_SUCCESS(err)){
1086 // should signature be discarded ?
1087 conv = ucnv_open(encoding, &err);
1088 // do the conversion
1089 ucnv_toUnicode(conv,
1090 &target, output + UPRV_LENGTHOF(output),
1091 &source, input + sizeof(input),
1092 NULL, TRUE, &err);
1093 out = output;
1094 if (discardSignature){
1095 ++out; // ignore initial U+FEFF
1096 }
1097 while(out != target) {
1098 printf("%04x ", *out++);
1099 }
1100 puts("");
1101 }
1102 //! [ucnv_detectUnicodeSignature]
1103 puts("");
1104 }
1105
1106
1107
1108 /* main */
1109
1110 int main()
1111 {
1112
1113 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1114
1115 convsample_02(); // C , u->koi8r, conv
1116 convsample_03(); // C, iterate
1117
1118 convsample_05(); // C, utf8->u, getNextUChar
1119 convsample_06(); // C freq counter thingy
1120
1121 convsample_12(); // C, sjis->u, conv
1122 convsample_13(); // C, big5->u, getNextU
1123
1124 convsample_20(); // C, callback
1125 convsample_21(); // C, callback debug
1126
1127 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1128
1129 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1130
1131 convsample_50(); // C, detect unicode signature
1132
1133 printf("End of converter samples.\n");
1134
1135 fflush(stdout);
1136 fflush(stderr);
1137
1138 return 0;
1139 }