]> git.saurik.com Git - apple/icu.git/blob - icuSources/samples/ucnv/convsamp.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / samples / ucnv / convsamp.cpp
1 /**************************************************************************
2 *
3 * Copyright (C) 2000-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ***************************************************************************
7 * file name: convsamp.c
8 * encoding: ASCII (7-bit)
9 *
10 * created on: 2000may30
11 * created by: Steven R. Loomis
12 *
13 * Sample code for the ICU conversion routines.
14 *
15 * Note: Nothing special is needed to build this sample. Link with
16 * the icu UC and icu I18N libraries.
17 *
18 * I use 'assert' for error checking, you probably will want
19 * something more flexible. '***BEGIN SAMPLE***' and
20 * '***END SAMPLE***' mark pieces suitable for stand alone
21 * code snippets.
22 *
23 *
24 * Each test can define it's own BUFFERSIZE
25 *
26 */
27
28 #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */
29
30 #include <stdio.h>
31 #include <ctype.h> /* for isspace, etc. */
32 #include <assert.h>
33 #include <string.h>
34 #include <stdlib.h> /* malloc */
35
36 #include "unicode/utypes.h" /* Basic ICU data types */
37 #include "unicode/ucnv.h" /* C Converter API */
38 #include "unicode/ustring.h" /* some more string fcns*/
39 #include "unicode/uchar.h" /* char names */
40 #include "unicode/uloc.h"
41 #include "unicode/unistr.h"
42
43 #include "flagcb.h"
44
45 /* Some utility functions */
46
47 static const UChar kNone[] = { 0x0000 };
48
49 #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
50
51 /* Print a UChar if possible, in seven characters. */
52 void prettyPrintUChar(UChar c)
53 {
54 if( (c <= 0x007F) &&
55 (isgraph(c)) ) {
56 printf(" '%c' ", (char)(0x00FF&c));
57 } else if ( c > 0x007F ) {
58 char buf[1000];
59 UErrorCode status = U_ZERO_ERROR;
60 int32_t o;
61
62 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
63 if(U_SUCCESS(status) && (o>0) ) {
64 buf[6] = 0;
65 printf("%7s", buf);
66 } else {
67 printf(" ??????");
68 }
69 } else {
70 switch((char)(c & 0x007F)) {
71 case ' ':
72 printf(" ' ' ");
73 break;
74 case '\t':
75 printf(" \\t ");
76 break;
77 case '\n':
78 printf(" \\n ");
79 break;
80 default:
81 printf(" _ ");
82 break;
83 }
84 }
85 }
86
87
88 void printUChars(const char *name = "?",
89 const UChar *uch = kNone,
90 int32_t len = -1 )
91 {
92 int32_t i;
93
94 if( (len == -1) && (uch) ) {
95 len = u_strlen(uch);
96 }
97
98 printf("%5s: ", name);
99 for( i = 0; i <len; i++) {
100 printf("%-6d ", i);
101 }
102 printf("\n");
103
104 printf("%5s: ", "uni");
105 for( i = 0; i <len; i++) {
106 printf("\\u%04X ", (int)uch[i]);
107 }
108 printf("\n");
109
110 printf("%5s:", "ch");
111 for( i = 0; i <len; i++) {
112 prettyPrintUChar(uch[i]);
113 }
114 printf("\n");
115 }
116
117 void printBytes(const char *name = "?",
118 const char *uch = "",
119 int32_t len = -1 )
120 {
121 int32_t i;
122
123 if( (len == -1) && (uch) ) {
124 len = strlen(uch);
125 }
126
127 printf("%5s: ", name);
128 for( i = 0; i <len; i++) {
129 printf("%-4d ", i);
130 }
131 printf("\n");
132
133 printf("%5s: ", "uni");
134 for( i = 0; i <len; i++) {
135 printf("\\x%02X ", 0x00FF & (int)uch[i]);
136 }
137 printf("\n");
138
139 printf("%5s:", "ch");
140 for( i = 0; i <len; i++) {
141 if(isgraph(0x00FF & (int)uch[i])) {
142 printf(" '%c' ", (char)uch[i]);
143 } else {
144 printf(" ");
145 }
146 }
147 printf("\n");
148 }
149
150 void printUChar(UChar32 ch32)
151 {
152 if(ch32 > 0xFFFF) {
153 printf("ch: U+%06X\n", ch32);
154 }
155 else {
156 UChar ch = (UChar)ch32;
157 printUChars("C", &ch, 1);
158 }
159 }
160
161 /*******************************************************************
162 Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
163 followed by an exclamation mark (!) into the KOI8-R Russian code page.
164
165 This example first creates a UChar String out of the Unicode chars.
166
167 targetSize must be set to the amount of space available in the target
168 buffer. After fromUChars is called,
169 len will contain the number of bytes in target[] which were
170 used in the resulting codepage. In this case, there is a 1:1 mapping
171 between the input and output characters. The exclamation mark has the
172 same value in both KOI8-R and Unicode.
173
174 src: 0 1 2 3 4 5 6
175 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
176 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!'
177
178 targ: 0 1 2 3 4 5 6
179 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
180 ch: '!'
181
182
183 Converting FROM unicode
184 to koi8-r.
185 You must call ucnv_close to clean up the memory used by the
186 converter.
187
188 'len' returns the number of OUTPUT bytes resulting from the
189 conversion.
190 */
191
192 UErrorCode convsample_02()
193 {
194 printf("\n\n==============================================\n"
195 "Sample 02: C: simple Unicode -> koi8-r conversion\n");
196
197
198 // **************************** START SAMPLE *******************
199 // "cat<cat>OK"
200 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
201 0x0430, 0x0021, 0x0000 };
202 char target[100];
203 UErrorCode status = U_ZERO_ERROR;
204 UConverter *conv;
205 int32_t len;
206
207 // set up the converter
208 //! [ucnv_open]
209 conv = ucnv_open("koi8-r", &status);
210 //! [ucnv_open]
211 assert(U_SUCCESS(status));
212
213 // convert to koi8-r
214 len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
215 assert(U_SUCCESS(status));
216
217 // close the converter
218 ucnv_close(conv);
219
220 // ***************************** END SAMPLE ********************
221
222 // Print it out
223 printUChars("src", source);
224 printf("\n");
225 printBytes("targ", target, len);
226
227 return U_ZERO_ERROR;
228 }
229
230
231 UErrorCode convsample_03()
232 {
233 printf("\n\n==============================================\n"
234 "Sample 03: C: print out all converters\n");
235
236 int32_t count;
237 int32_t i;
238
239 // **************************** START SAMPLE *******************
240 count = ucnv_countAvailable();
241 printf("Available converters: %d\n", count);
242
243 for(i=0;i<count;i++)
244 {
245 printf("%s ", ucnv_getAvailableName(i));
246 }
247
248 // ***************************** END SAMPLE ********************
249
250 printf("\n");
251
252 return U_ZERO_ERROR;
253 }
254
255
256
257 #define BUFFERSIZE 17 /* make it interesting :) */
258
259 /*
260 Converting from a codepage to Unicode in bulk..
261 What is the best way to determine the buffer size?
262
263 The 'buffersize' is in bytes of input.
264 For a given converter, divinding this by the minimum char size
265 give you the maximum number of Unicode characters that could be
266 expected for a given number of input bytes.
267 see: ucnv_getMinCharSize()
268
269 For example, a single byte codepage like 'Latin-3' has a
270 minimum char size of 1. (It takes at least 1 byte to represent
271 each Unicode char.) So the unicode buffer has the same number of
272 UChars as the input buffer has bytes.
273
274 In a strictly double byte codepage such as cp1362 (Windows
275 Korean), the minimum char size is 2. So, only half as many Unicode
276 chars as bytes are needed.
277
278 This work to calculate the buffer size is an optimization. Any
279 size of input and output buffer can be used, as long as the
280 program handles the following cases: If the input buffer is empty,
281 the source pointer will be equal to sourceLimit. If the output
282 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
283 */
284
285 UErrorCode convsample_05()
286 {
287 printf("\n\n==============================================\n"
288 "Sample 05: C: count the number of letters in a UTF-8 document\n");
289
290 FILE *f;
291 int32_t count;
292 char inBuf[BUFFERSIZE];
293 const char *source;
294 const char *sourceLimit;
295 UChar *uBuf;
296 UChar *target;
297 UChar *targetLimit;
298 UChar *p;
299 int32_t uBufSize = 0;
300 UConverter *conv;
301 UErrorCode status = U_ZERO_ERROR;
302 uint32_t letters=0, total=0;
303
304 f = fopen("data01.txt", "r");
305 if(!f)
306 {
307 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
308 return U_FILE_ACCESS_ERROR;
309 }
310
311 // **************************** START SAMPLE *******************
312 conv = ucnv_open("utf-8", &status);
313 assert(U_SUCCESS(status));
314
315 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
316 printf("input bytes %d / min chars %d = %d UChars\n",
317 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
318 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
319 assert(uBuf!=NULL);
320
321 // grab another buffer's worth
322 while((!feof(f)) &&
323 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
324 {
325 // Convert bytes to unicode
326 source = inBuf;
327 sourceLimit = inBuf + count;
328
329 do
330 {
331 target = uBuf;
332 targetLimit = uBuf + uBufSize;
333
334 ucnv_toUnicode(conv, &target, targetLimit,
335 &source, sourceLimit, NULL,
336 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
337 /* is true (when no more data will come) */
338 &status);
339
340 if(status == U_BUFFER_OVERFLOW_ERROR)
341 {
342 // simply ran out of space - we'll reset the target ptr the next
343 // time through the loop.
344 status = U_ZERO_ERROR;
345 }
346 else
347 {
348 // Check other errors here.
349 assert(U_SUCCESS(status));
350 // Break out of the loop (by force)
351 }
352
353 // Process the Unicode
354 // Todo: handle UTF-16/surrogates
355
356 for(p = uBuf; p<target; p++)
357 {
358 if(u_isalpha(*p))
359 letters++;
360 total++;
361 }
362 } while (source < sourceLimit); // while simply out of space
363 }
364
365 printf("%d letters out of %d total UChars.\n", letters, total);
366
367 // ***************************** END SAMPLE ********************
368 ucnv_close(conv);
369
370 printf("\n");
371
372 fclose(f);
373
374 return U_ZERO_ERROR;
375 }
376 #undef BUFFERSIZE
377
378 #define BUFFERSIZE 1024
379 typedef struct
380 {
381 UChar32 codepoint;
382 uint32_t frequency;
383 } CharFreqInfo;
384
385 UErrorCode convsample_06()
386 {
387 printf("\n\n==============================================\n"
388 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
389
390 FILE *f;
391 int32_t count;
392 char inBuf[BUFFERSIZE];
393 const char *source;
394 const char *sourceLimit;
395 int32_t uBufSize = 0;
396 UConverter *conv;
397 UErrorCode status = U_ZERO_ERROR;
398 uint32_t letters=0, total=0;
399
400 CharFreqInfo *info;
401 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
402 UChar32 p;
403
404 uint32_t ie = 0;
405 uint32_t gh = 0;
406 UChar32 l = 0;
407
408 f = fopen("data06.txt", "r");
409 if(!f)
410 {
411 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
412 return U_FILE_ACCESS_ERROR;
413 }
414
415 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
416 if(!info)
417 {
418 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
419 }
420
421 /* reset frequencies */
422 for(p=0;p<charCount;p++)
423 {
424 info[p].codepoint = p;
425 info[p].frequency = 0;
426 }
427
428 // **************************** START SAMPLE *******************
429 conv = ucnv_open("utf-8", &status);
430 assert(U_SUCCESS(status));
431
432 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
433 printf("input bytes %d / min chars %d = %d UChars\n",
434 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
435
436 // grab another buffer's worth
437 while((!feof(f)) &&
438 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
439 {
440 // Convert bytes to unicode
441 source = inBuf;
442 sourceLimit = inBuf + count;
443
444 while(source < sourceLimit)
445 {
446 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
447 if(U_FAILURE(status))
448 {
449 fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
450 status = U_ZERO_ERROR;
451 continue;
452 }
453 U_ASSERT(status);
454 total++;
455
456 if(u_isalpha(p))
457 letters++;
458
459 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
460 ie++;
461
462 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
463 gh++;
464
465 if(p>charCount)
466 {
467 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
468 free(info);
469 fclose(f);
470 ucnv_close(conv);
471 return U_UNSUPPORTED_ERROR;
472 }
473 info[p].frequency++;
474 l = p;
475 }
476 }
477
478 fclose(f);
479 ucnv_close(conv);
480
481 printf("%d letters out of %d total UChars.\n", letters, total);
482 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
483
484 // now, we could sort it..
485
486 // qsort(info, charCount, sizeof(info[0]), charfreq_compare);
487
488 for(p=0;p<charCount;p++)
489 {
490 if(info[p].frequency)
491 {
492 printf("% 5d U+%06X ", info[p].frequency, p);
493 if(p <= 0xFFFF)
494 {
495 prettyPrintUChar((UChar)p);
496 }
497 printf("\n");
498 }
499 }
500 free(info);
501 // ***************************** END SAMPLE ********************
502
503 printf("\n");
504
505 return U_ZERO_ERROR;
506 }
507 #undef BUFFERSIZE
508
509
510 /******************************************************
511 You must call ucnv_close to clean up the memory used by the
512 converter.
513
514 'len' returns the number of OUTPUT bytes resulting from the
515 conversion.
516 */
517
518 UErrorCode convsample_12()
519 {
520 printf("\n\n==============================================\n"
521 "Sample 12: C: simple sjis -> unicode conversion\n");
522
523
524 // **************************** START SAMPLE *******************
525
526 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
527 UChar target[100];
528 UErrorCode status = U_ZERO_ERROR;
529 UConverter *conv;
530 int32_t len;
531
532 // set up the converter
533 conv = ucnv_open("shift_jis", &status);
534 assert(U_SUCCESS(status));
535
536 // convert to Unicode
537 // Note: we can use strlen, we know it's an 8 bit null terminated codepage
538 target[6] = 0xFDCA;
539 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
540 U_ASSERT(status);
541 // close the converter
542 ucnv_close(conv);
543
544 // ***************************** END SAMPLE ********************
545
546 // Print it out
547 printBytes("src", source, strlen(source) );
548 printf("\n");
549 printUChars("targ", target, len);
550
551 return U_ZERO_ERROR;
552 }
553
554 /******************************************************************
555 C: Convert from codepage to Unicode one at a time.
556 */
557
558 UErrorCode convsample_13()
559 {
560 printf("\n\n==============================================\n"
561 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
562
563
564 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
565 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
566 const char *source, *sourceLimit;
567 UChar32 target;
568 UErrorCode status = U_ZERO_ERROR;
569 UConverter *conv = NULL;
570 int32_t srcCount=0;
571 int32_t dstCount=0;
572
573 srcCount = sizeof(sourceChars);
574
575 conv = ucnv_open("Big5", &status);
576 U_ASSERT(status);
577
578 source = sourceChars;
579 sourceLimit = sourceChars + sizeof(sourceChars);
580
581 // **************************** START SAMPLE *******************
582
583
584 printBytes("src",source,sourceLimit-source);
585
586 while(source < sourceLimit)
587 {
588 puts("");
589 target = ucnv_getNextUChar (conv,
590 &source,
591 sourceLimit,
592 &status);
593
594 // printBytes("src",source,sourceLimit-source);
595 U_ASSERT(status);
596 printUChar(target);
597 dstCount++;
598 }
599
600
601 // ************************** END SAMPLE *************************
602
603 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
604 ucnv_close(conv);
605
606 return U_ZERO_ERROR;
607 }
608
609
610
611
612 UBool convsample_20_didSubstitute(const char *source)
613 {
614 UChar uchars[100];
615 char bytes[100];
616 UConverter *conv = NULL;
617 UErrorCode status = U_ZERO_ERROR;
618 uint32_t len, len2;
619 UBool flagVal;
620
621 FromUFLAGContext * context = NULL;
622
623 printf("\n\n==============================================\n"
624 "Sample 20: C: Test for substitution using callbacks\n");
625
626 /* print out the original source */
627 printBytes("src", source);
628 printf("\n");
629
630 /* First, convert from UTF8 to unicode */
631 conv = ucnv_open("utf-8", &status);
632 U_ASSERT(status);
633
634 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
635 U_ASSERT(status);
636
637 printUChars("uch", uchars, len);
638 printf("\n");
639
640 /* Now, close the converter */
641 ucnv_close(conv);
642
643 /* Now, convert to windows-1252 */
644 conv = ucnv_open("windows-1252", &status);
645 U_ASSERT(status);
646
647 /* Converter starts out with the SUBSTITUTE callback set. */
648
649 /* initialize our callback */
650 context = flagCB_fromU_openContext();
651
652 /* Set our special callback */
653 ucnv_setFromUCallBack(conv,
654 flagCB_fromU,
655 context,
656 &(context->subCallback),
657 &(context->subContext),
658 &status);
659
660 U_ASSERT(status);
661
662 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
663 U_ASSERT(status);
664
665 flagVal = context->flag; /* it's about to go away when we close the cnv */
666
667 ucnv_close(conv);
668
669 /* print out the original source */
670 printBytes("bytes", bytes, len2);
671
672 return flagVal; /* true if callback was called */
673 }
674
675 UErrorCode convsample_20()
676 {
677 const char *sample1 = "abc\xdf\xbf";
678 const char *sample2 = "abc_def";
679
680
681 if(convsample_20_didSubstitute(sample1))
682 {
683 printf("DID substitute.\n******\n");
684 }
685 else
686 {
687 printf("Did NOT substitute.\n*****\n");
688 }
689
690 if(convsample_20_didSubstitute(sample2))
691 {
692 printf("DID substitute.\n******\n");
693 }
694 else
695 {
696 printf("Did NOT substitute.\n*****\n");
697 }
698
699 return U_ZERO_ERROR;
700 }
701
702 // 21 - C, callback, with clone and debug
703
704
705
706 UBool convsample_21_didSubstitute(const char *source)
707 {
708 UChar uchars[100];
709 char bytes[100];
710 UConverter *conv = NULL, *cloneCnv = NULL;
711 UErrorCode status = U_ZERO_ERROR;
712 uint32_t len, len2;
713 int32_t cloneLen;
714 UBool flagVal = FALSE;
715 UConverterFromUCallback junkCB;
716
717 FromUFLAGContext *flagCtx = NULL,
718 *cloneFlagCtx = NULL;
719
720 debugCBContext *debugCtx1 = NULL,
721 *debugCtx2 = NULL,
722 *cloneDebugCtx = NULL;
723
724 printf("\n\n==============================================\n"
725 "Sample 21: C: Test for substitution w/ callbacks & clones \n");
726
727 /* print out the original source */
728 printBytes("src", source);
729 printf("\n");
730
731 /* First, convert from UTF8 to unicode */
732 conv = ucnv_open("utf-8", &status);
733 U_ASSERT(status);
734
735 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
736 U_ASSERT(status);
737
738 printUChars("uch", uchars, len);
739 printf("\n");
740
741 /* Now, close the converter */
742 ucnv_close(conv);
743
744 /* Now, convert to windows-1252 */
745 conv = ucnv_open("windows-1252", &status);
746 U_ASSERT(status);
747
748 /* Converter starts out with the SUBSTITUTE callback set. */
749
750 /* initialize our callback */
751 /* from the 'bottom' innermost, out
752 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */
753
754 #if DEBUG_TMI
755 printf("flagCB_fromU = %p\n", &flagCB_fromU);
756 printf("debugCB_fromU = %p\n", &debugCB_fromU);
757 #endif
758
759 debugCtx1 = debugCB_openContext();
760 flagCtx = flagCB_fromU_openContext();
761 debugCtx2 = debugCB_openContext();
762
763 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */
764 debugCtx1->subContext = flagCtx;
765
766 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */
767 flagCtx->subContext = debugCtx2;
768
769 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
770 debugCtx2->subContext = NULL;
771
772 /* Set our special callback */
773
774 ucnv_setFromUCallBack(conv,
775 debugCB_fromU,
776 debugCtx1,
777 &(debugCtx2->subCallback),
778 &(debugCtx2->subContext),
779 &status);
780
781 U_ASSERT(status);
782
783 #if DEBUG_TMI
784 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
785 conv, debugCtx1, debugCtx1->subCallback,
786 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
787 #endif
788
789 cloneLen = 1; /* but passing in null so it will clone */
790 cloneCnv = ucnv_safeClone(conv, NULL, &cloneLen, &status);
791
792 U_ASSERT(status);
793
794 #if DEBUG_TMI
795 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv);
796 #endif
797
798 ucnv_close(conv);
799
800 #if DEBUG_TMI
801 printf("%p closed.\n", conv);
802 #endif
803
804 U_ASSERT(status);
805 /* Now, we have to extract the context */
806 cloneDebugCtx = NULL;
807 cloneFlagCtx = NULL;
808
809 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
810 if(cloneDebugCtx != NULL) {
811 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
812 }
813
814 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
815 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
816
817 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
818 U_ASSERT(status);
819
820 if(cloneFlagCtx != NULL) {
821 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */
822 } else {
823 printf("** Warning, couldn't get the subcallback \n");
824 }
825
826 ucnv_close(cloneCnv);
827
828 /* print out the original source */
829 printBytes("bytes", bytes, len2);
830
831 return flagVal; /* true if callback was called */
832 }
833
834 UErrorCode convsample_21()
835 {
836 const char *sample1 = "abc\xdf\xbf";
837 const char *sample2 = "abc_def";
838
839 if(convsample_21_didSubstitute(sample1))
840 {
841 printf("DID substitute.\n******\n");
842 }
843 else
844 {
845 printf("Did NOT substitute.\n*****\n");
846 }
847
848 if(convsample_21_didSubstitute(sample2))
849 {
850 printf("DID substitute.\n******\n");
851 }
852 else
853 {
854 printf("Did NOT substitute.\n*****\n");
855 }
856
857 return U_ZERO_ERROR;
858 }
859
860
861 // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16]
862
863 #define BUFFERSIZE 17 /* make it interesting :) */
864
865 UErrorCode convsample_40()
866 {
867 printf("\n\n==============================================\n"
868 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
869
870 FILE *f;
871 FILE *out;
872 int32_t count;
873 char inBuf[BUFFERSIZE];
874 const char *source;
875 const char *sourceLimit;
876 UChar *uBuf;
877 UChar *target;
878 UChar *targetLimit;
879 int32_t uBufSize = 0;
880 UConverter *conv = NULL;
881 UErrorCode status = U_ZERO_ERROR;
882 uint32_t inbytes=0, total=0;
883
884 f = fopen("data02.bin", "rb");
885 if(!f)
886 {
887 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
888 return U_FILE_ACCESS_ERROR;
889 }
890
891 out = fopen("data40.utf16", "wb");
892 if(!out)
893 {
894 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
895 fclose(f);
896 return U_FILE_ACCESS_ERROR;
897 }
898
899 // **************************** START SAMPLE *******************
900 conv = ucnv_openCCSID(37, UCNV_IBM, &status);
901 assert(U_SUCCESS(status));
902
903 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
904 printf("input bytes %d / min chars %d = %d UChars\n",
905 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
906 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
907 assert(uBuf!=NULL);
908
909 // grab another buffer's worth
910 while((!feof(f)) &&
911 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
912 {
913 inbytes += count;
914
915 // Convert bytes to unicode
916 source = inBuf;
917 sourceLimit = inBuf + count;
918
919 do
920 {
921 target = uBuf;
922 targetLimit = uBuf + uBufSize;
923
924 ucnv_toUnicode( conv, &target, targetLimit,
925 &source, sourceLimit, NULL,
926 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
927 /* is true (when no more data will come) */
928 &status);
929
930 if(status == U_BUFFER_OVERFLOW_ERROR)
931 {
932 // simply ran out of space - we'll reset the target ptr the next
933 // time through the loop.
934 status = U_ZERO_ERROR;
935 }
936 else
937 {
938 // Check other errors here.
939 assert(U_SUCCESS(status));
940 // Break out of the loop (by force)
941 }
942
943 // Process the Unicode
944 // Todo: handle UTF-16/surrogates
945 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
946 (size_t)(target-uBuf));
947 total += (target-uBuf);
948 } while (source < sourceLimit); // while simply out of space
949 }
950
951 printf("%d bytes in, %d UChars out.\n", inbytes, total);
952
953 // ***************************** END SAMPLE ********************
954 ucnv_close(conv);
955
956 fclose(f);
957 fclose(out);
958 printf("\n");
959
960 return U_ZERO_ERROR;
961 }
962 #undef BUFFERSIZE
963
964
965
966 // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out]
967
968 #define BUFFERSIZE 24 /* make it interesting :) */
969
970 UErrorCode convsample_46()
971 {
972 printf("\n\n==============================================\n"
973 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
974
975 FILE *f;
976 FILE *out;
977 int32_t count;
978 UChar inBuf[BUFFERSIZE];
979 const UChar *source;
980 const UChar *sourceLimit;
981 char *buf;
982 char *target;
983 char *targetLimit;
984
985 int32_t bufSize = 0;
986 UConverter *conv = NULL;
987 UErrorCode status = U_ZERO_ERROR;
988 uint32_t inchars=0, total=0;
989
990 f = fopen("data40.utf16", "rb");
991 if(!f)
992 {
993 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
994 return U_FILE_ACCESS_ERROR;
995 }
996
997 out = fopen("data46.out", "wb");
998 if(!out)
999 {
1000 fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1001 fclose(f);
1002 return U_FILE_ACCESS_ERROR;
1003 }
1004
1005 // **************************** START SAMPLE *******************
1006 conv = ucnv_open( "iso-8859-2", &status);
1007 assert(U_SUCCESS(status));
1008
1009 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1010 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1011 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1012 buf = (char*)malloc(bufSize * sizeof(char));
1013 assert(buf!=NULL);
1014
1015 // grab another buffer's worth
1016 while((!feof(f)) &&
1017 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1018 {
1019 inchars += count;
1020
1021 // Convert bytes to unicode
1022 source = inBuf;
1023 sourceLimit = inBuf + count;
1024
1025 do
1026 {
1027 target = buf;
1028 targetLimit = buf + bufSize;
1029
1030 ucnv_fromUnicode( conv, &target, targetLimit,
1031 &source, sourceLimit, NULL,
1032 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */
1033 /* is true (when no more data will come) */
1034 &status);
1035
1036 if(status == U_BUFFER_OVERFLOW_ERROR)
1037 {
1038 // simply ran out of space - we'll reset the target ptr the next
1039 // time through the loop.
1040 status = U_ZERO_ERROR;
1041 }
1042 else
1043 {
1044 // Check other errors here.
1045 assert(U_SUCCESS(status));
1046 // Break out of the loop (by force)
1047 }
1048
1049 // Process the Unicode
1050 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1051 (size_t)(target-buf));
1052 total += (target-buf);
1053 } while (source < sourceLimit); // while simply out of space
1054 }
1055
1056 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1057
1058 // ***************************** END SAMPLE ********************
1059 ucnv_close(conv);
1060
1061 fclose(f);
1062 fclose(out);
1063 printf("\n");
1064
1065 return U_ZERO_ERROR;
1066 }
1067 #undef BUFFERSIZE
1068
1069 #define BUFFERSIZE 219
1070
1071 void convsample_50() {
1072 printf("\n\n==============================================\n"
1073 "Sample 50: C: ucnv_detectUnicodeSignature\n");
1074
1075 //! [ucnv_detectUnicodeSignature]
1076 UErrorCode err = U_ZERO_ERROR;
1077 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1078 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1079 int32_t signatureLength = 0;
1080 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1081 UConverter *conv = NULL;
1082 UChar output[100];
1083 UChar *target = output, *out;
1084 const char *source = input;
1085 if(encoding!=NULL && U_SUCCESS(err)){
1086 // should signature be discarded ?
1087 conv = ucnv_open(encoding, &err);
1088 // do the conversion
1089 ucnv_toUnicode(conv,
1090 &target, output + sizeof(output)/U_SIZEOF_UCHAR,
1091 &source, input + sizeof(input),
1092 NULL, TRUE, &err);
1093 out = output;
1094 if (discardSignature){
1095 ++out; // ignore initial U+FEFF
1096 }
1097 while(out != target) {
1098 printf("%04x ", *out++);
1099 }
1100 puts("");
1101 }
1102 //! [ucnv_detectUnicodeSignature]
1103 puts("");
1104 }
1105
1106
1107
1108 /* main */
1109
1110 int main()
1111 {
1112
1113 printf("Default Converter=%s\n", ucnv_getDefaultName() );
1114
1115 convsample_02(); // C , u->koi8r, conv
1116 convsample_03(); // C, iterate
1117
1118 convsample_05(); // C, utf8->u, getNextUChar
1119 convsample_06(); // C freq counter thingy
1120
1121 convsample_12(); // C, sjis->u, conv
1122 convsample_13(); // C, big5->u, getNextU
1123
1124 convsample_20(); // C, callback
1125 convsample_21(); // C, callback debug
1126
1127 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16]
1128
1129 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out]
1130
1131 convsample_50(); // C, detect unicode signature
1132
1133 printf("End of converter samples.\n");
1134
1135 fflush(stdout);
1136 fflush(stderr);
1137
1138 return 0;
1139 }