]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /************************************************************************** |
2 | * | |
3 | * Copyright (C) 2000-2003, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | * | |
6 | *************************************************************************** | |
7 | * file name: convsamp.c | |
8 | * encoding: ASCII (7-bit) | |
9 | * | |
10 | * created on: 2000may30 | |
11 | * created by: Steven R. Loomis | |
12 | * | |
13 | * Sample code for the ICU conversion routines. | |
14 | * | |
15 | * Note: Nothing special is needed to build this sample. Link with | |
16 | * the icu UC and icu I18N libraries. | |
17 | * | |
18 | * I use 'assert' for error checking, you probably will want | |
19 | * something more flexible. '***BEGIN SAMPLE***' and | |
20 | * '***END SAMPLE***' mark pieces suitable for stand alone | |
21 | * code snippets. | |
22 | * | |
23 | * | |
24 | * Each test can define it's own BUFFERSIZE | |
25 | * | |
26 | */ | |
27 | ||
28 | #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ | |
29 | ||
30 | #include <stdio.h> | |
31 | #include <ctype.h> /* for isspace, etc. */ | |
32 | #include <assert.h> | |
33 | #include <string.h> | |
34 | #include <stdlib.h> /* malloc */ | |
35 | ||
36 | #include "unicode/utypes.h" /* Basic ICU data types */ | |
37 | #include "unicode/ucnv.h" /* C Converter API */ | |
38 | #include "unicode/ustring.h" /* some more string fcns*/ | |
39 | #include "unicode/uchar.h" /* char names */ | |
40 | #include "unicode/uloc.h" | |
41 | #include "unicode/unistr.h" | |
42 | ||
43 | #include "flagcb.h" | |
44 | ||
45 | /* Some utility functions */ | |
46 | ||
47 | static const UChar kNone[] = { 0x0000 }; | |
48 | ||
49 | #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} | |
50 | ||
51 | /* Print a UChar if possible, in seven characters. */ | |
52 | void prettyPrintUChar(UChar c) | |
53 | { | |
54 | if( (c <= 0x007F) && | |
55 | (isgraph(c)) ) { | |
56 | printf(" '%c' ", (char)(0x00FF&c)); | |
57 | } else if ( c > 0x007F ) { | |
58 | char buf[1000]; | |
59 | UErrorCode status = U_ZERO_ERROR; | |
60 | int32_t o; | |
61 | ||
62 | o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status); | |
63 | if(U_SUCCESS(status) && (o>0) ) { | |
64 | buf[6] = 0; | |
65 | printf("%7s", buf); | |
66 | } else { | |
67 | o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status); | |
68 | if(U_SUCCESS(status) && (o>0)) { | |
69 | buf[5] = 0; | |
70 | printf("~%6s", buf); | |
71 | } | |
72 | else { | |
73 | printf(" ??????"); | |
74 | } | |
75 | } | |
76 | } else { | |
77 | switch((char)(c & 0x007F)) { | |
78 | case ' ': | |
79 | printf(" ' ' "); | |
80 | break; | |
81 | case '\t': | |
82 | printf(" \\t "); | |
83 | break; | |
84 | case '\n': | |
85 | printf(" \\n "); | |
86 | break; | |
87 | default: | |
88 | printf(" _ "); | |
89 | break; | |
90 | } | |
91 | } | |
92 | } | |
93 | ||
94 | ||
95 | void printUChars(const char *name = "?", | |
96 | const UChar *uch = kNone, | |
97 | int32_t len = -1 ) | |
98 | { | |
99 | int32_t i; | |
100 | ||
101 | if( (len == -1) && (uch) ) { | |
102 | len = u_strlen(uch); | |
103 | } | |
104 | ||
105 | printf("%5s: ", name); | |
106 | for( i = 0; i <len; i++) { | |
107 | printf("%-6d ", i); | |
108 | } | |
109 | printf("\n"); | |
110 | ||
111 | printf("%5s: ", "uni"); | |
112 | for( i = 0; i <len; i++) { | |
113 | printf("\\u%04X ", (int)uch[i]); | |
114 | } | |
115 | printf("\n"); | |
116 | ||
117 | printf("%5s:", "ch"); | |
118 | for( i = 0; i <len; i++) { | |
119 | prettyPrintUChar(uch[i]); | |
120 | } | |
121 | printf("\n"); | |
122 | } | |
123 | ||
124 | void printBytes(const char *name = "?", | |
125 | const char *uch = "", | |
126 | int32_t len = -1 ) | |
127 | { | |
128 | int32_t i; | |
129 | ||
130 | if( (len == -1) && (uch) ) { | |
131 | len = strlen(uch); | |
132 | } | |
133 | ||
134 | printf("%5s: ", name); | |
135 | for( i = 0; i <len; i++) { | |
136 | printf("%-4d ", i); | |
137 | } | |
138 | printf("\n"); | |
139 | ||
140 | printf("%5s: ", "uni"); | |
141 | for( i = 0; i <len; i++) { | |
142 | printf("\\x%02X ", 0x00FF & (int)uch[i]); | |
143 | } | |
144 | printf("\n"); | |
145 | ||
146 | printf("%5s:", "ch"); | |
147 | for( i = 0; i <len; i++) { | |
148 | if(isgraph(0x00FF & (int)uch[i])) { | |
149 | printf(" '%c' ", (char)uch[i]); | |
150 | } else { | |
151 | printf(" "); | |
152 | } | |
153 | } | |
154 | printf("\n"); | |
155 | } | |
156 | ||
157 | void printUChar(UChar32 ch32) | |
158 | { | |
159 | if(ch32 > 0xFFFF) { | |
160 | printf("ch: U+%06X\n", ch32); | |
161 | } | |
162 | else { | |
163 | UChar ch = (UChar)ch32; | |
164 | printUChars("C", &ch, 1); | |
165 | } | |
166 | } | |
167 | ||
168 | /******************************************************************* | |
169 | Very simple C sample to convert the word 'Moscow' in Russian in Unicode, | |
170 | followed by an exclamation mark (!) into the KOI8-R Russian code page. | |
171 | ||
172 | This example first creates a UChar String out of the Unicode chars. | |
173 | ||
174 | targetSize must be set to the amount of space available in the target | |
175 | buffer. After fromUChars is called, | |
176 | len will contain the number of bytes in target[] which were | |
177 | used in the resulting codepage. In this case, there is a 1:1 mapping | |
178 | between the input and output characters. The exclamation mark has the | |
179 | same value in both KOI8-R and Unicode. | |
180 | ||
181 | src: 0 1 2 3 4 5 6 | |
182 | uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 | |
183 | ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' | |
184 | ||
185 | targ: 0 1 2 3 4 5 6 | |
186 | uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 | |
187 | ch: '!' | |
188 | ||
189 | ||
190 | Converting FROM unicode | |
191 | to koi8-r. | |
192 | You must call ucnv_close to clean up the memory used by the | |
193 | converter. | |
194 | ||
195 | 'len' returns the number of OUTPUT bytes resulting from the | |
196 | conversion. | |
197 | */ | |
198 | ||
199 | UErrorCode convsample_02() | |
200 | { | |
201 | printf("\n\n==============================================\n" | |
202 | "Sample 02: C: simple Unicode -> koi8-r conversion\n"); | |
203 | ||
204 | ||
205 | // **************************** START SAMPLE ******************* | |
206 | // "cat<cat>OK" | |
207 | UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, | |
208 | 0x0430, 0x0021, 0x0000 }; | |
209 | char target[100]; | |
210 | UErrorCode status = U_ZERO_ERROR; | |
211 | UConverter *conv; | |
212 | int32_t len; | |
213 | ||
214 | // set up the converter | |
215 | conv = ucnv_open("koi8-r", &status); | |
216 | assert(U_SUCCESS(status)); | |
217 | ||
218 | // convert to koi8-r | |
219 | len = ucnv_fromUChars(conv, target, 100, source, -1, &status); | |
220 | assert(U_SUCCESS(status)); | |
221 | ||
222 | // close the converter | |
223 | ucnv_close(conv); | |
224 | ||
225 | // ***************************** END SAMPLE ******************** | |
226 | ||
227 | // Print it out | |
228 | printUChars("src", source); | |
229 | printf("\n"); | |
230 | printBytes("targ", target, len); | |
231 | ||
232 | return U_ZERO_ERROR; | |
233 | } | |
234 | ||
235 | ||
236 | UErrorCode convsample_03() | |
237 | { | |
238 | printf("\n\n==============================================\n" | |
239 | "Sample 03: C: print out all converters\n"); | |
240 | ||
241 | int32_t count; | |
242 | int32_t i; | |
243 | ||
244 | // **************************** START SAMPLE ******************* | |
245 | count = ucnv_countAvailable(); | |
246 | printf("Available converters: %d\n", count); | |
247 | ||
248 | for(i=0;i<count;i++) | |
249 | { | |
250 | printf("%s ", ucnv_getAvailableName(i)); | |
251 | } | |
252 | ||
253 | // ***************************** END SAMPLE ******************** | |
254 | ||
255 | printf("\n"); | |
256 | ||
257 | return U_ZERO_ERROR; | |
258 | } | |
259 | ||
260 | ||
261 | ||
262 | #define BUFFERSIZE 17 /* make it interesting :) */ | |
263 | ||
264 | /* | |
265 | Converting from a codepage to Unicode in bulk.. | |
266 | What is the best way to determine the buffer size? | |
267 | ||
268 | The 'buffersize' is in bytes of input. | |
269 | For a given converter, divinding this by the minimum char size | |
270 | give you the maximum number of Unicode characters that could be | |
271 | expected for a given number of input bytes. | |
272 | see: ucnv_getMinCharSize() | |
273 | ||
274 | For example, a single byte codepage like 'Latin-3' has a | |
275 | minimum char size of 1. (It takes at least 1 byte to represent | |
276 | each Unicode char.) So the unicode buffer has the same number of | |
277 | UChars as the input buffer has bytes. | |
278 | ||
279 | In a strictly double byte codepage such as cp1362 (Windows | |
280 | Korean), the minimum char size is 2. So, only half as many Unicode | |
281 | chars as bytes are needed. | |
282 | ||
283 | This work to calculate the buffer size is an optimization. Any | |
284 | size of input and output buffer can be used, as long as the | |
285 | program handles the following cases: If the input buffer is empty, | |
286 | the source pointer will be equal to sourceLimit. If the output | |
287 | buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. | |
288 | */ | |
289 | ||
290 | UErrorCode convsample_05() | |
291 | { | |
292 | printf("\n\n==============================================\n" | |
293 | "Sample 05: C: count the number of letters in a UTF-8 document\n"); | |
294 | ||
295 | FILE *f; | |
296 | int32_t count; | |
297 | char inBuf[BUFFERSIZE]; | |
298 | const char *source; | |
299 | const char *sourceLimit; | |
300 | UChar *uBuf; | |
301 | UChar *target; | |
302 | UChar *targetLimit; | |
303 | UChar *p; | |
304 | int32_t uBufSize = 0; | |
305 | UConverter *conv; | |
306 | UErrorCode status = U_ZERO_ERROR; | |
307 | uint32_t letters=0, total=0; | |
308 | ||
309 | f = fopen("data01.txt", "r"); | |
310 | if(!f) | |
311 | { | |
312 | fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); | |
313 | return U_FILE_ACCESS_ERROR; | |
314 | } | |
315 | ||
316 | // **************************** START SAMPLE ******************* | |
317 | conv = ucnv_open("utf-8", &status); | |
318 | assert(U_SUCCESS(status)); | |
319 | ||
320 | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | |
321 | printf("input bytes %d / min chars %d = %d UChars\n", | |
322 | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | |
323 | uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); | |
324 | assert(uBuf!=NULL); | |
325 | ||
326 | // grab another buffer's worth | |
327 | while((!feof(f)) && | |
328 | ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) | |
329 | { | |
330 | // Convert bytes to unicode | |
331 | source = inBuf; | |
332 | sourceLimit = inBuf + count; | |
333 | ||
334 | do | |
335 | { | |
336 | target = uBuf; | |
337 | targetLimit = uBuf + uBufSize; | |
338 | ||
339 | ucnv_toUnicode(conv, &target, targetLimit, | |
340 | &source, sourceLimit, NULL, | |
341 | feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ | |
342 | /* is true (when no more data will come) */ | |
343 | &status); | |
344 | ||
345 | if(status == U_BUFFER_OVERFLOW_ERROR) | |
346 | { | |
347 | // simply ran out of space - we'll reset the target ptr the next | |
348 | // time through the loop. | |
349 | status = U_ZERO_ERROR; | |
350 | } | |
351 | else | |
352 | { | |
353 | // Check other errors here. | |
354 | assert(U_SUCCESS(status)); | |
355 | // Break out of the loop (by force) | |
356 | } | |
357 | ||
358 | // Process the Unicode | |
359 | // Todo: handle UTF-16/surrogates | |
360 | ||
361 | for(p = uBuf; p<target; p++) | |
362 | { | |
363 | if(u_isalpha(*p)) | |
364 | letters++; | |
365 | total++; | |
366 | } | |
367 | } while (source < sourceLimit); // while simply out of space | |
368 | } | |
369 | ||
370 | printf("%d letters out of %d total UChars.\n", letters, total); | |
371 | ||
372 | // ***************************** END SAMPLE ******************** | |
373 | ucnv_close(conv); | |
374 | ||
375 | printf("\n"); | |
376 | ||
377 | return U_ZERO_ERROR; | |
378 | } | |
379 | #undef BUFFERSIZE | |
380 | ||
381 | #define BUFFERSIZE 1024 | |
382 | typedef struct | |
383 | { | |
384 | UChar32 codepoint; | |
385 | uint32_t frequency; | |
386 | } CharFreqInfo; | |
387 | ||
388 | UErrorCode convsample_06() | |
389 | { | |
390 | printf("\n\n==============================================\n" | |
391 | "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); | |
392 | ||
393 | FILE *f; | |
394 | int32_t count; | |
395 | char inBuf[BUFFERSIZE]; | |
396 | const char *source; | |
397 | const char *sourceLimit; | |
398 | UChar *uBuf; | |
399 | int32_t uBufSize = 0; | |
400 | UConverter *conv; | |
401 | UErrorCode status = U_ZERO_ERROR; | |
402 | uint32_t letters=0, total=0; | |
403 | ||
404 | CharFreqInfo *info; | |
405 | UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ | |
406 | UChar32 p; | |
407 | ||
408 | uint32_t ie = 0; | |
409 | uint32_t gh = 0; | |
410 | UChar32 l = 0; | |
411 | ||
412 | f = fopen("data06.txt", "r"); | |
413 | if(!f) | |
414 | { | |
415 | fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); | |
416 | return U_FILE_ACCESS_ERROR; | |
417 | } | |
418 | ||
419 | info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); | |
420 | if(!info) | |
421 | { | |
422 | fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); | |
423 | } | |
424 | ||
425 | /* reset frequencies */ | |
426 | for(p=0;p<charCount;p++) | |
427 | { | |
428 | info[p].codepoint = p; | |
429 | info[p].frequency = 0; | |
430 | } | |
431 | ||
432 | // **************************** START SAMPLE ******************* | |
433 | conv = ucnv_open("utf-8", &status); | |
434 | assert(U_SUCCESS(status)); | |
435 | ||
436 | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | |
437 | printf("input bytes %d / min chars %d = %d UChars\n", | |
438 | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | |
439 | uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); | |
440 | assert(uBuf!=NULL); | |
441 | ||
442 | // grab another buffer's worth | |
443 | while((!feof(f)) && | |
444 | ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) | |
445 | { | |
446 | // Convert bytes to unicode | |
447 | source = inBuf; | |
448 | sourceLimit = inBuf + count; | |
449 | ||
450 | while(source < sourceLimit) | |
451 | { | |
452 | p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); | |
453 | if(U_FAILURE(status)) | |
454 | { | |
455 | fprintf(stderr, "%s @ %d\n", u_errorName(status), total); | |
456 | status = U_ZERO_ERROR; | |
457 | continue; | |
458 | } | |
459 | U_ASSERT(status); | |
460 | total++; | |
461 | ||
462 | if(u_isalpha(p)) | |
463 | letters++; | |
464 | ||
465 | if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) | |
466 | ie++; | |
467 | ||
468 | if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) | |
469 | gh++; | |
470 | ||
471 | if(p>charCount) | |
472 | { | |
473 | fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); | |
474 | return U_UNSUPPORTED_ERROR; | |
475 | } | |
476 | info[p].frequency++; | |
477 | l = p; | |
478 | } | |
479 | } | |
480 | ||
481 | fclose(f); | |
482 | ucnv_close(conv); | |
483 | ||
484 | printf("%d letters out of %d total UChars.\n", letters, total); | |
485 | printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); | |
486 | ||
487 | // now, we could sort it.. | |
488 | ||
489 | // qsort(info, charCount, sizeof(info[0]), charfreq_compare); | |
490 | ||
491 | for(p=0;p<charCount;p++) | |
492 | { | |
493 | if(info[p].frequency) | |
494 | { | |
495 | printf("% 5d U+%06X ", info[p].frequency, p); | |
496 | if(p <= 0xFFFF) | |
497 | { | |
498 | prettyPrintUChar((UChar)p); | |
499 | } | |
500 | printf("\n"); | |
501 | } | |
502 | } | |
503 | free(info); | |
504 | // ***************************** END SAMPLE ******************** | |
505 | ||
506 | printf("\n"); | |
507 | ||
508 | return U_ZERO_ERROR; | |
509 | } | |
510 | #undef BUFFERSIZE | |
511 | ||
512 | ||
513 | /****************************************************** | |
514 | You must call ucnv_close to clean up the memory used by the | |
515 | converter. | |
516 | ||
517 | 'len' returns the number of OUTPUT bytes resulting from the | |
518 | conversion. | |
519 | */ | |
520 | ||
521 | UErrorCode convsample_12() | |
522 | { | |
523 | printf("\n\n==============================================\n" | |
524 | "Sample 12: C: simple sjis -> unicode conversion\n"); | |
525 | ||
526 | ||
527 | // **************************** START SAMPLE ******************* | |
528 | ||
529 | char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; | |
530 | UChar target[100]; | |
531 | UErrorCode status = U_ZERO_ERROR; | |
532 | UConverter *conv; | |
533 | int32_t len; | |
534 | ||
535 | // set up the converter | |
536 | conv = ucnv_open("shift_jis", &status); | |
537 | assert(U_SUCCESS(status)); | |
538 | ||
539 | // convert to Unicode | |
540 | // Note: we can use strlen, we know it's an 8 bit null terminated codepage | |
541 | target[6] = 0xFDCA; | |
542 | len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); | |
543 | U_ASSERT(status); | |
544 | // close the converter | |
545 | ucnv_close(conv); | |
546 | ||
547 | // ***************************** END SAMPLE ******************** | |
548 | ||
549 | // Print it out | |
550 | printBytes("src", source, strlen(source) ); | |
551 | printf("\n"); | |
552 | printUChars("targ", target, len); | |
553 | ||
554 | return U_ZERO_ERROR; | |
555 | } | |
556 | ||
557 | /****************************************************************** | |
558 | C: Convert from codepage to Unicode one at a time. | |
559 | */ | |
560 | ||
561 | UErrorCode convsample_13() | |
562 | { | |
563 | printf("\n\n==============================================\n" | |
564 | "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); | |
565 | ||
566 | ||
567 | const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; | |
568 | // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; | |
569 | const char *source, *sourceLimit; | |
570 | UChar32 target; | |
571 | UErrorCode status = U_ZERO_ERROR; | |
572 | UConverter *conv = NULL; | |
573 | int32_t srcCount=0; | |
574 | int32_t dstCount=0; | |
575 | ||
576 | srcCount = sizeof(sourceChars); | |
577 | ||
578 | conv = ucnv_open("Big5", &status); | |
579 | U_ASSERT(status); | |
580 | ||
581 | source = sourceChars; | |
582 | sourceLimit = sourceChars + sizeof(sourceChars); | |
583 | ||
584 | // **************************** START SAMPLE ******************* | |
585 | ||
586 | ||
587 | printBytes("src",source,sourceLimit-source); | |
588 | ||
589 | while(source < sourceLimit) | |
590 | { | |
591 | puts(""); | |
592 | target = ucnv_getNextUChar (conv, | |
593 | &source, | |
594 | sourceLimit, | |
595 | &status); | |
596 | ||
597 | // printBytes("src",source,sourceLimit-source); | |
598 | U_ASSERT(status); | |
599 | printUChar(target); | |
600 | dstCount++; | |
601 | } | |
602 | ||
603 | ||
604 | // ************************** END SAMPLE ************************* | |
605 | ||
606 | printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); | |
607 | ucnv_close(conv); | |
608 | ||
609 | return U_ZERO_ERROR; | |
610 | } | |
611 | ||
612 | ||
613 | ||
614 | ||
615 | UBool convsample_20_didSubstitute(const char *source) | |
616 | { | |
617 | UChar uchars[100]; | |
618 | char bytes[100]; | |
619 | UConverter *conv = NULL; | |
620 | UErrorCode status = U_ZERO_ERROR; | |
621 | uint32_t len, len2; | |
622 | UBool flagVal; | |
623 | ||
624 | FromUFLAGContext * context = NULL; | |
625 | ||
626 | printf("\n\n==============================================\n" | |
627 | "Sample 20: C: Test for substitution using callbacks\n"); | |
628 | ||
629 | /* print out the original source */ | |
630 | printBytes("src", source); | |
631 | printf("\n"); | |
632 | ||
633 | /* First, convert from UTF8 to unicode */ | |
634 | conv = ucnv_open("utf-8", &status); | |
635 | U_ASSERT(status); | |
636 | ||
637 | len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); | |
638 | U_ASSERT(status); | |
639 | ||
640 | printUChars("uch", uchars, len); | |
641 | printf("\n"); | |
642 | ||
643 | /* Now, close the converter */ | |
644 | ucnv_close(conv); | |
645 | ||
646 | /* Now, convert to windows-1252 */ | |
647 | conv = ucnv_open("windows-1252", &status); | |
648 | U_ASSERT(status); | |
649 | ||
650 | /* Converter starts out with the SUBSTITUTE callback set. */ | |
651 | ||
652 | /* initialize our callback */ | |
653 | context = flagCB_fromU_openContext(); | |
654 | ||
655 | /* Set our special callback */ | |
656 | ucnv_setFromUCallBack(conv, | |
657 | flagCB_fromU, | |
658 | context, | |
659 | &(context->subCallback), | |
660 | &(context->subContext), | |
661 | &status); | |
662 | ||
663 | U_ASSERT(status); | |
664 | ||
665 | len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); | |
666 | U_ASSERT(status); | |
667 | ||
668 | flagVal = context->flag; /* it's about to go away when we close the cnv */ | |
669 | ||
670 | ucnv_close(conv); | |
671 | ||
672 | /* print out the original source */ | |
673 | printBytes("bytes", bytes, len2); | |
674 | ||
675 | return flagVal; /* true if callback was called */ | |
676 | } | |
677 | ||
678 | UErrorCode convsample_20() | |
679 | { | |
680 | const char *sample1 = "abc\xdf\xbf"; | |
681 | const char *sample2 = "abc_def"; | |
682 | ||
683 | ||
684 | if(convsample_20_didSubstitute(sample1)) | |
685 | { | |
686 | printf("DID substitute.\n******\n"); | |
687 | } | |
688 | else | |
689 | { | |
690 | printf("Did NOT substitute.\n*****\n"); | |
691 | } | |
692 | ||
693 | if(convsample_20_didSubstitute(sample2)) | |
694 | { | |
695 | printf("DID substitute.\n******\n"); | |
696 | } | |
697 | else | |
698 | { | |
699 | printf("Did NOT substitute.\n*****\n"); | |
700 | } | |
701 | ||
702 | return U_ZERO_ERROR; | |
703 | } | |
704 | ||
705 | // 21 - C, callback, with clone and debug | |
706 | ||
707 | ||
708 | ||
709 | UBool convsample_21_didSubstitute(const char *source) | |
710 | { | |
711 | UChar uchars[100]; | |
712 | char bytes[100]; | |
713 | UConverter *conv = NULL, *cloneCnv = NULL; | |
714 | UErrorCode status = U_ZERO_ERROR; | |
715 | uint32_t len, len2; | |
716 | int32_t cloneLen; | |
717 | UBool flagVal = FALSE; | |
718 | UConverterFromUCallback junkCB; | |
719 | ||
720 | FromUFLAGContext *flagCtx = NULL, | |
721 | *cloneFlagCtx = NULL; | |
722 | ||
723 | debugCBContext *debugCtx1 = NULL, | |
724 | *debugCtx2 = NULL, | |
725 | *cloneDebugCtx = NULL; | |
726 | ||
727 | printf("\n\n==============================================\n" | |
728 | "Sample 21: C: Test for substitution w/ callbacks & clones \n"); | |
729 | ||
730 | /* print out the original source */ | |
731 | printBytes("src", source); | |
732 | printf("\n"); | |
733 | ||
734 | /* First, convert from UTF8 to unicode */ | |
735 | conv = ucnv_open("utf-8", &status); | |
736 | U_ASSERT(status); | |
737 | ||
738 | len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); | |
739 | U_ASSERT(status); | |
740 | ||
741 | printUChars("uch", uchars, len); | |
742 | printf("\n"); | |
743 | ||
744 | /* Now, close the converter */ | |
745 | ucnv_close(conv); | |
746 | ||
747 | /* Now, convert to windows-1252 */ | |
748 | conv = ucnv_open("windows-1252", &status); | |
749 | U_ASSERT(status); | |
750 | ||
751 | /* Converter starts out with the SUBSTITUTE callback set. */ | |
752 | ||
753 | /* initialize our callback */ | |
754 | /* from the 'bottom' innermost, out | |
755 | * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ | |
756 | ||
757 | #if DEBUG_TMI | |
758 | printf("flagCB_fromU = %p\n", &flagCB_fromU); | |
759 | printf("debugCB_fromU = %p\n", &debugCB_fromU); | |
760 | #endif | |
761 | ||
762 | debugCtx1 = debugCB_openContext(); | |
763 | flagCtx = flagCB_fromU_openContext(); | |
764 | debugCtx2 = debugCB_openContext(); | |
765 | ||
766 | debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ | |
767 | debugCtx1->subContext = flagCtx; | |
768 | ||
769 | flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ | |
770 | flagCtx->subContext = debugCtx2; | |
771 | ||
772 | debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; | |
773 | debugCtx2->subContext = NULL; | |
774 | ||
775 | /* Set our special callback */ | |
776 | ||
777 | ucnv_setFromUCallBack(conv, | |
778 | debugCB_fromU, | |
779 | debugCtx1, | |
780 | &(debugCtx2->subCallback), | |
781 | &(debugCtx2->subContext), | |
782 | &status); | |
783 | ||
784 | U_ASSERT(status); | |
785 | ||
786 | #if DEBUG_TMI | |
787 | printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", | |
788 | conv, debugCtx1, debugCtx1->subCallback, | |
789 | debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); | |
790 | #endif | |
791 | ||
792 | cloneLen = 1; /* but passing in null so it will clone */ | |
793 | cloneCnv = ucnv_safeClone(conv, NULL, &cloneLen, &status); | |
794 | ||
795 | U_ASSERT(status); | |
796 | ||
797 | #if DEBUG_TMI | |
798 | printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); | |
799 | #endif | |
800 | ||
801 | ucnv_close(conv); | |
802 | ||
803 | #if DEBUG_TMI | |
804 | printf("%p closed.\n", conv); | |
805 | #endif | |
806 | ||
807 | U_ASSERT(status); | |
808 | /* Now, we have to extract the context */ | |
809 | cloneDebugCtx = NULL; | |
810 | cloneFlagCtx = NULL; | |
811 | ||
812 | ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); | |
813 | if(cloneDebugCtx != NULL) { | |
814 | cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; | |
815 | } | |
816 | ||
817 | printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", | |
818 | cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); | |
819 | ||
820 | len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); | |
821 | U_ASSERT(status); | |
822 | ||
823 | if(cloneFlagCtx != NULL) { | |
824 | flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ | |
825 | } else { | |
826 | printf("** Warning, couldn't get the subcallback \n"); | |
827 | } | |
828 | ||
829 | ucnv_close(cloneCnv); | |
830 | ||
831 | /* print out the original source */ | |
832 | printBytes("bytes", bytes, len2); | |
833 | ||
834 | return flagVal; /* true if callback was called */ | |
835 | } | |
836 | ||
837 | UErrorCode convsample_21() | |
838 | { | |
839 | const char *sample1 = "abc\xdf\xbf"; | |
840 | const char *sample2 = "abc_def"; | |
841 | ||
842 | if(convsample_21_didSubstitute(sample1)) | |
843 | { | |
844 | printf("DID substitute.\n******\n"); | |
845 | } | |
846 | else | |
847 | { | |
848 | printf("Did NOT substitute.\n*****\n"); | |
849 | } | |
850 | ||
851 | if(convsample_21_didSubstitute(sample2)) | |
852 | { | |
853 | printf("DID substitute.\n******\n"); | |
854 | } | |
855 | else | |
856 | { | |
857 | printf("Did NOT substitute.\n*****\n"); | |
858 | } | |
859 | ||
860 | return U_ZERO_ERROR; | |
861 | } | |
862 | ||
863 | ||
864 | // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] | |
865 | ||
866 | #define BUFFERSIZE 17 /* make it interesting :) */ | |
867 | ||
868 | UErrorCode convsample_40() | |
869 | { | |
870 | printf("\n\n==============================================\n" | |
871 | "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); | |
872 | ||
873 | FILE *f; | |
874 | FILE *out; | |
875 | int32_t count; | |
876 | char inBuf[BUFFERSIZE]; | |
877 | const char *source; | |
878 | const char *sourceLimit; | |
879 | UChar *uBuf; | |
880 | UChar *target; | |
881 | UChar *targetLimit; | |
882 | int32_t uBufSize = 0; | |
883 | UConverter *conv = NULL; | |
884 | UErrorCode status = U_ZERO_ERROR; | |
885 | uint32_t inbytes=0, total=0; | |
886 | ||
887 | f = fopen("data02.bin", "rb"); | |
888 | if(!f) | |
889 | { | |
890 | fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); | |
891 | return U_FILE_ACCESS_ERROR; | |
892 | } | |
893 | ||
894 | out = fopen("data40.utf16", "wb"); | |
895 | if(!out) | |
896 | { | |
897 | fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); | |
898 | return U_FILE_ACCESS_ERROR; | |
899 | } | |
900 | ||
901 | // **************************** START SAMPLE ******************* | |
902 | conv = ucnv_openCCSID(37, UCNV_IBM, &status); | |
903 | assert(U_SUCCESS(status)); | |
904 | ||
905 | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | |
906 | printf("input bytes %d / min chars %d = %d UChars\n", | |
907 | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | |
908 | uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); | |
909 | assert(uBuf!=NULL); | |
910 | ||
911 | // grab another buffer's worth | |
912 | while((!feof(f)) && | |
913 | ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) | |
914 | { | |
915 | inbytes += count; | |
916 | ||
917 | // Convert bytes to unicode | |
918 | source = inBuf; | |
919 | sourceLimit = inBuf + count; | |
920 | ||
921 | do | |
922 | { | |
923 | target = uBuf; | |
924 | targetLimit = uBuf + uBufSize; | |
925 | ||
926 | ucnv_toUnicode( conv, &target, targetLimit, | |
927 | &source, sourceLimit, NULL, | |
928 | feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ | |
929 | /* is true (when no more data will come) */ | |
930 | &status); | |
931 | ||
932 | if(status == U_BUFFER_OVERFLOW_ERROR) | |
933 | { | |
934 | // simply ran out of space - we'll reset the target ptr the next | |
935 | // time through the loop. | |
936 | status = U_ZERO_ERROR; | |
937 | } | |
938 | else | |
939 | { | |
940 | // Check other errors here. | |
941 | assert(U_SUCCESS(status)); | |
942 | // Break out of the loop (by force) | |
943 | } | |
944 | ||
945 | // Process the Unicode | |
946 | // Todo: handle UTF-16/surrogates | |
947 | assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == | |
948 | (size_t)(target-uBuf)); | |
949 | total += (target-uBuf); | |
950 | } while (source < sourceLimit); // while simply out of space | |
951 | } | |
952 | ||
953 | printf("%d bytes in, %d UChars out.\n", inbytes, total); | |
954 | ||
955 | // ***************************** END SAMPLE ******************** | |
956 | ucnv_close(conv); | |
957 | ||
958 | fclose(f); | |
959 | fclose(out); | |
960 | printf("\n"); | |
961 | ||
962 | return U_ZERO_ERROR; | |
963 | } | |
964 | #undef BUFFERSIZE | |
965 | ||
966 | ||
967 | ||
968 | // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] | |
969 | ||
970 | #define BUFFERSIZE 24 /* make it interesting :) */ | |
971 | ||
972 | UErrorCode convsample_46() | |
973 | { | |
974 | printf("\n\n==============================================\n" | |
975 | "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); | |
976 | ||
977 | FILE *f; | |
978 | FILE *out; | |
979 | int32_t count; | |
980 | UChar inBuf[BUFFERSIZE]; | |
981 | const UChar *source; | |
982 | const UChar *sourceLimit; | |
983 | char *buf; | |
984 | char *target; | |
985 | char *targetLimit; | |
986 | ||
987 | int32_t bufSize = 0; | |
988 | UConverter *conv = NULL; | |
989 | UErrorCode status = U_ZERO_ERROR; | |
990 | uint32_t inchars=0, total=0; | |
991 | ||
992 | f = fopen("data40.utf16", "rb"); | |
993 | if(!f) | |
994 | { | |
995 | fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); | |
996 | return U_FILE_ACCESS_ERROR; | |
997 | } | |
998 | ||
999 | out = fopen("data46.out", "wb"); | |
1000 | if(!out) | |
1001 | { | |
1002 | fprintf(stderr, "Couldn't create file 'data46.out'.\n"); | |
1003 | return U_FILE_ACCESS_ERROR; | |
1004 | } | |
1005 | ||
1006 | // **************************** START SAMPLE ******************* | |
1007 | conv = ucnv_open( "iso-8859-2", &status); | |
1008 | assert(U_SUCCESS(status)); | |
1009 | ||
1010 | bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); | |
1011 | printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", | |
1012 | BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); | |
1013 | buf = (char*)malloc(bufSize * sizeof(char)); | |
1014 | assert(buf!=NULL); | |
1015 | ||
1016 | // grab another buffer's worth | |
1017 | while((!feof(f)) && | |
1018 | ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) | |
1019 | { | |
1020 | inchars += count; | |
1021 | ||
1022 | // Convert bytes to unicode | |
1023 | source = inBuf; | |
1024 | sourceLimit = inBuf + count; | |
1025 | ||
1026 | do | |
1027 | { | |
1028 | target = buf; | |
1029 | targetLimit = buf + bufSize; | |
1030 | ||
1031 | ucnv_fromUnicode( conv, &target, targetLimit, | |
1032 | &source, sourceLimit, NULL, | |
1033 | feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ | |
1034 | /* is true (when no more data will come) */ | |
1035 | &status); | |
1036 | ||
1037 | if(status == U_BUFFER_OVERFLOW_ERROR) | |
1038 | { | |
1039 | // simply ran out of space - we'll reset the target ptr the next | |
1040 | // time through the loop. | |
1041 | status = U_ZERO_ERROR; | |
1042 | } | |
1043 | else | |
1044 | { | |
1045 | // Check other errors here. | |
1046 | assert(U_SUCCESS(status)); | |
1047 | // Break out of the loop (by force) | |
1048 | } | |
1049 | ||
1050 | // Process the Unicode | |
1051 | assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == | |
1052 | (size_t)(target-buf)); | |
1053 | total += (target-buf); | |
1054 | } while (source < sourceLimit); // while simply out of space | |
1055 | } | |
1056 | ||
1057 | printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); | |
1058 | ||
1059 | // ***************************** END SAMPLE ******************** | |
1060 | ucnv_close(conv); | |
1061 | ||
1062 | fclose(f); | |
1063 | fclose(out); | |
1064 | printf("\n"); | |
1065 | ||
1066 | return U_ZERO_ERROR; | |
1067 | } | |
1068 | #undef BUFFERSIZE | |
1069 | ||
1070 | #define BUFFERSIZE 219 | |
1071 | ||
1072 | ||
1073 | /* main */ | |
1074 | ||
1075 | int main() | |
1076 | { | |
1077 | ||
1078 | printf("Default Converter=%s\n", ucnv_getDefaultName() ); | |
1079 | ||
1080 | convsample_02(); // C , u->koi8r, conv | |
1081 | convsample_03(); // C, iterate | |
1082 | ||
1083 | convsample_05(); // C, utf8->u, getNextUChar | |
1084 | convsample_06(); // C freq counter thingy | |
1085 | ||
1086 | convsample_12(); // C, sjis->u, conv | |
1087 | convsample_13(); // C, big5->u, getNextU | |
1088 | ||
1089 | convsample_20(); // C, callback | |
1090 | convsample_21(); // C, callback debug | |
1091 | ||
1092 | convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] | |
1093 | ||
1094 | convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] | |
1095 | ||
1096 | printf("End of converter samples.\n"); | |
1097 | ||
1098 | fflush(stdout); | |
1099 | fflush(stderr); | |
1100 | ||
1101 | return 0; | |
1102 | } |