]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | /************************************************************************* |
2 | * | |
3 | * © 2016 and later: Unicode, Inc. and others. | |
4 | * License & terms of use: http://www.unicode.org/copyright.html#License | |
5 | * | |
6 | ************************************************************************** | |
7 | ************************************************************************** | |
b75a7d8f | 8 | * |
2ca993e8 | 9 | * Copyright (C) 2000-2016, International Business Machines |
b75a7d8f A |
10 | * Corporation and others. All Rights Reserved. |
11 | * | |
12 | *************************************************************************** | |
13 | * file name: convsamp.c | |
14 | * encoding: ASCII (7-bit) | |
15 | * | |
16 | * created on: 2000may30 | |
17 | * created by: Steven R. Loomis | |
18 | * | |
19 | * Sample code for the ICU conversion routines. | |
20 | * | |
21 | * Note: Nothing special is needed to build this sample. Link with | |
22 | * the icu UC and icu I18N libraries. | |
23 | * | |
24 | * I use 'assert' for error checking, you probably will want | |
25 | * something more flexible. '***BEGIN SAMPLE***' and | |
26 | * '***END SAMPLE***' mark pieces suitable for stand alone | |
27 | * code snippets. | |
28 | * | |
29 | * | |
30 | * Each test can define it's own BUFFERSIZE | |
31 | * | |
32 | */ | |
33 | ||
34 | #define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ | |
35 | ||
36 | #include <stdio.h> | |
37 | #include <ctype.h> /* for isspace, etc. */ | |
38 | #include <assert.h> | |
39 | #include <string.h> | |
40 | #include <stdlib.h> /* malloc */ | |
41 | ||
42 | #include "unicode/utypes.h" /* Basic ICU data types */ | |
43 | #include "unicode/ucnv.h" /* C Converter API */ | |
44 | #include "unicode/ustring.h" /* some more string fcns*/ | |
45 | #include "unicode/uchar.h" /* char names */ | |
46 | #include "unicode/uloc.h" | |
47 | #include "unicode/unistr.h" | |
48 | ||
49 | #include "flagcb.h" | |
50 | ||
51 | /* Some utility functions */ | |
f3c0d7a5 A |
52 | #ifndef UPRV_LENGTHOF |
53 | #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
54 | #endif | |
b75a7d8f A |
55 | |
56 | static const UChar kNone[] = { 0x0000 }; | |
57 | ||
58 | #define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} | |
59 | ||
60 | /* Print a UChar if possible, in seven characters. */ | |
61 | void prettyPrintUChar(UChar c) | |
62 | { | |
63 | if( (c <= 0x007F) && | |
64 | (isgraph(c)) ) { | |
65 | printf(" '%c' ", (char)(0x00FF&c)); | |
66 | } else if ( c > 0x007F ) { | |
67 | char buf[1000]; | |
68 | UErrorCode status = U_ZERO_ERROR; | |
69 | int32_t o; | |
4388f060 A |
70 | |
71 | o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status); | |
b75a7d8f A |
72 | if(U_SUCCESS(status) && (o>0) ) { |
73 | buf[6] = 0; | |
74 | printf("%7s", buf); | |
75 | } else { | |
4388f060 | 76 | printf(" ??????"); |
b75a7d8f A |
77 | } |
78 | } else { | |
79 | switch((char)(c & 0x007F)) { | |
80 | case ' ': | |
81 | printf(" ' ' "); | |
82 | break; | |
83 | case '\t': | |
84 | printf(" \\t "); | |
85 | break; | |
86 | case '\n': | |
87 | printf(" \\n "); | |
88 | break; | |
89 | default: | |
90 | printf(" _ "); | |
91 | break; | |
92 | } | |
93 | } | |
94 | } | |
95 | ||
96 | ||
97 | void printUChars(const char *name = "?", | |
98 | const UChar *uch = kNone, | |
99 | int32_t len = -1 ) | |
100 | { | |
101 | int32_t i; | |
102 | ||
103 | if( (len == -1) && (uch) ) { | |
104 | len = u_strlen(uch); | |
105 | } | |
106 | ||
107 | printf("%5s: ", name); | |
108 | for( i = 0; i <len; i++) { | |
109 | printf("%-6d ", i); | |
110 | } | |
111 | printf("\n"); | |
112 | ||
113 | printf("%5s: ", "uni"); | |
114 | for( i = 0; i <len; i++) { | |
115 | printf("\\u%04X ", (int)uch[i]); | |
116 | } | |
117 | printf("\n"); | |
118 | ||
119 | printf("%5s:", "ch"); | |
120 | for( i = 0; i <len; i++) { | |
121 | prettyPrintUChar(uch[i]); | |
122 | } | |
123 | printf("\n"); | |
124 | } | |
125 | ||
126 | void printBytes(const char *name = "?", | |
127 | const char *uch = "", | |
128 | int32_t len = -1 ) | |
129 | { | |
130 | int32_t i; | |
131 | ||
132 | if( (len == -1) && (uch) ) { | |
133 | len = strlen(uch); | |
134 | } | |
135 | ||
136 | printf("%5s: ", name); | |
137 | for( i = 0; i <len; i++) { | |
138 | printf("%-4d ", i); | |
139 | } | |
140 | printf("\n"); | |
141 | ||
142 | printf("%5s: ", "uni"); | |
143 | for( i = 0; i <len; i++) { | |
144 | printf("\\x%02X ", 0x00FF & (int)uch[i]); | |
145 | } | |
146 | printf("\n"); | |
147 | ||
148 | printf("%5s:", "ch"); | |
149 | for( i = 0; i <len; i++) { | |
150 | if(isgraph(0x00FF & (int)uch[i])) { | |
151 | printf(" '%c' ", (char)uch[i]); | |
152 | } else { | |
153 | printf(" "); | |
154 | } | |
155 | } | |
156 | printf("\n"); | |
157 | } | |
158 | ||
159 | void printUChar(UChar32 ch32) | |
160 | { | |
161 | if(ch32 > 0xFFFF) { | |
162 | printf("ch: U+%06X\n", ch32); | |
163 | } | |
164 | else { | |
165 | UChar ch = (UChar)ch32; | |
166 | printUChars("C", &ch, 1); | |
167 | } | |
168 | } | |
169 | ||
170 | /******************************************************************* | |
171 | Very simple C sample to convert the word 'Moscow' in Russian in Unicode, | |
172 | followed by an exclamation mark (!) into the KOI8-R Russian code page. | |
173 | ||
174 | This example first creates a UChar String out of the Unicode chars. | |
175 | ||
176 | targetSize must be set to the amount of space available in the target | |
177 | buffer. After fromUChars is called, | |
178 | len will contain the number of bytes in target[] which were | |
179 | used in the resulting codepage. In this case, there is a 1:1 mapping | |
180 | between the input and output characters. The exclamation mark has the | |
181 | same value in both KOI8-R and Unicode. | |
182 | ||
183 | src: 0 1 2 3 4 5 6 | |
184 | uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 | |
185 | ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' | |
186 | ||
187 | targ: 0 1 2 3 4 5 6 | |
188 | uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 | |
189 | ch: '!' | |
190 | ||
191 | ||
192 | Converting FROM unicode | |
193 | to koi8-r. | |
194 | You must call ucnv_close to clean up the memory used by the | |
195 | converter. | |
196 | ||
197 | 'len' returns the number of OUTPUT bytes resulting from the | |
198 | conversion. | |
199 | */ | |
200 | ||
201 | UErrorCode convsample_02() | |
202 | { | |
203 | printf("\n\n==============================================\n" | |
204 | "Sample 02: C: simple Unicode -> koi8-r conversion\n"); | |
205 | ||
206 | ||
207 | // **************************** START SAMPLE ******************* | |
208 | // "cat<cat>OK" | |
209 | UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, | |
210 | 0x0430, 0x0021, 0x0000 }; | |
211 | char target[100]; | |
212 | UErrorCode status = U_ZERO_ERROR; | |
213 | UConverter *conv; | |
214 | int32_t len; | |
215 | ||
216 | // set up the converter | |
4388f060 | 217 | //! [ucnv_open] |
b75a7d8f | 218 | conv = ucnv_open("koi8-r", &status); |
4388f060 | 219 | //! [ucnv_open] |
b75a7d8f A |
220 | assert(U_SUCCESS(status)); |
221 | ||
222 | // convert to koi8-r | |
223 | len = ucnv_fromUChars(conv, target, 100, source, -1, &status); | |
224 | assert(U_SUCCESS(status)); | |
225 | ||
226 | // close the converter | |
227 | ucnv_close(conv); | |
228 | ||
229 | // ***************************** END SAMPLE ******************** | |
230 | ||
231 | // Print it out | |
232 | printUChars("src", source); | |
233 | printf("\n"); | |
234 | printBytes("targ", target, len); | |
235 | ||
236 | return U_ZERO_ERROR; | |
237 | } | |
238 | ||
239 | ||
240 | UErrorCode convsample_03() | |
241 | { | |
242 | printf("\n\n==============================================\n" | |
243 | "Sample 03: C: print out all converters\n"); | |
244 | ||
245 | int32_t count; | |
246 | int32_t i; | |
247 | ||
248 | // **************************** START SAMPLE ******************* | |
249 | count = ucnv_countAvailable(); | |
250 | printf("Available converters: %d\n", count); | |
251 | ||
252 | for(i=0;i<count;i++) | |
253 | { | |
254 | printf("%s ", ucnv_getAvailableName(i)); | |
255 | } | |
256 | ||
257 | // ***************************** END SAMPLE ******************** | |
258 | ||
259 | printf("\n"); | |
260 | ||
261 | return U_ZERO_ERROR; | |
262 | } | |
263 | ||
264 | ||
265 | ||
266 | #define BUFFERSIZE 17 /* make it interesting :) */ | |
267 | ||
268 | /* | |
269 | Converting from a codepage to Unicode in bulk.. | |
270 | What is the best way to determine the buffer size? | |
271 | ||
272 | The 'buffersize' is in bytes of input. | |
273 | For a given converter, divinding this by the minimum char size | |
274 | give you the maximum number of Unicode characters that could be | |
275 | expected for a given number of input bytes. | |
276 | see: ucnv_getMinCharSize() | |
277 | ||
278 | For example, a single byte codepage like 'Latin-3' has a | |
279 | minimum char size of 1. (It takes at least 1 byte to represent | |
280 | each Unicode char.) So the unicode buffer has the same number of | |
281 | UChars as the input buffer has bytes. | |
282 | ||
283 | In a strictly double byte codepage such as cp1362 (Windows | |
284 | Korean), the minimum char size is 2. So, only half as many Unicode | |
285 | chars as bytes are needed. | |
286 | ||
287 | This work to calculate the buffer size is an optimization. Any | |
288 | size of input and output buffer can be used, as long as the | |
289 | program handles the following cases: If the input buffer is empty, | |
290 | the source pointer will be equal to sourceLimit. If the output | |
291 | buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. | |
292 | */ | |
293 | ||
294 | UErrorCode convsample_05() | |
295 | { | |
296 | printf("\n\n==============================================\n" | |
297 | "Sample 05: C: count the number of letters in a UTF-8 document\n"); | |
298 | ||
299 | FILE *f; | |
300 | int32_t count; | |
301 | char inBuf[BUFFERSIZE]; | |
302 | const char *source; | |
303 | const char *sourceLimit; | |
304 | UChar *uBuf; | |
305 | UChar *target; | |
306 | UChar *targetLimit; | |
307 | UChar *p; | |
308 | int32_t uBufSize = 0; | |
309 | UConverter *conv; | |
310 | UErrorCode status = U_ZERO_ERROR; | |
311 | uint32_t letters=0, total=0; | |
312 | ||
313 | f = fopen("data01.txt", "r"); | |
314 | if(!f) | |
315 | { | |
316 | fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); | |
317 | return U_FILE_ACCESS_ERROR; | |
318 | } | |
319 | ||
320 | // **************************** START SAMPLE ******************* | |
321 | conv = ucnv_open("utf-8", &status); | |
322 | assert(U_SUCCESS(status)); | |
323 | ||
324 | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | |
325 | printf("input bytes %d / min chars %d = %d UChars\n", | |
326 | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | |
327 | uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); | |
328 | assert(uBuf!=NULL); | |
329 | ||
330 | // grab another buffer's worth | |
331 | while((!feof(f)) && | |
332 | ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) | |
333 | { | |
334 | // Convert bytes to unicode | |
335 | source = inBuf; | |
336 | sourceLimit = inBuf + count; | |
337 | ||
338 | do | |
339 | { | |
340 | target = uBuf; | |
341 | targetLimit = uBuf + uBufSize; | |
342 | ||
343 | ucnv_toUnicode(conv, &target, targetLimit, | |
344 | &source, sourceLimit, NULL, | |
345 | feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ | |
346 | /* is true (when no more data will come) */ | |
347 | &status); | |
348 | ||
349 | if(status == U_BUFFER_OVERFLOW_ERROR) | |
350 | { | |
351 | // simply ran out of space - we'll reset the target ptr the next | |
352 | // time through the loop. | |
353 | status = U_ZERO_ERROR; | |
354 | } | |
355 | else | |
356 | { | |
357 | // Check other errors here. | |
358 | assert(U_SUCCESS(status)); | |
359 | // Break out of the loop (by force) | |
360 | } | |
361 | ||
362 | // Process the Unicode | |
363 | // Todo: handle UTF-16/surrogates | |
364 | ||
365 | for(p = uBuf; p<target; p++) | |
366 | { | |
367 | if(u_isalpha(*p)) | |
368 | letters++; | |
369 | total++; | |
370 | } | |
371 | } while (source < sourceLimit); // while simply out of space | |
372 | } | |
373 | ||
374 | printf("%d letters out of %d total UChars.\n", letters, total); | |
375 | ||
376 | // ***************************** END SAMPLE ******************** | |
377 | ucnv_close(conv); | |
378 | ||
379 | printf("\n"); | |
380 | ||
729e4ab9 A |
381 | fclose(f); |
382 | ||
b75a7d8f A |
383 | return U_ZERO_ERROR; |
384 | } | |
385 | #undef BUFFERSIZE | |
386 | ||
387 | #define BUFFERSIZE 1024 | |
388 | typedef struct | |
389 | { | |
390 | UChar32 codepoint; | |
391 | uint32_t frequency; | |
392 | } CharFreqInfo; | |
393 | ||
394 | UErrorCode convsample_06() | |
395 | { | |
396 | printf("\n\n==============================================\n" | |
397 | "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); | |
398 | ||
399 | FILE *f; | |
400 | int32_t count; | |
401 | char inBuf[BUFFERSIZE]; | |
402 | const char *source; | |
403 | const char *sourceLimit; | |
b75a7d8f A |
404 | int32_t uBufSize = 0; |
405 | UConverter *conv; | |
406 | UErrorCode status = U_ZERO_ERROR; | |
407 | uint32_t letters=0, total=0; | |
408 | ||
409 | CharFreqInfo *info; | |
410 | UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ | |
411 | UChar32 p; | |
412 | ||
413 | uint32_t ie = 0; | |
414 | uint32_t gh = 0; | |
415 | UChar32 l = 0; | |
416 | ||
417 | f = fopen("data06.txt", "r"); | |
418 | if(!f) | |
419 | { | |
420 | fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); | |
421 | return U_FILE_ACCESS_ERROR; | |
422 | } | |
423 | ||
424 | info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); | |
425 | if(!info) | |
426 | { | |
427 | fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); | |
428 | } | |
429 | ||
430 | /* reset frequencies */ | |
431 | for(p=0;p<charCount;p++) | |
432 | { | |
433 | info[p].codepoint = p; | |
434 | info[p].frequency = 0; | |
435 | } | |
436 | ||
437 | // **************************** START SAMPLE ******************* | |
438 | conv = ucnv_open("utf-8", &status); | |
439 | assert(U_SUCCESS(status)); | |
440 | ||
441 | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | |
442 | printf("input bytes %d / min chars %d = %d UChars\n", | |
443 | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | |
b75a7d8f A |
444 | |
445 | // grab another buffer's worth | |
446 | while((!feof(f)) && | |
447 | ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) | |
448 | { | |
449 | // Convert bytes to unicode | |
450 | source = inBuf; | |
451 | sourceLimit = inBuf + count; | |
452 | ||
453 | while(source < sourceLimit) | |
454 | { | |
455 | p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); | |
456 | if(U_FAILURE(status)) | |
457 | { | |
458 | fprintf(stderr, "%s @ %d\n", u_errorName(status), total); | |
459 | status = U_ZERO_ERROR; | |
460 | continue; | |
461 | } | |
462 | U_ASSERT(status); | |
463 | total++; | |
464 | ||
465 | if(u_isalpha(p)) | |
466 | letters++; | |
467 | ||
468 | if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) | |
469 | ie++; | |
470 | ||
471 | if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) | |
472 | gh++; | |
473 | ||
474 | if(p>charCount) | |
475 | { | |
476 | fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); | |
4388f060 A |
477 | free(info); |
478 | fclose(f); | |
479 | ucnv_close(conv); | |
b75a7d8f A |
480 | return U_UNSUPPORTED_ERROR; |
481 | } | |
482 | info[p].frequency++; | |
483 | l = p; | |
484 | } | |
485 | } | |
486 | ||
487 | fclose(f); | |
488 | ucnv_close(conv); | |
489 | ||
490 | printf("%d letters out of %d total UChars.\n", letters, total); | |
491 | printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); | |
492 | ||
493 | // now, we could sort it.. | |
494 | ||
495 | // qsort(info, charCount, sizeof(info[0]), charfreq_compare); | |
496 | ||
497 | for(p=0;p<charCount;p++) | |
498 | { | |
499 | if(info[p].frequency) | |
500 | { | |
501 | printf("% 5d U+%06X ", info[p].frequency, p); | |
502 | if(p <= 0xFFFF) | |
503 | { | |
504 | prettyPrintUChar((UChar)p); | |
505 | } | |
506 | printf("\n"); | |
507 | } | |
508 | } | |
509 | free(info); | |
510 | // ***************************** END SAMPLE ******************** | |
511 | ||
512 | printf("\n"); | |
513 | ||
514 | return U_ZERO_ERROR; | |
515 | } | |
516 | #undef BUFFERSIZE | |
517 | ||
518 | ||
519 | /****************************************************** | |
520 | You must call ucnv_close to clean up the memory used by the | |
521 | converter. | |
522 | ||
523 | 'len' returns the number of OUTPUT bytes resulting from the | |
524 | conversion. | |
525 | */ | |
526 | ||
527 | UErrorCode convsample_12() | |
528 | { | |
529 | printf("\n\n==============================================\n" | |
530 | "Sample 12: C: simple sjis -> unicode conversion\n"); | |
531 | ||
532 | ||
533 | // **************************** START SAMPLE ******************* | |
534 | ||
535 | char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; | |
536 | UChar target[100]; | |
537 | UErrorCode status = U_ZERO_ERROR; | |
538 | UConverter *conv; | |
539 | int32_t len; | |
540 | ||
541 | // set up the converter | |
542 | conv = ucnv_open("shift_jis", &status); | |
543 | assert(U_SUCCESS(status)); | |
544 | ||
545 | // convert to Unicode | |
546 | // Note: we can use strlen, we know it's an 8 bit null terminated codepage | |
547 | target[6] = 0xFDCA; | |
548 | len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); | |
549 | U_ASSERT(status); | |
550 | // close the converter | |
551 | ucnv_close(conv); | |
552 | ||
553 | // ***************************** END SAMPLE ******************** | |
554 | ||
555 | // Print it out | |
556 | printBytes("src", source, strlen(source) ); | |
557 | printf("\n"); | |
558 | printUChars("targ", target, len); | |
559 | ||
560 | return U_ZERO_ERROR; | |
561 | } | |
562 | ||
563 | /****************************************************************** | |
564 | C: Convert from codepage to Unicode one at a time. | |
565 | */ | |
566 | ||
567 | UErrorCode convsample_13() | |
568 | { | |
569 | printf("\n\n==============================================\n" | |
570 | "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); | |
571 | ||
572 | ||
573 | const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; | |
574 | // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; | |
575 | const char *source, *sourceLimit; | |
576 | UChar32 target; | |
577 | UErrorCode status = U_ZERO_ERROR; | |
578 | UConverter *conv = NULL; | |
579 | int32_t srcCount=0; | |
580 | int32_t dstCount=0; | |
581 | ||
582 | srcCount = sizeof(sourceChars); | |
583 | ||
584 | conv = ucnv_open("Big5", &status); | |
585 | U_ASSERT(status); | |
586 | ||
587 | source = sourceChars; | |
588 | sourceLimit = sourceChars + sizeof(sourceChars); | |
589 | ||
590 | // **************************** START SAMPLE ******************* | |
591 | ||
592 | ||
593 | printBytes("src",source,sourceLimit-source); | |
594 | ||
595 | while(source < sourceLimit) | |
596 | { | |
597 | puts(""); | |
598 | target = ucnv_getNextUChar (conv, | |
599 | &source, | |
600 | sourceLimit, | |
601 | &status); | |
602 | ||
603 | // printBytes("src",source,sourceLimit-source); | |
604 | U_ASSERT(status); | |
605 | printUChar(target); | |
606 | dstCount++; | |
607 | } | |
608 | ||
609 | ||
610 | // ************************** END SAMPLE ************************* | |
611 | ||
612 | printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); | |
613 | ucnv_close(conv); | |
614 | ||
615 | return U_ZERO_ERROR; | |
616 | } | |
617 | ||
618 | ||
619 | ||
620 | ||
621 | UBool convsample_20_didSubstitute(const char *source) | |
622 | { | |
623 | UChar uchars[100]; | |
624 | char bytes[100]; | |
625 | UConverter *conv = NULL; | |
626 | UErrorCode status = U_ZERO_ERROR; | |
627 | uint32_t len, len2; | |
628 | UBool flagVal; | |
629 | ||
630 | FromUFLAGContext * context = NULL; | |
631 | ||
632 | printf("\n\n==============================================\n" | |
633 | "Sample 20: C: Test for substitution using callbacks\n"); | |
634 | ||
635 | /* print out the original source */ | |
636 | printBytes("src", source); | |
637 | printf("\n"); | |
638 | ||
639 | /* First, convert from UTF8 to unicode */ | |
640 | conv = ucnv_open("utf-8", &status); | |
641 | U_ASSERT(status); | |
642 | ||
643 | len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); | |
644 | U_ASSERT(status); | |
645 | ||
646 | printUChars("uch", uchars, len); | |
647 | printf("\n"); | |
648 | ||
649 | /* Now, close the converter */ | |
650 | ucnv_close(conv); | |
651 | ||
652 | /* Now, convert to windows-1252 */ | |
653 | conv = ucnv_open("windows-1252", &status); | |
654 | U_ASSERT(status); | |
655 | ||
656 | /* Converter starts out with the SUBSTITUTE callback set. */ | |
657 | ||
658 | /* initialize our callback */ | |
659 | context = flagCB_fromU_openContext(); | |
660 | ||
661 | /* Set our special callback */ | |
662 | ucnv_setFromUCallBack(conv, | |
663 | flagCB_fromU, | |
664 | context, | |
665 | &(context->subCallback), | |
666 | &(context->subContext), | |
667 | &status); | |
668 | ||
669 | U_ASSERT(status); | |
670 | ||
671 | len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); | |
672 | U_ASSERT(status); | |
673 | ||
674 | flagVal = context->flag; /* it's about to go away when we close the cnv */ | |
675 | ||
676 | ucnv_close(conv); | |
677 | ||
678 | /* print out the original source */ | |
679 | printBytes("bytes", bytes, len2); | |
680 | ||
681 | return flagVal; /* true if callback was called */ | |
682 | } | |
683 | ||
684 | UErrorCode convsample_20() | |
685 | { | |
686 | const char *sample1 = "abc\xdf\xbf"; | |
687 | const char *sample2 = "abc_def"; | |
688 | ||
689 | ||
690 | if(convsample_20_didSubstitute(sample1)) | |
691 | { | |
692 | printf("DID substitute.\n******\n"); | |
693 | } | |
694 | else | |
695 | { | |
696 | printf("Did NOT substitute.\n*****\n"); | |
697 | } | |
698 | ||
699 | if(convsample_20_didSubstitute(sample2)) | |
700 | { | |
701 | printf("DID substitute.\n******\n"); | |
702 | } | |
703 | else | |
704 | { | |
705 | printf("Did NOT substitute.\n*****\n"); | |
706 | } | |
707 | ||
708 | return U_ZERO_ERROR; | |
709 | } | |
710 | ||
711 | // 21 - C, callback, with clone and debug | |
712 | ||
713 | ||
714 | ||
715 | UBool convsample_21_didSubstitute(const char *source) | |
716 | { | |
717 | UChar uchars[100]; | |
718 | char bytes[100]; | |
719 | UConverter *conv = NULL, *cloneCnv = NULL; | |
720 | UErrorCode status = U_ZERO_ERROR; | |
721 | uint32_t len, len2; | |
722 | int32_t cloneLen; | |
723 | UBool flagVal = FALSE; | |
724 | UConverterFromUCallback junkCB; | |
725 | ||
726 | FromUFLAGContext *flagCtx = NULL, | |
727 | *cloneFlagCtx = NULL; | |
728 | ||
729 | debugCBContext *debugCtx1 = NULL, | |
730 | *debugCtx2 = NULL, | |
731 | *cloneDebugCtx = NULL; | |
732 | ||
733 | printf("\n\n==============================================\n" | |
734 | "Sample 21: C: Test for substitution w/ callbacks & clones \n"); | |
735 | ||
736 | /* print out the original source */ | |
737 | printBytes("src", source); | |
738 | printf("\n"); | |
739 | ||
740 | /* First, convert from UTF8 to unicode */ | |
741 | conv = ucnv_open("utf-8", &status); | |
742 | U_ASSERT(status); | |
743 | ||
744 | len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); | |
745 | U_ASSERT(status); | |
746 | ||
747 | printUChars("uch", uchars, len); | |
748 | printf("\n"); | |
749 | ||
750 | /* Now, close the converter */ | |
751 | ucnv_close(conv); | |
752 | ||
753 | /* Now, convert to windows-1252 */ | |
754 | conv = ucnv_open("windows-1252", &status); | |
755 | U_ASSERT(status); | |
756 | ||
757 | /* Converter starts out with the SUBSTITUTE callback set. */ | |
758 | ||
759 | /* initialize our callback */ | |
760 | /* from the 'bottom' innermost, out | |
761 | * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ | |
762 | ||
763 | #if DEBUG_TMI | |
764 | printf("flagCB_fromU = %p\n", &flagCB_fromU); | |
765 | printf("debugCB_fromU = %p\n", &debugCB_fromU); | |
766 | #endif | |
767 | ||
768 | debugCtx1 = debugCB_openContext(); | |
769 | flagCtx = flagCB_fromU_openContext(); | |
770 | debugCtx2 = debugCB_openContext(); | |
771 | ||
772 | debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ | |
773 | debugCtx1->subContext = flagCtx; | |
774 | ||
775 | flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ | |
776 | flagCtx->subContext = debugCtx2; | |
777 | ||
778 | debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; | |
779 | debugCtx2->subContext = NULL; | |
780 | ||
781 | /* Set our special callback */ | |
782 | ||
783 | ucnv_setFromUCallBack(conv, | |
784 | debugCB_fromU, | |
785 | debugCtx1, | |
786 | &(debugCtx2->subCallback), | |
787 | &(debugCtx2->subContext), | |
788 | &status); | |
789 | ||
790 | U_ASSERT(status); | |
791 | ||
792 | #if DEBUG_TMI | |
793 | printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", | |
794 | conv, debugCtx1, debugCtx1->subCallback, | |
795 | debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); | |
796 | #endif | |
797 | ||
57a6839d | 798 | cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status); |
b75a7d8f A |
799 | |
800 | U_ASSERT(status); | |
801 | ||
802 | #if DEBUG_TMI | |
803 | printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); | |
804 | #endif | |
805 | ||
806 | ucnv_close(conv); | |
807 | ||
808 | #if DEBUG_TMI | |
809 | printf("%p closed.\n", conv); | |
810 | #endif | |
811 | ||
812 | U_ASSERT(status); | |
813 | /* Now, we have to extract the context */ | |
814 | cloneDebugCtx = NULL; | |
815 | cloneFlagCtx = NULL; | |
816 | ||
817 | ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); | |
818 | if(cloneDebugCtx != NULL) { | |
819 | cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; | |
820 | } | |
821 | ||
822 | printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", | |
823 | cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); | |
824 | ||
825 | len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); | |
826 | U_ASSERT(status); | |
827 | ||
828 | if(cloneFlagCtx != NULL) { | |
829 | flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ | |
830 | } else { | |
831 | printf("** Warning, couldn't get the subcallback \n"); | |
832 | } | |
833 | ||
834 | ucnv_close(cloneCnv); | |
835 | ||
836 | /* print out the original source */ | |
837 | printBytes("bytes", bytes, len2); | |
838 | ||
839 | return flagVal; /* true if callback was called */ | |
840 | } | |
841 | ||
842 | UErrorCode convsample_21() | |
843 | { | |
844 | const char *sample1 = "abc\xdf\xbf"; | |
845 | const char *sample2 = "abc_def"; | |
846 | ||
847 | if(convsample_21_didSubstitute(sample1)) | |
848 | { | |
849 | printf("DID substitute.\n******\n"); | |
850 | } | |
851 | else | |
852 | { | |
853 | printf("Did NOT substitute.\n*****\n"); | |
854 | } | |
855 | ||
856 | if(convsample_21_didSubstitute(sample2)) | |
857 | { | |
858 | printf("DID substitute.\n******\n"); | |
859 | } | |
860 | else | |
861 | { | |
862 | printf("Did NOT substitute.\n*****\n"); | |
863 | } | |
864 | ||
865 | return U_ZERO_ERROR; | |
866 | } | |
867 | ||
868 | ||
869 | // 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] | |
870 | ||
871 | #define BUFFERSIZE 17 /* make it interesting :) */ | |
872 | ||
873 | UErrorCode convsample_40() | |
874 | { | |
875 | printf("\n\n==============================================\n" | |
876 | "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); | |
877 | ||
878 | FILE *f; | |
879 | FILE *out; | |
880 | int32_t count; | |
881 | char inBuf[BUFFERSIZE]; | |
882 | const char *source; | |
883 | const char *sourceLimit; | |
884 | UChar *uBuf; | |
885 | UChar *target; | |
886 | UChar *targetLimit; | |
887 | int32_t uBufSize = 0; | |
888 | UConverter *conv = NULL; | |
889 | UErrorCode status = U_ZERO_ERROR; | |
890 | uint32_t inbytes=0, total=0; | |
891 | ||
892 | f = fopen("data02.bin", "rb"); | |
893 | if(!f) | |
894 | { | |
895 | fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); | |
896 | return U_FILE_ACCESS_ERROR; | |
897 | } | |
898 | ||
899 | out = fopen("data40.utf16", "wb"); | |
900 | if(!out) | |
901 | { | |
902 | fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); | |
729e4ab9 | 903 | fclose(f); |
b75a7d8f A |
904 | return U_FILE_ACCESS_ERROR; |
905 | } | |
906 | ||
907 | // **************************** START SAMPLE ******************* | |
908 | conv = ucnv_openCCSID(37, UCNV_IBM, &status); | |
909 | assert(U_SUCCESS(status)); | |
910 | ||
911 | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | |
912 | printf("input bytes %d / min chars %d = %d UChars\n", | |
913 | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | |
914 | uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); | |
915 | assert(uBuf!=NULL); | |
916 | ||
917 | // grab another buffer's worth | |
918 | while((!feof(f)) && | |
919 | ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) | |
920 | { | |
921 | inbytes += count; | |
922 | ||
923 | // Convert bytes to unicode | |
924 | source = inBuf; | |
925 | sourceLimit = inBuf + count; | |
926 | ||
927 | do | |
928 | { | |
929 | target = uBuf; | |
930 | targetLimit = uBuf + uBufSize; | |
931 | ||
932 | ucnv_toUnicode( conv, &target, targetLimit, | |
933 | &source, sourceLimit, NULL, | |
934 | feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ | |
935 | /* is true (when no more data will come) */ | |
936 | &status); | |
937 | ||
938 | if(status == U_BUFFER_OVERFLOW_ERROR) | |
939 | { | |
940 | // simply ran out of space - we'll reset the target ptr the next | |
941 | // time through the loop. | |
942 | status = U_ZERO_ERROR; | |
943 | } | |
944 | else | |
945 | { | |
946 | // Check other errors here. | |
947 | assert(U_SUCCESS(status)); | |
948 | // Break out of the loop (by force) | |
949 | } | |
950 | ||
951 | // Process the Unicode | |
952 | // Todo: handle UTF-16/surrogates | |
953 | assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == | |
954 | (size_t)(target-uBuf)); | |
955 | total += (target-uBuf); | |
956 | } while (source < sourceLimit); // while simply out of space | |
957 | } | |
958 | ||
959 | printf("%d bytes in, %d UChars out.\n", inbytes, total); | |
960 | ||
961 | // ***************************** END SAMPLE ******************** | |
962 | ucnv_close(conv); | |
963 | ||
964 | fclose(f); | |
965 | fclose(out); | |
966 | printf("\n"); | |
967 | ||
968 | return U_ZERO_ERROR; | |
969 | } | |
970 | #undef BUFFERSIZE | |
971 | ||
972 | ||
973 | ||
974 | // 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] | |
975 | ||
976 | #define BUFFERSIZE 24 /* make it interesting :) */ | |
977 | ||
978 | UErrorCode convsample_46() | |
979 | { | |
980 | printf("\n\n==============================================\n" | |
981 | "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); | |
982 | ||
983 | FILE *f; | |
984 | FILE *out; | |
985 | int32_t count; | |
986 | UChar inBuf[BUFFERSIZE]; | |
987 | const UChar *source; | |
988 | const UChar *sourceLimit; | |
989 | char *buf; | |
990 | char *target; | |
991 | char *targetLimit; | |
992 | ||
993 | int32_t bufSize = 0; | |
994 | UConverter *conv = NULL; | |
995 | UErrorCode status = U_ZERO_ERROR; | |
996 | uint32_t inchars=0, total=0; | |
997 | ||
998 | f = fopen("data40.utf16", "rb"); | |
999 | if(!f) | |
1000 | { | |
1001 | fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); | |
1002 | return U_FILE_ACCESS_ERROR; | |
1003 | } | |
1004 | ||
1005 | out = fopen("data46.out", "wb"); | |
1006 | if(!out) | |
1007 | { | |
1008 | fprintf(stderr, "Couldn't create file 'data46.out'.\n"); | |
729e4ab9 | 1009 | fclose(f); |
b75a7d8f A |
1010 | return U_FILE_ACCESS_ERROR; |
1011 | } | |
1012 | ||
1013 | // **************************** START SAMPLE ******************* | |
1014 | conv = ucnv_open( "iso-8859-2", &status); | |
1015 | assert(U_SUCCESS(status)); | |
1016 | ||
1017 | bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); | |
1018 | printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", | |
1019 | BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); | |
1020 | buf = (char*)malloc(bufSize * sizeof(char)); | |
1021 | assert(buf!=NULL); | |
1022 | ||
1023 | // grab another buffer's worth | |
1024 | while((!feof(f)) && | |
1025 | ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) | |
1026 | { | |
1027 | inchars += count; | |
1028 | ||
1029 | // Convert bytes to unicode | |
1030 | source = inBuf; | |
1031 | sourceLimit = inBuf + count; | |
1032 | ||
1033 | do | |
1034 | { | |
1035 | target = buf; | |
1036 | targetLimit = buf + bufSize; | |
1037 | ||
1038 | ucnv_fromUnicode( conv, &target, targetLimit, | |
1039 | &source, sourceLimit, NULL, | |
1040 | feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ | |
1041 | /* is true (when no more data will come) */ | |
1042 | &status); | |
1043 | ||
1044 | if(status == U_BUFFER_OVERFLOW_ERROR) | |
1045 | { | |
1046 | // simply ran out of space - we'll reset the target ptr the next | |
1047 | // time through the loop. | |
1048 | status = U_ZERO_ERROR; | |
1049 | } | |
1050 | else | |
1051 | { | |
1052 | // Check other errors here. | |
1053 | assert(U_SUCCESS(status)); | |
1054 | // Break out of the loop (by force) | |
1055 | } | |
1056 | ||
1057 | // Process the Unicode | |
1058 | assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == | |
1059 | (size_t)(target-buf)); | |
1060 | total += (target-buf); | |
1061 | } while (source < sourceLimit); // while simply out of space | |
1062 | } | |
1063 | ||
1064 | printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); | |
1065 | ||
1066 | // ***************************** END SAMPLE ******************** | |
1067 | ucnv_close(conv); | |
1068 | ||
1069 | fclose(f); | |
1070 | fclose(out); | |
1071 | printf("\n"); | |
1072 | ||
1073 | return U_ZERO_ERROR; | |
1074 | } | |
1075 | #undef BUFFERSIZE | |
1076 | ||
1077 | #define BUFFERSIZE 219 | |
1078 | ||
4388f060 A |
1079 | void convsample_50() { |
1080 | printf("\n\n==============================================\n" | |
1081 | "Sample 50: C: ucnv_detectUnicodeSignature\n"); | |
1082 | ||
1083 | //! [ucnv_detectUnicodeSignature] | |
1084 | UErrorCode err = U_ZERO_ERROR; | |
1085 | UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */ | |
1086 | char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; | |
1087 | int32_t signatureLength = 0; | |
1088 | const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); | |
1089 | UConverter *conv = NULL; | |
1090 | UChar output[100]; | |
1091 | UChar *target = output, *out; | |
1092 | const char *source = input; | |
1093 | if(encoding!=NULL && U_SUCCESS(err)){ | |
1094 | // should signature be discarded ? | |
1095 | conv = ucnv_open(encoding, &err); | |
1096 | // do the conversion | |
1097 | ucnv_toUnicode(conv, | |
2ca993e8 | 1098 | &target, output + UPRV_LENGTHOF(output), |
4388f060 A |
1099 | &source, input + sizeof(input), |
1100 | NULL, TRUE, &err); | |
1101 | out = output; | |
1102 | if (discardSignature){ | |
1103 | ++out; // ignore initial U+FEFF | |
1104 | } | |
1105 | while(out != target) { | |
1106 | printf("%04x ", *out++); | |
1107 | } | |
1108 | puts(""); | |
1109 | } | |
1110 | //! [ucnv_detectUnicodeSignature] | |
1111 | puts(""); | |
1112 | } | |
1113 | ||
1114 | ||
b75a7d8f A |
1115 | |
1116 | /* main */ | |
1117 | ||
1118 | int main() | |
1119 | { | |
1120 | ||
1121 | printf("Default Converter=%s\n", ucnv_getDefaultName() ); | |
1122 | ||
1123 | convsample_02(); // C , u->koi8r, conv | |
1124 | convsample_03(); // C, iterate | |
1125 | ||
1126 | convsample_05(); // C, utf8->u, getNextUChar | |
1127 | convsample_06(); // C freq counter thingy | |
1128 | ||
1129 | convsample_12(); // C, sjis->u, conv | |
1130 | convsample_13(); // C, big5->u, getNextU | |
1131 | ||
1132 | convsample_20(); // C, callback | |
1133 | convsample_21(); // C, callback debug | |
1134 | ||
1135 | convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] | |
1136 | ||
1137 | convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] | |
4388f060 A |
1138 | |
1139 | convsample_50(); // C, detect unicode signature | |
b75a7d8f A |
1140 | |
1141 | printf("End of converter samples.\n"); | |
1142 | ||
1143 | fflush(stdout); | |
1144 | fflush(stderr); | |
1145 | ||
1146 | return 0; | |
1147 | } |