icuSources/samples/ucnv/convsamp.cpp

   1 /**************************************************************************
   2 *
   3 *   Copyright (C) 2000-2011, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *
   6 ***************************************************************************
   7 *   file name:  convsamp.c
   8 *   encoding:   ASCII (7-bit)
   9 *
  10 *   created on: 2000may30
  11 *   created by: Steven R. Loomis
  12 *
  13 *   Sample code for the ICU conversion routines.
  14 *
  15 * Note: Nothing special is needed to build this sample. Link with
  16 *       the icu UC and icu I18N libraries.
  17 *
  18 *       I use 'assert' for error checking, you probably will want
  19 *       something more flexible.  '***BEGIN SAMPLE***' and
  20 *       '***END SAMPLE***' mark pieces suitable for stand alone
  21 *       code snippets.
  22 *
  23 *
  24 *  Each test can define it's own BUFFERSIZE
  25 *
  26 */
  27
  28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
  29
  30 #include <stdio.h>
  31 #include <ctype.h>            /* for isspace, etc.    */
  32 #include <assert.h>
  33 #include <string.h>
  34 #include <stdlib.h>  /* malloc */
  35
  36 #include "unicode/utypes.h"   /* Basic ICU data types */
  37 #include "unicode/ucnv.h"     /* C   Converter API    */
  38 #include "unicode/ustring.h"  /* some more string fcns*/
  39 #include "unicode/uchar.h"    /* char names           */
  40 #include "unicode/uloc.h"
  41 #include "unicode/unistr.h"
  42
  43 #include "flagcb.h"
  44
  45 /* Some utility functions */
  46
  47 static const UChar kNone[] = { 0x0000 };
  48
  49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
  50
  51 /* Print a UChar if possible, in seven characters. */
  52 void prettyPrintUChar(UChar c)
  53 {
  54   if(  (c <= 0x007F) &&
  55        (isgraph(c))  ) {
  56     printf(" '%c'   ", (char)(0x00FF&c));
  57   } else if ( c > 0x007F ) {
  58     char buf[1000];
  59     UErrorCode status = U_ZERO_ERROR;
  60     int32_t o;
  61
  62     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
  63     if(U_SUCCESS(status) && (o>0) ) {
  64       buf[6] = 0;
  65       printf("%7s", buf);
  66     } else {
  67       printf(" ??????");
  68     }
  69   } else {
  70     switch((char)(c & 0x007F)) {
  71     case ' ':
  72       printf(" ' '   ");
  73       break;
  74     case '\t':
  75       printf(" \\t    ");
  76       break;
  77     case '\n':
  78       printf(" \\n    ");
  79       break;
  80     default:
  81       printf("  _    ");
  82       break;
  83     }
  84   }
  85 }
  86
  87
  88 void printUChars(const char  *name = "?",
  89                  const UChar *uch  = kNone,
  90                  int32_t     len   = -1 )
  91 {
  92   int32_t i;
  93
  94   if( (len == -1) && (uch) ) {
  95     len = u_strlen(uch);
  96   }
  97
  98   printf("%5s: ", name);
  99   for( i = 0; i <len; i++) {
 100     printf("%-6d ", i);
 101   }
 102   printf("\n");
 103
 104   printf("%5s: ", "uni");
 105   for( i = 0; i <len; i++) {
 106     printf("\\u%04X ", (int)uch[i]);
 107   }
 108   printf("\n");
 109
 110   printf("%5s:", "ch");
 111   for( i = 0; i <len; i++) {
 112     prettyPrintUChar(uch[i]);
 113   }
 114   printf("\n");
 115 }
 116
 117 void printBytes(const char  *name = "?",
 118                  const char *uch  = "",
 119                  int32_t     len   = -1 )
 120 {
 121   int32_t i;
 122
 123   if( (len == -1) && (uch) ) {
 124     len = strlen(uch);
 125   }
 126
 127   printf("%5s: ", name);
 128   for( i = 0; i <len; i++) {
 129     printf("%-4d ", i);
 130   }
 131   printf("\n");
 132
 133   printf("%5s: ", "uni");
 134   for( i = 0; i <len; i++) {
 135     printf("\\x%02X ", 0x00FF & (int)uch[i]);
 136   }
 137   printf("\n");
 138
 139   printf("%5s:", "ch");
 140   for( i = 0; i <len; i++) {
 141     if(isgraph(0x00FF & (int)uch[i])) {
 142       printf(" '%c' ", (char)uch[i]);
 143     } else {
 144       printf("     ");
 145     }
 146   }
 147   printf("\n");
 148 }
 149
 150 void printUChar(UChar32 ch32)
 151 {
 152     if(ch32 > 0xFFFF) {
 153       printf("ch: U+%06X\n", ch32);
 154     }
 155     else {
 156       UChar ch = (UChar)ch32;
 157       printUChars("C", &ch, 1);
 158     }
 159 }
 160
 161 /*******************************************************************
 162   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
 163   followed by an exclamation mark (!) into the KOI8-R Russian code page.
 164
 165   This example first creates a UChar String out of the Unicode chars.
 166
 167   targetSize must be set to the amount of space available in the target
 168   buffer. After fromUChars is called,
 169   len will contain the number of bytes in target[] which were
 170   used in the resulting codepage.  In this case, there is a 1:1 mapping
 171   between the input and output characters. The exclamation mark has the
 172   same value in both KOI8-R and Unicode.
 173
 174   src: 0      1      2      3      4      5      6
 175   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
 176    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
 177
 178  targ:  0    1    2    3    4    5    6
 179   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
 180    ch:                                '!'
 181
 182
 183 Converting FROM unicode
 184   to koi8-r.
 185   You must call ucnv_close to clean up the memory used by the
 186   converter.
 187
 188   'len' returns the number of OUTPUT bytes resulting from the
 189   conversion.
 190  */
 191
 192 UErrorCode convsample_02()
 193 {
 194   printf("\n\n==============================================\n"
 195          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
 196
 197
 198   // **************************** START SAMPLE *******************
 199   // "cat<cat>OK"
 200   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
 201                      0x0430, 0x0021, 0x0000 };
 202   char target[100];
 203   UErrorCode status = U_ZERO_ERROR;
 204   UConverter *conv;
 205   int32_t     len;
 206
 207   // set up the converter
 208   //! [ucnv_open]
 209   conv = ucnv_open("koi8-r", &status);
 210   //! [ucnv_open]
 211   assert(U_SUCCESS(status));
 212
 213   // convert to koi8-r
 214   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
 215   assert(U_SUCCESS(status));
 216
 217   // close the converter
 218   ucnv_close(conv);
 219
 220   // ***************************** END SAMPLE ********************
 221
 222   // Print it out
 223   printUChars("src", source);
 224   printf("\n");
 225   printBytes("targ", target, len);
 226
 227   return U_ZERO_ERROR;
 228 }
 229
 230
 231 UErrorCode convsample_03()
 232 {
 233   printf("\n\n==============================================\n"
 234          "Sample 03: C: print out all converters\n");
 235
 236   int32_t count;
 237   int32_t i;
 238
 239   // **************************** START SAMPLE *******************
 240   count = ucnv_countAvailable();
 241   printf("Available converters: %d\n", count);
 242
 243   for(i=0;i<count;i++)
 244   {
 245     printf("%s ", ucnv_getAvailableName(i));
 246   }
 247
 248   // ***************************** END SAMPLE ********************
 249
 250   printf("\n");
 251
 252   return U_ZERO_ERROR;
 253 }
 254
 255
 256
 257 #define BUFFERSIZE 17 /* make it interesting :) */
 258
 259 /*
 260   Converting from a codepage to Unicode in bulk..
 261   What is the best way to determine the buffer size?
 262
 263      The 'buffersize' is in bytes of input.
 264     For a given converter, divinding this by the minimum char size
 265     give you the maximum number of Unicode characters that could be
 266     expected for a given number of input bytes.
 267      see: ucnv_getMinCharSize()
 268
 269      For example, a single byte codepage like 'Latin-3' has a
 270     minimum char size of 1. (It takes at least 1 byte to represent
 271     each Unicode char.) So the unicode buffer has the same number of
 272     UChars as the input buffer has bytes.
 273
 274      In a strictly double byte codepage such as cp1362 (Windows
 275     Korean), the minimum char size is 2. So, only half as many Unicode
 276     chars as bytes are needed.
 277
 278      This work to calculate the buffer size is an optimization. Any
 279     size of input and output buffer can be used, as long as the
 280     program handles the following cases: If the input buffer is empty,
 281     the source pointer will be equal to sourceLimit.  If the output
 282     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
 283  */
 284
 285 UErrorCode convsample_05()
 286 {
 287   printf("\n\n==============================================\n"
 288          "Sample 05: C: count the number of letters in a UTF-8 document\n");
 289
 290   FILE *f;
 291   int32_t count;
 292   char inBuf[BUFFERSIZE];
 293   const char *source;
 294   const char *sourceLimit;
 295   UChar *uBuf;
 296   UChar *target;
 297   UChar *targetLimit;
 298   UChar *p;
 299   int32_t uBufSize = 0;
 300   UConverter *conv;
 301   UErrorCode status = U_ZERO_ERROR;
 302   uint32_t letters=0, total=0;
 303
 304   f = fopen("data01.txt", "r");
 305   if(!f)
 306   {
 307     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
 308     return U_FILE_ACCESS_ERROR;
 309   }
 310
 311   // **************************** START SAMPLE *******************
 312   conv = ucnv_open("utf-8", &status);
 313   assert(U_SUCCESS(status));
 314
 315   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 316   printf("input bytes %d / min chars %d = %d UChars\n",
 317          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 318   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 319   assert(uBuf!=NULL);
 320
 321   // grab another buffer's worth
 322   while((!feof(f)) &&
 323         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 324   {
 325     // Convert bytes to unicode
 326     source = inBuf;
 327     sourceLimit = inBuf + count;
 328
 329     do
 330     {
 331         target = uBuf;
 332         targetLimit = uBuf + uBufSize;
 333
 334         ucnv_toUnicode(conv, &target, targetLimit,
 335                        &source, sourceLimit, NULL,
 336                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 337                                    /* is true (when no more data will come) */
 338                        &status);
 339
 340         if(status == U_BUFFER_OVERFLOW_ERROR)
 341         {
 342           // simply ran out of space - we'll reset the target ptr the next
 343           // time through the loop.
 344           status = U_ZERO_ERROR;
 345         }
 346         else
 347         {
 348           //  Check other errors here.
 349           assert(U_SUCCESS(status));
 350           // Break out of the loop (by force)
 351         }
 352
 353         // Process the Unicode
 354         // Todo: handle UTF-16/surrogates
 355
 356         for(p = uBuf; p<target; p++)
 357         {
 358           if(u_isalpha(*p))
 359             letters++;
 360           total++;
 361         }
 362     } while (source < sourceLimit); // while simply out of space
 363   }
 364
 365   printf("%d letters out of %d total UChars.\n", letters, total);
 366
 367   // ***************************** END SAMPLE ********************
 368   ucnv_close(conv);
 369
 370   printf("\n");
 371
 372   fclose(f);
 373
 374   return U_ZERO_ERROR;
 375 }
 376 #undef BUFFERSIZE
 377
 378 #define BUFFERSIZE 1024
 379 typedef struct
 380 {
 381   UChar32  codepoint;
 382   uint32_t frequency;
 383 } CharFreqInfo;
 384
 385 UErrorCode convsample_06()
 386 {
 387   printf("\n\n==============================================\n"
 388          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
 389
 390   FILE *f;
 391   int32_t count;
 392   char inBuf[BUFFERSIZE];
 393   const char *source;
 394   const char *sourceLimit;
 395   int32_t uBufSize = 0;
 396   UConverter *conv;
 397   UErrorCode status = U_ZERO_ERROR;
 398   uint32_t letters=0, total=0;
 399
 400   CharFreqInfo   *info;
 401   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
 402   UChar32   p;
 403
 404   uint32_t ie = 0;
 405   uint32_t gh = 0;
 406   UChar32 l = 0;
 407
 408   f = fopen("data06.txt", "r");
 409   if(!f)
 410   {
 411     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
 412     return U_FILE_ACCESS_ERROR;
 413   }
 414
 415   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
 416   if(!info)
 417   {
 418     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
 419   }
 420
 421   /* reset frequencies */
 422   for(p=0;p<charCount;p++)
 423   {
 424     info[p].codepoint = p;
 425     info[p].frequency = 0;
 426   }
 427
 428   // **************************** START SAMPLE *******************
 429   conv = ucnv_open("utf-8", &status);
 430   assert(U_SUCCESS(status));
 431
 432   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 433   printf("input bytes %d / min chars %d = %d UChars\n",
 434          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 435
 436   // grab another buffer's worth
 437   while((!feof(f)) &&
 438         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 439   {
 440     // Convert bytes to unicode
 441     source = inBuf;
 442     sourceLimit = inBuf + count;
 443
 444     while(source < sourceLimit)
 445     {
 446       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
 447       if(U_FAILURE(status))
 448       {
 449         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
 450         status = U_ZERO_ERROR;
 451         continue;
 452       }
 453       U_ASSERT(status);
 454       total++;
 455
 456       if(u_isalpha(p))
 457         letters++;
 458
 459       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
 460         ie++;
 461
 462       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
 463         gh++;
 464
 465       if(p>charCount)
 466       {
 467         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
 468         free(info);
 469         fclose(f);
 470         ucnv_close(conv);
 471         return U_UNSUPPORTED_ERROR;
 472       }
 473       info[p].frequency++;
 474       l = p;
 475     }
 476   }
 477
 478   fclose(f);
 479   ucnv_close(conv);
 480
 481   printf("%d letters out of %d total UChars.\n", letters, total);
 482   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
 483
 484   // now, we could sort it..
 485
 486   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
 487
 488   for(p=0;p<charCount;p++)
 489   {
 490     if(info[p].frequency)
 491     {
 492       printf("% 5d U+%06X ", info[p].frequency, p);
 493       if(p <= 0xFFFF)
 494       {
 495         prettyPrintUChar((UChar)p);
 496       }
 497       printf("\n");
 498     }
 499   }
 500   free(info);
 501   // ***************************** END SAMPLE ********************
 502
 503   printf("\n");
 504
 505   return U_ZERO_ERROR;
 506 }
 507 #undef BUFFERSIZE
 508
 509
 510 /******************************************************
 511   You must call ucnv_close to clean up the memory used by the
 512   converter.
 513
 514   'len' returns the number of OUTPUT bytes resulting from the
 515   conversion.
 516  */
 517
 518 UErrorCode convsample_12()
 519 {
 520   printf("\n\n==============================================\n"
 521          "Sample 12: C: simple sjis -> unicode conversion\n");
 522
 523
 524   // **************************** START SAMPLE *******************
 525
 526   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
 527   UChar target[100];
 528   UErrorCode status = U_ZERO_ERROR;
 529   UConverter *conv;
 530   int32_t     len;
 531
 532   // set up the converter
 533   conv = ucnv_open("shift_jis", &status);
 534   assert(U_SUCCESS(status));
 535
 536   // convert to Unicode
 537   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
 538   target[6] = 0xFDCA;
 539   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
 540   U_ASSERT(status);
 541   // close the converter
 542   ucnv_close(conv);
 543
 544   // ***************************** END SAMPLE ********************
 545
 546   // Print it out
 547   printBytes("src", source, strlen(source) );
 548   printf("\n");
 549   printUChars("targ", target, len);
 550
 551   return U_ZERO_ERROR;
 552 }
 553
 554 /******************************************************************
 555    C: Convert from codepage to Unicode one at a time.
 556 */
 557
 558 UErrorCode convsample_13()
 559 {
 560   printf("\n\n==============================================\n"
 561          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
 562
 563
 564   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
 565   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
 566   const char *source, *sourceLimit;
 567   UChar32 target;
 568   UErrorCode status = U_ZERO_ERROR;
 569   UConverter *conv = NULL;
 570   int32_t srcCount=0;
 571   int32_t dstCount=0;
 572
 573   srcCount = sizeof(sourceChars);
 574
 575   conv = ucnv_open("Big5", &status);
 576   U_ASSERT(status);
 577
 578   source = sourceChars;
 579   sourceLimit = sourceChars + sizeof(sourceChars);
 580
 581   // **************************** START SAMPLE *******************
 582
 583
 584   printBytes("src",source,sourceLimit-source);
 585
 586   while(source < sourceLimit)
 587   {
 588     puts("");
 589     target = ucnv_getNextUChar (conv,
 590                                 &source,
 591                                 sourceLimit,
 592                                 &status);
 593
 594     //    printBytes("src",source,sourceLimit-source);
 595     U_ASSERT(status);
 596     printUChar(target);
 597     dstCount++;
 598   }
 599
 600
 601   // ************************** END SAMPLE *************************
 602
 603   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
 604   ucnv_close(conv);
 605
 606   return U_ZERO_ERROR;
 607 }
 608
 609
 610
 611
 612 UBool convsample_20_didSubstitute(const char *source)
 613 {
 614   UChar uchars[100];
 615   char bytes[100];
 616   UConverter *conv = NULL;
 617   UErrorCode status = U_ZERO_ERROR;
 618   uint32_t len, len2;
 619   UBool  flagVal;
 620
 621   FromUFLAGContext * context = NULL;
 622
 623   printf("\n\n==============================================\n"
 624          "Sample 20: C: Test for substitution using callbacks\n");
 625
 626   /* print out the original source */
 627   printBytes("src", source);
 628   printf("\n");
 629
 630   /* First, convert from UTF8 to unicode */
 631   conv = ucnv_open("utf-8", &status);
 632   U_ASSERT(status);
 633
 634   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 635   U_ASSERT(status);
 636
 637   printUChars("uch", uchars, len);
 638   printf("\n");
 639
 640   /* Now, close the converter */
 641   ucnv_close(conv);
 642
 643   /* Now, convert to windows-1252 */
 644   conv = ucnv_open("windows-1252", &status);
 645   U_ASSERT(status);
 646
 647   /* Converter starts out with the SUBSTITUTE callback set. */
 648
 649   /* initialize our callback */
 650   context = flagCB_fromU_openContext();
 651
 652   /* Set our special callback */
 653   ucnv_setFromUCallBack(conv,
 654                         flagCB_fromU,
 655                         context,
 656                         &(context->subCallback),
 657                         &(context->subContext),
 658                         &status);
 659
 660   U_ASSERT(status);
 661
 662   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
 663   U_ASSERT(status);
 664
 665   flagVal = context->flag;  /* it's about to go away when we close the cnv */
 666
 667   ucnv_close(conv);
 668
 669   /* print out the original source */
 670   printBytes("bytes", bytes, len2);
 671
 672   return flagVal; /* true if callback was called */
 673 }
 674
 675 UErrorCode convsample_20()
 676 {
 677   const char *sample1 = "abc\xdf\xbf";
 678   const char *sample2 = "abc_def";
 679
 680
 681   if(convsample_20_didSubstitute(sample1))
 682   {
 683     printf("DID substitute.\n******\n");
 684   }
 685   else
 686   {
 687     printf("Did NOT substitute.\n*****\n");
 688   }
 689
 690   if(convsample_20_didSubstitute(sample2))
 691   {
 692     printf("DID substitute.\n******\n");
 693   }
 694   else
 695   {
 696     printf("Did NOT substitute.\n*****\n");
 697   }
 698
 699   return U_ZERO_ERROR;
 700 }
 701
 702 // 21  - C, callback, with clone and debug
 703
 704
 705
 706 UBool convsample_21_didSubstitute(const char *source)
 707 {
 708   UChar uchars[100];
 709   char bytes[100];
 710   UConverter *conv = NULL, *cloneCnv = NULL;
 711   UErrorCode status = U_ZERO_ERROR;
 712   uint32_t len, len2;
 713   int32_t  cloneLen;
 714   UBool  flagVal = FALSE;
 715   UConverterFromUCallback junkCB;
 716
 717   FromUFLAGContext *flagCtx = NULL,
 718                    *cloneFlagCtx = NULL;
 719
 720   debugCBContext   *debugCtx1 = NULL,
 721                    *debugCtx2 = NULL,
 722                    *cloneDebugCtx = NULL;
 723
 724   printf("\n\n==============================================\n"
 725          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
 726
 727   /* print out the original source */
 728   printBytes("src", source);
 729   printf("\n");
 730
 731   /* First, convert from UTF8 to unicode */
 732   conv = ucnv_open("utf-8", &status);
 733   U_ASSERT(status);
 734
 735   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 736   U_ASSERT(status);
 737
 738   printUChars("uch", uchars, len);
 739   printf("\n");
 740
 741   /* Now, close the converter */
 742   ucnv_close(conv);
 743
 744   /* Now, convert to windows-1252 */
 745   conv = ucnv_open("windows-1252", &status);
 746   U_ASSERT(status);
 747
 748   /* Converter starts out with the SUBSTITUTE callback set. */
 749
 750   /* initialize our callback */
 751   /* from the 'bottom' innermost, out
 752    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
 753
 754 #if DEBUG_TMI
 755   printf("flagCB_fromU = %p\n", &flagCB_fromU);
 756   printf("debugCB_fromU = %p\n", &debugCB_fromU);
 757 #endif
 758
 759   debugCtx1 = debugCB_openContext();
 760    flagCtx  = flagCB_fromU_openContext();
 761   debugCtx2 = debugCB_openContext();
 762
 763   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
 764   debugCtx1->subContext  =  flagCtx;
 765
 766   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
 767   flagCtx->subContext    =  debugCtx2;
 768
 769   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 770   debugCtx2->subContext  = NULL;
 771
 772   /* Set our special callback */
 773
 774   ucnv_setFromUCallBack(conv,
 775                         debugCB_fromU,
 776                         debugCtx1,
 777                         &(debugCtx2->subCallback),
 778                         &(debugCtx2->subContext),
 779                         &status);
 780
 781   U_ASSERT(status);
 782
 783 #if DEBUG_TMI
 784   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
 785          conv, debugCtx1, debugCtx1->subCallback,
 786          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
 787 #endif
 788
 789   cloneLen = 1; /* but passing in null so it will clone */
 790   cloneCnv = ucnv_safeClone(conv,  NULL,  &cloneLen, &status);
 791
 792   U_ASSERT(status);
 793
 794 #if DEBUG_TMI
 795   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
 796 #endif
 797
 798   ucnv_close(conv);
 799
 800 #if DEBUG_TMI
 801   printf("%p closed.\n", conv);
 802 #endif
 803
 804   U_ASSERT(status);
 805   /* Now, we have to extract the context */
 806   cloneDebugCtx = NULL;
 807   cloneFlagCtx  = NULL;
 808
 809   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
 810   if(cloneDebugCtx != NULL) {
 811       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
 812   }
 813
 814   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
 815          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
 816
 817   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
 818   U_ASSERT(status);
 819
 820   if(cloneFlagCtx != NULL) {
 821       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
 822   } else {
 823       printf("** Warning, couldn't get the subcallback \n");
 824   }
 825
 826   ucnv_close(cloneCnv);
 827
 828   /* print out the original source */
 829   printBytes("bytes", bytes, len2);
 830
 831   return flagVal; /* true if callback was called */
 832 }
 833
 834 UErrorCode convsample_21()
 835 {
 836   const char *sample1 = "abc\xdf\xbf";
 837   const char *sample2 = "abc_def";
 838
 839   if(convsample_21_didSubstitute(sample1))
 840   {
 841     printf("DID substitute.\n******\n");
 842   }
 843   else
 844   {
 845     printf("Did NOT substitute.\n*****\n");
 846   }
 847
 848   if(convsample_21_didSubstitute(sample2))
 849   {
 850     printf("DID substitute.\n******\n");
 851   }
 852   else
 853   {
 854     printf("Did NOT substitute.\n*****\n");
 855   }
 856
 857   return U_ZERO_ERROR;
 858 }
 859
 860
 861 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
 862
 863 #define BUFFERSIZE 17 /* make it interesting :) */
 864
 865 UErrorCode convsample_40()
 866 {
 867   printf("\n\n==============================================\n"
 868     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
 869
 870   FILE *f;
 871   FILE *out;
 872   int32_t count;
 873   char inBuf[BUFFERSIZE];
 874   const char *source;
 875   const char *sourceLimit;
 876   UChar *uBuf;
 877   UChar *target;
 878   UChar *targetLimit;
 879   int32_t uBufSize = 0;
 880   UConverter *conv = NULL;
 881   UErrorCode status = U_ZERO_ERROR;
 882   uint32_t inbytes=0, total=0;
 883
 884   f = fopen("data02.bin", "rb");
 885   if(!f)
 886   {
 887     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
 888     return U_FILE_ACCESS_ERROR;
 889   }
 890
 891   out = fopen("data40.utf16", "wb");
 892   if(!out)
 893   {
 894     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
 895     fclose(f);
 896     return U_FILE_ACCESS_ERROR;
 897   }
 898
 899   // **************************** START SAMPLE *******************
 900   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
 901   assert(U_SUCCESS(status));
 902
 903   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 904   printf("input bytes %d / min chars %d = %d UChars\n",
 905          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 906   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 907   assert(uBuf!=NULL);
 908
 909   // grab another buffer's worth
 910   while((!feof(f)) &&
 911         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 912   {
 913     inbytes += count;
 914
 915     // Convert bytes to unicode
 916     source = inBuf;
 917     sourceLimit = inBuf + count;
 918
 919     do
 920     {
 921         target = uBuf;
 922         targetLimit = uBuf + uBufSize;
 923
 924         ucnv_toUnicode( conv, &target, targetLimit,
 925                        &source, sourceLimit, NULL,
 926                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 927                                    /* is true (when no more data will come) */
 928                          &status);
 929
 930         if(status == U_BUFFER_OVERFLOW_ERROR)
 931         {
 932           // simply ran out of space - we'll reset the target ptr the next
 933           // time through the loop.
 934           status = U_ZERO_ERROR;
 935         }
 936         else
 937         {
 938           //  Check other errors here.
 939           assert(U_SUCCESS(status));
 940           // Break out of the loop (by force)
 941         }
 942
 943         // Process the Unicode
 944         // Todo: handle UTF-16/surrogates
 945         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
 946                (size_t)(target-uBuf));
 947         total += (target-uBuf);
 948     } while (source < sourceLimit); // while simply out of space
 949   }
 950
 951   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
 952
 953   // ***************************** END SAMPLE ********************
 954   ucnv_close(conv);
 955
 956   fclose(f);
 957   fclose(out);
 958   printf("\n");
 959
 960   return U_ZERO_ERROR;
 961 }
 962 #undef BUFFERSIZE
 963
 964
 965
 966 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
 967
 968 #define BUFFERSIZE 24 /* make it interesting :) */
 969
 970 UErrorCode convsample_46()
 971 {
 972   printf("\n\n==============================================\n"
 973     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
 974
 975   FILE *f;
 976   FILE *out;
 977   int32_t count;
 978   UChar inBuf[BUFFERSIZE];
 979   const UChar *source;
 980   const UChar *sourceLimit;
 981   char *buf;
 982   char *target;
 983   char *targetLimit;
 984
 985   int32_t bufSize = 0;
 986   UConverter *conv = NULL;
 987   UErrorCode status = U_ZERO_ERROR;
 988   uint32_t inchars=0, total=0;
 989
 990   f = fopen("data40.utf16", "rb");
 991   if(!f)
 992   {
 993     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
 994     return U_FILE_ACCESS_ERROR;
 995   }
 996
 997   out = fopen("data46.out", "wb");
 998   if(!out)
 999   {
1000     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1001     fclose(f);
1002     return U_FILE_ACCESS_ERROR;
1003   }
1004
1005   // **************************** START SAMPLE *******************
1006   conv = ucnv_open( "iso-8859-2", &status);
1007   assert(U_SUCCESS(status));
1008
1009   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1010   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1011          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1012   buf = (char*)malloc(bufSize * sizeof(char));
1013   assert(buf!=NULL);
1014
1015   // grab another buffer's worth
1016   while((!feof(f)) &&
1017         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1018   {
1019     inchars += count;
1020
1021     // Convert bytes to unicode
1022     source = inBuf;
1023     sourceLimit = inBuf + count;
1024
1025     do
1026     {
1027         target = buf;
1028         targetLimit = buf + bufSize;
1029
1030         ucnv_fromUnicode( conv, &target, targetLimit,
1031                        &source, sourceLimit, NULL,
1032                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1033                                    /* is true (when no more data will come) */
1034                          &status);
1035
1036         if(status == U_BUFFER_OVERFLOW_ERROR)
1037         {
1038           // simply ran out of space - we'll reset the target ptr the next
1039           // time through the loop.
1040           status = U_ZERO_ERROR;
1041         }
1042         else
1043         {
1044           //  Check other errors here.
1045           assert(U_SUCCESS(status));
1046           // Break out of the loop (by force)
1047         }
1048
1049         // Process the Unicode
1050         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1051                (size_t)(target-buf));
1052         total += (target-buf);
1053     } while (source < sourceLimit); // while simply out of space
1054   }
1055
1056   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1057
1058   // ***************************** END SAMPLE ********************
1059   ucnv_close(conv);
1060
1061   fclose(f);
1062   fclose(out);
1063   printf("\n");
1064
1065   return U_ZERO_ERROR;
1066 }
1067 #undef BUFFERSIZE
1068
1069 #define BUFFERSIZE 219
1070
1071 void convsample_50() {
1072   printf("\n\n==============================================\n"
1073          "Sample 50: C: ucnv_detectUnicodeSignature\n");
1074
1075   //! [ucnv_detectUnicodeSignature]
1076   UErrorCode err = U_ZERO_ERROR;
1077   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1078   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1079   int32_t signatureLength = 0;
1080   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1081   UConverter *conv = NULL;
1082   UChar output[100];
1083   UChar *target = output, *out;
1084   const char *source = input;
1085   if(encoding!=NULL && U_SUCCESS(err)){
1086     // should signature be discarded ?
1087     conv = ucnv_open(encoding, &err);
1088     // do the conversion
1089     ucnv_toUnicode(conv,
1090                    &target, output + sizeof(output)/U_SIZEOF_UCHAR,
1091                    &source, input + sizeof(input),
1092                    NULL, TRUE, &err);
1093     out = output;
1094     if (discardSignature){
1095       ++out; // ignore initial U+FEFF
1096     }
1097     while(out != target) {
1098       printf("%04x ", *out++);
1099     }
1100     puts("");
1101   }
1102   //! [ucnv_detectUnicodeSignature]
1103   puts("");
1104 }
1105
1106
1107
1108 /* main */
1109
1110 int main()
1111 {
1112
1113   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1114
1115   convsample_02();  // C  , u->koi8r, conv
1116   convsample_03();  // C,   iterate
1117
1118   convsample_05();  // C,  utf8->u, getNextUChar
1119   convsample_06(); // C freq counter thingy
1120
1121   convsample_12();  // C,  sjis->u, conv
1122   convsample_13();  // C,  big5->u, getNextU
1123
1124   convsample_20();  // C, callback
1125   convsample_21();  // C, callback debug
1126
1127   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1128
1129   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1130
1131   convsample_50();  // C, detect unicode signature
1132
1133   printf("End of converter samples.\n");
1134
1135   fflush(stdout);
1136   fflush(stderr);
1137
1138   return 0;
1139 }