icuSources/samples/ucnv/convsamp.cpp

   1 /**************************************************************************
   2 *
   3 *   Copyright (C) 2000-2013, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *
   6 ***************************************************************************
   7 *   file name:  convsamp.c
   8 *   encoding:   ASCII (7-bit)
   9 *
  10 *   created on: 2000may30
  11 *   created by: Steven R. Loomis
  12 *
  13 *   Sample code for the ICU conversion routines.
  14 *
  15 * Note: Nothing special is needed to build this sample. Link with
  16 *       the icu UC and icu I18N libraries.
  17 *
  18 *       I use 'assert' for error checking, you probably will want
  19 *       something more flexible.  '***BEGIN SAMPLE***' and
  20 *       '***END SAMPLE***' mark pieces suitable for stand alone
  21 *       code snippets.
  22 *
  23 *
  24 *  Each test can define it's own BUFFERSIZE
  25 *
  26 */
  27
  28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
  29
  30 #include <stdio.h>
  31 #include <ctype.h>            /* for isspace, etc.    */
  32 #include <assert.h>
  33 #include <string.h>
  34 #include <stdlib.h>  /* malloc */
  35
  36 #include "unicode/utypes.h"   /* Basic ICU data types */
  37 #include "unicode/ucnv.h"     /* C   Converter API    */
  38 #include "unicode/ustring.h"  /* some more string fcns*/
  39 #include "unicode/uchar.h"    /* char names           */
  40 #include "unicode/uloc.h"
  41 #include "unicode/unistr.h"
  42
  43 #include "flagcb.h"
  44
  45 /* Some utility functions */
  46
  47 static const UChar kNone[] = { 0x0000 };
  48
  49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
  50
  51 /* Print a UChar if possible, in seven characters. */
  52 void prettyPrintUChar(UChar c)
  53 {
  54   if(  (c <= 0x007F) &&
  55        (isgraph(c))  ) {
  56     printf(" '%c'   ", (char)(0x00FF&c));
  57   } else if ( c > 0x007F ) {
  58     char buf[1000];
  59     UErrorCode status = U_ZERO_ERROR;
  60     int32_t o;
  61
  62     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
  63     if(U_SUCCESS(status) && (o>0) ) {
  64       buf[6] = 0;
  65       printf("%7s", buf);
  66     } else {
  67       printf(" ??????");
  68     }
  69   } else {
  70     switch((char)(c & 0x007F)) {
  71     case ' ':
  72       printf(" ' '   ");
  73       break;
  74     case '\t':
  75       printf(" \\t    ");
  76       break;
  77     case '\n':
  78       printf(" \\n    ");
  79       break;
  80     default:
  81       printf("  _    ");
  82       break;
  83     }
  84   }
  85 }
  86
  87
  88 void printUChars(const char  *name = "?",
  89                  const UChar *uch  = kNone,
  90                  int32_t     len   = -1 )
  91 {
  92   int32_t i;
  93
  94   if( (len == -1) && (uch) ) {
  95     len = u_strlen(uch);
  96   }
  97
  98   printf("%5s: ", name);
  99   for( i = 0; i <len; i++) {
 100     printf("%-6d ", i);
 101   }
 102   printf("\n");
 103
 104   printf("%5s: ", "uni");
 105   for( i = 0; i <len; i++) {
 106     printf("\\u%04X ", (int)uch[i]);
 107   }
 108   printf("\n");
 109
 110   printf("%5s:", "ch");
 111   for( i = 0; i <len; i++) {
 112     prettyPrintUChar(uch[i]);
 113   }
 114   printf("\n");
 115 }
 116
 117 void printBytes(const char  *name = "?",
 118                  const char *uch  = "",
 119                  int32_t     len   = -1 )
 120 {
 121   int32_t i;
 122
 123   if( (len == -1) && (uch) ) {
 124     len = strlen(uch);
 125   }
 126
 127   printf("%5s: ", name);
 128   for( i = 0; i <len; i++) {
 129     printf("%-4d ", i);
 130   }
 131   printf("\n");
 132
 133   printf("%5s: ", "uni");
 134   for( i = 0; i <len; i++) {
 135     printf("\\x%02X ", 0x00FF & (int)uch[i]);
 136   }
 137   printf("\n");
 138
 139   printf("%5s:", "ch");
 140   for( i = 0; i <len; i++) {
 141     if(isgraph(0x00FF & (int)uch[i])) {
 142       printf(" '%c' ", (char)uch[i]);
 143     } else {
 144       printf("     ");
 145     }
 146   }
 147   printf("\n");
 148 }
 149
 150 void printUChar(UChar32 ch32)
 151 {
 152     if(ch32 > 0xFFFF) {
 153       printf("ch: U+%06X\n", ch32);
 154     }
 155     else {
 156       UChar ch = (UChar)ch32;
 157       printUChars("C", &ch, 1);
 158     }
 159 }
 160
 161 /*******************************************************************
 162   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
 163   followed by an exclamation mark (!) into the KOI8-R Russian code page.
 164
 165   This example first creates a UChar String out of the Unicode chars.
 166
 167   targetSize must be set to the amount of space available in the target
 168   buffer. After fromUChars is called,
 169   len will contain the number of bytes in target[] which were
 170   used in the resulting codepage.  In this case, there is a 1:1 mapping
 171   between the input and output characters. The exclamation mark has the
 172   same value in both KOI8-R and Unicode.
 173
 174   src: 0      1      2      3      4      5      6
 175   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
 176    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
 177
 178  targ:  0    1    2    3    4    5    6
 179   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
 180    ch:                                '!'
 181
 182
 183 Converting FROM unicode
 184   to koi8-r.
 185   You must call ucnv_close to clean up the memory used by the
 186   converter.
 187
 188   'len' returns the number of OUTPUT bytes resulting from the
 189   conversion.
 190  */
 191
 192 UErrorCode convsample_02()
 193 {
 194   printf("\n\n==============================================\n"
 195          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
 196
 197
 198   // **************************** START SAMPLE *******************
 199   // "cat<cat>OK"
 200   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
 201                      0x0430, 0x0021, 0x0000 };
 202   char target[100];
 203   UErrorCode status = U_ZERO_ERROR;
 204   UConverter *conv;
 205   int32_t     len;
 206
 207   // set up the converter
 208   //! [ucnv_open]
 209   conv = ucnv_open("koi8-r", &status);
 210   //! [ucnv_open]
 211   assert(U_SUCCESS(status));
 212
 213   // convert to koi8-r
 214   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
 215   assert(U_SUCCESS(status));
 216
 217   // close the converter
 218   ucnv_close(conv);
 219
 220   // ***************************** END SAMPLE ********************
 221
 222   // Print it out
 223   printUChars("src", source);
 224   printf("\n");
 225   printBytes("targ", target, len);
 226
 227   return U_ZERO_ERROR;
 228 }
 229
 230
 231 UErrorCode convsample_03()
 232 {
 233   printf("\n\n==============================================\n"
 234          "Sample 03: C: print out all converters\n");
 235
 236   int32_t count;
 237   int32_t i;
 238
 239   // **************************** START SAMPLE *******************
 240   count = ucnv_countAvailable();
 241   printf("Available converters: %d\n", count);
 242
 243   for(i=0;i<count;i++)
 244   {
 245     printf("%s ", ucnv_getAvailableName(i));
 246   }
 247
 248   // ***************************** END SAMPLE ********************
 249
 250   printf("\n");
 251
 252   return U_ZERO_ERROR;
 253 }
 254
 255
 256
 257 #define BUFFERSIZE 17 /* make it interesting :) */
 258
 259 /*
 260   Converting from a codepage to Unicode in bulk..
 261   What is the best way to determine the buffer size?
 262
 263      The 'buffersize' is in bytes of input.
 264     For a given converter, divinding this by the minimum char size
 265     give you the maximum number of Unicode characters that could be
 266     expected for a given number of input bytes.
 267      see: ucnv_getMinCharSize()
 268
 269      For example, a single byte codepage like 'Latin-3' has a
 270     minimum char size of 1. (It takes at least 1 byte to represent
 271     each Unicode char.) So the unicode buffer has the same number of
 272     UChars as the input buffer has bytes.
 273
 274      In a strictly double byte codepage such as cp1362 (Windows
 275     Korean), the minimum char size is 2. So, only half as many Unicode
 276     chars as bytes are needed.
 277
 278      This work to calculate the buffer size is an optimization. Any
 279     size of input and output buffer can be used, as long as the
 280     program handles the following cases: If the input buffer is empty,
 281     the source pointer will be equal to sourceLimit.  If the output
 282     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
 283  */
 284
 285 UErrorCode convsample_05()
 286 {
 287   printf("\n\n==============================================\n"
 288          "Sample 05: C: count the number of letters in a UTF-8 document\n");
 289
 290   FILE *f;
 291   int32_t count;
 292   char inBuf[BUFFERSIZE];
 293   const char *source;
 294   const char *sourceLimit;
 295   UChar *uBuf;
 296   UChar *target;
 297   UChar *targetLimit;
 298   UChar *p;
 299   int32_t uBufSize = 0;
 300   UConverter *conv;
 301   UErrorCode status = U_ZERO_ERROR;
 302   uint32_t letters=0, total=0;
 303
 304   f = fopen("data01.txt", "r");
 305   if(!f)
 306   {
 307     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
 308     return U_FILE_ACCESS_ERROR;
 309   }
 310
 311   // **************************** START SAMPLE *******************
 312   conv = ucnv_open("utf-8", &status);
 313   assert(U_SUCCESS(status));
 314
 315   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 316   printf("input bytes %d / min chars %d = %d UChars\n",
 317          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 318   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 319   assert(uBuf!=NULL);
 320
 321   // grab another buffer's worth
 322   while((!feof(f)) &&
 323         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 324   {
 325     // Convert bytes to unicode
 326     source = inBuf;
 327     sourceLimit = inBuf + count;
 328
 329     do
 330     {
 331         target = uBuf;
 332         targetLimit = uBuf + uBufSize;
 333
 334         ucnv_toUnicode(conv, &target, targetLimit,
 335                        &source, sourceLimit, NULL,
 336                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 337                                    /* is true (when no more data will come) */
 338                        &status);
 339
 340         if(status == U_BUFFER_OVERFLOW_ERROR)
 341         {
 342           // simply ran out of space - we'll reset the target ptr the next
 343           // time through the loop.
 344           status = U_ZERO_ERROR;
 345         }
 346         else
 347         {
 348           //  Check other errors here.
 349           assert(U_SUCCESS(status));
 350           // Break out of the loop (by force)
 351         }
 352
 353         // Process the Unicode
 354         // Todo: handle UTF-16/surrogates
 355
 356         for(p = uBuf; p<target; p++)
 357         {
 358           if(u_isalpha(*p))
 359             letters++;
 360           total++;
 361         }
 362     } while (source < sourceLimit); // while simply out of space
 363   }
 364
 365   printf("%d letters out of %d total UChars.\n", letters, total);
 366
 367   // ***************************** END SAMPLE ********************
 368   ucnv_close(conv);
 369
 370   printf("\n");
 371
 372   fclose(f);
 373
 374   return U_ZERO_ERROR;
 375 }
 376 #undef BUFFERSIZE
 377
 378 #define BUFFERSIZE 1024
 379 typedef struct
 380 {
 381   UChar32  codepoint;
 382   uint32_t frequency;
 383 } CharFreqInfo;
 384
 385 UErrorCode convsample_06()
 386 {
 387   printf("\n\n==============================================\n"
 388          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
 389
 390   FILE *f;
 391   int32_t count;
 392   char inBuf[BUFFERSIZE];
 393   const char *source;
 394   const char *sourceLimit;
 395   int32_t uBufSize = 0;
 396   UConverter *conv;
 397   UErrorCode status = U_ZERO_ERROR;
 398   uint32_t letters=0, total=0;
 399
 400   CharFreqInfo   *info;
 401   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
 402   UChar32   p;
 403
 404   uint32_t ie = 0;
 405   uint32_t gh = 0;
 406   UChar32 l = 0;
 407
 408   f = fopen("data06.txt", "r");
 409   if(!f)
 410   {
 411     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
 412     return U_FILE_ACCESS_ERROR;
 413   }
 414
 415   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
 416   if(!info)
 417   {
 418     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
 419   }
 420
 421   /* reset frequencies */
 422   for(p=0;p<charCount;p++)
 423   {
 424     info[p].codepoint = p;
 425     info[p].frequency = 0;
 426   }
 427
 428   // **************************** START SAMPLE *******************
 429   conv = ucnv_open("utf-8", &status);
 430   assert(U_SUCCESS(status));
 431
 432   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 433   printf("input bytes %d / min chars %d = %d UChars\n",
 434          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 435
 436   // grab another buffer's worth
 437   while((!feof(f)) &&
 438         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 439   {
 440     // Convert bytes to unicode
 441     source = inBuf;
 442     sourceLimit = inBuf + count;
 443
 444     while(source < sourceLimit)
 445     {
 446       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
 447       if(U_FAILURE(status))
 448       {
 449         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
 450         status = U_ZERO_ERROR;
 451         continue;
 452       }
 453       U_ASSERT(status);
 454       total++;
 455
 456       if(u_isalpha(p))
 457         letters++;
 458
 459       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
 460         ie++;
 461
 462       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
 463         gh++;
 464
 465       if(p>charCount)
 466       {
 467         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
 468         free(info);
 469         fclose(f);
 470         ucnv_close(conv);
 471         return U_UNSUPPORTED_ERROR;
 472       }
 473       info[p].frequency++;
 474       l = p;
 475     }
 476   }
 477
 478   fclose(f);
 479   ucnv_close(conv);
 480
 481   printf("%d letters out of %d total UChars.\n", letters, total);
 482   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
 483
 484   // now, we could sort it..
 485
 486   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
 487
 488   for(p=0;p<charCount;p++)
 489   {
 490     if(info[p].frequency)
 491     {
 492       printf("% 5d U+%06X ", info[p].frequency, p);
 493       if(p <= 0xFFFF)
 494       {
 495         prettyPrintUChar((UChar)p);
 496       }
 497       printf("\n");
 498     }
 499   }
 500   free(info);
 501   // ***************************** END SAMPLE ********************
 502
 503   printf("\n");
 504
 505   return U_ZERO_ERROR;
 506 }
 507 #undef BUFFERSIZE
 508
 509
 510 /******************************************************
 511   You must call ucnv_close to clean up the memory used by the
 512   converter.
 513
 514   'len' returns the number of OUTPUT bytes resulting from the
 515   conversion.
 516  */
 517
 518 UErrorCode convsample_12()
 519 {
 520   printf("\n\n==============================================\n"
 521          "Sample 12: C: simple sjis -> unicode conversion\n");
 522
 523
 524   // **************************** START SAMPLE *******************
 525
 526   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
 527   UChar target[100];
 528   UErrorCode status = U_ZERO_ERROR;
 529   UConverter *conv;
 530   int32_t     len;
 531
 532   // set up the converter
 533   conv = ucnv_open("shift_jis", &status);
 534   assert(U_SUCCESS(status));
 535
 536   // convert to Unicode
 537   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
 538   target[6] = 0xFDCA;
 539   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
 540   U_ASSERT(status);
 541   // close the converter
 542   ucnv_close(conv);
 543
 544   // ***************************** END SAMPLE ********************
 545
 546   // Print it out
 547   printBytes("src", source, strlen(source) );
 548   printf("\n");
 549   printUChars("targ", target, len);
 550
 551   return U_ZERO_ERROR;
 552 }
 553
 554 /******************************************************************
 555    C: Convert from codepage to Unicode one at a time.
 556 */
 557
 558 UErrorCode convsample_13()
 559 {
 560   printf("\n\n==============================================\n"
 561          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
 562
 563
 564   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
 565   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
 566   const char *source, *sourceLimit;
 567   UChar32 target;
 568   UErrorCode status = U_ZERO_ERROR;
 569   UConverter *conv = NULL;
 570   int32_t srcCount=0;
 571   int32_t dstCount=0;
 572
 573   srcCount = sizeof(sourceChars);
 574
 575   conv = ucnv_open("Big5", &status);
 576   U_ASSERT(status);
 577
 578   source = sourceChars;
 579   sourceLimit = sourceChars + sizeof(sourceChars);
 580
 581   // **************************** START SAMPLE *******************
 582
 583
 584   printBytes("src",source,sourceLimit-source);
 585
 586   while(source < sourceLimit)
 587   {
 588     puts("");
 589     target = ucnv_getNextUChar (conv,
 590                                 &source,
 591                                 sourceLimit,
 592                                 &status);
 593
 594     //    printBytes("src",source,sourceLimit-source);
 595     U_ASSERT(status);
 596     printUChar(target);
 597     dstCount++;
 598   }
 599
 600
 601   // ************************** END SAMPLE *************************
 602
 603   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
 604   ucnv_close(conv);
 605
 606   return U_ZERO_ERROR;
 607 }
 608
 609
 610
 611
 612 UBool convsample_20_didSubstitute(const char *source)
 613 {
 614   UChar uchars[100];
 615   char bytes[100];
 616   UConverter *conv = NULL;
 617   UErrorCode status = U_ZERO_ERROR;
 618   uint32_t len, len2;
 619   UBool  flagVal;
 620
 621   FromUFLAGContext * context = NULL;
 622
 623   printf("\n\n==============================================\n"
 624          "Sample 20: C: Test for substitution using callbacks\n");
 625
 626   /* print out the original source */
 627   printBytes("src", source);
 628   printf("\n");
 629
 630   /* First, convert from UTF8 to unicode */
 631   conv = ucnv_open("utf-8", &status);
 632   U_ASSERT(status);
 633
 634   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 635   U_ASSERT(status);
 636
 637   printUChars("uch", uchars, len);
 638   printf("\n");
 639
 640   /* Now, close the converter */
 641   ucnv_close(conv);
 642
 643   /* Now, convert to windows-1252 */
 644   conv = ucnv_open("windows-1252", &status);
 645   U_ASSERT(status);
 646
 647   /* Converter starts out with the SUBSTITUTE callback set. */
 648
 649   /* initialize our callback */
 650   context = flagCB_fromU_openContext();
 651
 652   /* Set our special callback */
 653   ucnv_setFromUCallBack(conv,
 654                         flagCB_fromU,
 655                         context,
 656                         &(context->subCallback),
 657                         &(context->subContext),
 658                         &status);
 659
 660   U_ASSERT(status);
 661
 662   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
 663   U_ASSERT(status);
 664
 665   flagVal = context->flag;  /* it's about to go away when we close the cnv */
 666
 667   ucnv_close(conv);
 668
 669   /* print out the original source */
 670   printBytes("bytes", bytes, len2);
 671
 672   return flagVal; /* true if callback was called */
 673 }
 674
 675 UErrorCode convsample_20()
 676 {
 677   const char *sample1 = "abc\xdf\xbf";
 678   const char *sample2 = "abc_def";
 679
 680
 681   if(convsample_20_didSubstitute(sample1))
 682   {
 683     printf("DID substitute.\n******\n");
 684   }
 685   else
 686   {
 687     printf("Did NOT substitute.\n*****\n");
 688   }
 689
 690   if(convsample_20_didSubstitute(sample2))
 691   {
 692     printf("DID substitute.\n******\n");
 693   }
 694   else
 695   {
 696     printf("Did NOT substitute.\n*****\n");
 697   }
 698
 699   return U_ZERO_ERROR;
 700 }
 701
 702 // 21  - C, callback, with clone and debug
 703
 704
 705
 706 UBool convsample_21_didSubstitute(const char *source)
 707 {
 708   UChar uchars[100];
 709   char bytes[100];
 710   UConverter *conv = NULL, *cloneCnv = NULL;
 711   UErrorCode status = U_ZERO_ERROR;
 712   uint32_t len, len2;
 713   int32_t  cloneLen;
 714   UBool  flagVal = FALSE;
 715   UConverterFromUCallback junkCB;
 716
 717   FromUFLAGContext *flagCtx = NULL,
 718                    *cloneFlagCtx = NULL;
 719
 720   debugCBContext   *debugCtx1 = NULL,
 721                    *debugCtx2 = NULL,
 722                    *cloneDebugCtx = NULL;
 723
 724   printf("\n\n==============================================\n"
 725          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
 726
 727   /* print out the original source */
 728   printBytes("src", source);
 729   printf("\n");
 730
 731   /* First, convert from UTF8 to unicode */
 732   conv = ucnv_open("utf-8", &status);
 733   U_ASSERT(status);
 734
 735   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 736   U_ASSERT(status);
 737
 738   printUChars("uch", uchars, len);
 739   printf("\n");
 740
 741   /* Now, close the converter */
 742   ucnv_close(conv);
 743
 744   /* Now, convert to windows-1252 */
 745   conv = ucnv_open("windows-1252", &status);
 746   U_ASSERT(status);
 747
 748   /* Converter starts out with the SUBSTITUTE callback set. */
 749
 750   /* initialize our callback */
 751   /* from the 'bottom' innermost, out
 752    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
 753
 754 #if DEBUG_TMI
 755   printf("flagCB_fromU = %p\n", &flagCB_fromU);
 756   printf("debugCB_fromU = %p\n", &debugCB_fromU);
 757 #endif
 758
 759   debugCtx1 = debugCB_openContext();
 760    flagCtx  = flagCB_fromU_openContext();
 761   debugCtx2 = debugCB_openContext();
 762
 763   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
 764   debugCtx1->subContext  =  flagCtx;
 765
 766   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
 767   flagCtx->subContext    =  debugCtx2;
 768
 769   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 770   debugCtx2->subContext  = NULL;
 771
 772   /* Set our special callback */
 773
 774   ucnv_setFromUCallBack(conv,
 775                         debugCB_fromU,
 776                         debugCtx1,
 777                         &(debugCtx2->subCallback),
 778                         &(debugCtx2->subContext),
 779                         &status);
 780
 781   U_ASSERT(status);
 782
 783 #if DEBUG_TMI
 784   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
 785          conv, debugCtx1, debugCtx1->subCallback,
 786          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
 787 #endif
 788
 789   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
 790
 791   U_ASSERT(status);
 792
 793 #if DEBUG_TMI
 794   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
 795 #endif
 796
 797   ucnv_close(conv);
 798
 799 #if DEBUG_TMI
 800   printf("%p closed.\n", conv);
 801 #endif
 802
 803   U_ASSERT(status);
 804   /* Now, we have to extract the context */
 805   cloneDebugCtx = NULL;
 806   cloneFlagCtx  = NULL;
 807
 808   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
 809   if(cloneDebugCtx != NULL) {
 810       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
 811   }
 812
 813   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
 814          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
 815
 816   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
 817   U_ASSERT(status);
 818
 819   if(cloneFlagCtx != NULL) {
 820       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
 821   } else {
 822       printf("** Warning, couldn't get the subcallback \n");
 823   }
 824
 825   ucnv_close(cloneCnv);
 826
 827   /* print out the original source */
 828   printBytes("bytes", bytes, len2);
 829
 830   return flagVal; /* true if callback was called */
 831 }
 832
 833 UErrorCode convsample_21()
 834 {
 835   const char *sample1 = "abc\xdf\xbf";
 836   const char *sample2 = "abc_def";
 837
 838   if(convsample_21_didSubstitute(sample1))
 839   {
 840     printf("DID substitute.\n******\n");
 841   }
 842   else
 843   {
 844     printf("Did NOT substitute.\n*****\n");
 845   }
 846
 847   if(convsample_21_didSubstitute(sample2))
 848   {
 849     printf("DID substitute.\n******\n");
 850   }
 851   else
 852   {
 853     printf("Did NOT substitute.\n*****\n");
 854   }
 855
 856   return U_ZERO_ERROR;
 857 }
 858
 859
 860 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
 861
 862 #define BUFFERSIZE 17 /* make it interesting :) */
 863
 864 UErrorCode convsample_40()
 865 {
 866   printf("\n\n==============================================\n"
 867     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
 868
 869   FILE *f;
 870   FILE *out;
 871   int32_t count;
 872   char inBuf[BUFFERSIZE];
 873   const char *source;
 874   const char *sourceLimit;
 875   UChar *uBuf;
 876   UChar *target;
 877   UChar *targetLimit;
 878   int32_t uBufSize = 0;
 879   UConverter *conv = NULL;
 880   UErrorCode status = U_ZERO_ERROR;
 881   uint32_t inbytes=0, total=0;
 882
 883   f = fopen("data02.bin", "rb");
 884   if(!f)
 885   {
 886     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
 887     return U_FILE_ACCESS_ERROR;
 888   }
 889
 890   out = fopen("data40.utf16", "wb");
 891   if(!out)
 892   {
 893     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
 894     fclose(f);
 895     return U_FILE_ACCESS_ERROR;
 896   }
 897
 898   // **************************** START SAMPLE *******************
 899   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
 900   assert(U_SUCCESS(status));
 901
 902   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 903   printf("input bytes %d / min chars %d = %d UChars\n",
 904          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 905   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 906   assert(uBuf!=NULL);
 907
 908   // grab another buffer's worth
 909   while((!feof(f)) &&
 910         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 911   {
 912     inbytes += count;
 913
 914     // Convert bytes to unicode
 915     source = inBuf;
 916     sourceLimit = inBuf + count;
 917
 918     do
 919     {
 920         target = uBuf;
 921         targetLimit = uBuf + uBufSize;
 922
 923         ucnv_toUnicode( conv, &target, targetLimit,
 924                        &source, sourceLimit, NULL,
 925                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 926                                    /* is true (when no more data will come) */
 927                          &status);
 928
 929         if(status == U_BUFFER_OVERFLOW_ERROR)
 930         {
 931           // simply ran out of space - we'll reset the target ptr the next
 932           // time through the loop.
 933           status = U_ZERO_ERROR;
 934         }
 935         else
 936         {
 937           //  Check other errors here.
 938           assert(U_SUCCESS(status));
 939           // Break out of the loop (by force)
 940         }
 941
 942         // Process the Unicode
 943         // Todo: handle UTF-16/surrogates
 944         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
 945                (size_t)(target-uBuf));
 946         total += (target-uBuf);
 947     } while (source < sourceLimit); // while simply out of space
 948   }
 949
 950   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
 951
 952   // ***************************** END SAMPLE ********************
 953   ucnv_close(conv);
 954
 955   fclose(f);
 956   fclose(out);
 957   printf("\n");
 958
 959   return U_ZERO_ERROR;
 960 }
 961 #undef BUFFERSIZE
 962
 963
 964
 965 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
 966
 967 #define BUFFERSIZE 24 /* make it interesting :) */
 968
 969 UErrorCode convsample_46()
 970 {
 971   printf("\n\n==============================================\n"
 972     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
 973
 974   FILE *f;
 975   FILE *out;
 976   int32_t count;
 977   UChar inBuf[BUFFERSIZE];
 978   const UChar *source;
 979   const UChar *sourceLimit;
 980   char *buf;
 981   char *target;
 982   char *targetLimit;
 983
 984   int32_t bufSize = 0;
 985   UConverter *conv = NULL;
 986   UErrorCode status = U_ZERO_ERROR;
 987   uint32_t inchars=0, total=0;
 988
 989   f = fopen("data40.utf16", "rb");
 990   if(!f)
 991   {
 992     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
 993     return U_FILE_ACCESS_ERROR;
 994   }
 995
 996   out = fopen("data46.out", "wb");
 997   if(!out)
 998   {
 999     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1000     fclose(f);
1001     return U_FILE_ACCESS_ERROR;
1002   }
1003
1004   // **************************** START SAMPLE *******************
1005   conv = ucnv_open( "iso-8859-2", &status);
1006   assert(U_SUCCESS(status));
1007
1008   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1009   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1010          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1011   buf = (char*)malloc(bufSize * sizeof(char));
1012   assert(buf!=NULL);
1013
1014   // grab another buffer's worth
1015   while((!feof(f)) &&
1016         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1017   {
1018     inchars += count;
1019
1020     // Convert bytes to unicode
1021     source = inBuf;
1022     sourceLimit = inBuf + count;
1023
1024     do
1025     {
1026         target = buf;
1027         targetLimit = buf + bufSize;
1028
1029         ucnv_fromUnicode( conv, &target, targetLimit,
1030                        &source, sourceLimit, NULL,
1031                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1032                                    /* is true (when no more data will come) */
1033                          &status);
1034
1035         if(status == U_BUFFER_OVERFLOW_ERROR)
1036         {
1037           // simply ran out of space - we'll reset the target ptr the next
1038           // time through the loop.
1039           status = U_ZERO_ERROR;
1040         }
1041         else
1042         {
1043           //  Check other errors here.
1044           assert(U_SUCCESS(status));
1045           // Break out of the loop (by force)
1046         }
1047
1048         // Process the Unicode
1049         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1050                (size_t)(target-buf));
1051         total += (target-buf);
1052     } while (source < sourceLimit); // while simply out of space
1053   }
1054
1055   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1056
1057   // ***************************** END SAMPLE ********************
1058   ucnv_close(conv);
1059
1060   fclose(f);
1061   fclose(out);
1062   printf("\n");
1063
1064   return U_ZERO_ERROR;
1065 }
1066 #undef BUFFERSIZE
1067
1068 #define BUFFERSIZE 219
1069
1070 void convsample_50() {
1071   printf("\n\n==============================================\n"
1072          "Sample 50: C: ucnv_detectUnicodeSignature\n");
1073
1074   //! [ucnv_detectUnicodeSignature]
1075   UErrorCode err = U_ZERO_ERROR;
1076   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1077   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1078   int32_t signatureLength = 0;
1079   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1080   UConverter *conv = NULL;
1081   UChar output[100];
1082   UChar *target = output, *out;
1083   const char *source = input;
1084   if(encoding!=NULL && U_SUCCESS(err)){
1085     // should signature be discarded ?
1086     conv = ucnv_open(encoding, &err);
1087     // do the conversion
1088     ucnv_toUnicode(conv,
1089                    &target, output + sizeof(output)/U_SIZEOF_UCHAR,
1090                    &source, input + sizeof(input),
1091                    NULL, TRUE, &err);
1092     out = output;
1093     if (discardSignature){
1094       ++out; // ignore initial U+FEFF
1095     }
1096     while(out != target) {
1097       printf("%04x ", *out++);
1098     }
1099     puts("");
1100   }
1101   //! [ucnv_detectUnicodeSignature]
1102   puts("");
1103 }
1104
1105
1106
1107 /* main */
1108
1109 int main()
1110 {
1111
1112   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1113
1114   convsample_02();  // C  , u->koi8r, conv
1115   convsample_03();  // C,   iterate
1116
1117   convsample_05();  // C,  utf8->u, getNextUChar
1118   convsample_06(); // C freq counter thingy
1119
1120   convsample_12();  // C,  sjis->u, conv
1121   convsample_13();  // C,  big5->u, getNextU
1122
1123   convsample_20();  // C, callback
1124   convsample_21();  // C, callback debug
1125
1126   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1127
1128   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1129
1130   convsample_50();  // C, detect unicode signature
1131
1132   printf("End of converter samples.\n");
1133
1134   fflush(stdout);
1135   fflush(stderr);
1136
1137   return 0;
1138 }