icuSources/samples/ucnv/convsamp.cpp

   1 /*************************************************************************
   2 *
   3 *   © 2016 and later: Unicode, Inc. and others.
   4 *   License & terms of use: http://www.unicode.org/copyright.html#License
   5 *
   6 **************************************************************************
   7 **************************************************************************
   8 *
   9 *   Copyright (C) 2000-2016, International Business Machines
  10 *   Corporation and others.  All Rights Reserved.
  11 *
  12 ***************************************************************************
  13 *   file name:  convsamp.c
  14 *   encoding:   ASCII (7-bit)
  15 *
  16 *   created on: 2000may30
  17 *   created by: Steven R. Loomis
  18 *
  19 *   Sample code for the ICU conversion routines.
  20 *
  21 * Note: Nothing special is needed to build this sample. Link with
  22 *       the icu UC and icu I18N libraries.
  23 *
  24 *       I use 'assert' for error checking, you probably will want
  25 *       something more flexible.  '***BEGIN SAMPLE***' and
  26 *       '***END SAMPLE***' mark pieces suitable for stand alone
  27 *       code snippets.
  28 *
  29 *
  30 *  Each test can define it's own BUFFERSIZE
  31 *
  32 */
  33
  34 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
  35
  36 #include <stdio.h>
  37 #include <ctype.h>            /* for isspace, etc.    */
  38 #include <assert.h>
  39 #include <string.h>
  40 #include <stdlib.h>  /* malloc */
  41
  42 #include "unicode/utypes.h"   /* Basic ICU data types */
  43 #include "unicode/ucnv.h"     /* C   Converter API    */
  44 #include "unicode/ustring.h"  /* some more string fcns*/
  45 #include "unicode/uchar.h"    /* char names           */
  46 #include "unicode/uloc.h"
  47 #include "unicode/unistr.h"
  48
  49 #include "flagcb.h"
  50
  51 /* Some utility functions */
  52 #ifndef UPRV_LENGTHOF
  53 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  54 #endif
  55
  56 static const UChar kNone[] = { 0x0000 };
  57
  58 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
  59
  60 /* Print a UChar if possible, in seven characters. */
  61 void prettyPrintUChar(UChar c)
  62 {
  63   if(  (c <= 0x007F) &&
  64        (isgraph(c))  ) {
  65     printf(" '%c'   ", (char)(0x00FF&c));
  66   } else if ( c > 0x007F ) {
  67     char buf[1000];
  68     UErrorCode status = U_ZERO_ERROR;
  69     int32_t o;
  70
  71     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
  72     if(U_SUCCESS(status) && (o>0) ) {
  73       buf[6] = 0;
  74       printf("%7s", buf);
  75     } else {
  76       printf(" ??????");
  77     }
  78   } else {
  79     switch((char)(c & 0x007F)) {
  80     case ' ':
  81       printf(" ' '   ");
  82       break;
  83     case '\t':
  84       printf(" \\t    ");
  85       break;
  86     case '\n':
  87       printf(" \\n    ");
  88       break;
  89     default:
  90       printf("  _    ");
  91       break;
  92     }
  93   }
  94 }
  95
  96
  97 void printUChars(const char  *name = "?",
  98                  const UChar *uch  = kNone,
  99                  int32_t     len   = -1 )
 100 {
 101   int32_t i;
 102
 103   if( (len == -1) && (uch) ) {
 104     len = u_strlen(uch);
 105   }
 106
 107   printf("%5s: ", name);
 108   for( i = 0; i <len; i++) {
 109     printf("%-6d ", i);
 110   }
 111   printf("\n");
 112
 113   printf("%5s: ", "uni");
 114   for( i = 0; i <len; i++) {
 115     printf("\\u%04X ", (int)uch[i]);
 116   }
 117   printf("\n");
 118
 119   printf("%5s:", "ch");
 120   for( i = 0; i <len; i++) {
 121     prettyPrintUChar(uch[i]);
 122   }
 123   printf("\n");
 124 }
 125
 126 void printBytes(const char  *name = "?",
 127                  const char *uch  = "",
 128                  int32_t     len   = -1 )
 129 {
 130   int32_t i;
 131
 132   if( (len == -1) && (uch) ) {
 133     len = strlen(uch);
 134   }
 135
 136   printf("%5s: ", name);
 137   for( i = 0; i <len; i++) {
 138     printf("%-4d ", i);
 139   }
 140   printf("\n");
 141
 142   printf("%5s: ", "uni");
 143   for( i = 0; i <len; i++) {
 144     printf("\\x%02X ", 0x00FF & (int)uch[i]);
 145   }
 146   printf("\n");
 147
 148   printf("%5s:", "ch");
 149   for( i = 0; i <len; i++) {
 150     if(isgraph(0x00FF & (int)uch[i])) {
 151       printf(" '%c' ", (char)uch[i]);
 152     } else {
 153       printf("     ");
 154     }
 155   }
 156   printf("\n");
 157 }
 158
 159 void printUChar(UChar32 ch32)
 160 {
 161     if(ch32 > 0xFFFF) {
 162       printf("ch: U+%06X\n", ch32);
 163     }
 164     else {
 165       UChar ch = (UChar)ch32;
 166       printUChars("C", &ch, 1);
 167     }
 168 }
 169
 170 /*******************************************************************
 171   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
 172   followed by an exclamation mark (!) into the KOI8-R Russian code page.
 173
 174   This example first creates a UChar String out of the Unicode chars.
 175
 176   targetSize must be set to the amount of space available in the target
 177   buffer. After fromUChars is called,
 178   len will contain the number of bytes in target[] which were
 179   used in the resulting codepage.  In this case, there is a 1:1 mapping
 180   between the input and output characters. The exclamation mark has the
 181   same value in both KOI8-R and Unicode.
 182
 183   src: 0      1      2      3      4      5      6
 184   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
 185    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
 186
 187  targ:  0    1    2    3    4    5    6
 188   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
 189    ch:                                '!'
 190
 191
 192 Converting FROM unicode
 193   to koi8-r.
 194   You must call ucnv_close to clean up the memory used by the
 195   converter.
 196
 197   'len' returns the number of OUTPUT bytes resulting from the
 198   conversion.
 199  */
 200
 201 UErrorCode convsample_02()
 202 {
 203   printf("\n\n==============================================\n"
 204          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
 205
 206
 207   // **************************** START SAMPLE *******************
 208   // "cat<cat>OK"
 209   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
 210                      0x0430, 0x0021, 0x0000 };
 211   char target[100];
 212   UErrorCode status = U_ZERO_ERROR;
 213   UConverter *conv;
 214   int32_t     len;
 215
 216   // set up the converter
 217   //! [ucnv_open]
 218   conv = ucnv_open("koi8-r", &status);
 219   //! [ucnv_open]
 220   assert(U_SUCCESS(status));
 221
 222   // convert to koi8-r
 223   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
 224   assert(U_SUCCESS(status));
 225
 226   // close the converter
 227   ucnv_close(conv);
 228
 229   // ***************************** END SAMPLE ********************
 230
 231   // Print it out
 232   printUChars("src", source);
 233   printf("\n");
 234   printBytes("targ", target, len);
 235
 236   return U_ZERO_ERROR;
 237 }
 238
 239
 240 UErrorCode convsample_03()
 241 {
 242   printf("\n\n==============================================\n"
 243          "Sample 03: C: print out all converters\n");
 244
 245   int32_t count;
 246   int32_t i;
 247
 248   // **************************** START SAMPLE *******************
 249   count = ucnv_countAvailable();
 250   printf("Available converters: %d\n", count);
 251
 252   for(i=0;i<count;i++)
 253   {
 254     printf("%s ", ucnv_getAvailableName(i));
 255   }
 256
 257   // ***************************** END SAMPLE ********************
 258
 259   printf("\n");
 260
 261   return U_ZERO_ERROR;
 262 }
 263
 264
 265
 266 #define BUFFERSIZE 17 /* make it interesting :) */
 267
 268 /*
 269   Converting from a codepage to Unicode in bulk..
 270   What is the best way to determine the buffer size?
 271
 272      The 'buffersize' is in bytes of input.
 273     For a given converter, divinding this by the minimum char size
 274     give you the maximum number of Unicode characters that could be
 275     expected for a given number of input bytes.
 276      see: ucnv_getMinCharSize()
 277
 278      For example, a single byte codepage like 'Latin-3' has a
 279     minimum char size of 1. (It takes at least 1 byte to represent
 280     each Unicode char.) So the unicode buffer has the same number of
 281     UChars as the input buffer has bytes.
 282
 283      In a strictly double byte codepage such as cp1362 (Windows
 284     Korean), the minimum char size is 2. So, only half as many Unicode
 285     chars as bytes are needed.
 286
 287      This work to calculate the buffer size is an optimization. Any
 288     size of input and output buffer can be used, as long as the
 289     program handles the following cases: If the input buffer is empty,
 290     the source pointer will be equal to sourceLimit.  If the output
 291     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
 292  */
 293
 294 UErrorCode convsample_05()
 295 {
 296   printf("\n\n==============================================\n"
 297          "Sample 05: C: count the number of letters in a UTF-8 document\n");
 298
 299   FILE *f;
 300   int32_t count;
 301   char inBuf[BUFFERSIZE];
 302   const char *source;
 303   const char *sourceLimit;
 304   UChar *uBuf;
 305   UChar *target;
 306   UChar *targetLimit;
 307   UChar *p;
 308   int32_t uBufSize = 0;
 309   UConverter *conv;
 310   UErrorCode status = U_ZERO_ERROR;
 311   uint32_t letters=0, total=0;
 312
 313   f = fopen("data01.txt", "r");
 314   if(!f)
 315   {
 316     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
 317     return U_FILE_ACCESS_ERROR;
 318   }
 319
 320   // **************************** START SAMPLE *******************
 321   conv = ucnv_open("utf-8", &status);
 322   assert(U_SUCCESS(status));
 323
 324   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 325   printf("input bytes %d / min chars %d = %d UChars\n",
 326          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 327   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 328   assert(uBuf!=NULL);
 329
 330   // grab another buffer's worth
 331   while((!feof(f)) &&
 332         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 333   {
 334     // Convert bytes to unicode
 335     source = inBuf;
 336     sourceLimit = inBuf + count;
 337
 338     do
 339     {
 340         target = uBuf;
 341         targetLimit = uBuf + uBufSize;
 342
 343         ucnv_toUnicode(conv, &target, targetLimit,
 344                        &source, sourceLimit, NULL,
 345                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 346                                    /* is true (when no more data will come) */
 347                        &status);
 348
 349         if(status == U_BUFFER_OVERFLOW_ERROR)
 350         {
 351           // simply ran out of space - we'll reset the target ptr the next
 352           // time through the loop.
 353           status = U_ZERO_ERROR;
 354         }
 355         else
 356         {
 357           //  Check other errors here.
 358           assert(U_SUCCESS(status));
 359           // Break out of the loop (by force)
 360         }
 361
 362         // Process the Unicode
 363         // Todo: handle UTF-16/surrogates
 364
 365         for(p = uBuf; p<target; p++)
 366         {
 367           if(u_isalpha(*p))
 368             letters++;
 369           total++;
 370         }
 371     } while (source < sourceLimit); // while simply out of space
 372   }
 373
 374   printf("%d letters out of %d total UChars.\n", letters, total);
 375
 376   // ***************************** END SAMPLE ********************
 377   ucnv_close(conv);
 378
 379   printf("\n");
 380
 381   fclose(f);
 382
 383   return U_ZERO_ERROR;
 384 }
 385 #undef BUFFERSIZE
 386
 387 #define BUFFERSIZE 1024
 388 typedef struct
 389 {
 390   UChar32  codepoint;
 391   uint32_t frequency;
 392 } CharFreqInfo;
 393
 394 UErrorCode convsample_06()
 395 {
 396   printf("\n\n==============================================\n"
 397          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
 398
 399   FILE *f;
 400   int32_t count;
 401   char inBuf[BUFFERSIZE];
 402   const char *source;
 403   const char *sourceLimit;
 404   int32_t uBufSize = 0;
 405   UConverter *conv;
 406   UErrorCode status = U_ZERO_ERROR;
 407   uint32_t letters=0, total=0;
 408
 409   CharFreqInfo   *info;
 410   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
 411   UChar32   p;
 412
 413   uint32_t ie = 0;
 414   uint32_t gh = 0;
 415   UChar32 l = 0;
 416
 417   f = fopen("data06.txt", "r");
 418   if(!f)
 419   {
 420     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
 421     return U_FILE_ACCESS_ERROR;
 422   }
 423
 424   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
 425   if(!info)
 426   {
 427     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
 428   }
 429
 430   /* reset frequencies */
 431   for(p=0;p<charCount;p++)
 432   {
 433     info[p].codepoint = p;
 434     info[p].frequency = 0;
 435   }
 436
 437   // **************************** START SAMPLE *******************
 438   conv = ucnv_open("utf-8", &status);
 439   assert(U_SUCCESS(status));
 440
 441   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 442   printf("input bytes %d / min chars %d = %d UChars\n",
 443          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 444
 445   // grab another buffer's worth
 446   while((!feof(f)) &&
 447         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 448   {
 449     // Convert bytes to unicode
 450     source = inBuf;
 451     sourceLimit = inBuf + count;
 452
 453     while(source < sourceLimit)
 454     {
 455       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
 456       if(U_FAILURE(status))
 457       {
 458         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
 459         status = U_ZERO_ERROR;
 460         continue;
 461       }
 462       U_ASSERT(status);
 463       total++;
 464
 465       if(u_isalpha(p))
 466         letters++;
 467
 468       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
 469         ie++;
 470
 471       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
 472         gh++;
 473
 474       if(p>charCount)
 475       {
 476         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
 477         free(info);
 478         fclose(f);
 479         ucnv_close(conv);
 480         return U_UNSUPPORTED_ERROR;
 481       }
 482       info[p].frequency++;
 483       l = p;
 484     }
 485   }
 486
 487   fclose(f);
 488   ucnv_close(conv);
 489
 490   printf("%d letters out of %d total UChars.\n", letters, total);
 491   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
 492
 493   // now, we could sort it..
 494
 495   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
 496
 497   for(p=0;p<charCount;p++)
 498   {
 499     if(info[p].frequency)
 500     {
 501       printf("% 5d U+%06X ", info[p].frequency, p);
 502       if(p <= 0xFFFF)
 503       {
 504         prettyPrintUChar((UChar)p);
 505       }
 506       printf("\n");
 507     }
 508   }
 509   free(info);
 510   // ***************************** END SAMPLE ********************
 511
 512   printf("\n");
 513
 514   return U_ZERO_ERROR;
 515 }
 516 #undef BUFFERSIZE
 517
 518
 519 /******************************************************
 520   You must call ucnv_close to clean up the memory used by the
 521   converter.
 522
 523   'len' returns the number of OUTPUT bytes resulting from the
 524   conversion.
 525  */
 526
 527 UErrorCode convsample_12()
 528 {
 529   printf("\n\n==============================================\n"
 530          "Sample 12: C: simple sjis -> unicode conversion\n");
 531
 532
 533   // **************************** START SAMPLE *******************
 534
 535   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
 536   UChar target[100];
 537   UErrorCode status = U_ZERO_ERROR;
 538   UConverter *conv;
 539   int32_t     len;
 540
 541   // set up the converter
 542   conv = ucnv_open("shift_jis", &status);
 543   assert(U_SUCCESS(status));
 544
 545   // convert to Unicode
 546   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
 547   target[6] = 0xFDCA;
 548   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
 549   U_ASSERT(status);
 550   // close the converter
 551   ucnv_close(conv);
 552
 553   // ***************************** END SAMPLE ********************
 554
 555   // Print it out
 556   printBytes("src", source, strlen(source) );
 557   printf("\n");
 558   printUChars("targ", target, len);
 559
 560   return U_ZERO_ERROR;
 561 }
 562
 563 /******************************************************************
 564    C: Convert from codepage to Unicode one at a time.
 565 */
 566
 567 UErrorCode convsample_13()
 568 {
 569   printf("\n\n==============================================\n"
 570          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
 571
 572
 573   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
 574   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
 575   const char *source, *sourceLimit;
 576   UChar32 target;
 577   UErrorCode status = U_ZERO_ERROR;
 578   UConverter *conv = NULL;
 579   int32_t srcCount=0;
 580   int32_t dstCount=0;
 581
 582   srcCount = sizeof(sourceChars);
 583
 584   conv = ucnv_open("Big5", &status);
 585   U_ASSERT(status);
 586
 587   source = sourceChars;
 588   sourceLimit = sourceChars + sizeof(sourceChars);
 589
 590   // **************************** START SAMPLE *******************
 591
 592
 593   printBytes("src",source,sourceLimit-source);
 594
 595   while(source < sourceLimit)
 596   {
 597     puts("");
 598     target = ucnv_getNextUChar (conv,
 599                                 &source,
 600                                 sourceLimit,
 601                                 &status);
 602
 603     //    printBytes("src",source,sourceLimit-source);
 604     U_ASSERT(status);
 605     printUChar(target);
 606     dstCount++;
 607   }
 608
 609
 610   // ************************** END SAMPLE *************************
 611
 612   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
 613   ucnv_close(conv);
 614
 615   return U_ZERO_ERROR;
 616 }
 617
 618
 619
 620
 621 UBool convsample_20_didSubstitute(const char *source)
 622 {
 623   UChar uchars[100];
 624   char bytes[100];
 625   UConverter *conv = NULL;
 626   UErrorCode status = U_ZERO_ERROR;
 627   uint32_t len, len2;
 628   UBool  flagVal;
 629
 630   FromUFLAGContext * context = NULL;
 631
 632   printf("\n\n==============================================\n"
 633          "Sample 20: C: Test for substitution using callbacks\n");
 634
 635   /* print out the original source */
 636   printBytes("src", source);
 637   printf("\n");
 638
 639   /* First, convert from UTF8 to unicode */
 640   conv = ucnv_open("utf-8", &status);
 641   U_ASSERT(status);
 642
 643   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 644   U_ASSERT(status);
 645
 646   printUChars("uch", uchars, len);
 647   printf("\n");
 648
 649   /* Now, close the converter */
 650   ucnv_close(conv);
 651
 652   /* Now, convert to windows-1252 */
 653   conv = ucnv_open("windows-1252", &status);
 654   U_ASSERT(status);
 655
 656   /* Converter starts out with the SUBSTITUTE callback set. */
 657
 658   /* initialize our callback */
 659   context = flagCB_fromU_openContext();
 660
 661   /* Set our special callback */
 662   ucnv_setFromUCallBack(conv,
 663                         flagCB_fromU,
 664                         context,
 665                         &(context->subCallback),
 666                         &(context->subContext),
 667                         &status);
 668
 669   U_ASSERT(status);
 670
 671   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
 672   U_ASSERT(status);
 673
 674   flagVal = context->flag;  /* it's about to go away when we close the cnv */
 675
 676   ucnv_close(conv);
 677
 678   /* print out the original source */
 679   printBytes("bytes", bytes, len2);
 680
 681   return flagVal; /* true if callback was called */
 682 }
 683
 684 UErrorCode convsample_20()
 685 {
 686   const char *sample1 = "abc\xdf\xbf";
 687   const char *sample2 = "abc_def";
 688
 689
 690   if(convsample_20_didSubstitute(sample1))
 691   {
 692     printf("DID substitute.\n******\n");
 693   }
 694   else
 695   {
 696     printf("Did NOT substitute.\n*****\n");
 697   }
 698
 699   if(convsample_20_didSubstitute(sample2))
 700   {
 701     printf("DID substitute.\n******\n");
 702   }
 703   else
 704   {
 705     printf("Did NOT substitute.\n*****\n");
 706   }
 707
 708   return U_ZERO_ERROR;
 709 }
 710
 711 // 21  - C, callback, with clone and debug
 712
 713
 714
 715 UBool convsample_21_didSubstitute(const char *source)
 716 {
 717   UChar uchars[100];
 718   char bytes[100];
 719   UConverter *conv = NULL, *cloneCnv = NULL;
 720   UErrorCode status = U_ZERO_ERROR;
 721   uint32_t len, len2;
 722   int32_t  cloneLen;
 723   UBool  flagVal = FALSE;
 724   UConverterFromUCallback junkCB;
 725
 726   FromUFLAGContext *flagCtx = NULL,
 727                    *cloneFlagCtx = NULL;
 728
 729   debugCBContext   *debugCtx1 = NULL,
 730                    *debugCtx2 = NULL,
 731                    *cloneDebugCtx = NULL;
 732
 733   printf("\n\n==============================================\n"
 734          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
 735
 736   /* print out the original source */
 737   printBytes("src", source);
 738   printf("\n");
 739
 740   /* First, convert from UTF8 to unicode */
 741   conv = ucnv_open("utf-8", &status);
 742   U_ASSERT(status);
 743
 744   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 745   U_ASSERT(status);
 746
 747   printUChars("uch", uchars, len);
 748   printf("\n");
 749
 750   /* Now, close the converter */
 751   ucnv_close(conv);
 752
 753   /* Now, convert to windows-1252 */
 754   conv = ucnv_open("windows-1252", &status);
 755   U_ASSERT(status);
 756
 757   /* Converter starts out with the SUBSTITUTE callback set. */
 758
 759   /* initialize our callback */
 760   /* from the 'bottom' innermost, out
 761    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
 762
 763 #if DEBUG_TMI
 764   printf("flagCB_fromU = %p\n", &flagCB_fromU);
 765   printf("debugCB_fromU = %p\n", &debugCB_fromU);
 766 #endif
 767
 768   debugCtx1 = debugCB_openContext();
 769    flagCtx  = flagCB_fromU_openContext();
 770   debugCtx2 = debugCB_openContext();
 771
 772   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
 773   debugCtx1->subContext  =  flagCtx;
 774
 775   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
 776   flagCtx->subContext    =  debugCtx2;
 777
 778   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 779   debugCtx2->subContext  = NULL;
 780
 781   /* Set our special callback */
 782
 783   ucnv_setFromUCallBack(conv,
 784                         debugCB_fromU,
 785                         debugCtx1,
 786                         &(debugCtx2->subCallback),
 787                         &(debugCtx2->subContext),
 788                         &status);
 789
 790   U_ASSERT(status);
 791
 792 #if DEBUG_TMI
 793   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
 794          conv, debugCtx1, debugCtx1->subCallback,
 795          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
 796 #endif
 797
 798   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
 799
 800   U_ASSERT(status);
 801
 802 #if DEBUG_TMI
 803   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
 804 #endif
 805
 806   ucnv_close(conv);
 807
 808 #if DEBUG_TMI
 809   printf("%p closed.\n", conv);
 810 #endif
 811
 812   U_ASSERT(status);
 813   /* Now, we have to extract the context */
 814   cloneDebugCtx = NULL;
 815   cloneFlagCtx  = NULL;
 816
 817   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
 818   if(cloneDebugCtx != NULL) {
 819       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
 820   }
 821
 822   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
 823          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
 824
 825   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
 826   U_ASSERT(status);
 827
 828   if(cloneFlagCtx != NULL) {
 829       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
 830   } else {
 831       printf("** Warning, couldn't get the subcallback \n");
 832   }
 833
 834   ucnv_close(cloneCnv);
 835
 836   /* print out the original source */
 837   printBytes("bytes", bytes, len2);
 838
 839   return flagVal; /* true if callback was called */
 840 }
 841
 842 UErrorCode convsample_21()
 843 {
 844   const char *sample1 = "abc\xdf\xbf";
 845   const char *sample2 = "abc_def";
 846
 847   if(convsample_21_didSubstitute(sample1))
 848   {
 849     printf("DID substitute.\n******\n");
 850   }
 851   else
 852   {
 853     printf("Did NOT substitute.\n*****\n");
 854   }
 855
 856   if(convsample_21_didSubstitute(sample2))
 857   {
 858     printf("DID substitute.\n******\n");
 859   }
 860   else
 861   {
 862     printf("Did NOT substitute.\n*****\n");
 863   }
 864
 865   return U_ZERO_ERROR;
 866 }
 867
 868
 869 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
 870
 871 #define BUFFERSIZE 17 /* make it interesting :) */
 872
 873 UErrorCode convsample_40()
 874 {
 875   printf("\n\n==============================================\n"
 876     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
 877
 878   FILE *f;
 879   FILE *out;
 880   int32_t count;
 881   char inBuf[BUFFERSIZE];
 882   const char *source;
 883   const char *sourceLimit;
 884   UChar *uBuf;
 885   UChar *target;
 886   UChar *targetLimit;
 887   int32_t uBufSize = 0;
 888   UConverter *conv = NULL;
 889   UErrorCode status = U_ZERO_ERROR;
 890   uint32_t inbytes=0, total=0;
 891
 892   f = fopen("data02.bin", "rb");
 893   if(!f)
 894   {
 895     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
 896     return U_FILE_ACCESS_ERROR;
 897   }
 898
 899   out = fopen("data40.utf16", "wb");
 900   if(!out)
 901   {
 902     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
 903     fclose(f);
 904     return U_FILE_ACCESS_ERROR;
 905   }
 906
 907   // **************************** START SAMPLE *******************
 908   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
 909   assert(U_SUCCESS(status));
 910
 911   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 912   printf("input bytes %d / min chars %d = %d UChars\n",
 913          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 914   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 915   assert(uBuf!=NULL);
 916
 917   // grab another buffer's worth
 918   while((!feof(f)) &&
 919         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 920   {
 921     inbytes += count;
 922
 923     // Convert bytes to unicode
 924     source = inBuf;
 925     sourceLimit = inBuf + count;
 926
 927     do
 928     {
 929         target = uBuf;
 930         targetLimit = uBuf + uBufSize;
 931
 932         ucnv_toUnicode( conv, &target, targetLimit,
 933                        &source, sourceLimit, NULL,
 934                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 935                                    /* is true (when no more data will come) */
 936                          &status);
 937
 938         if(status == U_BUFFER_OVERFLOW_ERROR)
 939         {
 940           // simply ran out of space - we'll reset the target ptr the next
 941           // time through the loop.
 942           status = U_ZERO_ERROR;
 943         }
 944         else
 945         {
 946           //  Check other errors here.
 947           assert(U_SUCCESS(status));
 948           // Break out of the loop (by force)
 949         }
 950
 951         // Process the Unicode
 952         // Todo: handle UTF-16/surrogates
 953         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
 954                (size_t)(target-uBuf));
 955         total += (target-uBuf);
 956     } while (source < sourceLimit); // while simply out of space
 957   }
 958
 959   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
 960
 961   // ***************************** END SAMPLE ********************
 962   ucnv_close(conv);
 963
 964   fclose(f);
 965   fclose(out);
 966   printf("\n");
 967
 968   return U_ZERO_ERROR;
 969 }
 970 #undef BUFFERSIZE
 971
 972
 973
 974 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
 975
 976 #define BUFFERSIZE 24 /* make it interesting :) */
 977
 978 UErrorCode convsample_46()
 979 {
 980   printf("\n\n==============================================\n"
 981     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
 982
 983   FILE *f;
 984   FILE *out;
 985   int32_t count;
 986   UChar inBuf[BUFFERSIZE];
 987   const UChar *source;
 988   const UChar *sourceLimit;
 989   char *buf;
 990   char *target;
 991   char *targetLimit;
 992
 993   int32_t bufSize = 0;
 994   UConverter *conv = NULL;
 995   UErrorCode status = U_ZERO_ERROR;
 996   uint32_t inchars=0, total=0;
 997
 998   f = fopen("data40.utf16", "rb");
 999   if(!f)
1000   {
1001     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1002     return U_FILE_ACCESS_ERROR;
1003   }
1004
1005   out = fopen("data46.out", "wb");
1006   if(!out)
1007   {
1008     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1009     fclose(f);
1010     return U_FILE_ACCESS_ERROR;
1011   }
1012
1013   // **************************** START SAMPLE *******************
1014   conv = ucnv_open( "iso-8859-2", &status);
1015   assert(U_SUCCESS(status));
1016
1017   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1018   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1019          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1020   buf = (char*)malloc(bufSize * sizeof(char));
1021   assert(buf!=NULL);
1022
1023   // grab another buffer's worth
1024   while((!feof(f)) &&
1025         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1026   {
1027     inchars += count;
1028
1029     // Convert bytes to unicode
1030     source = inBuf;
1031     sourceLimit = inBuf + count;
1032
1033     do
1034     {
1035         target = buf;
1036         targetLimit = buf + bufSize;
1037
1038         ucnv_fromUnicode( conv, &target, targetLimit,
1039                        &source, sourceLimit, NULL,
1040                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1041                                    /* is true (when no more data will come) */
1042                          &status);
1043
1044         if(status == U_BUFFER_OVERFLOW_ERROR)
1045         {
1046           // simply ran out of space - we'll reset the target ptr the next
1047           // time through the loop.
1048           status = U_ZERO_ERROR;
1049         }
1050         else
1051         {
1052           //  Check other errors here.
1053           assert(U_SUCCESS(status));
1054           // Break out of the loop (by force)
1055         }
1056
1057         // Process the Unicode
1058         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1059                (size_t)(target-buf));
1060         total += (target-buf);
1061     } while (source < sourceLimit); // while simply out of space
1062   }
1063
1064   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1065
1066   // ***************************** END SAMPLE ********************
1067   ucnv_close(conv);
1068
1069   fclose(f);
1070   fclose(out);
1071   printf("\n");
1072
1073   return U_ZERO_ERROR;
1074 }
1075 #undef BUFFERSIZE
1076
1077 #define BUFFERSIZE 219
1078
1079 void convsample_50() {
1080   printf("\n\n==============================================\n"
1081          "Sample 50: C: ucnv_detectUnicodeSignature\n");
1082
1083   //! [ucnv_detectUnicodeSignature]
1084   UErrorCode err = U_ZERO_ERROR;
1085   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1086   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1087   int32_t signatureLength = 0;
1088   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1089   UConverter *conv = NULL;
1090   UChar output[100];
1091   UChar *target = output, *out;
1092   const char *source = input;
1093   if(encoding!=NULL && U_SUCCESS(err)){
1094     // should signature be discarded ?
1095     conv = ucnv_open(encoding, &err);
1096     // do the conversion
1097     ucnv_toUnicode(conv,
1098                    &target, output + UPRV_LENGTHOF(output),
1099                    &source, input + sizeof(input),
1100                    NULL, TRUE, &err);
1101     out = output;
1102     if (discardSignature){
1103       ++out; // ignore initial U+FEFF
1104     }
1105     while(out != target) {
1106       printf("%04x ", *out++);
1107     }
1108     puts("");
1109   }
1110   //! [ucnv_detectUnicodeSignature]
1111   puts("");
1112 }
1113
1114
1115
1116 /* main */
1117
1118 int main()
1119 {
1120
1121   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1122
1123   convsample_02();  // C  , u->koi8r, conv
1124   convsample_03();  // C,   iterate
1125
1126   convsample_05();  // C,  utf8->u, getNextUChar
1127   convsample_06(); // C freq counter thingy
1128
1129   convsample_12();  // C,  sjis->u, conv
1130   convsample_13();  // C,  big5->u, getNextU
1131
1132   convsample_20();  // C, callback
1133   convsample_21();  // C, callback debug
1134
1135   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1136
1137   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1138
1139   convsample_50();  // C, detect unicode signature
1140
1141   printf("End of converter samples.\n");
1142
1143   fflush(stdout);
1144   fflush(stderr);
1145
1146   return 0;
1147 }