icuSources/samples/ucnv/convsamp.cpp

   1 /**************************************************************************
   2 *
   3 *   Copyright (C) 2000-2003, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *
   6 ***************************************************************************
   7 *   file name:  convsamp.c
   8 *   encoding:   ASCII (7-bit)
   9 *
  10 *   created on: 2000may30
  11 *   created by: Steven R. Loomis
  12 *
  13 *   Sample code for the ICU conversion routines.
  14 *
  15 * Note: Nothing special is needed to build this sample. Link with
  16 *       the icu UC and icu I18N libraries.
  17 *
  18 *       I use 'assert' for error checking, you probably will want
  19 *       something more flexible.  '***BEGIN SAMPLE***' and
  20 *       '***END SAMPLE***' mark pieces suitable for stand alone
  21 *       code snippets.
  22 *
  23 *
  24 *  Each test can define it's own BUFFERSIZE
  25 *
  26 */
  27
  28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
  29
  30 #include <stdio.h>
  31 #include <ctype.h>            /* for isspace, etc.    */
  32 #include <assert.h>
  33 #include <string.h>
  34 #include <stdlib.h>  /* malloc */
  35
  36 #include "unicode/utypes.h"   /* Basic ICU data types */
  37 #include "unicode/ucnv.h"     /* C   Converter API    */
  38 #include "unicode/ustring.h"  /* some more string fcns*/
  39 #include "unicode/uchar.h"    /* char names           */
  40 #include "unicode/uloc.h"
  41 #include "unicode/unistr.h"
  42
  43 #include "flagcb.h"
  44
  45 /* Some utility functions */
  46
  47 static const UChar kNone[] = { 0x0000 };
  48
  49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
  50
  51 /* Print a UChar if possible, in seven characters. */
  52 void prettyPrintUChar(UChar c)
  53 {
  54   if(  (c <= 0x007F) &&
  55        (isgraph(c))  ) {
  56     printf(" '%c'   ", (char)(0x00FF&c));
  57   } else if ( c > 0x007F ) {
  58     char buf[1000];
  59     UErrorCode status = U_ZERO_ERROR;
  60     int32_t o;
  61
  62     o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status);
  63     if(U_SUCCESS(status) && (o>0) ) {
  64       buf[6] = 0;
  65       printf("%7s", buf);
  66     } else {
  67       o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status);
  68       if(U_SUCCESS(status) && (o>0)) {
  69         buf[5] = 0;
  70         printf("~%6s", buf);
  71       }
  72       else {
  73         printf(" ??????");
  74       }
  75     }
  76   } else {
  77     switch((char)(c & 0x007F)) {
  78     case ' ':
  79       printf(" ' '   ");
  80       break;
  81     case '\t':
  82       printf(" \\t    ");
  83       break;
  84     case '\n':
  85       printf(" \\n    ");
  86       break;
  87     default:
  88       printf("  _    ");
  89       break;
  90     }
  91   }
  92 }
  93
  94
  95 void printUChars(const char  *name = "?",
  96                  const UChar *uch  = kNone,
  97                  int32_t     len   = -1 )
  98 {
  99   int32_t i;
 100
 101   if( (len == -1) && (uch) ) {
 102     len = u_strlen(uch);
 103   }
 104
 105   printf("%5s: ", name);
 106   for( i = 0; i <len; i++) {
 107     printf("%-6d ", i);
 108   }
 109   printf("\n");
 110
 111   printf("%5s: ", "uni");
 112   for( i = 0; i <len; i++) {
 113     printf("\\u%04X ", (int)uch[i]);
 114   }
 115   printf("\n");
 116
 117   printf("%5s:", "ch");
 118   for( i = 0; i <len; i++) {
 119     prettyPrintUChar(uch[i]);
 120   }
 121   printf("\n");
 122 }
 123
 124 void printBytes(const char  *name = "?",
 125                  const char *uch  = "",
 126                  int32_t     len   = -1 )
 127 {
 128   int32_t i;
 129
 130   if( (len == -1) && (uch) ) {
 131     len = strlen(uch);
 132   }
 133
 134   printf("%5s: ", name);
 135   for( i = 0; i <len; i++) {
 136     printf("%-4d ", i);
 137   }
 138   printf("\n");
 139
 140   printf("%5s: ", "uni");
 141   for( i = 0; i <len; i++) {
 142     printf("\\x%02X ", 0x00FF & (int)uch[i]);
 143   }
 144   printf("\n");
 145
 146   printf("%5s:", "ch");
 147   for( i = 0; i <len; i++) {
 148     if(isgraph(0x00FF & (int)uch[i])) {
 149       printf(" '%c' ", (char)uch[i]);
 150     } else {
 151       printf("     ");
 152     }
 153   }
 154   printf("\n");
 155 }
 156
 157 void printUChar(UChar32 ch32)
 158 {
 159     if(ch32 > 0xFFFF) {
 160       printf("ch: U+%06X\n", ch32);
 161     }
 162     else {
 163       UChar ch = (UChar)ch32;
 164       printUChars("C", &ch, 1);
 165     }
 166 }
 167
 168 /*******************************************************************
 169   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
 170   followed by an exclamation mark (!) into the KOI8-R Russian code page.
 171
 172   This example first creates a UChar String out of the Unicode chars.
 173
 174   targetSize must be set to the amount of space available in the target
 175   buffer. After fromUChars is called,
 176   len will contain the number of bytes in target[] which were
 177   used in the resulting codepage.  In this case, there is a 1:1 mapping
 178   between the input and output characters. The exclamation mark has the
 179   same value in both KOI8-R and Unicode.
 180
 181   src: 0      1      2      3      4      5      6
 182   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
 183    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
 184
 185  targ:  0    1    2    3    4    5    6
 186   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
 187    ch:                                '!'
 188
 189
 190 Converting FROM unicode
 191   to koi8-r.
 192   You must call ucnv_close to clean up the memory used by the
 193   converter.
 194
 195   'len' returns the number of OUTPUT bytes resulting from the
 196   conversion.
 197  */
 198
 199 UErrorCode convsample_02()
 200 {
 201   printf("\n\n==============================================\n"
 202          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
 203
 204
 205   // **************************** START SAMPLE *******************
 206   // "cat<cat>OK"
 207   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
 208                      0x0430, 0x0021, 0x0000 };
 209   char target[100];
 210   UErrorCode status = U_ZERO_ERROR;
 211   UConverter *conv;
 212   int32_t     len;
 213
 214   // set up the converter
 215   conv = ucnv_open("koi8-r", &status);
 216   assert(U_SUCCESS(status));
 217
 218   // convert to koi8-r
 219   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
 220   assert(U_SUCCESS(status));
 221
 222   // close the converter
 223   ucnv_close(conv);
 224
 225   // ***************************** END SAMPLE ********************
 226
 227   // Print it out
 228   printUChars("src", source);
 229   printf("\n");
 230   printBytes("targ", target, len);
 231
 232   return U_ZERO_ERROR;
 233 }
 234
 235
 236 UErrorCode convsample_03()
 237 {
 238   printf("\n\n==============================================\n"
 239          "Sample 03: C: print out all converters\n");
 240
 241   int32_t count;
 242   int32_t i;
 243
 244   // **************************** START SAMPLE *******************
 245   count = ucnv_countAvailable();
 246   printf("Available converters: %d\n", count);
 247
 248   for(i=0;i<count;i++)
 249   {
 250     printf("%s ", ucnv_getAvailableName(i));
 251   }
 252
 253   // ***************************** END SAMPLE ********************
 254
 255   printf("\n");
 256
 257   return U_ZERO_ERROR;
 258 }
 259
 260
 261
 262 #define BUFFERSIZE 17 /* make it interesting :) */
 263
 264 /*
 265   Converting from a codepage to Unicode in bulk..
 266   What is the best way to determine the buffer size?
 267
 268      The 'buffersize' is in bytes of input.
 269     For a given converter, divinding this by the minimum char size
 270     give you the maximum number of Unicode characters that could be
 271     expected for a given number of input bytes.
 272      see: ucnv_getMinCharSize()
 273
 274      For example, a single byte codepage like 'Latin-3' has a
 275     minimum char size of 1. (It takes at least 1 byte to represent
 276     each Unicode char.) So the unicode buffer has the same number of
 277     UChars as the input buffer has bytes.
 278
 279      In a strictly double byte codepage such as cp1362 (Windows
 280     Korean), the minimum char size is 2. So, only half as many Unicode
 281     chars as bytes are needed.
 282
 283      This work to calculate the buffer size is an optimization. Any
 284     size of input and output buffer can be used, as long as the
 285     program handles the following cases: If the input buffer is empty,
 286     the source pointer will be equal to sourceLimit.  If the output
 287     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
 288  */
 289
 290 UErrorCode convsample_05()
 291 {
 292   printf("\n\n==============================================\n"
 293          "Sample 05: C: count the number of letters in a UTF-8 document\n");
 294
 295   FILE *f;
 296   int32_t count;
 297   char inBuf[BUFFERSIZE];
 298   const char *source;
 299   const char *sourceLimit;
 300   UChar *uBuf;
 301   UChar *target;
 302   UChar *targetLimit;
 303   UChar *p;
 304   int32_t uBufSize = 0;
 305   UConverter *conv;
 306   UErrorCode status = U_ZERO_ERROR;
 307   uint32_t letters=0, total=0;
 308
 309   f = fopen("data01.txt", "r");
 310   if(!f)
 311   {
 312     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
 313     return U_FILE_ACCESS_ERROR;
 314   }
 315
 316   // **************************** START SAMPLE *******************
 317   conv = ucnv_open("utf-8", &status);
 318   assert(U_SUCCESS(status));
 319
 320   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 321   printf("input bytes %d / min chars %d = %d UChars\n",
 322          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 323   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 324   assert(uBuf!=NULL);
 325
 326   // grab another buffer's worth
 327   while((!feof(f)) &&
 328         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 329   {
 330     // Convert bytes to unicode
 331     source = inBuf;
 332     sourceLimit = inBuf + count;
 333
 334     do
 335     {
 336         target = uBuf;
 337         targetLimit = uBuf + uBufSize;
 338
 339         ucnv_toUnicode(conv, &target, targetLimit,
 340                        &source, sourceLimit, NULL,
 341                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 342                                    /* is true (when no more data will come) */
 343                        &status);
 344
 345         if(status == U_BUFFER_OVERFLOW_ERROR)
 346         {
 347           // simply ran out of space - we'll reset the target ptr the next
 348           // time through the loop.
 349           status = U_ZERO_ERROR;
 350         }
 351         else
 352         {
 353           //  Check other errors here.
 354           assert(U_SUCCESS(status));
 355           // Break out of the loop (by force)
 356         }
 357
 358         // Process the Unicode
 359         // Todo: handle UTF-16/surrogates
 360
 361         for(p = uBuf; p<target; p++)
 362         {
 363           if(u_isalpha(*p))
 364             letters++;
 365           total++;
 366         }
 367     } while (source < sourceLimit); // while simply out of space
 368   }
 369
 370   printf("%d letters out of %d total UChars.\n", letters, total);
 371
 372   // ***************************** END SAMPLE ********************
 373   ucnv_close(conv);
 374
 375   printf("\n");
 376
 377   return U_ZERO_ERROR;
 378 }
 379 #undef BUFFERSIZE
 380
 381 #define BUFFERSIZE 1024
 382 typedef struct
 383 {
 384   UChar32  codepoint;
 385   uint32_t frequency;
 386 } CharFreqInfo;
 387
 388 UErrorCode convsample_06()
 389 {
 390   printf("\n\n==============================================\n"
 391          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
 392
 393   FILE *f;
 394   int32_t count;
 395   char inBuf[BUFFERSIZE];
 396   const char *source;
 397   const char *sourceLimit;
 398   UChar *uBuf;
 399   int32_t uBufSize = 0;
 400   UConverter *conv;
 401   UErrorCode status = U_ZERO_ERROR;
 402   uint32_t letters=0, total=0;
 403
 404   CharFreqInfo   *info;
 405   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
 406   UChar32   p;
 407
 408   uint32_t ie = 0;
 409   uint32_t gh = 0;
 410   UChar32 l = 0;
 411
 412   f = fopen("data06.txt", "r");
 413   if(!f)
 414   {
 415     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
 416     return U_FILE_ACCESS_ERROR;
 417   }
 418
 419   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
 420   if(!info)
 421   {
 422     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
 423   }
 424
 425   /* reset frequencies */
 426   for(p=0;p<charCount;p++)
 427   {
 428     info[p].codepoint = p;
 429     info[p].frequency = 0;
 430   }
 431
 432   // **************************** START SAMPLE *******************
 433   conv = ucnv_open("utf-8", &status);
 434   assert(U_SUCCESS(status));
 435
 436   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 437   printf("input bytes %d / min chars %d = %d UChars\n",
 438          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 439   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 440   assert(uBuf!=NULL);
 441
 442   // grab another buffer's worth
 443   while((!feof(f)) &&
 444         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 445   {
 446     // Convert bytes to unicode
 447     source = inBuf;
 448     sourceLimit = inBuf + count;
 449
 450     while(source < sourceLimit)
 451     {
 452       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
 453       if(U_FAILURE(status))
 454       {
 455         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
 456         status = U_ZERO_ERROR;
 457         continue;
 458       }
 459       U_ASSERT(status);
 460       total++;
 461
 462       if(u_isalpha(p))
 463         letters++;
 464
 465       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
 466         ie++;
 467
 468       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
 469         gh++;
 470
 471       if(p>charCount)
 472       {
 473         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
 474         return U_UNSUPPORTED_ERROR;
 475       }
 476       info[p].frequency++;
 477       l = p;
 478     }
 479   }
 480
 481   fclose(f);
 482   ucnv_close(conv);
 483
 484   printf("%d letters out of %d total UChars.\n", letters, total);
 485   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
 486
 487   // now, we could sort it..
 488
 489   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
 490
 491   for(p=0;p<charCount;p++)
 492   {
 493     if(info[p].frequency)
 494     {
 495       printf("% 5d U+%06X ", info[p].frequency, p);
 496       if(p <= 0xFFFF)
 497       {
 498         prettyPrintUChar((UChar)p);
 499       }
 500       printf("\n");
 501     }
 502   }
 503   free(info);
 504   // ***************************** END SAMPLE ********************
 505
 506   printf("\n");
 507
 508   return U_ZERO_ERROR;
 509 }
 510 #undef BUFFERSIZE
 511
 512
 513 /******************************************************
 514   You must call ucnv_close to clean up the memory used by the
 515   converter.
 516
 517   'len' returns the number of OUTPUT bytes resulting from the
 518   conversion.
 519  */
 520
 521 UErrorCode convsample_12()
 522 {
 523   printf("\n\n==============================================\n"
 524          "Sample 12: C: simple sjis -> unicode conversion\n");
 525
 526
 527   // **************************** START SAMPLE *******************
 528
 529   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
 530   UChar target[100];
 531   UErrorCode status = U_ZERO_ERROR;
 532   UConverter *conv;
 533   int32_t     len;
 534
 535   // set up the converter
 536   conv = ucnv_open("shift_jis", &status);
 537   assert(U_SUCCESS(status));
 538
 539   // convert to Unicode
 540   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
 541   target[6] = 0xFDCA;
 542   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
 543   U_ASSERT(status);
 544   // close the converter
 545   ucnv_close(conv);
 546
 547   // ***************************** END SAMPLE ********************
 548
 549   // Print it out
 550   printBytes("src", source, strlen(source) );
 551   printf("\n");
 552   printUChars("targ", target, len);
 553
 554   return U_ZERO_ERROR;
 555 }
 556
 557 /******************************************************************
 558    C: Convert from codepage to Unicode one at a time.
 559 */
 560
 561 UErrorCode convsample_13()
 562 {
 563   printf("\n\n==============================================\n"
 564          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
 565
 566
 567   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
 568   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
 569   const char *source, *sourceLimit;
 570   UChar32 target;
 571   UErrorCode status = U_ZERO_ERROR;
 572   UConverter *conv = NULL;
 573   int32_t srcCount=0;
 574   int32_t dstCount=0;
 575
 576   srcCount = sizeof(sourceChars);
 577
 578   conv = ucnv_open("Big5", &status);
 579   U_ASSERT(status);
 580
 581   source = sourceChars;
 582   sourceLimit = sourceChars + sizeof(sourceChars);
 583
 584   // **************************** START SAMPLE *******************
 585
 586
 587   printBytes("src",source,sourceLimit-source);
 588
 589   while(source < sourceLimit)
 590   {
 591     puts("");
 592     target = ucnv_getNextUChar (conv,
 593                                 &source,
 594                                 sourceLimit,
 595                                 &status);
 596
 597     //    printBytes("src",source,sourceLimit-source);
 598     U_ASSERT(status);
 599     printUChar(target);
 600     dstCount++;
 601   }
 602
 603
 604   // ************************** END SAMPLE *************************
 605
 606   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
 607   ucnv_close(conv);
 608
 609   return U_ZERO_ERROR;
 610 }
 611
 612
 613
 614
 615 UBool convsample_20_didSubstitute(const char *source)
 616 {
 617   UChar uchars[100];
 618   char bytes[100];
 619   UConverter *conv = NULL;
 620   UErrorCode status = U_ZERO_ERROR;
 621   uint32_t len, len2;
 622   UBool  flagVal;
 623
 624   FromUFLAGContext * context = NULL;
 625
 626   printf("\n\n==============================================\n"
 627          "Sample 20: C: Test for substitution using callbacks\n");
 628
 629   /* print out the original source */
 630   printBytes("src", source);
 631   printf("\n");
 632
 633   /* First, convert from UTF8 to unicode */
 634   conv = ucnv_open("utf-8", &status);
 635   U_ASSERT(status);
 636
 637   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 638   U_ASSERT(status);
 639
 640   printUChars("uch", uchars, len);
 641   printf("\n");
 642
 643   /* Now, close the converter */
 644   ucnv_close(conv);
 645
 646   /* Now, convert to windows-1252 */
 647   conv = ucnv_open("windows-1252", &status);
 648   U_ASSERT(status);
 649
 650   /* Converter starts out with the SUBSTITUTE callback set. */
 651
 652   /* initialize our callback */
 653   context = flagCB_fromU_openContext();
 654
 655   /* Set our special callback */
 656   ucnv_setFromUCallBack(conv,
 657                         flagCB_fromU,
 658                         context,
 659                         &(context->subCallback),
 660                         &(context->subContext),
 661                         &status);
 662
 663   U_ASSERT(status);
 664
 665   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
 666   U_ASSERT(status);
 667
 668   flagVal = context->flag;  /* it's about to go away when we close the cnv */
 669
 670   ucnv_close(conv);
 671
 672   /* print out the original source */
 673   printBytes("bytes", bytes, len2);
 674
 675   return flagVal; /* true if callback was called */
 676 }
 677
 678 UErrorCode convsample_20()
 679 {
 680   const char *sample1 = "abc\xdf\xbf";
 681   const char *sample2 = "abc_def";
 682
 683
 684   if(convsample_20_didSubstitute(sample1))
 685   {
 686     printf("DID substitute.\n******\n");
 687   }
 688   else
 689   {
 690     printf("Did NOT substitute.\n*****\n");
 691   }
 692
 693   if(convsample_20_didSubstitute(sample2))
 694   {
 695     printf("DID substitute.\n******\n");
 696   }
 697   else
 698   {
 699     printf("Did NOT substitute.\n*****\n");
 700   }
 701
 702   return U_ZERO_ERROR;
 703 }
 704
 705 // 21  - C, callback, with clone and debug
 706
 707
 708
 709 UBool convsample_21_didSubstitute(const char *source)
 710 {
 711   UChar uchars[100];
 712   char bytes[100];
 713   UConverter *conv = NULL, *cloneCnv = NULL;
 714   UErrorCode status = U_ZERO_ERROR;
 715   uint32_t len, len2;
 716   int32_t  cloneLen;
 717   UBool  flagVal = FALSE;
 718   UConverterFromUCallback junkCB;
 719
 720   FromUFLAGContext *flagCtx = NULL,
 721                    *cloneFlagCtx = NULL;
 722
 723   debugCBContext   *debugCtx1 = NULL,
 724                    *debugCtx2 = NULL,
 725                    *cloneDebugCtx = NULL;
 726
 727   printf("\n\n==============================================\n"
 728          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
 729
 730   /* print out the original source */
 731   printBytes("src", source);
 732   printf("\n");
 733
 734   /* First, convert from UTF8 to unicode */
 735   conv = ucnv_open("utf-8", &status);
 736   U_ASSERT(status);
 737
 738   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 739   U_ASSERT(status);
 740
 741   printUChars("uch", uchars, len);
 742   printf("\n");
 743
 744   /* Now, close the converter */
 745   ucnv_close(conv);
 746
 747   /* Now, convert to windows-1252 */
 748   conv = ucnv_open("windows-1252", &status);
 749   U_ASSERT(status);
 750
 751   /* Converter starts out with the SUBSTITUTE callback set. */
 752
 753   /* initialize our callback */
 754   /* from the 'bottom' innermost, out
 755    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
 756
 757 #if DEBUG_TMI
 758   printf("flagCB_fromU = %p\n", &flagCB_fromU);
 759   printf("debugCB_fromU = %p\n", &debugCB_fromU);
 760 #endif
 761
 762   debugCtx1 = debugCB_openContext();
 763    flagCtx  = flagCB_fromU_openContext();
 764   debugCtx2 = debugCB_openContext();
 765
 766   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
 767   debugCtx1->subContext  =  flagCtx;
 768
 769   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
 770   flagCtx->subContext    =  debugCtx2;
 771
 772   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 773   debugCtx2->subContext  = NULL;
 774
 775   /* Set our special callback */
 776
 777   ucnv_setFromUCallBack(conv,
 778                         debugCB_fromU,
 779                         debugCtx1,
 780                         &(debugCtx2->subCallback),
 781                         &(debugCtx2->subContext),
 782                         &status);
 783
 784   U_ASSERT(status);
 785
 786 #if DEBUG_TMI
 787   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
 788          conv, debugCtx1, debugCtx1->subCallback,
 789          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
 790 #endif
 791
 792   cloneLen = 1; /* but passing in null so it will clone */
 793   cloneCnv = ucnv_safeClone(conv,  NULL,  &cloneLen, &status);
 794
 795   U_ASSERT(status);
 796
 797 #if DEBUG_TMI
 798   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
 799 #endif
 800
 801   ucnv_close(conv);
 802
 803 #if DEBUG_TMI
 804   printf("%p closed.\n", conv);
 805 #endif
 806
 807   U_ASSERT(status);
 808   /* Now, we have to extract the context */
 809   cloneDebugCtx = NULL;
 810   cloneFlagCtx  = NULL;
 811
 812   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
 813   if(cloneDebugCtx != NULL) {
 814       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
 815   }
 816
 817   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
 818          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
 819
 820   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
 821   U_ASSERT(status);
 822
 823   if(cloneFlagCtx != NULL) {
 824       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
 825   } else {
 826       printf("** Warning, couldn't get the subcallback \n");
 827   }
 828
 829   ucnv_close(cloneCnv);
 830
 831   /* print out the original source */
 832   printBytes("bytes", bytes, len2);
 833
 834   return flagVal; /* true if callback was called */
 835 }
 836
 837 UErrorCode convsample_21()
 838 {
 839   const char *sample1 = "abc\xdf\xbf";
 840   const char *sample2 = "abc_def";
 841
 842   if(convsample_21_didSubstitute(sample1))
 843   {
 844     printf("DID substitute.\n******\n");
 845   }
 846   else
 847   {
 848     printf("Did NOT substitute.\n*****\n");
 849   }
 850
 851   if(convsample_21_didSubstitute(sample2))
 852   {
 853     printf("DID substitute.\n******\n");
 854   }
 855   else
 856   {
 857     printf("Did NOT substitute.\n*****\n");
 858   }
 859
 860   return U_ZERO_ERROR;
 861 }
 862
 863
 864 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
 865
 866 #define BUFFERSIZE 17 /* make it interesting :) */
 867
 868 UErrorCode convsample_40()
 869 {
 870   printf("\n\n==============================================\n"
 871     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
 872
 873   FILE *f;
 874   FILE *out;
 875   int32_t count;
 876   char inBuf[BUFFERSIZE];
 877   const char *source;
 878   const char *sourceLimit;
 879   UChar *uBuf;
 880   UChar *target;
 881   UChar *targetLimit;
 882   int32_t uBufSize = 0;
 883   UConverter *conv = NULL;
 884   UErrorCode status = U_ZERO_ERROR;
 885   uint32_t inbytes=0, total=0;
 886
 887   f = fopen("data02.bin", "rb");
 888   if(!f)
 889   {
 890     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
 891     return U_FILE_ACCESS_ERROR;
 892   }
 893
 894   out = fopen("data40.utf16", "wb");
 895   if(!out)
 896   {
 897     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
 898     return U_FILE_ACCESS_ERROR;
 899   }
 900
 901   // **************************** START SAMPLE *******************
 902   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
 903   assert(U_SUCCESS(status));
 904
 905   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 906   printf("input bytes %d / min chars %d = %d UChars\n",
 907          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 908   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 909   assert(uBuf!=NULL);
 910
 911   // grab another buffer's worth
 912   while((!feof(f)) &&
 913         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 914   {
 915     inbytes += count;
 916
 917     // Convert bytes to unicode
 918     source = inBuf;
 919     sourceLimit = inBuf + count;
 920
 921     do
 922     {
 923         target = uBuf;
 924         targetLimit = uBuf + uBufSize;
 925
 926         ucnv_toUnicode( conv, &target, targetLimit,
 927                        &source, sourceLimit, NULL,
 928                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 929                                    /* is true (when no more data will come) */
 930                          &status);
 931
 932         if(status == U_BUFFER_OVERFLOW_ERROR)
 933         {
 934           // simply ran out of space - we'll reset the target ptr the next
 935           // time through the loop.
 936           status = U_ZERO_ERROR;
 937         }
 938         else
 939         {
 940           //  Check other errors here.
 941           assert(U_SUCCESS(status));
 942           // Break out of the loop (by force)
 943         }
 944
 945         // Process the Unicode
 946         // Todo: handle UTF-16/surrogates
 947         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
 948                (size_t)(target-uBuf));
 949         total += (target-uBuf);
 950     } while (source < sourceLimit); // while simply out of space
 951   }
 952
 953   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
 954
 955   // ***************************** END SAMPLE ********************
 956   ucnv_close(conv);
 957
 958   fclose(f);
 959   fclose(out);
 960   printf("\n");
 961
 962   return U_ZERO_ERROR;
 963 }
 964 #undef BUFFERSIZE
 965
 966
 967
 968 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
 969
 970 #define BUFFERSIZE 24 /* make it interesting :) */
 971
 972 UErrorCode convsample_46()
 973 {
 974   printf("\n\n==============================================\n"
 975     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
 976
 977   FILE *f;
 978   FILE *out;
 979   int32_t count;
 980   UChar inBuf[BUFFERSIZE];
 981   const UChar *source;
 982   const UChar *sourceLimit;
 983   char *buf;
 984   char *target;
 985   char *targetLimit;
 986
 987   int32_t bufSize = 0;
 988   UConverter *conv = NULL;
 989   UErrorCode status = U_ZERO_ERROR;
 990   uint32_t inchars=0, total=0;
 991
 992   f = fopen("data40.utf16", "rb");
 993   if(!f)
 994   {
 995     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
 996     return U_FILE_ACCESS_ERROR;
 997   }
 998
 999   out = fopen("data46.out", "wb");
1000   if(!out)
1001   {
1002     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1003     return U_FILE_ACCESS_ERROR;
1004   }
1005
1006   // **************************** START SAMPLE *******************
1007   conv = ucnv_open( "iso-8859-2", &status);
1008   assert(U_SUCCESS(status));
1009
1010   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1011   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1012          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1013   buf = (char*)malloc(bufSize * sizeof(char));
1014   assert(buf!=NULL);
1015
1016   // grab another buffer's worth
1017   while((!feof(f)) &&
1018         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1019   {
1020     inchars += count;
1021
1022     // Convert bytes to unicode
1023     source = inBuf;
1024     sourceLimit = inBuf + count;
1025
1026     do
1027     {
1028         target = buf;
1029         targetLimit = buf + bufSize;
1030
1031         ucnv_fromUnicode( conv, &target, targetLimit,
1032                        &source, sourceLimit, NULL,
1033                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1034                                    /* is true (when no more data will come) */
1035                          &status);
1036
1037         if(status == U_BUFFER_OVERFLOW_ERROR)
1038         {
1039           // simply ran out of space - we'll reset the target ptr the next
1040           // time through the loop.
1041           status = U_ZERO_ERROR;
1042         }
1043         else
1044         {
1045           //  Check other errors here.
1046           assert(U_SUCCESS(status));
1047           // Break out of the loop (by force)
1048         }
1049
1050         // Process the Unicode
1051         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1052                (size_t)(target-buf));
1053         total += (target-buf);
1054     } while (source < sourceLimit); // while simply out of space
1055   }
1056
1057   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1058
1059   // ***************************** END SAMPLE ********************
1060   ucnv_close(conv);
1061
1062   fclose(f);
1063   fclose(out);
1064   printf("\n");
1065
1066   return U_ZERO_ERROR;
1067 }
1068 #undef BUFFERSIZE
1069
1070 #define BUFFERSIZE 219
1071
1072
1073 /* main */
1074
1075 int main()
1076 {
1077
1078   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1079
1080   convsample_02();  // C  , u->koi8r, conv
1081   convsample_03();  // C,   iterate
1082
1083   convsample_05();  // C,  utf8->u, getNextUChar
1084   convsample_06(); // C freq counter thingy
1085
1086   convsample_12();  // C,  sjis->u, conv
1087   convsample_13();  // C,  big5->u, getNextU
1088
1089   convsample_20();  // C, callback
1090   convsample_21();  // C, callback debug
1091
1092   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1093
1094   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1095
1096   printf("End of converter samples.\n");
1097
1098   fflush(stdout);
1099   fflush(stderr);
1100
1101   return 0;
1102 }