icuSources/samples/ucnv/convsamp.cpp

   1 /**************************************************************************
   2 *
   3 *   Copyright (C) 2000-2010, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *
   6 ***************************************************************************
   7 *   file name:  convsamp.c
   8 *   encoding:   ASCII (7-bit)
   9 *
  10 *   created on: 2000may30
  11 *   created by: Steven R. Loomis
  12 *
  13 *   Sample code for the ICU conversion routines.
  14 *
  15 * Note: Nothing special is needed to build this sample. Link with
  16 *       the icu UC and icu I18N libraries.
  17 *
  18 *       I use 'assert' for error checking, you probably will want
  19 *       something more flexible.  '***BEGIN SAMPLE***' and
  20 *       '***END SAMPLE***' mark pieces suitable for stand alone
  21 *       code snippets.
  22 *
  23 *
  24 *  Each test can define it's own BUFFERSIZE
  25 *
  26 */
  27
  28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
  29
  30 #include <stdio.h>
  31 #include <ctype.h>            /* for isspace, etc.    */
  32 #include <assert.h>
  33 #include <string.h>
  34 #include <stdlib.h>  /* malloc */
  35
  36 #include "unicode/utypes.h"   /* Basic ICU data types */
  37 #include "unicode/ucnv.h"     /* C   Converter API    */
  38 #include "unicode/ustring.h"  /* some more string fcns*/
  39 #include "unicode/uchar.h"    /* char names           */
  40 #include "unicode/uloc.h"
  41 #include "unicode/unistr.h"
  42
  43 #include "flagcb.h"
  44
  45 /* Some utility functions */
  46
  47 static const UChar kNone[] = { 0x0000 };
  48
  49 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
  50
  51 /* Print a UChar if possible, in seven characters. */
  52 void prettyPrintUChar(UChar c)
  53 {
  54   if(  (c <= 0x007F) &&
  55        (isgraph(c))  ) {
  56     printf(" '%c'   ", (char)(0x00FF&c));
  57   } else if ( c > 0x007F ) {
  58     char buf[1000];
  59     UErrorCode status = U_ZERO_ERROR;
  60     int32_t o;
  61
  62     o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status);
  63     if(U_SUCCESS(status) && (o>0) ) {
  64       buf[6] = 0;
  65       printf("%7s", buf);
  66     } else {
  67       o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status);
  68       if(U_SUCCESS(status) && (o>0)) {
  69         buf[5] = 0;
  70         printf("~%6s", buf);
  71       }
  72       else {
  73         printf(" ??????");
  74       }
  75     }
  76   } else {
  77     switch((char)(c & 0x007F)) {
  78     case ' ':
  79       printf(" ' '   ");
  80       break;
  81     case '\t':
  82       printf(" \\t    ");
  83       break;
  84     case '\n':
  85       printf(" \\n    ");
  86       break;
  87     default:
  88       printf("  _    ");
  89       break;
  90     }
  91   }
  92 }
  93
  94
  95 void printUChars(const char  *name = "?",
  96                  const UChar *uch  = kNone,
  97                  int32_t     len   = -1 )
  98 {
  99   int32_t i;
 100
 101   if( (len == -1) && (uch) ) {
 102     len = u_strlen(uch);
 103   }
 104
 105   printf("%5s: ", name);
 106   for( i = 0; i <len; i++) {
 107     printf("%-6d ", i);
 108   }
 109   printf("\n");
 110
 111   printf("%5s: ", "uni");
 112   for( i = 0; i <len; i++) {
 113     printf("\\u%04X ", (int)uch[i]);
 114   }
 115   printf("\n");
 116
 117   printf("%5s:", "ch");
 118   for( i = 0; i <len; i++) {
 119     prettyPrintUChar(uch[i]);
 120   }
 121   printf("\n");
 122 }
 123
 124 void printBytes(const char  *name = "?",
 125                  const char *uch  = "",
 126                  int32_t     len   = -1 )
 127 {
 128   int32_t i;
 129
 130   if( (len == -1) && (uch) ) {
 131     len = strlen(uch);
 132   }
 133
 134   printf("%5s: ", name);
 135   for( i = 0; i <len; i++) {
 136     printf("%-4d ", i);
 137   }
 138   printf("\n");
 139
 140   printf("%5s: ", "uni");
 141   for( i = 0; i <len; i++) {
 142     printf("\\x%02X ", 0x00FF & (int)uch[i]);
 143   }
 144   printf("\n");
 145
 146   printf("%5s:", "ch");
 147   for( i = 0; i <len; i++) {
 148     if(isgraph(0x00FF & (int)uch[i])) {
 149       printf(" '%c' ", (char)uch[i]);
 150     } else {
 151       printf("     ");
 152     }
 153   }
 154   printf("\n");
 155 }
 156
 157 void printUChar(UChar32 ch32)
 158 {
 159     if(ch32 > 0xFFFF) {
 160       printf("ch: U+%06X\n", ch32);
 161     }
 162     else {
 163       UChar ch = (UChar)ch32;
 164       printUChars("C", &ch, 1);
 165     }
 166 }
 167
 168 /*******************************************************************
 169   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
 170   followed by an exclamation mark (!) into the KOI8-R Russian code page.
 171
 172   This example first creates a UChar String out of the Unicode chars.
 173
 174   targetSize must be set to the amount of space available in the target
 175   buffer. After fromUChars is called,
 176   len will contain the number of bytes in target[] which were
 177   used in the resulting codepage.  In this case, there is a 1:1 mapping
 178   between the input and output characters. The exclamation mark has the
 179   same value in both KOI8-R and Unicode.
 180
 181   src: 0      1      2      3      4      5      6
 182   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
 183    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
 184
 185  targ:  0    1    2    3    4    5    6
 186   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
 187    ch:                                '!'
 188
 189
 190 Converting FROM unicode
 191   to koi8-r.
 192   You must call ucnv_close to clean up the memory used by the
 193   converter.
 194
 195   'len' returns the number of OUTPUT bytes resulting from the
 196   conversion.
 197  */
 198
 199 UErrorCode convsample_02()
 200 {
 201   printf("\n\n==============================================\n"
 202          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
 203
 204
 205   // **************************** START SAMPLE *******************
 206   // "cat<cat>OK"
 207   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
 208                      0x0430, 0x0021, 0x0000 };
 209   char target[100];
 210   UErrorCode status = U_ZERO_ERROR;
 211   UConverter *conv;
 212   int32_t     len;
 213
 214   // set up the converter
 215   conv = ucnv_open("koi8-r", &status);
 216   assert(U_SUCCESS(status));
 217
 218   // convert to koi8-r
 219   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
 220   assert(U_SUCCESS(status));
 221
 222   // close the converter
 223   ucnv_close(conv);
 224
 225   // ***************************** END SAMPLE ********************
 226
 227   // Print it out
 228   printUChars("src", source);
 229   printf("\n");
 230   printBytes("targ", target, len);
 231
 232   return U_ZERO_ERROR;
 233 }
 234
 235
 236 UErrorCode convsample_03()
 237 {
 238   printf("\n\n==============================================\n"
 239          "Sample 03: C: print out all converters\n");
 240
 241   int32_t count;
 242   int32_t i;
 243
 244   // **************************** START SAMPLE *******************
 245   count = ucnv_countAvailable();
 246   printf("Available converters: %d\n", count);
 247
 248   for(i=0;i<count;i++)
 249   {
 250     printf("%s ", ucnv_getAvailableName(i));
 251   }
 252
 253   // ***************************** END SAMPLE ********************
 254
 255   printf("\n");
 256
 257   return U_ZERO_ERROR;
 258 }
 259
 260
 261
 262 #define BUFFERSIZE 17 /* make it interesting :) */
 263
 264 /*
 265   Converting from a codepage to Unicode in bulk..
 266   What is the best way to determine the buffer size?
 267
 268      The 'buffersize' is in bytes of input.
 269     For a given converter, divinding this by the minimum char size
 270     give you the maximum number of Unicode characters that could be
 271     expected for a given number of input bytes.
 272      see: ucnv_getMinCharSize()
 273
 274      For example, a single byte codepage like 'Latin-3' has a
 275     minimum char size of 1. (It takes at least 1 byte to represent
 276     each Unicode char.) So the unicode buffer has the same number of
 277     UChars as the input buffer has bytes.
 278
 279      In a strictly double byte codepage such as cp1362 (Windows
 280     Korean), the minimum char size is 2. So, only half as many Unicode
 281     chars as bytes are needed.
 282
 283      This work to calculate the buffer size is an optimization. Any
 284     size of input and output buffer can be used, as long as the
 285     program handles the following cases: If the input buffer is empty,
 286     the source pointer will be equal to sourceLimit.  If the output
 287     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
 288  */
 289
 290 UErrorCode convsample_05()
 291 {
 292   printf("\n\n==============================================\n"
 293          "Sample 05: C: count the number of letters in a UTF-8 document\n");
 294
 295   FILE *f;
 296   int32_t count;
 297   char inBuf[BUFFERSIZE];
 298   const char *source;
 299   const char *sourceLimit;
 300   UChar *uBuf;
 301   UChar *target;
 302   UChar *targetLimit;
 303   UChar *p;
 304   int32_t uBufSize = 0;
 305   UConverter *conv;
 306   UErrorCode status = U_ZERO_ERROR;
 307   uint32_t letters=0, total=0;
 308
 309   f = fopen("data01.txt", "r");
 310   if(!f)
 311   {
 312     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
 313     return U_FILE_ACCESS_ERROR;
 314   }
 315
 316   // **************************** START SAMPLE *******************
 317   conv = ucnv_open("utf-8", &status);
 318   assert(U_SUCCESS(status));
 319
 320   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 321   printf("input bytes %d / min chars %d = %d UChars\n",
 322          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 323   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 324   assert(uBuf!=NULL);
 325
 326   // grab another buffer's worth
 327   while((!feof(f)) &&
 328         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 329   {
 330     // Convert bytes to unicode
 331     source = inBuf;
 332     sourceLimit = inBuf + count;
 333
 334     do
 335     {
 336         target = uBuf;
 337         targetLimit = uBuf + uBufSize;
 338
 339         ucnv_toUnicode(conv, &target, targetLimit,
 340                        &source, sourceLimit, NULL,
 341                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 342                                    /* is true (when no more data will come) */
 343                        &status);
 344
 345         if(status == U_BUFFER_OVERFLOW_ERROR)
 346         {
 347           // simply ran out of space - we'll reset the target ptr the next
 348           // time through the loop.
 349           status = U_ZERO_ERROR;
 350         }
 351         else
 352         {
 353           //  Check other errors here.
 354           assert(U_SUCCESS(status));
 355           // Break out of the loop (by force)
 356         }
 357
 358         // Process the Unicode
 359         // Todo: handle UTF-16/surrogates
 360
 361         for(p = uBuf; p<target; p++)
 362         {
 363           if(u_isalpha(*p))
 364             letters++;
 365           total++;
 366         }
 367     } while (source < sourceLimit); // while simply out of space
 368   }
 369
 370   printf("%d letters out of %d total UChars.\n", letters, total);
 371
 372   // ***************************** END SAMPLE ********************
 373   ucnv_close(conv);
 374
 375   printf("\n");
 376
 377   fclose(f);
 378
 379   return U_ZERO_ERROR;
 380 }
 381 #undef BUFFERSIZE
 382
 383 #define BUFFERSIZE 1024
 384 typedef struct
 385 {
 386   UChar32  codepoint;
 387   uint32_t frequency;
 388 } CharFreqInfo;
 389
 390 UErrorCode convsample_06()
 391 {
 392   printf("\n\n==============================================\n"
 393          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
 394
 395   FILE *f;
 396   int32_t count;
 397   char inBuf[BUFFERSIZE];
 398   const char *source;
 399   const char *sourceLimit;
 400   UChar *uBuf;
 401   int32_t uBufSize = 0;
 402   UConverter *conv;
 403   UErrorCode status = U_ZERO_ERROR;
 404   uint32_t letters=0, total=0;
 405
 406   CharFreqInfo   *info;
 407   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
 408   UChar32   p;
 409
 410   uint32_t ie = 0;
 411   uint32_t gh = 0;
 412   UChar32 l = 0;
 413
 414   f = fopen("data06.txt", "r");
 415   if(!f)
 416   {
 417     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
 418     return U_FILE_ACCESS_ERROR;
 419   }
 420
 421   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
 422   if(!info)
 423   {
 424     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
 425   }
 426
 427   /* reset frequencies */
 428   for(p=0;p<charCount;p++)
 429   {
 430     info[p].codepoint = p;
 431     info[p].frequency = 0;
 432   }
 433
 434   // **************************** START SAMPLE *******************
 435   conv = ucnv_open("utf-8", &status);
 436   assert(U_SUCCESS(status));
 437
 438   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 439   printf("input bytes %d / min chars %d = %d UChars\n",
 440          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 441   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 442   assert(uBuf!=NULL);
 443
 444   // grab another buffer's worth
 445   while((!feof(f)) &&
 446         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 447   {
 448     // Convert bytes to unicode
 449     source = inBuf;
 450     sourceLimit = inBuf + count;
 451
 452     while(source < sourceLimit)
 453     {
 454       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
 455       if(U_FAILURE(status))
 456       {
 457         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
 458         status = U_ZERO_ERROR;
 459         continue;
 460       }
 461       U_ASSERT(status);
 462       total++;
 463
 464       if(u_isalpha(p))
 465         letters++;
 466
 467       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
 468         ie++;
 469
 470       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
 471         gh++;
 472
 473       if(p>charCount)
 474       {
 475         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
 476         return U_UNSUPPORTED_ERROR;
 477       }
 478       info[p].frequency++;
 479       l = p;
 480     }
 481   }
 482
 483   fclose(f);
 484   ucnv_close(conv);
 485
 486   printf("%d letters out of %d total UChars.\n", letters, total);
 487   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
 488
 489   // now, we could sort it..
 490
 491   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
 492
 493   for(p=0;p<charCount;p++)
 494   {
 495     if(info[p].frequency)
 496     {
 497       printf("% 5d U+%06X ", info[p].frequency, p);
 498       if(p <= 0xFFFF)
 499       {
 500         prettyPrintUChar((UChar)p);
 501       }
 502       printf("\n");
 503     }
 504   }
 505   free(info);
 506   // ***************************** END SAMPLE ********************
 507
 508   printf("\n");
 509
 510   return U_ZERO_ERROR;
 511 }
 512 #undef BUFFERSIZE
 513
 514
 515 /******************************************************
 516   You must call ucnv_close to clean up the memory used by the
 517   converter.
 518
 519   'len' returns the number of OUTPUT bytes resulting from the
 520   conversion.
 521  */
 522
 523 UErrorCode convsample_12()
 524 {
 525   printf("\n\n==============================================\n"
 526          "Sample 12: C: simple sjis -> unicode conversion\n");
 527
 528
 529   // **************************** START SAMPLE *******************
 530
 531   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
 532   UChar target[100];
 533   UErrorCode status = U_ZERO_ERROR;
 534   UConverter *conv;
 535   int32_t     len;
 536
 537   // set up the converter
 538   conv = ucnv_open("shift_jis", &status);
 539   assert(U_SUCCESS(status));
 540
 541   // convert to Unicode
 542   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
 543   target[6] = 0xFDCA;
 544   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
 545   U_ASSERT(status);
 546   // close the converter
 547   ucnv_close(conv);
 548
 549   // ***************************** END SAMPLE ********************
 550
 551   // Print it out
 552   printBytes("src", source, strlen(source) );
 553   printf("\n");
 554   printUChars("targ", target, len);
 555
 556   return U_ZERO_ERROR;
 557 }
 558
 559 /******************************************************************
 560    C: Convert from codepage to Unicode one at a time.
 561 */
 562
 563 UErrorCode convsample_13()
 564 {
 565   printf("\n\n==============================================\n"
 566          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
 567
 568
 569   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
 570   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
 571   const char *source, *sourceLimit;
 572   UChar32 target;
 573   UErrorCode status = U_ZERO_ERROR;
 574   UConverter *conv = NULL;
 575   int32_t srcCount=0;
 576   int32_t dstCount=0;
 577
 578   srcCount = sizeof(sourceChars);
 579
 580   conv = ucnv_open("Big5", &status);
 581   U_ASSERT(status);
 582
 583   source = sourceChars;
 584   sourceLimit = sourceChars + sizeof(sourceChars);
 585
 586   // **************************** START SAMPLE *******************
 587
 588
 589   printBytes("src",source,sourceLimit-source);
 590
 591   while(source < sourceLimit)
 592   {
 593     puts("");
 594     target = ucnv_getNextUChar (conv,
 595                                 &source,
 596                                 sourceLimit,
 597                                 &status);
 598
 599     //    printBytes("src",source,sourceLimit-source);
 600     U_ASSERT(status);
 601     printUChar(target);
 602     dstCount++;
 603   }
 604
 605
 606   // ************************** END SAMPLE *************************
 607
 608   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
 609   ucnv_close(conv);
 610
 611   return U_ZERO_ERROR;
 612 }
 613
 614
 615
 616
 617 UBool convsample_20_didSubstitute(const char *source)
 618 {
 619   UChar uchars[100];
 620   char bytes[100];
 621   UConverter *conv = NULL;
 622   UErrorCode status = U_ZERO_ERROR;
 623   uint32_t len, len2;
 624   UBool  flagVal;
 625
 626   FromUFLAGContext * context = NULL;
 627
 628   printf("\n\n==============================================\n"
 629          "Sample 20: C: Test for substitution using callbacks\n");
 630
 631   /* print out the original source */
 632   printBytes("src", source);
 633   printf("\n");
 634
 635   /* First, convert from UTF8 to unicode */
 636   conv = ucnv_open("utf-8", &status);
 637   U_ASSERT(status);
 638
 639   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 640   U_ASSERT(status);
 641
 642   printUChars("uch", uchars, len);
 643   printf("\n");
 644
 645   /* Now, close the converter */
 646   ucnv_close(conv);
 647
 648   /* Now, convert to windows-1252 */
 649   conv = ucnv_open("windows-1252", &status);
 650   U_ASSERT(status);
 651
 652   /* Converter starts out with the SUBSTITUTE callback set. */
 653
 654   /* initialize our callback */
 655   context = flagCB_fromU_openContext();
 656
 657   /* Set our special callback */
 658   ucnv_setFromUCallBack(conv,
 659                         flagCB_fromU,
 660                         context,
 661                         &(context->subCallback),
 662                         &(context->subContext),
 663                         &status);
 664
 665   U_ASSERT(status);
 666
 667   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
 668   U_ASSERT(status);
 669
 670   flagVal = context->flag;  /* it's about to go away when we close the cnv */
 671
 672   ucnv_close(conv);
 673
 674   /* print out the original source */
 675   printBytes("bytes", bytes, len2);
 676
 677   return flagVal; /* true if callback was called */
 678 }
 679
 680 UErrorCode convsample_20()
 681 {
 682   const char *sample1 = "abc\xdf\xbf";
 683   const char *sample2 = "abc_def";
 684
 685
 686   if(convsample_20_didSubstitute(sample1))
 687   {
 688     printf("DID substitute.\n******\n");
 689   }
 690   else
 691   {
 692     printf("Did NOT substitute.\n*****\n");
 693   }
 694
 695   if(convsample_20_didSubstitute(sample2))
 696   {
 697     printf("DID substitute.\n******\n");
 698   }
 699   else
 700   {
 701     printf("Did NOT substitute.\n*****\n");
 702   }
 703
 704   return U_ZERO_ERROR;
 705 }
 706
 707 // 21  - C, callback, with clone and debug
 708
 709
 710
 711 UBool convsample_21_didSubstitute(const char *source)
 712 {
 713   UChar uchars[100];
 714   char bytes[100];
 715   UConverter *conv = NULL, *cloneCnv = NULL;
 716   UErrorCode status = U_ZERO_ERROR;
 717   uint32_t len, len2;
 718   int32_t  cloneLen;
 719   UBool  flagVal = FALSE;
 720   UConverterFromUCallback junkCB;
 721
 722   FromUFLAGContext *flagCtx = NULL,
 723                    *cloneFlagCtx = NULL;
 724
 725   debugCBContext   *debugCtx1 = NULL,
 726                    *debugCtx2 = NULL,
 727                    *cloneDebugCtx = NULL;
 728
 729   printf("\n\n==============================================\n"
 730          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
 731
 732   /* print out the original source */
 733   printBytes("src", source);
 734   printf("\n");
 735
 736   /* First, convert from UTF8 to unicode */
 737   conv = ucnv_open("utf-8", &status);
 738   U_ASSERT(status);
 739
 740   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 741   U_ASSERT(status);
 742
 743   printUChars("uch", uchars, len);
 744   printf("\n");
 745
 746   /* Now, close the converter */
 747   ucnv_close(conv);
 748
 749   /* Now, convert to windows-1252 */
 750   conv = ucnv_open("windows-1252", &status);
 751   U_ASSERT(status);
 752
 753   /* Converter starts out with the SUBSTITUTE callback set. */
 754
 755   /* initialize our callback */
 756   /* from the 'bottom' innermost, out
 757    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
 758
 759 #if DEBUG_TMI
 760   printf("flagCB_fromU = %p\n", &flagCB_fromU);
 761   printf("debugCB_fromU = %p\n", &debugCB_fromU);
 762 #endif
 763
 764   debugCtx1 = debugCB_openContext();
 765    flagCtx  = flagCB_fromU_openContext();
 766   debugCtx2 = debugCB_openContext();
 767
 768   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
 769   debugCtx1->subContext  =  flagCtx;
 770
 771   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
 772   flagCtx->subContext    =  debugCtx2;
 773
 774   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 775   debugCtx2->subContext  = NULL;
 776
 777   /* Set our special callback */
 778
 779   ucnv_setFromUCallBack(conv,
 780                         debugCB_fromU,
 781                         debugCtx1,
 782                         &(debugCtx2->subCallback),
 783                         &(debugCtx2->subContext),
 784                         &status);
 785
 786   U_ASSERT(status);
 787
 788 #if DEBUG_TMI
 789   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
 790          conv, debugCtx1, debugCtx1->subCallback,
 791          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
 792 #endif
 793
 794   cloneLen = 1; /* but passing in null so it will clone */
 795   cloneCnv = ucnv_safeClone(conv,  NULL,  &cloneLen, &status);
 796
 797   U_ASSERT(status);
 798
 799 #if DEBUG_TMI
 800   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
 801 #endif
 802
 803   ucnv_close(conv);
 804
 805 #if DEBUG_TMI
 806   printf("%p closed.\n", conv);
 807 #endif
 808
 809   U_ASSERT(status);
 810   /* Now, we have to extract the context */
 811   cloneDebugCtx = NULL;
 812   cloneFlagCtx  = NULL;
 813
 814   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
 815   if(cloneDebugCtx != NULL) {
 816       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
 817   }
 818
 819   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
 820          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
 821
 822   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
 823   U_ASSERT(status);
 824
 825   if(cloneFlagCtx != NULL) {
 826       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
 827   } else {
 828       printf("** Warning, couldn't get the subcallback \n");
 829   }
 830
 831   ucnv_close(cloneCnv);
 832
 833   /* print out the original source */
 834   printBytes("bytes", bytes, len2);
 835
 836   return flagVal; /* true if callback was called */
 837 }
 838
 839 UErrorCode convsample_21()
 840 {
 841   const char *sample1 = "abc\xdf\xbf";
 842   const char *sample2 = "abc_def";
 843
 844   if(convsample_21_didSubstitute(sample1))
 845   {
 846     printf("DID substitute.\n******\n");
 847   }
 848   else
 849   {
 850     printf("Did NOT substitute.\n*****\n");
 851   }
 852
 853   if(convsample_21_didSubstitute(sample2))
 854   {
 855     printf("DID substitute.\n******\n");
 856   }
 857   else
 858   {
 859     printf("Did NOT substitute.\n*****\n");
 860   }
 861
 862   return U_ZERO_ERROR;
 863 }
 864
 865
 866 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
 867
 868 #define BUFFERSIZE 17 /* make it interesting :) */
 869
 870 UErrorCode convsample_40()
 871 {
 872   printf("\n\n==============================================\n"
 873     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
 874
 875   FILE *f;
 876   FILE *out;
 877   int32_t count;
 878   char inBuf[BUFFERSIZE];
 879   const char *source;
 880   const char *sourceLimit;
 881   UChar *uBuf;
 882   UChar *target;
 883   UChar *targetLimit;
 884   int32_t uBufSize = 0;
 885   UConverter *conv = NULL;
 886   UErrorCode status = U_ZERO_ERROR;
 887   uint32_t inbytes=0, total=0;
 888
 889   f = fopen("data02.bin", "rb");
 890   if(!f)
 891   {
 892     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
 893     return U_FILE_ACCESS_ERROR;
 894   }
 895
 896   out = fopen("data40.utf16", "wb");
 897   if(!out)
 898   {
 899     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
 900     fclose(f);
 901     return U_FILE_ACCESS_ERROR;
 902   }
 903
 904   // **************************** START SAMPLE *******************
 905   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
 906   assert(U_SUCCESS(status));
 907
 908   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 909   printf("input bytes %d / min chars %d = %d UChars\n",
 910          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 911   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 912   assert(uBuf!=NULL);
 913
 914   // grab another buffer's worth
 915   while((!feof(f)) &&
 916         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 917   {
 918     inbytes += count;
 919
 920     // Convert bytes to unicode
 921     source = inBuf;
 922     sourceLimit = inBuf + count;
 923
 924     do
 925     {
 926         target = uBuf;
 927         targetLimit = uBuf + uBufSize;
 928
 929         ucnv_toUnicode( conv, &target, targetLimit,
 930                        &source, sourceLimit, NULL,
 931                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 932                                    /* is true (when no more data will come) */
 933                          &status);
 934
 935         if(status == U_BUFFER_OVERFLOW_ERROR)
 936         {
 937           // simply ran out of space - we'll reset the target ptr the next
 938           // time through the loop.
 939           status = U_ZERO_ERROR;
 940         }
 941         else
 942         {
 943           //  Check other errors here.
 944           assert(U_SUCCESS(status));
 945           // Break out of the loop (by force)
 946         }
 947
 948         // Process the Unicode
 949         // Todo: handle UTF-16/surrogates
 950         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
 951                (size_t)(target-uBuf));
 952         total += (target-uBuf);
 953     } while (source < sourceLimit); // while simply out of space
 954   }
 955
 956   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
 957
 958   // ***************************** END SAMPLE ********************
 959   ucnv_close(conv);
 960
 961   fclose(f);
 962   fclose(out);
 963   printf("\n");
 964
 965   return U_ZERO_ERROR;
 966 }
 967 #undef BUFFERSIZE
 968
 969
 970
 971 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
 972
 973 #define BUFFERSIZE 24 /* make it interesting :) */
 974
 975 UErrorCode convsample_46()
 976 {
 977   printf("\n\n==============================================\n"
 978     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
 979
 980   FILE *f;
 981   FILE *out;
 982   int32_t count;
 983   UChar inBuf[BUFFERSIZE];
 984   const UChar *source;
 985   const UChar *sourceLimit;
 986   char *buf;
 987   char *target;
 988   char *targetLimit;
 989
 990   int32_t bufSize = 0;
 991   UConverter *conv = NULL;
 992   UErrorCode status = U_ZERO_ERROR;
 993   uint32_t inchars=0, total=0;
 994
 995   f = fopen("data40.utf16", "rb");
 996   if(!f)
 997   {
 998     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
 999     return U_FILE_ACCESS_ERROR;
1000   }
1001
1002   out = fopen("data46.out", "wb");
1003   if(!out)
1004   {
1005     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1006     fclose(f);
1007     return U_FILE_ACCESS_ERROR;
1008   }
1009
1010   // **************************** START SAMPLE *******************
1011   conv = ucnv_open( "iso-8859-2", &status);
1012   assert(U_SUCCESS(status));
1013
1014   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1015   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1016          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1017   buf = (char*)malloc(bufSize * sizeof(char));
1018   assert(buf!=NULL);
1019
1020   // grab another buffer's worth
1021   while((!feof(f)) &&
1022         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1023   {
1024     inchars += count;
1025
1026     // Convert bytes to unicode
1027     source = inBuf;
1028     sourceLimit = inBuf + count;
1029
1030     do
1031     {
1032         target = buf;
1033         targetLimit = buf + bufSize;
1034
1035         ucnv_fromUnicode( conv, &target, targetLimit,
1036                        &source, sourceLimit, NULL,
1037                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1038                                    /* is true (when no more data will come) */
1039                          &status);
1040
1041         if(status == U_BUFFER_OVERFLOW_ERROR)
1042         {
1043           // simply ran out of space - we'll reset the target ptr the next
1044           // time through the loop.
1045           status = U_ZERO_ERROR;
1046         }
1047         else
1048         {
1049           //  Check other errors here.
1050           assert(U_SUCCESS(status));
1051           // Break out of the loop (by force)
1052         }
1053
1054         // Process the Unicode
1055         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1056                (size_t)(target-buf));
1057         total += (target-buf);
1058     } while (source < sourceLimit); // while simply out of space
1059   }
1060
1061   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1062
1063   // ***************************** END SAMPLE ********************
1064   ucnv_close(conv);
1065
1066   fclose(f);
1067   fclose(out);
1068   printf("\n");
1069
1070   return U_ZERO_ERROR;
1071 }
1072 #undef BUFFERSIZE
1073
1074 #define BUFFERSIZE 219
1075
1076
1077 /* main */
1078
1079 int main()
1080 {
1081
1082   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1083
1084   convsample_02();  // C  , u->koi8r, conv
1085   convsample_03();  // C,   iterate
1086
1087   convsample_05();  // C,  utf8->u, getNextUChar
1088   convsample_06(); // C freq counter thingy
1089
1090   convsample_12();  // C,  sjis->u, conv
1091   convsample_13();  // C,  big5->u, getNextU
1092
1093   convsample_20();  // C, callback
1094   convsample_21();  // C, callback debug
1095
1096   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1097
1098   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1099
1100   printf("End of converter samples.\n");
1101
1102   fflush(stdout);
1103   fflush(stderr);
1104
1105   return 0;
1106 }