icuSources/samples/ucnv/convsamp.cpp

   1 /**************************************************************************
   2 *
   3 *   Copyright (C) 2000-2016, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *
   6 ***************************************************************************
   7 *   file name:  convsamp.c
   8 *   encoding:   ASCII (7-bit)
   9 *
  10 *   created on: 2000may30
  11 *   created by: Steven R. Loomis
  12 *
  13 *   Sample code for the ICU conversion routines.
  14 *
  15 * Note: Nothing special is needed to build this sample. Link with
  16 *       the icu UC and icu I18N libraries.
  17 *
  18 *       I use 'assert' for error checking, you probably will want
  19 *       something more flexible.  '***BEGIN SAMPLE***' and
  20 *       '***END SAMPLE***' mark pieces suitable for stand alone
  21 *       code snippets.
  22 *
  23 *
  24 *  Each test can define it's own BUFFERSIZE
  25 *
  26 */
  27
  28 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
  29
  30 #include <stdio.h>
  31 #include <ctype.h>            /* for isspace, etc.    */
  32 #include <assert.h>
  33 #include <string.h>
  34 #include <stdlib.h>  /* malloc */
  35
  36 #include "cmemory.h"
  37 #include "unicode/utypes.h"   /* Basic ICU data types */
  38 #include "unicode/ucnv.h"     /* C   Converter API    */
  39 #include "unicode/ustring.h"  /* some more string fcns*/
  40 #include "unicode/uchar.h"    /* char names           */
  41 #include "unicode/uloc.h"
  42 #include "unicode/unistr.h"
  43
  44 #include "flagcb.h"
  45
  46 /* Some utility functions */
  47
  48 static const UChar kNone[] = { 0x0000 };
  49
  50 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
  51
  52 /* Print a UChar if possible, in seven characters. */
  53 void prettyPrintUChar(UChar c)
  54 {
  55   if(  (c <= 0x007F) &&
  56        (isgraph(c))  ) {
  57     printf(" '%c'   ", (char)(0x00FF&c));
  58   } else if ( c > 0x007F ) {
  59     char buf[1000];
  60     UErrorCode status = U_ZERO_ERROR;
  61     int32_t o;
  62
  63     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
  64     if(U_SUCCESS(status) && (o>0) ) {
  65       buf[6] = 0;
  66       printf("%7s", buf);
  67     } else {
  68       printf(" ??????");
  69     }
  70   } else {
  71     switch((char)(c & 0x007F)) {
  72     case ' ':
  73       printf(" ' '   ");
  74       break;
  75     case '\t':
  76       printf(" \\t    ");
  77       break;
  78     case '\n':
  79       printf(" \\n    ");
  80       break;
  81     default:
  82       printf("  _    ");
  83       break;
  84     }
  85   }
  86 }
  87
  88
  89 void printUChars(const char  *name = "?",
  90                  const UChar *uch  = kNone,
  91                  int32_t     len   = -1 )
  92 {
  93   int32_t i;
  94
  95   if( (len == -1) && (uch) ) {
  96     len = u_strlen(uch);
  97   }
  98
  99   printf("%5s: ", name);
 100   for( i = 0; i <len; i++) {
 101     printf("%-6d ", i);
 102   }
 103   printf("\n");
 104
 105   printf("%5s: ", "uni");
 106   for( i = 0; i <len; i++) {
 107     printf("\\u%04X ", (int)uch[i]);
 108   }
 109   printf("\n");
 110
 111   printf("%5s:", "ch");
 112   for( i = 0; i <len; i++) {
 113     prettyPrintUChar(uch[i]);
 114   }
 115   printf("\n");
 116 }
 117
 118 void printBytes(const char  *name = "?",
 119                  const char *uch  = "",
 120                  int32_t     len   = -1 )
 121 {
 122   int32_t i;
 123
 124   if( (len == -1) && (uch) ) {
 125     len = strlen(uch);
 126   }
 127
 128   printf("%5s: ", name);
 129   for( i = 0; i <len; i++) {
 130     printf("%-4d ", i);
 131   }
 132   printf("\n");
 133
 134   printf("%5s: ", "uni");
 135   for( i = 0; i <len; i++) {
 136     printf("\\x%02X ", 0x00FF & (int)uch[i]);
 137   }
 138   printf("\n");
 139
 140   printf("%5s:", "ch");
 141   for( i = 0; i <len; i++) {
 142     if(isgraph(0x00FF & (int)uch[i])) {
 143       printf(" '%c' ", (char)uch[i]);
 144     } else {
 145       printf("     ");
 146     }
 147   }
 148   printf("\n");
 149 }
 150
 151 void printUChar(UChar32 ch32)
 152 {
 153     if(ch32 > 0xFFFF) {
 154       printf("ch: U+%06X\n", ch32);
 155     }
 156     else {
 157       UChar ch = (UChar)ch32;
 158       printUChars("C", &ch, 1);
 159     }
 160 }
 161
 162 /*******************************************************************
 163   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
 164   followed by an exclamation mark (!) into the KOI8-R Russian code page.
 165
 166   This example first creates a UChar String out of the Unicode chars.
 167
 168   targetSize must be set to the amount of space available in the target
 169   buffer. After fromUChars is called,
 170   len will contain the number of bytes in target[] which were
 171   used in the resulting codepage.  In this case, there is a 1:1 mapping
 172   between the input and output characters. The exclamation mark has the
 173   same value in both KOI8-R and Unicode.
 174
 175   src: 0      1      2      3      4      5      6
 176   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
 177    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
 178
 179  targ:  0    1    2    3    4    5    6
 180   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
 181    ch:                                '!'
 182
 183
 184 Converting FROM unicode
 185   to koi8-r.
 186   You must call ucnv_close to clean up the memory used by the
 187   converter.
 188
 189   'len' returns the number of OUTPUT bytes resulting from the
 190   conversion.
 191  */
 192
 193 UErrorCode convsample_02()
 194 {
 195   printf("\n\n==============================================\n"
 196          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
 197
 198
 199   // **************************** START SAMPLE *******************
 200   // "cat<cat>OK"
 201   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
 202                      0x0430, 0x0021, 0x0000 };
 203   char target[100];
 204   UErrorCode status = U_ZERO_ERROR;
 205   UConverter *conv;
 206   int32_t     len;
 207
 208   // set up the converter
 209   //! [ucnv_open]
 210   conv = ucnv_open("koi8-r", &status);
 211   //! [ucnv_open]
 212   assert(U_SUCCESS(status));
 213
 214   // convert to koi8-r
 215   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
 216   assert(U_SUCCESS(status));
 217
 218   // close the converter
 219   ucnv_close(conv);
 220
 221   // ***************************** END SAMPLE ********************
 222
 223   // Print it out
 224   printUChars("src", source);
 225   printf("\n");
 226   printBytes("targ", target, len);
 227
 228   return U_ZERO_ERROR;
 229 }
 230
 231
 232 UErrorCode convsample_03()
 233 {
 234   printf("\n\n==============================================\n"
 235          "Sample 03: C: print out all converters\n");
 236
 237   int32_t count;
 238   int32_t i;
 239
 240   // **************************** START SAMPLE *******************
 241   count = ucnv_countAvailable();
 242   printf("Available converters: %d\n", count);
 243
 244   for(i=0;i<count;i++)
 245   {
 246     printf("%s ", ucnv_getAvailableName(i));
 247   }
 248
 249   // ***************************** END SAMPLE ********************
 250
 251   printf("\n");
 252
 253   return U_ZERO_ERROR;
 254 }
 255
 256
 257
 258 #define BUFFERSIZE 17 /* make it interesting :) */
 259
 260 /*
 261   Converting from a codepage to Unicode in bulk..
 262   What is the best way to determine the buffer size?
 263
 264      The 'buffersize' is in bytes of input.
 265     For a given converter, divinding this by the minimum char size
 266     give you the maximum number of Unicode characters that could be
 267     expected for a given number of input bytes.
 268      see: ucnv_getMinCharSize()
 269
 270      For example, a single byte codepage like 'Latin-3' has a
 271     minimum char size of 1. (It takes at least 1 byte to represent
 272     each Unicode char.) So the unicode buffer has the same number of
 273     UChars as the input buffer has bytes.
 274
 275      In a strictly double byte codepage such as cp1362 (Windows
 276     Korean), the minimum char size is 2. So, only half as many Unicode
 277     chars as bytes are needed.
 278
 279      This work to calculate the buffer size is an optimization. Any
 280     size of input and output buffer can be used, as long as the
 281     program handles the following cases: If the input buffer is empty,
 282     the source pointer will be equal to sourceLimit.  If the output
 283     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
 284  */
 285
 286 UErrorCode convsample_05()
 287 {
 288   printf("\n\n==============================================\n"
 289          "Sample 05: C: count the number of letters in a UTF-8 document\n");
 290
 291   FILE *f;
 292   int32_t count;
 293   char inBuf[BUFFERSIZE];
 294   const char *source;
 295   const char *sourceLimit;
 296   UChar *uBuf;
 297   UChar *target;
 298   UChar *targetLimit;
 299   UChar *p;
 300   int32_t uBufSize = 0;
 301   UConverter *conv;
 302   UErrorCode status = U_ZERO_ERROR;
 303   uint32_t letters=0, total=0;
 304
 305   f = fopen("data01.txt", "r");
 306   if(!f)
 307   {
 308     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
 309     return U_FILE_ACCESS_ERROR;
 310   }
 311
 312   // **************************** START SAMPLE *******************
 313   conv = ucnv_open("utf-8", &status);
 314   assert(U_SUCCESS(status));
 315
 316   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 317   printf("input bytes %d / min chars %d = %d UChars\n",
 318          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 319   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 320   assert(uBuf!=NULL);
 321
 322   // grab another buffer's worth
 323   while((!feof(f)) &&
 324         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 325   {
 326     // Convert bytes to unicode
 327     source = inBuf;
 328     sourceLimit = inBuf + count;
 329
 330     do
 331     {
 332         target = uBuf;
 333         targetLimit = uBuf + uBufSize;
 334
 335         ucnv_toUnicode(conv, &target, targetLimit,
 336                        &source, sourceLimit, NULL,
 337                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 338                                    /* is true (when no more data will come) */
 339                        &status);
 340
 341         if(status == U_BUFFER_OVERFLOW_ERROR)
 342         {
 343           // simply ran out of space - we'll reset the target ptr the next
 344           // time through the loop.
 345           status = U_ZERO_ERROR;
 346         }
 347         else
 348         {
 349           //  Check other errors here.
 350           assert(U_SUCCESS(status));
 351           // Break out of the loop (by force)
 352         }
 353
 354         // Process the Unicode
 355         // Todo: handle UTF-16/surrogates
 356
 357         for(p = uBuf; p<target; p++)
 358         {
 359           if(u_isalpha(*p))
 360             letters++;
 361           total++;
 362         }
 363     } while (source < sourceLimit); // while simply out of space
 364   }
 365
 366   printf("%d letters out of %d total UChars.\n", letters, total);
 367
 368   // ***************************** END SAMPLE ********************
 369   ucnv_close(conv);
 370
 371   printf("\n");
 372
 373   fclose(f);
 374
 375   return U_ZERO_ERROR;
 376 }
 377 #undef BUFFERSIZE
 378
 379 #define BUFFERSIZE 1024
 380 typedef struct
 381 {
 382   UChar32  codepoint;
 383   uint32_t frequency;
 384 } CharFreqInfo;
 385
 386 UErrorCode convsample_06()
 387 {
 388   printf("\n\n==============================================\n"
 389          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
 390
 391   FILE *f;
 392   int32_t count;
 393   char inBuf[BUFFERSIZE];
 394   const char *source;
 395   const char *sourceLimit;
 396   int32_t uBufSize = 0;
 397   UConverter *conv;
 398   UErrorCode status = U_ZERO_ERROR;
 399   uint32_t letters=0, total=0;
 400
 401   CharFreqInfo   *info;
 402   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
 403   UChar32   p;
 404
 405   uint32_t ie = 0;
 406   uint32_t gh = 0;
 407   UChar32 l = 0;
 408
 409   f = fopen("data06.txt", "r");
 410   if(!f)
 411   {
 412     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
 413     return U_FILE_ACCESS_ERROR;
 414   }
 415
 416   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
 417   if(!info)
 418   {
 419     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
 420   }
 421
 422   /* reset frequencies */
 423   for(p=0;p<charCount;p++)
 424   {
 425     info[p].codepoint = p;
 426     info[p].frequency = 0;
 427   }
 428
 429   // **************************** START SAMPLE *******************
 430   conv = ucnv_open("utf-8", &status);
 431   assert(U_SUCCESS(status));
 432
 433   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 434   printf("input bytes %d / min chars %d = %d UChars\n",
 435          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 436
 437   // grab another buffer's worth
 438   while((!feof(f)) &&
 439         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 440   {
 441     // Convert bytes to unicode
 442     source = inBuf;
 443     sourceLimit = inBuf + count;
 444
 445     while(source < sourceLimit)
 446     {
 447       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
 448       if(U_FAILURE(status))
 449       {
 450         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
 451         status = U_ZERO_ERROR;
 452         continue;
 453       }
 454       U_ASSERT(status);
 455       total++;
 456
 457       if(u_isalpha(p))
 458         letters++;
 459
 460       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
 461         ie++;
 462
 463       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
 464         gh++;
 465
 466       if(p>charCount)
 467       {
 468         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
 469         free(info);
 470         fclose(f);
 471         ucnv_close(conv);
 472         return U_UNSUPPORTED_ERROR;
 473       }
 474       info[p].frequency++;
 475       l = p;
 476     }
 477   }
 478
 479   fclose(f);
 480   ucnv_close(conv);
 481
 482   printf("%d letters out of %d total UChars.\n", letters, total);
 483   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
 484
 485   // now, we could sort it..
 486
 487   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
 488
 489   for(p=0;p<charCount;p++)
 490   {
 491     if(info[p].frequency)
 492     {
 493       printf("% 5d U+%06X ", info[p].frequency, p);
 494       if(p <= 0xFFFF)
 495       {
 496         prettyPrintUChar((UChar)p);
 497       }
 498       printf("\n");
 499     }
 500   }
 501   free(info);
 502   // ***************************** END SAMPLE ********************
 503
 504   printf("\n");
 505
 506   return U_ZERO_ERROR;
 507 }
 508 #undef BUFFERSIZE
 509
 510
 511 /******************************************************
 512   You must call ucnv_close to clean up the memory used by the
 513   converter.
 514
 515   'len' returns the number of OUTPUT bytes resulting from the
 516   conversion.
 517  */
 518
 519 UErrorCode convsample_12()
 520 {
 521   printf("\n\n==============================================\n"
 522          "Sample 12: C: simple sjis -> unicode conversion\n");
 523
 524
 525   // **************************** START SAMPLE *******************
 526
 527   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
 528   UChar target[100];
 529   UErrorCode status = U_ZERO_ERROR;
 530   UConverter *conv;
 531   int32_t     len;
 532
 533   // set up the converter
 534   conv = ucnv_open("shift_jis", &status);
 535   assert(U_SUCCESS(status));
 536
 537   // convert to Unicode
 538   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
 539   target[6] = 0xFDCA;
 540   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
 541   U_ASSERT(status);
 542   // close the converter
 543   ucnv_close(conv);
 544
 545   // ***************************** END SAMPLE ********************
 546
 547   // Print it out
 548   printBytes("src", source, strlen(source) );
 549   printf("\n");
 550   printUChars("targ", target, len);
 551
 552   return U_ZERO_ERROR;
 553 }
 554
 555 /******************************************************************
 556    C: Convert from codepage to Unicode one at a time.
 557 */
 558
 559 UErrorCode convsample_13()
 560 {
 561   printf("\n\n==============================================\n"
 562          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
 563
 564
 565   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
 566   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
 567   const char *source, *sourceLimit;
 568   UChar32 target;
 569   UErrorCode status = U_ZERO_ERROR;
 570   UConverter *conv = NULL;
 571   int32_t srcCount=0;
 572   int32_t dstCount=0;
 573
 574   srcCount = sizeof(sourceChars);
 575
 576   conv = ucnv_open("Big5", &status);
 577   U_ASSERT(status);
 578
 579   source = sourceChars;
 580   sourceLimit = sourceChars + sizeof(sourceChars);
 581
 582   // **************************** START SAMPLE *******************
 583
 584
 585   printBytes("src",source,sourceLimit-source);
 586
 587   while(source < sourceLimit)
 588   {
 589     puts("");
 590     target = ucnv_getNextUChar (conv,
 591                                 &source,
 592                                 sourceLimit,
 593                                 &status);
 594
 595     //    printBytes("src",source,sourceLimit-source);
 596     U_ASSERT(status);
 597     printUChar(target);
 598     dstCount++;
 599   }
 600
 601
 602   // ************************** END SAMPLE *************************
 603
 604   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
 605   ucnv_close(conv);
 606
 607   return U_ZERO_ERROR;
 608 }
 609
 610
 611
 612
 613 UBool convsample_20_didSubstitute(const char *source)
 614 {
 615   UChar uchars[100];
 616   char bytes[100];
 617   UConverter *conv = NULL;
 618   UErrorCode status = U_ZERO_ERROR;
 619   uint32_t len, len2;
 620   UBool  flagVal;
 621
 622   FromUFLAGContext * context = NULL;
 623
 624   printf("\n\n==============================================\n"
 625          "Sample 20: C: Test for substitution using callbacks\n");
 626
 627   /* print out the original source */
 628   printBytes("src", source);
 629   printf("\n");
 630
 631   /* First, convert from UTF8 to unicode */
 632   conv = ucnv_open("utf-8", &status);
 633   U_ASSERT(status);
 634
 635   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 636   U_ASSERT(status);
 637
 638   printUChars("uch", uchars, len);
 639   printf("\n");
 640
 641   /* Now, close the converter */
 642   ucnv_close(conv);
 643
 644   /* Now, convert to windows-1252 */
 645   conv = ucnv_open("windows-1252", &status);
 646   U_ASSERT(status);
 647
 648   /* Converter starts out with the SUBSTITUTE callback set. */
 649
 650   /* initialize our callback */
 651   context = flagCB_fromU_openContext();
 652
 653   /* Set our special callback */
 654   ucnv_setFromUCallBack(conv,
 655                         flagCB_fromU,
 656                         context,
 657                         &(context->subCallback),
 658                         &(context->subContext),
 659                         &status);
 660
 661   U_ASSERT(status);
 662
 663   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
 664   U_ASSERT(status);
 665
 666   flagVal = context->flag;  /* it's about to go away when we close the cnv */
 667
 668   ucnv_close(conv);
 669
 670   /* print out the original source */
 671   printBytes("bytes", bytes, len2);
 672
 673   return flagVal; /* true if callback was called */
 674 }
 675
 676 UErrorCode convsample_20()
 677 {
 678   const char *sample1 = "abc\xdf\xbf";
 679   const char *sample2 = "abc_def";
 680
 681
 682   if(convsample_20_didSubstitute(sample1))
 683   {
 684     printf("DID substitute.\n******\n");
 685   }
 686   else
 687   {
 688     printf("Did NOT substitute.\n*****\n");
 689   }
 690
 691   if(convsample_20_didSubstitute(sample2))
 692   {
 693     printf("DID substitute.\n******\n");
 694   }
 695   else
 696   {
 697     printf("Did NOT substitute.\n*****\n");
 698   }
 699
 700   return U_ZERO_ERROR;
 701 }
 702
 703 // 21  - C, callback, with clone and debug
 704
 705
 706
 707 UBool convsample_21_didSubstitute(const char *source)
 708 {
 709   UChar uchars[100];
 710   char bytes[100];
 711   UConverter *conv = NULL, *cloneCnv = NULL;
 712   UErrorCode status = U_ZERO_ERROR;
 713   uint32_t len, len2;
 714   int32_t  cloneLen;
 715   UBool  flagVal = FALSE;
 716   UConverterFromUCallback junkCB;
 717
 718   FromUFLAGContext *flagCtx = NULL,
 719                    *cloneFlagCtx = NULL;
 720
 721   debugCBContext   *debugCtx1 = NULL,
 722                    *debugCtx2 = NULL,
 723                    *cloneDebugCtx = NULL;
 724
 725   printf("\n\n==============================================\n"
 726          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
 727
 728   /* print out the original source */
 729   printBytes("src", source);
 730   printf("\n");
 731
 732   /* First, convert from UTF8 to unicode */
 733   conv = ucnv_open("utf-8", &status);
 734   U_ASSERT(status);
 735
 736   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 737   U_ASSERT(status);
 738
 739   printUChars("uch", uchars, len);
 740   printf("\n");
 741
 742   /* Now, close the converter */
 743   ucnv_close(conv);
 744
 745   /* Now, convert to windows-1252 */
 746   conv = ucnv_open("windows-1252", &status);
 747   U_ASSERT(status);
 748
 749   /* Converter starts out with the SUBSTITUTE callback set. */
 750
 751   /* initialize our callback */
 752   /* from the 'bottom' innermost, out
 753    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
 754
 755 #if DEBUG_TMI
 756   printf("flagCB_fromU = %p\n", &flagCB_fromU);
 757   printf("debugCB_fromU = %p\n", &debugCB_fromU);
 758 #endif
 759
 760   debugCtx1 = debugCB_openContext();
 761    flagCtx  = flagCB_fromU_openContext();
 762   debugCtx2 = debugCB_openContext();
 763
 764   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
 765   debugCtx1->subContext  =  flagCtx;
 766
 767   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
 768   flagCtx->subContext    =  debugCtx2;
 769
 770   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 771   debugCtx2->subContext  = NULL;
 772
 773   /* Set our special callback */
 774
 775   ucnv_setFromUCallBack(conv,
 776                         debugCB_fromU,
 777                         debugCtx1,
 778                         &(debugCtx2->subCallback),
 779                         &(debugCtx2->subContext),
 780                         &status);
 781
 782   U_ASSERT(status);
 783
 784 #if DEBUG_TMI
 785   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
 786          conv, debugCtx1, debugCtx1->subCallback,
 787          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
 788 #endif
 789
 790   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
 791
 792   U_ASSERT(status);
 793
 794 #if DEBUG_TMI
 795   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
 796 #endif
 797
 798   ucnv_close(conv);
 799
 800 #if DEBUG_TMI
 801   printf("%p closed.\n", conv);
 802 #endif
 803
 804   U_ASSERT(status);
 805   /* Now, we have to extract the context */
 806   cloneDebugCtx = NULL;
 807   cloneFlagCtx  = NULL;
 808
 809   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
 810   if(cloneDebugCtx != NULL) {
 811       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
 812   }
 813
 814   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
 815          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
 816
 817   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
 818   U_ASSERT(status);
 819
 820   if(cloneFlagCtx != NULL) {
 821       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
 822   } else {
 823       printf("** Warning, couldn't get the subcallback \n");
 824   }
 825
 826   ucnv_close(cloneCnv);
 827
 828   /* print out the original source */
 829   printBytes("bytes", bytes, len2);
 830
 831   return flagVal; /* true if callback was called */
 832 }
 833
 834 UErrorCode convsample_21()
 835 {
 836   const char *sample1 = "abc\xdf\xbf";
 837   const char *sample2 = "abc_def";
 838
 839   if(convsample_21_didSubstitute(sample1))
 840   {
 841     printf("DID substitute.\n******\n");
 842   }
 843   else
 844   {
 845     printf("Did NOT substitute.\n*****\n");
 846   }
 847
 848   if(convsample_21_didSubstitute(sample2))
 849   {
 850     printf("DID substitute.\n******\n");
 851   }
 852   else
 853   {
 854     printf("Did NOT substitute.\n*****\n");
 855   }
 856
 857   return U_ZERO_ERROR;
 858 }
 859
 860
 861 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
 862
 863 #define BUFFERSIZE 17 /* make it interesting :) */
 864
 865 UErrorCode convsample_40()
 866 {
 867   printf("\n\n==============================================\n"
 868     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
 869
 870   FILE *f;
 871   FILE *out;
 872   int32_t count;
 873   char inBuf[BUFFERSIZE];
 874   const char *source;
 875   const char *sourceLimit;
 876   UChar *uBuf;
 877   UChar *target;
 878   UChar *targetLimit;
 879   int32_t uBufSize = 0;
 880   UConverter *conv = NULL;
 881   UErrorCode status = U_ZERO_ERROR;
 882   uint32_t inbytes=0, total=0;
 883
 884   f = fopen("data02.bin", "rb");
 885   if(!f)
 886   {
 887     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
 888     return U_FILE_ACCESS_ERROR;
 889   }
 890
 891   out = fopen("data40.utf16", "wb");
 892   if(!out)
 893   {
 894     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
 895     fclose(f);
 896     return U_FILE_ACCESS_ERROR;
 897   }
 898
 899   // **************************** START SAMPLE *******************
 900   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
 901   assert(U_SUCCESS(status));
 902
 903   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 904   printf("input bytes %d / min chars %d = %d UChars\n",
 905          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 906   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 907   assert(uBuf!=NULL);
 908
 909   // grab another buffer's worth
 910   while((!feof(f)) &&
 911         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 912   {
 913     inbytes += count;
 914
 915     // Convert bytes to unicode
 916     source = inBuf;
 917     sourceLimit = inBuf + count;
 918
 919     do
 920     {
 921         target = uBuf;
 922         targetLimit = uBuf + uBufSize;
 923
 924         ucnv_toUnicode( conv, &target, targetLimit,
 925                        &source, sourceLimit, NULL,
 926                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 927                                    /* is true (when no more data will come) */
 928                          &status);
 929
 930         if(status == U_BUFFER_OVERFLOW_ERROR)
 931         {
 932           // simply ran out of space - we'll reset the target ptr the next
 933           // time through the loop.
 934           status = U_ZERO_ERROR;
 935         }
 936         else
 937         {
 938           //  Check other errors here.
 939           assert(U_SUCCESS(status));
 940           // Break out of the loop (by force)
 941         }
 942
 943         // Process the Unicode
 944         // Todo: handle UTF-16/surrogates
 945         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
 946                (size_t)(target-uBuf));
 947         total += (target-uBuf);
 948     } while (source < sourceLimit); // while simply out of space
 949   }
 950
 951   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
 952
 953   // ***************************** END SAMPLE ********************
 954   ucnv_close(conv);
 955
 956   fclose(f);
 957   fclose(out);
 958   printf("\n");
 959
 960   return U_ZERO_ERROR;
 961 }
 962 #undef BUFFERSIZE
 963
 964
 965
 966 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
 967
 968 #define BUFFERSIZE 24 /* make it interesting :) */
 969
 970 UErrorCode convsample_46()
 971 {
 972   printf("\n\n==============================================\n"
 973     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
 974
 975   FILE *f;
 976   FILE *out;
 977   int32_t count;
 978   UChar inBuf[BUFFERSIZE];
 979   const UChar *source;
 980   const UChar *sourceLimit;
 981   char *buf;
 982   char *target;
 983   char *targetLimit;
 984
 985   int32_t bufSize = 0;
 986   UConverter *conv = NULL;
 987   UErrorCode status = U_ZERO_ERROR;
 988   uint32_t inchars=0, total=0;
 989
 990   f = fopen("data40.utf16", "rb");
 991   if(!f)
 992   {
 993     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
 994     return U_FILE_ACCESS_ERROR;
 995   }
 996
 997   out = fopen("data46.out", "wb");
 998   if(!out)
 999   {
1000     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1001     fclose(f);
1002     return U_FILE_ACCESS_ERROR;
1003   }
1004
1005   // **************************** START SAMPLE *******************
1006   conv = ucnv_open( "iso-8859-2", &status);
1007   assert(U_SUCCESS(status));
1008
1009   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1010   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1011          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1012   buf = (char*)malloc(bufSize * sizeof(char));
1013   assert(buf!=NULL);
1014
1015   // grab another buffer's worth
1016   while((!feof(f)) &&
1017         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1018   {
1019     inchars += count;
1020
1021     // Convert bytes to unicode
1022     source = inBuf;
1023     sourceLimit = inBuf + count;
1024
1025     do
1026     {
1027         target = buf;
1028         targetLimit = buf + bufSize;
1029
1030         ucnv_fromUnicode( conv, &target, targetLimit,
1031                        &source, sourceLimit, NULL,
1032                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1033                                    /* is true (when no more data will come) */
1034                          &status);
1035
1036         if(status == U_BUFFER_OVERFLOW_ERROR)
1037         {
1038           // simply ran out of space - we'll reset the target ptr the next
1039           // time through the loop.
1040           status = U_ZERO_ERROR;
1041         }
1042         else
1043         {
1044           //  Check other errors here.
1045           assert(U_SUCCESS(status));
1046           // Break out of the loop (by force)
1047         }
1048
1049         // Process the Unicode
1050         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1051                (size_t)(target-buf));
1052         total += (target-buf);
1053     } while (source < sourceLimit); // while simply out of space
1054   }
1055
1056   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1057
1058   // ***************************** END SAMPLE ********************
1059   ucnv_close(conv);
1060
1061   fclose(f);
1062   fclose(out);
1063   printf("\n");
1064
1065   return U_ZERO_ERROR;
1066 }
1067 #undef BUFFERSIZE
1068
1069 #define BUFFERSIZE 219
1070
1071 void convsample_50() {
1072   printf("\n\n==============================================\n"
1073          "Sample 50: C: ucnv_detectUnicodeSignature\n");
1074
1075   //! [ucnv_detectUnicodeSignature]
1076   UErrorCode err = U_ZERO_ERROR;
1077   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1078   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1079   int32_t signatureLength = 0;
1080   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1081   UConverter *conv = NULL;
1082   UChar output[100];
1083   UChar *target = output, *out;
1084   const char *source = input;
1085   if(encoding!=NULL && U_SUCCESS(err)){
1086     // should signature be discarded ?
1087     conv = ucnv_open(encoding, &err);
1088     // do the conversion
1089     ucnv_toUnicode(conv,
1090                    &target, output + UPRV_LENGTHOF(output),
1091                    &source, input + sizeof(input),
1092                    NULL, TRUE, &err);
1093     out = output;
1094     if (discardSignature){
1095       ++out; // ignore initial U+FEFF
1096     }
1097     while(out != target) {
1098       printf("%04x ", *out++);
1099     }
1100     puts("");
1101   }
1102   //! [ucnv_detectUnicodeSignature]
1103   puts("");
1104 }
1105
1106
1107
1108 /* main */
1109
1110 int main()
1111 {
1112
1113   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1114
1115   convsample_02();  // C  , u->koi8r, conv
1116   convsample_03();  // C,   iterate
1117
1118   convsample_05();  // C,  utf8->u, getNextUChar
1119   convsample_06(); // C freq counter thingy
1120
1121   convsample_12();  // C,  sjis->u, conv
1122   convsample_13();  // C,  big5->u, getNextU
1123
1124   convsample_20();  // C, callback
1125   convsample_21();  // C, callback debug
1126
1127   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1128
1129   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1130
1131   convsample_50();  // C, detect unicode signature
1132
1133   printf("End of converter samples.\n");
1134
1135   fflush(stdout);
1136   fflush(stderr);
1137
1138   return 0;
1139 }