icuSources/common/ucnvhz.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnvhz.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000oct16
  12 *   created by: Ram Viswanadha
  13 *   10/31/2000  Ram     Implemented offsets logic function
  14 *
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  20
  21 #include "cmemory.h"
  22 #include "unicode/ucnv.h"
  23 #include "unicode/ucnv_cb.h"
  24 #include "unicode/uset.h"
  25 #include "ucnv_bld.h"
  26 #include "ucnv_cnv.h"
  27
  28 #define UCNV_TILDE 0x7E          /* ~ */
  29 #define UCNV_OPEN_BRACE 0x7B     /* { */
  30 #define UCNV_CLOSE_BRACE 0x7D   /* } */
  31 #define SB_ESCAPE    "\x7E\x7D"
  32 #define DB_ESCAPE    "\x7E\x7B"
  33 #define TILDE_ESCAPE "\x7E\x7E"
  34 #define ESC_LEN       2
  35
  36
  37 #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){                             \
  38     while(len-->0){                                                                                                         \
  39         if(targetIndex < targetLength){                                                                                     \
  40             args->target[targetIndex] = (unsigned char) *strToAppend;                                                       \
  41             if(args->offsets!=NULL){                                                                                        \
  42                 *(offsets++) = sourceIndex-1;                                                                               \
  43             }                                                                                                               \
  44             targetIndex++;                                                                                                  \
  45         }                                                                                                                   \
  46         else{                                                                                                               \
  47             args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
  48             *err =U_BUFFER_OVERFLOW_ERROR;                                                                                  \
  49         }                                                                                                                   \
  50         strToAppend++;                                                                                                      \
  51     }                                                                                                                       \
  52 }
  53
  54
  55 typedef struct{
  56     int32_t targetIndex;
  57     int32_t sourceIndex;
  58     UBool isEscapeAppended;
  59     UConverter* gbConverter;
  60     UBool isStateDBCS;
  61     UBool isTargetUCharDBCS;
  62 }UConverterDataHZ;
  63
  64
  65
  66 static void
  67 _HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){
  68     cnv->toUnicodeStatus = 0;
  69     cnv->fromUnicodeStatus= 0;
  70     cnv->mode=0;
  71     cnv->fromUChar32=0x0000;
  72     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataHZ));
  73     if(cnv->extraInfo != NULL){
  74         ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
  75         ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
  76         ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
  77         ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
  78         ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
  79         ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
  80     }
  81     /* test for NULL */
  82     else {
  83         *errorCode = U_MEMORY_ALLOCATION_ERROR;
  84         return;
  85     }
  86 }
  87
  88 static void
  89 _HZClose(UConverter *cnv){
  90     if(cnv->extraInfo != NULL) {
  91         ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
  92         if(!cnv->isExtraLocal) {
  93             uprv_free(cnv->extraInfo);
  94         }
  95         cnv->extraInfo = NULL;
  96     }
  97 }
  98
  99 static void
 100 _HZReset(UConverter *cnv, UConverterResetChoice choice){
 101     if(choice<=UCNV_RESET_TO_UNICODE) {
 102         cnv->toUnicodeStatus = 0;
 103         cnv->mode=0;
 104         if(cnv->extraInfo != NULL){
 105             ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
 106         }
 107     }
 108     if(choice!=UCNV_RESET_TO_UNICODE) {
 109         cnv->fromUnicodeStatus= 0;
 110         cnv->fromUChar32=0x0000;
 111         if(cnv->extraInfo != NULL){
 112             ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
 113             ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
 114             ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
 115             ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
 116         }
 117     }
 118 }
 119
 120 /**************************************HZ Encoding*************************************************
 121 * Rules for HZ encoding
 122 *
 123 *   In ASCII mode, a byte is interpreted as an ASCII character, unless a
 124 *   '~' is encountered. The character '~' is an escape character. By
 125 *   convention, it must be immediately followed ONLY by '~', '{' or '\n'
 126 *   (<LF>), with the following special meaning.
 127
 128 *   1. The escape sequence '~~' is interpreted as a '~'.
 129 *   2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
 130 *   3. The escape sequence '~\n' is a line-continuation marker to be
 131 *     consumed with no output produced.
 132 *   In GB mode, characters are interpreted two bytes at a time as (pure)
 133 *   GB codes until the escape-from-GB code '~}' is read. This code
 134 *   switches the mode from GB back to ASCII.  (Note that the escape-
 135 *   from-GB code '~}' ($7E7D) is outside the defined GB range.)
 136 *
 137 *   Source: RFC 1842
 138 */
 139
 140
 141 static void
 142 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
 143                                                             UErrorCode* err){
 144     char tempBuf[2];
 145     const char *mySource = ( char *) args->source;
 146     UChar *myTarget = args->target;
 147     const char *mySourceLimit = args->sourceLimit;
 148     UChar32 targetUniChar = 0x0000;
 149     UChar mySourceChar = 0x0000;
 150     UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
 151
 152     if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
 153         *err = U_ILLEGAL_ARGUMENT_ERROR;
 154         return;
 155     }
 156
 157     while(mySource< mySourceLimit){
 158
 159         if(myTarget < args->targetLimit){
 160
 161             mySourceChar= (unsigned char) *mySource++;
 162
 163             switch(mySourceChar){
 164                 case 0x0A:
 165                     if(args->converter->mode ==UCNV_TILDE){
 166                         args->converter->mode=0;
 167
 168                     }
 169                     *(myTarget++)=(UChar)mySourceChar;
 170                     continue;
 171
 172                 case UCNV_TILDE:
 173                     if(args->converter->mode ==UCNV_TILDE){
 174                         *(myTarget++)=(UChar)mySourceChar;
 175                         args->converter->mode=0;
 176                         continue;
 177
 178                     }
 179                     else if(args->converter->toUnicodeStatus !=0){
 180                         args->converter->mode=0;
 181                         break;
 182                     }
 183                     else{
 184                         args->converter->mode = UCNV_TILDE;
 185                         continue;
 186                     }
 187
 188
 189                 case UCNV_OPEN_BRACE:
 190                     if(args->converter->mode == UCNV_TILDE){
 191                         args->converter->mode=0;
 192                         myData->isStateDBCS = TRUE;
 193                         continue;
 194                     }
 195                     else{
 196                         break;
 197                     }
 198
 199
 200                 case UCNV_CLOSE_BRACE:
 201                     if(args->converter->mode == UCNV_TILDE){
 202                         args->converter->mode=0;
 203                          myData->isStateDBCS = FALSE;
 204                         continue;
 205                     }
 206                     else{
 207                         break;
 208                     }
 209
 210                 default:
 211                      /* if the first byte is equal to TILDE and the trail byte
 212                      * is not a valid byte then it is an error condition
 213                      */
 214                     if(args->converter->mode == UCNV_TILDE){
 215                         args->converter->mode=0;
 216                         mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
 217                         goto SAVE_STATE;
 218                     }
 219
 220                     break;
 221
 222             }
 223
 224             if(myData->isStateDBCS){
 225                 if(args->converter->toUnicodeStatus == 0x00){
 226                     args->converter->toUnicodeStatus = (UChar) mySourceChar;
 227                     continue;
 228                 }
 229                 else{
 230                     tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
 231                     tempBuf[1] = (char) (mySourceChar+0x80);
 232                     mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
 233                     args->converter->toUnicodeStatus =0x00;
 234                     targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
 235                         tempBuf, 2, args->converter->useFallback);
 236                 }
 237             }
 238             else{
 239                 if(args->converter->fromUnicodeStatus == 0x00){
 240                     targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
 241                         mySource - 1, 1, args->converter->useFallback);
 242                 }
 243                 else{
 244                     goto SAVE_STATE;
 245                 }
 246
 247             }
 248             if(targetUniChar < 0xfffe){
 249                 if(args->offsets) {
 250                     args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
 251                 }
 252
 253                 *(myTarget++)=(UChar)targetUniChar;
 254             }
 255             else if(targetUniChar>=0xfffe){
 256 SAVE_STATE:
 257                 if(targetUniChar == 0xfffe){
 258                     *err = U_INVALID_CHAR_FOUND;
 259                 }
 260                 else{
 261                     *err = U_ILLEGAL_CHAR_FOUND;
 262                 }
 263                 if(myData->isStateDBCS){
 264                     args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
 265                     args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
 266                     args->converter->toULength=2;
 267                 }
 268                 else{
 269                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
 270                     args->converter->toULength=1;
 271                 }
 272                 break;
 273             }
 274         }
 275         else{
 276             *err =U_BUFFER_OVERFLOW_ERROR;
 277             break;
 278         }
 279     }
 280
 281     args->target = myTarget;
 282     args->source = mySource;
 283 }
 284
 285
 286 static void
 287 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
 288                                                       UErrorCode * err){
 289     const UChar *mySource = args->source;
 290     char *myTarget = args->target;
 291     int32_t* offsets = args->offsets;
 292     int32_t mySourceIndex = 0;
 293     int32_t myTargetIndex = 0;
 294     int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
 295     int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
 296     int32_t length=0;
 297     uint32_t targetUniChar = 0x0000;
 298     UChar32 mySourceChar = 0x0000,c=0x0000;
 299     UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
 300     UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
 301     UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
 302     UBool isEscapeAppended =FALSE;
 303     int len =0;
 304     const char* escSeq=NULL;
 305
 306     if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
 307         *err = U_ILLEGAL_ARGUMENT_ERROR;
 308         return;
 309     }
 310     if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
 311         goto getTrail;
 312     }
 313     /*writing the char to the output stream */
 314     while (mySourceIndex < mySourceLength){
 315         targetUniChar = missingCharMarker;
 316         if (myTargetIndex < targetLength){
 317
 318             c=mySourceChar = (UChar) mySource[mySourceIndex++];
 319
 320
 321             oldIsTargetUCharDBCS = isTargetUCharDBCS;
 322             if(mySourceChar ==UCNV_TILDE){
 323                 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
 324                 len = ESC_LEN;
 325                 escSeq = TILDE_ESCAPE;
 326                 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
 327                 continue;
 328             }
 329             else{
 330                 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
 331                     mySourceChar,&targetUniChar,args->converter->useFallback);
 332
 333             }
 334             /* only DBCS or SBCS characters are expected*/
 335             /* DB haracters with high bit set to 1 are expected */
 336             if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
 337                 targetUniChar= missingCharMarker;
 338             }
 339             if (targetUniChar != missingCharMarker){
 340                myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
 341                  if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
 342                     /*Shifting from a double byte to single byte mode*/
 343                     if(!isTargetUCharDBCS){
 344                         len =ESC_LEN;
 345                         escSeq = SB_ESCAPE;
 346                         CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
 347                         myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
 348                     }
 349                     else{ /* Shifting from a single byte to double byte mode*/
 350                         len =ESC_LEN;
 351                         escSeq = DB_ESCAPE;
 352                         CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
 353                         myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
 354
 355                     }
 356                 }
 357
 358                 if(isTargetUCharDBCS){
 359                     if( myTargetIndex <targetLength){
 360                         myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
 361                         if(offsets){
 362                             *(offsets++) = mySourceIndex-1;
 363                         }
 364                         if(myTargetIndex < targetLength){
 365                             myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
 366                             if(offsets){
 367                                 *(offsets++) = mySourceIndex-1;
 368                             }
 369                         }else{
 370                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
 371                             *err = U_BUFFER_OVERFLOW_ERROR;
 372                         }
 373                     }else{
 374                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
 375                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
 376                         *err = U_BUFFER_OVERFLOW_ERROR;
 377                     }
 378
 379                 }else{
 380                     if( myTargetIndex <targetLength){
 381                         myTarget[myTargetIndex++] = (char) (targetUniChar );
 382                         if(offsets){
 383                             *(offsets++) = mySourceIndex-1;
 384                         }
 385
 386                     }else{
 387                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
 388                         *err = U_BUFFER_OVERFLOW_ERROR;
 389                     }
 390                 }
 391
 392             }
 393             else{
 394                 /* oops.. the code point is unassigned */
 395                 /*Handle surrogates */
 396                 /*check if the char is a First surrogate*/
 397                 if(UTF_IS_SURROGATE(mySourceChar)) {
 398                     if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
 399                         args->converter->fromUChar32=mySourceChar;
 400 getTrail:
 401                         /*look ahead to find the trail surrogate*/
 402                         if(mySourceIndex <  mySourceLength) {
 403                             /* test the following code unit */
 404                             UChar trail=(UChar) args->source[mySourceIndex];
 405                             if(UTF_IS_SECOND_SURROGATE(trail)) {
 406                                 ++mySourceIndex;
 407                                 mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
 408                                 args->converter->fromUChar32=0x00;
 409                                 /* there are no surrogates in GB2312*/
 410                                 *err = U_INVALID_CHAR_FOUND;
 411                                 /* exit this condition tree */
 412                             } else {
 413                                 /* this is an unmatched lead code unit (1st surrogate) */
 414                                 /* callback(illegal) */
 415                                 *err=U_ILLEGAL_CHAR_FOUND;
 416                             }
 417                         } else {
 418                             /* no more input */
 419                             *err = U_ZERO_ERROR;
 420                         }
 421                     } else {
 422                         /* this is an unmatched trail code unit (2nd surrogate) */
 423                         /* callback(illegal) */
 424                         *err=U_ILLEGAL_CHAR_FOUND;
 425                     }
 426                 } else {
 427                     /* callback(unassigned) for a BMP code point */
 428                     *err = U_INVALID_CHAR_FOUND;
 429                 }
 430
 431                 args->converter->fromUChar32=mySourceChar;
 432                 break;
 433             }
 434         }
 435         else{
 436             *err = U_BUFFER_OVERFLOW_ERROR;
 437             break;
 438         }
 439         targetUniChar=missingCharMarker;
 440     }
 441
 442     args->target += myTargetIndex;
 443     args->source += mySourceIndex;
 444     myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
 445 }
 446
 447 static void
 448 _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
 449     UConverter *cnv = args->converter;
 450     UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
 451     char *p;
 452     char buffer[4];
 453     p = buffer;
 454
 455     if( convData->isTargetUCharDBCS){
 456         *p++= UCNV_TILDE;
 457         *p++= UCNV_CLOSE_BRACE;
 458         convData->isTargetUCharDBCS=FALSE;
 459     }
 460     *p++= cnv->subChar[0];
 461
 462     ucnv_cbFromUWriteBytes(args,
 463                            buffer, (int32_t)(p - buffer),
 464                            offsetIndex, err);
 465 }
 466
 467 /* structure for SafeClone calculations */
 468 struct cloneHZStruct
 469 {
 470     UConverter cnv;
 471     UAlignedMemory deadSpace1;
 472     UConverter subCnv;
 473     UAlignedMemory deadSpace2;
 474     UConverterDataHZ mydata;
 475 };
 476
 477
 478 static UConverter *
 479 _HZ_SafeClone(const UConverter *cnv,
 480               void *stackBuffer,
 481               int32_t *pBufferSize,
 482               UErrorCode *status)
 483 {
 484     struct cloneHZStruct * localClone;
 485     int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
 486
 487     if (U_FAILURE(*status)){
 488         return 0;
 489     }
 490
 491     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
 492         *pBufferSize = bufferSizeNeeded;
 493         return 0;
 494     }
 495
 496     localClone = (struct cloneHZStruct *)stackBuffer;
 497     uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
 498
 499     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
 500     localClone->cnv.extraInfo = &localClone->mydata;
 501     localClone->cnv.isExtraLocal = TRUE;
 502
 503     /* deep-clone the sub-converter */
 504     size = (int32_t)sizeof(UConverter);
 505     ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
 506         ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
 507
 508     return &localClone->cnv;
 509 }
 510
 511 static void
 512 _HZ_GetUnicodeSet(const UConverter *cnv,
 513                   USetAdder *sa,
 514                   UConverterUnicodeSet which,
 515                   UErrorCode *pErrorCode) {
 516     /* the tilde '~' is hardcoded in the converter */
 517     sa->add(sa->set, 0x7e);
 518
 519     /* add all of the code points that the sub-converter handles */
 520     ((UConverterDataHZ*)cnv->extraInfo)->
 521         gbConverter->sharedData->impl->
 522             getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
 523                           sa, which, pErrorCode);
 524 }
 525
 526 static const UConverterImpl _HZImpl={
 527
 528     UCNV_HZ,
 529
 530     NULL,
 531     NULL,
 532
 533     _HZOpen,
 534     _HZClose,
 535     _HZReset,
 536
 537     UConverter_toUnicode_HZ_OFFSETS_LOGIC,
 538     UConverter_toUnicode_HZ_OFFSETS_LOGIC,
 539     UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
 540     UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
 541     NULL,
 542
 543     NULL,
 544     NULL,
 545     _HZ_WriteSub,
 546     _HZ_SafeClone,
 547     _HZ_GetUnicodeSet
 548 };
 549
 550 static const UConverterStaticData _HZStaticData={
 551     sizeof(UConverterStaticData),
 552         "HZ",
 553          0,
 554          UCNV_IBM,
 555          UCNV_HZ,
 556          1,
 557          4,
 558         { 0x1a, 0, 0, 0 },
 559         1,
 560         FALSE,
 561         FALSE,
 562         0,
 563         0,
 564         { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
 565
 566 };
 567
 568
 569 const UConverterSharedData _HZData={
 570     sizeof(UConverterSharedData),
 571         ~((uint32_t) 0),
 572         NULL,
 573         NULL,
 574         &_HZStaticData,
 575         FALSE,
 576         &_HZImpl,
 577         0
 578 };
 579
 580 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */