icuSources/common/ucnv_u32.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2015, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u32.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "unicode/utf.h"
  23 #include "ucnv_bld.h"
  24 #include "ucnv_cnv.h"
  25 #include "cmemory.h"
  26
  27 #define MAXIMUM_UCS2            0x0000FFFF
  28 #define MAXIMUM_UTF             0x0010FFFF
  29 #define HALF_SHIFT              10
  30 #define HALF_BASE               0x0010000
  31 #define HALF_MASK               0x3FF
  32 #define SURROGATE_HIGH_START    0xD800
  33 #define SURROGATE_LOW_START     0xDC00
  34
  35 /* -SURROGATE_LOW_START + HALF_BASE */
  36 #define SURROGATE_LOW_BASE      9216
  37
  38 enum {
  39     UCNV_NEED_TO_WRITE_BOM=1
  40 };
  41
  42 /* UTF-32BE ----------------------------------------------------------------- */
  43
  44 static void
  45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
  46                                 UErrorCode * err)
  47 {
  48     const unsigned char *mySource = (unsigned char *) args->source;
  49     UChar *myTarget = args->target;
  50     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  51     const UChar *targetLimit = args->targetLimit;
  52     unsigned char *toUBytes = args->converter->toUBytes;
  53     uint32_t ch, i;
  54
  55     /* Restore state of current sequence */
  56     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
  57         i = args->converter->toULength;       /* restore # of bytes consumed */
  58         args->converter->toULength = 0;
  59
  60         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
  61         args->converter->toUnicodeStatus = 0;
  62         goto morebytes;
  63     }
  64
  65     while (mySource < sourceLimit && myTarget < targetLimit) {
  66         i = 0;
  67         ch = 0;
  68 morebytes:
  69         while (i < sizeof(uint32_t)) {
  70             if (mySource < sourceLimit) {
  71                 ch = (ch << 8) | (uint8_t)(*mySource);
  72                 toUBytes[i++] = (char) *(mySource++);
  73             }
  74             else {
  75                 /* stores a partially calculated target*/
  76                 /* + 1 to make 0 a valid character */
  77                 args->converter->toUnicodeStatus = ch + 1;
  78                 args->converter->toULength = (int8_t) i;
  79                 goto donefornow;
  80             }
  81         }
  82
  83         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
  84             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
  85             if (ch <= MAXIMUM_UCS2)
  86             {
  87                 /* fits in 16 bits */
  88                 *(myTarget++) = (UChar) ch;
  89             }
  90             else {
  91                 /* write out the surrogates */
  92                 *(myTarget++) = U16_LEAD(ch);
  93                 ch = U16_TRAIL(ch);
  94                 if (myTarget < targetLimit) {
  95                     *(myTarget++) = (UChar)ch;
  96                 }
  97                 else {
  98                     /* Put in overflow buffer (not handled here) */
  99                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 100                     args->converter->UCharErrorBufferLength = 1;
 101                     *err = U_BUFFER_OVERFLOW_ERROR;
 102                     break;
 103                 }
 104             }
 105         }
 106         else {
 107             args->converter->toULength = (int8_t)i;
 108             *err = U_ILLEGAL_CHAR_FOUND;
 109             break;
 110         }
 111     }
 112
 113 donefornow:
 114     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 115         /* End of target buffer */
 116         *err = U_BUFFER_OVERFLOW_ERROR;
 117     }
 118
 119     args->target = myTarget;
 120     args->source = (const char *) mySource;
 121 }
 122
 123 static void
 124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 125                                              UErrorCode * err)
 126 {
 127     const unsigned char *mySource = (unsigned char *) args->source;
 128     UChar *myTarget = args->target;
 129     int32_t *myOffsets = args->offsets;
 130     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 131     const UChar *targetLimit = args->targetLimit;
 132     unsigned char *toUBytes = args->converter->toUBytes;
 133     uint32_t ch, i;
 134     int32_t offsetNum = 0;
 135
 136     /* Restore state of current sequence */
 137     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
 138         i = args->converter->toULength;       /* restore # of bytes consumed */
 139         args->converter->toULength = 0;
 140
 141         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
 142         args->converter->toUnicodeStatus = 0;
 143         goto morebytes;
 144     }
 145
 146     while (mySource < sourceLimit && myTarget < targetLimit) {
 147         i = 0;
 148         ch = 0;
 149 morebytes:
 150         while (i < sizeof(uint32_t)) {
 151             if (mySource < sourceLimit) {
 152                 ch = (ch << 8) | (uint8_t)(*mySource);
 153                 toUBytes[i++] = (char) *(mySource++);
 154             }
 155             else {
 156                 /* stores a partially calculated target*/
 157                 /* + 1 to make 0 a valid character */
 158                 args->converter->toUnicodeStatus = ch + 1;
 159                 args->converter->toULength = (int8_t) i;
 160                 goto donefornow;
 161             }
 162         }
 163
 164         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 165             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 166             if (ch <= MAXIMUM_UCS2) {
 167                 /* fits in 16 bits */
 168                 *(myTarget++) = (UChar) ch;
 169                 *(myOffsets++) = offsetNum;
 170             }
 171             else {
 172                 /* write out the surrogates */
 173                 *(myTarget++) = U16_LEAD(ch);
 174                 *myOffsets++ = offsetNum;
 175                 ch = U16_TRAIL(ch);
 176                 if (myTarget < targetLimit)
 177                 {
 178                     *(myTarget++) = (UChar)ch;
 179                     *(myOffsets++) = offsetNum;
 180                 }
 181                 else {
 182                     /* Put in overflow buffer (not handled here) */
 183                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 184                     args->converter->UCharErrorBufferLength = 1;
 185                     *err = U_BUFFER_OVERFLOW_ERROR;
 186                     break;
 187                 }
 188             }
 189         }
 190         else {
 191             args->converter->toULength = (int8_t)i;
 192             *err = U_ILLEGAL_CHAR_FOUND;
 193             break;
 194         }
 195         offsetNum += i;
 196     }
 197
 198 donefornow:
 199     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 200     {
 201         /* End of target buffer */
 202         *err = U_BUFFER_OVERFLOW_ERROR;
 203     }
 204
 205     args->target = myTarget;
 206     args->source = (const char *) mySource;
 207     args->offsets = myOffsets;
 208 }
 209
 210 static void
 211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
 212                                   UErrorCode * err)
 213 {
 214     const UChar *mySource = args->source;
 215     unsigned char *myTarget;
 216     const UChar *sourceLimit = args->sourceLimit;
 217     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 218     UChar32 ch, ch2;
 219     unsigned int indexToWrite;
 220     unsigned char temp[sizeof(uint32_t)];
 221
 222     if(mySource >= sourceLimit) {
 223         /* no input, nothing to do */
 224         return;
 225     }
 226
 227     /* write the BOM if necessary */
 228     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 229         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 230         ucnv_fromUWriteBytes(args->converter,
 231                              bom, 4,
 232                              &args->target, args->targetLimit,
 233                              &args->offsets, -1,
 234                              err);
 235         args->converter->fromUnicodeStatus=0;
 236     }
 237
 238     myTarget = (unsigned char *) args->target;
 239     temp[0] = 0;
 240
 241     if (args->converter->fromUChar32) {
 242         ch = args->converter->fromUChar32;
 243         args->converter->fromUChar32 = 0;
 244         goto lowsurogate;
 245     }
 246
 247     while (mySource < sourceLimit && myTarget < targetLimit) {
 248         ch = *(mySource++);
 249
 250         if (U_IS_SURROGATE(ch)) {
 251             if (U_IS_LEAD(ch)) {
 252 lowsurogate:
 253                 if (mySource < sourceLimit) {
 254                     ch2 = *mySource;
 255                     if (U_IS_TRAIL(ch2)) {
 256                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 257                         mySource++;
 258                     }
 259                     else {
 260                         /* this is an unmatched trail code unit (2nd surrogate) */
 261                         /* callback(illegal) */
 262                         args->converter->fromUChar32 = ch;
 263                         *err = U_ILLEGAL_CHAR_FOUND;
 264                         break;
 265                     }
 266                 }
 267                 else {
 268                     /* ran out of source */
 269                     args->converter->fromUChar32 = ch;
 270                     if (args->flush) {
 271                         /* this is an unmatched trail code unit (2nd surrogate) */
 272                         /* callback(illegal) */
 273                         *err = U_ILLEGAL_CHAR_FOUND;
 274                     }
 275                     break;
 276                 }
 277             }
 278             else {
 279                 /* this is an unmatched trail code unit (2nd surrogate) */
 280                 /* callback(illegal) */
 281                 args->converter->fromUChar32 = ch;
 282                 *err = U_ILLEGAL_CHAR_FOUND;
 283                 break;
 284             }
 285         }
 286
 287         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 288         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 289         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 290         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 291
 292         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 293             if (myTarget < targetLimit) {
 294                 *(myTarget++) = temp[indexToWrite];
 295             }
 296             else {
 297                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 298                 *err = U_BUFFER_OVERFLOW_ERROR;
 299             }
 300         }
 301     }
 302
 303     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 304         *err = U_BUFFER_OVERFLOW_ERROR;
 305     }
 306
 307     args->target = (char *) myTarget;
 308     args->source = mySource;
 309 }
 310
 311 static void
 312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 313                                                UErrorCode * err)
 314 {
 315     const UChar *mySource = args->source;
 316     unsigned char *myTarget;
 317     int32_t *myOffsets;
 318     const UChar *sourceLimit = args->sourceLimit;
 319     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 320     UChar32 ch, ch2;
 321     int32_t offsetNum = 0;
 322     unsigned int indexToWrite;
 323     unsigned char temp[sizeof(uint32_t)];
 324
 325     if(mySource >= sourceLimit) {
 326         /* no input, nothing to do */
 327         return;
 328     }
 329
 330     /* write the BOM if necessary */
 331     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 332         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 333         ucnv_fromUWriteBytes(args->converter,
 334                              bom, 4,
 335                              &args->target, args->targetLimit,
 336                              &args->offsets, -1,
 337                              err);
 338         args->converter->fromUnicodeStatus=0;
 339     }
 340
 341     myTarget = (unsigned char *) args->target;
 342     myOffsets = args->offsets;
 343     temp[0] = 0;
 344
 345     if (args->converter->fromUChar32) {
 346         ch = args->converter->fromUChar32;
 347         args->converter->fromUChar32 = 0;
 348         goto lowsurogate;
 349     }
 350
 351     while (mySource < sourceLimit && myTarget < targetLimit) {
 352         ch = *(mySource++);
 353
 354         if (U_IS_SURROGATE(ch)) {
 355             if (U_IS_LEAD(ch)) {
 356 lowsurogate:
 357                 if (mySource < sourceLimit) {
 358                     ch2 = *mySource;
 359                     if (U_IS_TRAIL(ch2)) {
 360                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 361                         mySource++;
 362                     }
 363                     else {
 364                         /* this is an unmatched trail code unit (2nd surrogate) */
 365                         /* callback(illegal) */
 366                         args->converter->fromUChar32 = ch;
 367                         *err = U_ILLEGAL_CHAR_FOUND;
 368                         break;
 369                     }
 370                 }
 371                 else {
 372                     /* ran out of source */
 373                     args->converter->fromUChar32 = ch;
 374                     if (args->flush) {
 375                         /* this is an unmatched trail code unit (2nd surrogate) */
 376                         /* callback(illegal) */
 377                         *err = U_ILLEGAL_CHAR_FOUND;
 378                     }
 379                     break;
 380                 }
 381             }
 382             else {
 383                 /* this is an unmatched trail code unit (2nd surrogate) */
 384                 /* callback(illegal) */
 385                 args->converter->fromUChar32 = ch;
 386                 *err = U_ILLEGAL_CHAR_FOUND;
 387                 break;
 388             }
 389         }
 390
 391         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 392         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 393         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 394         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 395
 396         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 397             if (myTarget < targetLimit) {
 398                 *(myTarget++) = temp[indexToWrite];
 399                 *(myOffsets++) = offsetNum;
 400             }
 401             else {
 402                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 403                 *err = U_BUFFER_OVERFLOW_ERROR;
 404             }
 405         }
 406         offsetNum = offsetNum + 1 + (temp[1] != 0);
 407     }
 408
 409     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 410         *err = U_BUFFER_OVERFLOW_ERROR;
 411     }
 412
 413     args->target = (char *) myTarget;
 414     args->source = mySource;
 415     args->offsets = myOffsets;
 416 }
 417
 418 static UChar32
 419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
 420                                    UErrorCode* err)
 421 {
 422     const uint8_t *mySource;
 423     UChar32 myUChar;
 424     int32_t length;
 425
 426     mySource = (const uint8_t *)args->source;
 427     if (mySource >= (const uint8_t *)args->sourceLimit)
 428     {
 429         /* no input */
 430         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 431         return 0xffff;
 432     }
 433
 434     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 435     if (length < 4)
 436     {
 437         /* got a partial character */
 438         uprv_memcpy(args->converter->toUBytes, mySource, length);
 439         args->converter->toULength = (int8_t)length;
 440         args->source = (const char *)(mySource + length);
 441         *err = U_TRUNCATED_CHAR_FOUND;
 442         return 0xffff;
 443     }
 444
 445     /* Don't even try to do a direct cast because the value may be on an odd address. */
 446     myUChar = ((UChar32)mySource[0] << 24)
 447             | ((UChar32)mySource[1] << 16)
 448             | ((UChar32)mySource[2] << 8)
 449             | ((UChar32)mySource[3]);
 450
 451     args->source = (const char *)(mySource + 4);
 452     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 453         return myUChar;
 454     }
 455
 456     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 457     args->converter->toULength = 4;
 458
 459     *err = U_ILLEGAL_CHAR_FOUND;
 460     return 0xffff;
 461 }
 462
 463 static const UConverterImpl _UTF32BEImpl = {
 464     UCNV_UTF32_BigEndian,
 465
 466     NULL,
 467     NULL,
 468
 469     NULL,
 470     NULL,
 471     NULL,
 472
 473     T_UConverter_toUnicode_UTF32_BE,
 474     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
 475     T_UConverter_fromUnicode_UTF32_BE,
 476     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
 477     T_UConverter_getNextUChar_UTF32_BE,
 478
 479     NULL,
 480     NULL,
 481     NULL,
 482     NULL,
 483     ucnv_getNonSurrogateUnicodeSet
 484 };
 485
 486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 487 static const UConverterStaticData _UTF32BEStaticData = {
 488     sizeof(UConverterStaticData),
 489     "UTF-32BE",
 490     1232,
 491     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
 492     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
 493     0,
 494     0,
 495     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 496 };
 497
 498 const UConverterSharedData _UTF32BEData =
 499         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
 500
 501 /* UTF-32LE ---------------------------------------------------------- */
 502
 503 static void
 504 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
 505                                 UErrorCode * err)
 506 {
 507     const unsigned char *mySource = (unsigned char *) args->source;
 508     UChar *myTarget = args->target;
 509     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 510     const UChar *targetLimit = args->targetLimit;
 511     unsigned char *toUBytes = args->converter->toUBytes;
 512     uint32_t ch, i;
 513
 514     /* Restore state of current sequence */
 515     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 516     {
 517         i = args->converter->toULength;       /* restore # of bytes consumed */
 518         args->converter->toULength = 0;
 519
 520         /* Stores the previously calculated ch from a previous call*/
 521         ch = args->converter->toUnicodeStatus - 1;
 522         args->converter->toUnicodeStatus = 0;
 523         goto morebytes;
 524     }
 525
 526     while (mySource < sourceLimit && myTarget < targetLimit)
 527     {
 528         i = 0;
 529         ch = 0;
 530 morebytes:
 531         while (i < sizeof(uint32_t))
 532         {
 533             if (mySource < sourceLimit)
 534             {
 535                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 536                 toUBytes[i++] = (char) *(mySource++);
 537             }
 538             else
 539             {
 540                 /* stores a partially calculated target*/
 541                 /* + 1 to make 0 a valid character */
 542                 args->converter->toUnicodeStatus = ch + 1;
 543                 args->converter->toULength = (int8_t) i;
 544                 goto donefornow;
 545             }
 546         }
 547
 548         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 549             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 550             if (ch <= MAXIMUM_UCS2) {
 551                 /* fits in 16 bits */
 552                 *(myTarget++) = (UChar) ch;
 553             }
 554             else {
 555                 /* write out the surrogates */
 556                 *(myTarget++) = U16_LEAD(ch);
 557                 ch = U16_TRAIL(ch);
 558                 if (myTarget < targetLimit) {
 559                     *(myTarget++) = (UChar)ch;
 560                 }
 561                 else {
 562                     /* Put in overflow buffer (not handled here) */
 563                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 564                     args->converter->UCharErrorBufferLength = 1;
 565                     *err = U_BUFFER_OVERFLOW_ERROR;
 566                     break;
 567                 }
 568             }
 569         }
 570         else {
 571             args->converter->toULength = (int8_t)i;
 572             *err = U_ILLEGAL_CHAR_FOUND;
 573             break;
 574         }
 575     }
 576
 577 donefornow:
 578     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 579     {
 580         /* End of target buffer */
 581         *err = U_BUFFER_OVERFLOW_ERROR;
 582     }
 583
 584     args->target = myTarget;
 585     args->source = (const char *) mySource;
 586 }
 587
 588 static void
 589 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 590                                              UErrorCode * err)
 591 {
 592     const unsigned char *mySource = (unsigned char *) args->source;
 593     UChar *myTarget = args->target;
 594     int32_t *myOffsets = args->offsets;
 595     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 596     const UChar *targetLimit = args->targetLimit;
 597     unsigned char *toUBytes = args->converter->toUBytes;
 598     uint32_t ch, i;
 599     int32_t offsetNum = 0;
 600
 601     /* Restore state of current sequence */
 602     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 603     {
 604         i = args->converter->toULength;       /* restore # of bytes consumed */
 605         args->converter->toULength = 0;
 606
 607         /* Stores the previously calculated ch from a previous call*/
 608         ch = args->converter->toUnicodeStatus - 1;
 609         args->converter->toUnicodeStatus = 0;
 610         goto morebytes;
 611     }
 612
 613     while (mySource < sourceLimit && myTarget < targetLimit)
 614     {
 615         i = 0;
 616         ch = 0;
 617 morebytes:
 618         while (i < sizeof(uint32_t))
 619         {
 620             if (mySource < sourceLimit)
 621             {
 622                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 623                 toUBytes[i++] = (char) *(mySource++);
 624             }
 625             else
 626             {
 627                 /* stores a partially calculated target*/
 628                 /* + 1 to make 0 a valid character */
 629                 args->converter->toUnicodeStatus = ch + 1;
 630                 args->converter->toULength = (int8_t) i;
 631                 goto donefornow;
 632             }
 633         }
 634
 635         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
 636         {
 637             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 638             if (ch <= MAXIMUM_UCS2)
 639             {
 640                 /* fits in 16 bits */
 641                 *(myTarget++) = (UChar) ch;
 642                 *(myOffsets++) = offsetNum;
 643             }
 644             else {
 645                 /* write out the surrogates */
 646                 *(myTarget++) = U16_LEAD(ch);
 647                 *(myOffsets++) = offsetNum;
 648                 ch = U16_TRAIL(ch);
 649                 if (myTarget < targetLimit)
 650                 {
 651                     *(myTarget++) = (UChar)ch;
 652                     *(myOffsets++) = offsetNum;
 653                 }
 654                 else
 655                 {
 656                     /* Put in overflow buffer (not handled here) */
 657                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 658                     args->converter->UCharErrorBufferLength = 1;
 659                     *err = U_BUFFER_OVERFLOW_ERROR;
 660                     break;
 661                 }
 662             }
 663         }
 664         else
 665         {
 666             args->converter->toULength = (int8_t)i;
 667             *err = U_ILLEGAL_CHAR_FOUND;
 668             break;
 669         }
 670         offsetNum += i;
 671     }
 672
 673 donefornow:
 674     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 675     {
 676         /* End of target buffer */
 677         *err = U_BUFFER_OVERFLOW_ERROR;
 678     }
 679
 680     args->target = myTarget;
 681     args->source = (const char *) mySource;
 682     args->offsets = myOffsets;
 683 }
 684
 685 static void
 686 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
 687                                   UErrorCode * err)
 688 {
 689     const UChar *mySource = args->source;
 690     unsigned char *myTarget;
 691     const UChar *sourceLimit = args->sourceLimit;
 692     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 693     UChar32 ch, ch2;
 694     unsigned int indexToWrite;
 695     unsigned char temp[sizeof(uint32_t)];
 696
 697     if(mySource >= sourceLimit) {
 698         /* no input, nothing to do */
 699         return;
 700     }
 701
 702     /* write the BOM if necessary */
 703     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 704         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 705         ucnv_fromUWriteBytes(args->converter,
 706                              bom, 4,
 707                              &args->target, args->targetLimit,
 708                              &args->offsets, -1,
 709                              err);
 710         args->converter->fromUnicodeStatus=0;
 711     }
 712
 713     myTarget = (unsigned char *) args->target;
 714     temp[3] = 0;
 715
 716     if (args->converter->fromUChar32)
 717     {
 718         ch = args->converter->fromUChar32;
 719         args->converter->fromUChar32 = 0;
 720         goto lowsurogate;
 721     }
 722
 723     while (mySource < sourceLimit && myTarget < targetLimit)
 724     {
 725         ch = *(mySource++);
 726
 727         if (U16_IS_SURROGATE(ch)) {
 728             if (U16_IS_LEAD(ch))
 729             {
 730 lowsurogate:
 731                 if (mySource < sourceLimit)
 732                 {
 733                     ch2 = *mySource;
 734                     if (U16_IS_TRAIL(ch2)) {
 735                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 736                         mySource++;
 737                     }
 738                     else {
 739                         /* this is an unmatched trail code unit (2nd surrogate) */
 740                         /* callback(illegal) */
 741                         args->converter->fromUChar32 = ch;
 742                         *err = U_ILLEGAL_CHAR_FOUND;
 743                         break;
 744                     }
 745                 }
 746                 else {
 747                     /* ran out of source */
 748                     args->converter->fromUChar32 = ch;
 749                     if (args->flush) {
 750                         /* this is an unmatched trail code unit (2nd surrogate) */
 751                         /* callback(illegal) */
 752                         *err = U_ILLEGAL_CHAR_FOUND;
 753                     }
 754                     break;
 755                 }
 756             }
 757             else {
 758                 /* this is an unmatched trail code unit (2nd surrogate) */
 759                 /* callback(illegal) */
 760                 args->converter->fromUChar32 = ch;
 761                 *err = U_ILLEGAL_CHAR_FOUND;
 762                 break;
 763             }
 764         }
 765
 766         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 767         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 768         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 769         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 770
 771         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 772         {
 773             if (myTarget < targetLimit)
 774             {
 775                 *(myTarget++) = temp[indexToWrite];
 776             }
 777             else
 778             {
 779                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 780                 *err = U_BUFFER_OVERFLOW_ERROR;
 781             }
 782         }
 783     }
 784
 785     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 786     {
 787         *err = U_BUFFER_OVERFLOW_ERROR;
 788     }
 789
 790     args->target = (char *) myTarget;
 791     args->source = mySource;
 792 }
 793
 794 static void
 795 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 796                                                UErrorCode * err)
 797 {
 798     const UChar *mySource = args->source;
 799     unsigned char *myTarget;
 800     int32_t *myOffsets;
 801     const UChar *sourceLimit = args->sourceLimit;
 802     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 803     UChar32 ch, ch2;
 804     unsigned int indexToWrite;
 805     unsigned char temp[sizeof(uint32_t)];
 806     int32_t offsetNum = 0;
 807
 808     if(mySource >= sourceLimit) {
 809         /* no input, nothing to do */
 810         return;
 811     }
 812
 813     /* write the BOM if necessary */
 814     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 815         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 816         ucnv_fromUWriteBytes(args->converter,
 817                              bom, 4,
 818                              &args->target, args->targetLimit,
 819                              &args->offsets, -1,
 820                              err);
 821         args->converter->fromUnicodeStatus=0;
 822     }
 823
 824     myTarget = (unsigned char *) args->target;
 825     myOffsets = args->offsets;
 826     temp[3] = 0;
 827
 828     if (args->converter->fromUChar32)
 829     {
 830         ch = args->converter->fromUChar32;
 831         args->converter->fromUChar32 = 0;
 832         goto lowsurogate;
 833     }
 834
 835     while (mySource < sourceLimit && myTarget < targetLimit)
 836     {
 837         ch = *(mySource++);
 838
 839         if (U16_IS_SURROGATE(ch)) {
 840             if (U16_IS_LEAD(ch))
 841             {
 842 lowsurogate:
 843                 if (mySource < sourceLimit)
 844                 {
 845                     ch2 = *mySource;
 846                     if (U16_IS_TRAIL(ch2))
 847                     {
 848                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 849                         mySource++;
 850                     }
 851                     else {
 852                         /* this is an unmatched trail code unit (2nd surrogate) */
 853                         /* callback(illegal) */
 854                         args->converter->fromUChar32 = ch;
 855                         *err = U_ILLEGAL_CHAR_FOUND;
 856                         break;
 857                     }
 858                 }
 859                 else {
 860                     /* ran out of source */
 861                     args->converter->fromUChar32 = ch;
 862                     if (args->flush) {
 863                         /* this is an unmatched trail code unit (2nd surrogate) */
 864                         /* callback(illegal) */
 865                         *err = U_ILLEGAL_CHAR_FOUND;
 866                     }
 867                     break;
 868                 }
 869             }
 870             else {
 871                 /* this is an unmatched trail code unit (2nd surrogate) */
 872                 /* callback(illegal) */
 873                 args->converter->fromUChar32 = ch;
 874                 *err = U_ILLEGAL_CHAR_FOUND;
 875                 break;
 876             }
 877         }
 878
 879         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 880         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 881         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 882         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 883
 884         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 885         {
 886             if (myTarget < targetLimit)
 887             {
 888                 *(myTarget++) = temp[indexToWrite];
 889                 *(myOffsets++) = offsetNum;
 890             }
 891             else
 892             {
 893                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 894                 *err = U_BUFFER_OVERFLOW_ERROR;
 895             }
 896         }
 897         offsetNum = offsetNum + 1 + (temp[2] != 0);
 898     }
 899
 900     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 901     {
 902         *err = U_BUFFER_OVERFLOW_ERROR;
 903     }
 904
 905     args->target = (char *) myTarget;
 906     args->source = mySource;
 907     args->offsets = myOffsets;
 908 }
 909
 910 static UChar32
 911 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
 912                                    UErrorCode* err)
 913 {
 914     const uint8_t *mySource;
 915     UChar32 myUChar;
 916     int32_t length;
 917
 918     mySource = (const uint8_t *)args->source;
 919     if (mySource >= (const uint8_t *)args->sourceLimit)
 920     {
 921         /* no input */
 922         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 923         return 0xffff;
 924     }
 925
 926     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 927     if (length < 4)
 928     {
 929         /* got a partial character */
 930         uprv_memcpy(args->converter->toUBytes, mySource, length);
 931         args->converter->toULength = (int8_t)length;
 932         args->source = (const char *)(mySource + length);
 933         *err = U_TRUNCATED_CHAR_FOUND;
 934         return 0xffff;
 935     }
 936
 937     /* Don't even try to do a direct cast because the value may be on an odd address. */
 938     myUChar = ((UChar32)mySource[3] << 24)
 939             | ((UChar32)mySource[2] << 16)
 940             | ((UChar32)mySource[1] << 8)
 941             | ((UChar32)mySource[0]);
 942
 943     args->source = (const char *)(mySource + 4);
 944     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 945         return myUChar;
 946     }
 947
 948     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 949     args->converter->toULength = 4;
 950
 951     *err = U_ILLEGAL_CHAR_FOUND;
 952     return 0xffff;
 953 }
 954
 955 static const UConverterImpl _UTF32LEImpl = {
 956     UCNV_UTF32_LittleEndian,
 957
 958     NULL,
 959     NULL,
 960
 961     NULL,
 962     NULL,
 963     NULL,
 964
 965     T_UConverter_toUnicode_UTF32_LE,
 966     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
 967     T_UConverter_fromUnicode_UTF32_LE,
 968     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
 969     T_UConverter_getNextUChar_UTF32_LE,
 970
 971     NULL,
 972     NULL,
 973     NULL,
 974     NULL,
 975     ucnv_getNonSurrogateUnicodeSet
 976 };
 977
 978 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 979 static const UConverterStaticData _UTF32LEStaticData = {
 980     sizeof(UConverterStaticData),
 981     "UTF-32LE",
 982     1234,
 983     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
 984     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
 985     0,
 986     0,
 987     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 988 };
 989
 990
 991 const UConverterSharedData _UTF32LEData =
 992         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
 993
 994 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
 995
 996 /*
 997  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
 998  * accordingly.
 999  *
1000  * State values:
1001  * 0    initial state
1002  * 1    saw 00
1003  * 2    saw 00 00
1004  * 3    saw 00 00 FE
1005  * 4    -
1006  * 5    saw FF
1007  * 6    saw FF FE
1008  * 7    saw FF FE 00
1009  * 8    UTF-32BE mode
1010  * 9    UTF-32LE mode
1011  *
1012  * During detection: state&3==number of matching bytes so far.
1013  *
1014  * On output, emit U+FEFF as the first code point.
1015  */
1016
1017 static void
1018 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1019     if(choice<=UCNV_RESET_TO_UNICODE) {
1020         /* reset toUnicode: state=0 */
1021         cnv->mode=0;
1022     }
1023     if(choice!=UCNV_RESET_TO_UNICODE) {
1024         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1025         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1026     }
1027 }
1028
1029 static void
1030 _UTF32Open(UConverter *cnv,
1031            UConverterLoadArgs *pArgs,
1032            UErrorCode *pErrorCode) {
1033     _UTF32Reset(cnv, UCNV_RESET_BOTH);
1034 }
1035
1036 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1037
1038 static void
1039 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1040                            UErrorCode *pErrorCode) {
1041     UConverter *cnv=pArgs->converter;
1042     const char *source=pArgs->source;
1043     const char *sourceLimit=pArgs->sourceLimit;
1044     int32_t *offsets=pArgs->offsets;
1045
1046     int32_t state, offsetDelta;
1047     char b;
1048
1049     state=cnv->mode;
1050
1051     /*
1052      * If we detect a BOM in this buffer, then we must add the BOM size to the
1053      * offsets because the actual converter function will not see and count the BOM.
1054      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1055      */
1056     offsetDelta=0;
1057
1058     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1059         switch(state) {
1060         case 0:
1061             b=*source;
1062             if(b==0) {
1063                 state=1; /* could be 00 00 FE FF */
1064             } else if(b==(char)0xff) {
1065                 state=5; /* could be FF FE 00 00 */
1066             } else {
1067                 state=8; /* default to UTF-32BE */
1068                 continue;
1069             }
1070             ++source;
1071             break;
1072         case 1:
1073         case 2:
1074         case 3:
1075         case 5:
1076         case 6:
1077         case 7:
1078             if(*source==utf32BOM[state]) {
1079                 ++state;
1080                 ++source;
1081                 if(state==4) {
1082                     state=8; /* detect UTF-32BE */
1083                     offsetDelta=(int32_t)(source-pArgs->source);
1084                 } else if(state==8) {
1085                     state=9; /* detect UTF-32LE */
1086                     offsetDelta=(int32_t)(source-pArgs->source);
1087                 }
1088             } else {
1089                 /* switch to UTF-32BE and pass the previous bytes */
1090                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1091
1092                 /* reset the source */
1093                 source=pArgs->source;
1094
1095                 if(count==(state&3)) {
1096                     /* simple: all in the same buffer, just reset source */
1097                 } else {
1098                     UBool oldFlush=pArgs->flush;
1099
1100                     /* some of the bytes are from a previous buffer, replay those first */
1101                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1102                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1103                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1104
1105                     /* no offsets: bytes from previous buffer, and not enough for output */
1106                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1107
1108                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1109                     pArgs->sourceLimit=sourceLimit;
1110                     pArgs->flush=oldFlush;
1111                 }
1112                 state=8;
1113                 continue;
1114             }
1115             break;
1116         case 8:
1117             /* call UTF-32BE */
1118             pArgs->source=source;
1119             if(offsets==NULL) {
1120                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1121             } else {
1122                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1123             }
1124             source=pArgs->source;
1125             break;
1126         case 9:
1127             /* call UTF-32LE */
1128             pArgs->source=source;
1129             if(offsets==NULL) {
1130                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1131             } else {
1132                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1133             }
1134             source=pArgs->source;
1135             break;
1136         default:
1137             break; /* does not occur */
1138         }
1139     }
1140
1141     /* add BOM size to offsets - see comment at offsetDelta declaration */
1142     if(offsets!=NULL && offsetDelta!=0) {
1143         int32_t *offsetsLimit=pArgs->offsets;
1144         while(offsets<offsetsLimit) {
1145             *offsets++ += offsetDelta;
1146         }
1147     }
1148
1149     pArgs->source=source;
1150
1151     if(source==sourceLimit && pArgs->flush) {
1152         /* handle truncated input */
1153         switch(state) {
1154         case 0:
1155             break; /* no input at all, nothing to do */
1156         case 8:
1157             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1158             break;
1159         case 9:
1160             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1161             break;
1162         default:
1163             /* handle 0<state<8: call UTF-32BE with too-short input */
1164             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1165             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1166
1167             /* no offsets: not enough for output */
1168             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1169             pArgs->source=source;
1170             pArgs->sourceLimit=sourceLimit;
1171             state=8;
1172             break;
1173         }
1174     }
1175
1176     cnv->mode=state;
1177 }
1178
1179 static UChar32
1180 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1181                    UErrorCode *pErrorCode) {
1182     switch(pArgs->converter->mode) {
1183     case 8:
1184         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1185     case 9:
1186         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1187     default:
1188         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1189     }
1190 }
1191
1192 static const UConverterImpl _UTF32Impl = {
1193     UCNV_UTF32,
1194
1195     NULL,
1196     NULL,
1197
1198     _UTF32Open,
1199     NULL,
1200     _UTF32Reset,
1201
1202     _UTF32ToUnicodeWithOffsets,
1203     _UTF32ToUnicodeWithOffsets,
1204 #if U_IS_BIG_ENDIAN
1205     T_UConverter_fromUnicode_UTF32_BE,
1206     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1207 #else
1208     T_UConverter_fromUnicode_UTF32_LE,
1209     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1210 #endif
1211     _UTF32GetNextUChar,
1212
1213     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1214     NULL,
1215     NULL,
1216     NULL,
1217     ucnv_getNonSurrogateUnicodeSet
1218 };
1219
1220 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1221 static const UConverterStaticData _UTF32StaticData = {
1222     sizeof(UConverterStaticData),
1223     "UTF-32",
1224     1236,
1225     UCNV_IBM, UCNV_UTF32, 4, 4,
1226 #if U_IS_BIG_ENDIAN
1227     { 0, 0, 0xff, 0xfd }, 4,
1228 #else
1229     { 0xfd, 0xff, 0, 0 }, 4,
1230 #endif
1231     FALSE, FALSE,
1232     0,
1233     0,
1234     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1235 };
1236
1237 const UConverterSharedData _UTF32Data =
1238         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
1239
1240 #endif