icuSources/common/ucnv_u32.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2011, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u32.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "unicode/utf.h"
  23 #include "ucnv_bld.h"
  24 #include "ucnv_cnv.h"
  25 #include "cmemory.h"
  26
  27 #define MAXIMUM_UCS2            0x0000FFFF
  28 #define MAXIMUM_UTF             0x0010FFFF
  29 #define HALF_SHIFT              10
  30 #define HALF_BASE               0x0010000
  31 #define HALF_MASK               0x3FF
  32 #define SURROGATE_HIGH_START    0xD800
  33 #define SURROGATE_LOW_START     0xDC00
  34
  35 /* -SURROGATE_LOW_START + HALF_BASE */
  36 #define SURROGATE_LOW_BASE      9216
  37
  38 enum {
  39     UCNV_NEED_TO_WRITE_BOM=1
  40 };
  41
  42 /* UTF-32BE ----------------------------------------------------------------- */
  43
  44 static void
  45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
  46                                 UErrorCode * err)
  47 {
  48     const unsigned char *mySource = (unsigned char *) args->source;
  49     UChar *myTarget = args->target;
  50     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  51     const UChar *targetLimit = args->targetLimit;
  52     unsigned char *toUBytes = args->converter->toUBytes;
  53     uint32_t ch, i;
  54
  55     /* Restore state of current sequence */
  56     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
  57         i = args->converter->toULength;       /* restore # of bytes consumed */
  58         args->converter->toULength = 0;
  59
  60         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
  61         args->converter->toUnicodeStatus = 0;
  62         goto morebytes;
  63     }
  64
  65     while (mySource < sourceLimit && myTarget < targetLimit) {
  66         i = 0;
  67         ch = 0;
  68 morebytes:
  69         while (i < sizeof(uint32_t)) {
  70             if (mySource < sourceLimit) {
  71                 ch = (ch << 8) | (uint8_t)(*mySource);
  72                 toUBytes[i++] = (char) *(mySource++);
  73             }
  74             else {
  75                 /* stores a partially calculated target*/
  76                 /* + 1 to make 0 a valid character */
  77                 args->converter->toUnicodeStatus = ch + 1;
  78                 args->converter->toULength = (int8_t) i;
  79                 goto donefornow;
  80             }
  81         }
  82
  83         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
  84             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
  85             if (ch <= MAXIMUM_UCS2)
  86             {
  87                 /* fits in 16 bits */
  88                 *(myTarget++) = (UChar) ch;
  89             }
  90             else {
  91                 /* write out the surrogates */
  92                 *(myTarget++) = U16_LEAD(ch);
  93                 ch = U16_TRAIL(ch);
  94                 if (myTarget < targetLimit) {
  95                     *(myTarget++) = (UChar)ch;
  96                 }
  97                 else {
  98                     /* Put in overflow buffer (not handled here) */
  99                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 100                     args->converter->UCharErrorBufferLength = 1;
 101                     *err = U_BUFFER_OVERFLOW_ERROR;
 102                     break;
 103                 }
 104             }
 105         }
 106         else {
 107             args->converter->toULength = (int8_t)i;
 108             *err = U_ILLEGAL_CHAR_FOUND;
 109             break;
 110         }
 111     }
 112
 113 donefornow:
 114     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 115         /* End of target buffer */
 116         *err = U_BUFFER_OVERFLOW_ERROR;
 117     }
 118
 119     args->target = myTarget;
 120     args->source = (const char *) mySource;
 121 }
 122
 123 static void
 124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 125                                              UErrorCode * err)
 126 {
 127     const unsigned char *mySource = (unsigned char *) args->source;
 128     UChar *myTarget = args->target;
 129     int32_t *myOffsets = args->offsets;
 130     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 131     const UChar *targetLimit = args->targetLimit;
 132     unsigned char *toUBytes = args->converter->toUBytes;
 133     uint32_t ch, i;
 134     int32_t offsetNum = 0;
 135
 136     /* Restore state of current sequence */
 137     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
 138         i = args->converter->toULength;       /* restore # of bytes consumed */
 139         args->converter->toULength = 0;
 140
 141         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
 142         args->converter->toUnicodeStatus = 0;
 143         goto morebytes;
 144     }
 145
 146     while (mySource < sourceLimit && myTarget < targetLimit) {
 147         i = 0;
 148         ch = 0;
 149 morebytes:
 150         while (i < sizeof(uint32_t)) {
 151             if (mySource < sourceLimit) {
 152                 ch = (ch << 8) | (uint8_t)(*mySource);
 153                 toUBytes[i++] = (char) *(mySource++);
 154             }
 155             else {
 156                 /* stores a partially calculated target*/
 157                 /* + 1 to make 0 a valid character */
 158                 args->converter->toUnicodeStatus = ch + 1;
 159                 args->converter->toULength = (int8_t) i;
 160                 goto donefornow;
 161             }
 162         }
 163
 164         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 165             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 166             if (ch <= MAXIMUM_UCS2) {
 167                 /* fits in 16 bits */
 168                 *(myTarget++) = (UChar) ch;
 169                 *(myOffsets++) = offsetNum;
 170             }
 171             else {
 172                 /* write out the surrogates */
 173                 *(myTarget++) = U16_LEAD(ch);
 174                 *myOffsets++ = offsetNum;
 175                 ch = U16_TRAIL(ch);
 176                 if (myTarget < targetLimit)
 177                 {
 178                     *(myTarget++) = (UChar)ch;
 179                     *(myOffsets++) = offsetNum;
 180                 }
 181                 else {
 182                     /* Put in overflow buffer (not handled here) */
 183                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 184                     args->converter->UCharErrorBufferLength = 1;
 185                     *err = U_BUFFER_OVERFLOW_ERROR;
 186                     break;
 187                 }
 188             }
 189         }
 190         else {
 191             args->converter->toULength = (int8_t)i;
 192             *err = U_ILLEGAL_CHAR_FOUND;
 193             break;
 194         }
 195         offsetNum += i;
 196     }
 197
 198 donefornow:
 199     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 200     {
 201         /* End of target buffer */
 202         *err = U_BUFFER_OVERFLOW_ERROR;
 203     }
 204
 205     args->target = myTarget;
 206     args->source = (const char *) mySource;
 207     args->offsets = myOffsets;
 208 }
 209
 210 static void
 211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
 212                                   UErrorCode * err)
 213 {
 214     const UChar *mySource = args->source;
 215     unsigned char *myTarget;
 216     const UChar *sourceLimit = args->sourceLimit;
 217     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 218     UChar32 ch, ch2;
 219     unsigned int indexToWrite;
 220     unsigned char temp[sizeof(uint32_t)];
 221
 222     if(mySource >= sourceLimit) {
 223         /* no input, nothing to do */
 224         return;
 225     }
 226
 227     /* write the BOM if necessary */
 228     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 229         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 230         ucnv_fromUWriteBytes(args->converter,
 231                              bom, 4,
 232                              &args->target, args->targetLimit,
 233                              &args->offsets, -1,
 234                              err);
 235         args->converter->fromUnicodeStatus=0;
 236     }
 237
 238     myTarget = (unsigned char *) args->target;
 239     temp[0] = 0;
 240
 241     if (args->converter->fromUChar32) {
 242         ch = args->converter->fromUChar32;
 243         args->converter->fromUChar32 = 0;
 244         goto lowsurogate;
 245     }
 246
 247     while (mySource < sourceLimit && myTarget < targetLimit) {
 248         ch = *(mySource++);
 249
 250         if (U_IS_SURROGATE(ch)) {
 251             if (U_IS_LEAD(ch)) {
 252 lowsurogate:
 253                 if (mySource < sourceLimit) {
 254                     ch2 = *mySource;
 255                     if (U_IS_TRAIL(ch2)) {
 256                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 257                         mySource++;
 258                     }
 259                     else {
 260                         /* this is an unmatched trail code unit (2nd surrogate) */
 261                         /* callback(illegal) */
 262                         args->converter->fromUChar32 = ch;
 263                         *err = U_ILLEGAL_CHAR_FOUND;
 264                         break;
 265                     }
 266                 }
 267                 else {
 268                     /* ran out of source */
 269                     args->converter->fromUChar32 = ch;
 270                     if (args->flush) {
 271                         /* this is an unmatched trail code unit (2nd surrogate) */
 272                         /* callback(illegal) */
 273                         *err = U_ILLEGAL_CHAR_FOUND;
 274                     }
 275                     break;
 276                 }
 277             }
 278             else {
 279                 /* this is an unmatched trail code unit (2nd surrogate) */
 280                 /* callback(illegal) */
 281                 args->converter->fromUChar32 = ch;
 282                 *err = U_ILLEGAL_CHAR_FOUND;
 283                 break;
 284             }
 285         }
 286
 287         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 288         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 289         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 290         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 291
 292         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 293             if (myTarget < targetLimit) {
 294                 *(myTarget++) = temp[indexToWrite];
 295             }
 296             else {
 297                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 298                 *err = U_BUFFER_OVERFLOW_ERROR;
 299             }
 300         }
 301     }
 302
 303     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 304         *err = U_BUFFER_OVERFLOW_ERROR;
 305     }
 306
 307     args->target = (char *) myTarget;
 308     args->source = mySource;
 309 }
 310
 311 static void
 312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 313                                                UErrorCode * err)
 314 {
 315     const UChar *mySource = args->source;
 316     unsigned char *myTarget;
 317     int32_t *myOffsets;
 318     const UChar *sourceLimit = args->sourceLimit;
 319     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 320     UChar32 ch, ch2;
 321     int32_t offsetNum = 0;
 322     unsigned int indexToWrite;
 323     unsigned char temp[sizeof(uint32_t)];
 324
 325     if(mySource >= sourceLimit) {
 326         /* no input, nothing to do */
 327         return;
 328     }
 329
 330     /* write the BOM if necessary */
 331     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 332         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 333         ucnv_fromUWriteBytes(args->converter,
 334                              bom, 4,
 335                              &args->target, args->targetLimit,
 336                              &args->offsets, -1,
 337                              err);
 338         args->converter->fromUnicodeStatus=0;
 339     }
 340
 341     myTarget = (unsigned char *) args->target;
 342     myOffsets = args->offsets;
 343     temp[0] = 0;
 344
 345     if (args->converter->fromUChar32) {
 346         ch = args->converter->fromUChar32;
 347         args->converter->fromUChar32 = 0;
 348         goto lowsurogate;
 349     }
 350
 351     while (mySource < sourceLimit && myTarget < targetLimit) {
 352         ch = *(mySource++);
 353
 354         if (U_IS_SURROGATE(ch)) {
 355             if (U_IS_LEAD(ch)) {
 356 lowsurogate:
 357                 if (mySource < sourceLimit) {
 358                     ch2 = *mySource;
 359                     if (U_IS_TRAIL(ch2)) {
 360                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 361                         mySource++;
 362                     }
 363                     else {
 364                         /* this is an unmatched trail code unit (2nd surrogate) */
 365                         /* callback(illegal) */
 366                         args->converter->fromUChar32 = ch;
 367                         *err = U_ILLEGAL_CHAR_FOUND;
 368                         break;
 369                     }
 370                 }
 371                 else {
 372                     /* ran out of source */
 373                     args->converter->fromUChar32 = ch;
 374                     if (args->flush) {
 375                         /* this is an unmatched trail code unit (2nd surrogate) */
 376                         /* callback(illegal) */
 377                         *err = U_ILLEGAL_CHAR_FOUND;
 378                     }
 379                     break;
 380                 }
 381             }
 382             else {
 383                 /* this is an unmatched trail code unit (2nd surrogate) */
 384                 /* callback(illegal) */
 385                 args->converter->fromUChar32 = ch;
 386                 *err = U_ILLEGAL_CHAR_FOUND;
 387                 break;
 388             }
 389         }
 390
 391         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 392         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 393         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 394         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 395
 396         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 397             if (myTarget < targetLimit) {
 398                 *(myTarget++) = temp[indexToWrite];
 399                 *(myOffsets++) = offsetNum;
 400             }
 401             else {
 402                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 403                 *err = U_BUFFER_OVERFLOW_ERROR;
 404             }
 405         }
 406         offsetNum = offsetNum + 1 + (temp[1] != 0);
 407     }
 408
 409     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 410         *err = U_BUFFER_OVERFLOW_ERROR;
 411     }
 412
 413     args->target = (char *) myTarget;
 414     args->source = mySource;
 415     args->offsets = myOffsets;
 416 }
 417
 418 static UChar32
 419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
 420                                    UErrorCode* err)
 421 {
 422     const uint8_t *mySource;
 423     UChar32 myUChar;
 424     int32_t length;
 425
 426     mySource = (const uint8_t *)args->source;
 427     if (mySource >= (const uint8_t *)args->sourceLimit)
 428     {
 429         /* no input */
 430         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 431         return 0xffff;
 432     }
 433
 434     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 435     if (length < 4)
 436     {
 437         /* got a partial character */
 438         uprv_memcpy(args->converter->toUBytes, mySource, length);
 439         args->converter->toULength = (int8_t)length;
 440         args->source = (const char *)(mySource + length);
 441         *err = U_TRUNCATED_CHAR_FOUND;
 442         return 0xffff;
 443     }
 444
 445     /* Don't even try to do a direct cast because the value may be on an odd address. */
 446     myUChar = ((UChar32)mySource[0] << 24)
 447             | ((UChar32)mySource[1] << 16)
 448             | ((UChar32)mySource[2] << 8)
 449             | ((UChar32)mySource[3]);
 450
 451     args->source = (const char *)(mySource + 4);
 452     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 453         return myUChar;
 454     }
 455
 456     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 457     args->converter->toULength = 4;
 458
 459     *err = U_ILLEGAL_CHAR_FOUND;
 460     return 0xffff;
 461 }
 462
 463 static const UConverterImpl _UTF32BEImpl = {
 464     UCNV_UTF32_BigEndian,
 465
 466     NULL,
 467     NULL,
 468
 469     NULL,
 470     NULL,
 471     NULL,
 472
 473     T_UConverter_toUnicode_UTF32_BE,
 474     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
 475     T_UConverter_fromUnicode_UTF32_BE,
 476     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
 477     T_UConverter_getNextUChar_UTF32_BE,
 478
 479     NULL,
 480     NULL,
 481     NULL,
 482     NULL,
 483     ucnv_getNonSurrogateUnicodeSet
 484 };
 485
 486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 487 static const UConverterStaticData _UTF32BEStaticData = {
 488     sizeof(UConverterStaticData),
 489     "UTF-32BE",
 490     1232,
 491     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
 492     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
 493     0,
 494     0,
 495     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 496 };
 497
 498 const UConverterSharedData _UTF32BEData = {
 499     sizeof(UConverterSharedData), ~((uint32_t) 0),
 500     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
 501     0
 502 };
 503
 504 /* UTF-32LE ---------------------------------------------------------- */
 505
 506 static void
 507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
 508                                 UErrorCode * err)
 509 {
 510     const unsigned char *mySource = (unsigned char *) args->source;
 511     UChar *myTarget = args->target;
 512     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 513     const UChar *targetLimit = args->targetLimit;
 514     unsigned char *toUBytes = args->converter->toUBytes;
 515     uint32_t ch, i;
 516
 517     /* Restore state of current sequence */
 518     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 519     {
 520         i = args->converter->toULength;       /* restore # of bytes consumed */
 521         args->converter->toULength = 0;
 522
 523         /* Stores the previously calculated ch from a previous call*/
 524         ch = args->converter->toUnicodeStatus - 1;
 525         args->converter->toUnicodeStatus = 0;
 526         goto morebytes;
 527     }
 528
 529     while (mySource < sourceLimit && myTarget < targetLimit)
 530     {
 531         i = 0;
 532         ch = 0;
 533 morebytes:
 534         while (i < sizeof(uint32_t))
 535         {
 536             if (mySource < sourceLimit)
 537             {
 538                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 539                 toUBytes[i++] = (char) *(mySource++);
 540             }
 541             else
 542             {
 543                 /* stores a partially calculated target*/
 544                 /* + 1 to make 0 a valid character */
 545                 args->converter->toUnicodeStatus = ch + 1;
 546                 args->converter->toULength = (int8_t) i;
 547                 goto donefornow;
 548             }
 549         }
 550
 551         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 552             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 553             if (ch <= MAXIMUM_UCS2) {
 554                 /* fits in 16 bits */
 555                 *(myTarget++) = (UChar) ch;
 556             }
 557             else {
 558                 /* write out the surrogates */
 559                 *(myTarget++) = U16_LEAD(ch);
 560                 ch = U16_TRAIL(ch);
 561                 if (myTarget < targetLimit) {
 562                     *(myTarget++) = (UChar)ch;
 563                 }
 564                 else {
 565                     /* Put in overflow buffer (not handled here) */
 566                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 567                     args->converter->UCharErrorBufferLength = 1;
 568                     *err = U_BUFFER_OVERFLOW_ERROR;
 569                     break;
 570                 }
 571             }
 572         }
 573         else {
 574             args->converter->toULength = (int8_t)i;
 575             *err = U_ILLEGAL_CHAR_FOUND;
 576             break;
 577         }
 578     }
 579
 580 donefornow:
 581     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 582     {
 583         /* End of target buffer */
 584         *err = U_BUFFER_OVERFLOW_ERROR;
 585     }
 586
 587     args->target = myTarget;
 588     args->source = (const char *) mySource;
 589 }
 590
 591 static void
 592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 593                                              UErrorCode * err)
 594 {
 595     const unsigned char *mySource = (unsigned char *) args->source;
 596     UChar *myTarget = args->target;
 597     int32_t *myOffsets = args->offsets;
 598     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 599     const UChar *targetLimit = args->targetLimit;
 600     unsigned char *toUBytes = args->converter->toUBytes;
 601     uint32_t ch, i;
 602     int32_t offsetNum = 0;
 603
 604     /* Restore state of current sequence */
 605     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 606     {
 607         i = args->converter->toULength;       /* restore # of bytes consumed */
 608         args->converter->toULength = 0;
 609
 610         /* Stores the previously calculated ch from a previous call*/
 611         ch = args->converter->toUnicodeStatus - 1;
 612         args->converter->toUnicodeStatus = 0;
 613         goto morebytes;
 614     }
 615
 616     while (mySource < sourceLimit && myTarget < targetLimit)
 617     {
 618         i = 0;
 619         ch = 0;
 620 morebytes:
 621         while (i < sizeof(uint32_t))
 622         {
 623             if (mySource < sourceLimit)
 624             {
 625                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 626                 toUBytes[i++] = (char) *(mySource++);
 627             }
 628             else
 629             {
 630                 /* stores a partially calculated target*/
 631                 /* + 1 to make 0 a valid character */
 632                 args->converter->toUnicodeStatus = ch + 1;
 633                 args->converter->toULength = (int8_t) i;
 634                 goto donefornow;
 635             }
 636         }
 637
 638         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
 639         {
 640             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 641             if (ch <= MAXIMUM_UCS2)
 642             {
 643                 /* fits in 16 bits */
 644                 *(myTarget++) = (UChar) ch;
 645                 *(myOffsets++) = offsetNum;
 646             }
 647             else {
 648                 /* write out the surrogates */
 649                 *(myTarget++) = U16_LEAD(ch);
 650                 *(myOffsets++) = offsetNum;
 651                 ch = U16_TRAIL(ch);
 652                 if (myTarget < targetLimit)
 653                 {
 654                     *(myTarget++) = (UChar)ch;
 655                     *(myOffsets++) = offsetNum;
 656                 }
 657                 else
 658                 {
 659                     /* Put in overflow buffer (not handled here) */
 660                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 661                     args->converter->UCharErrorBufferLength = 1;
 662                     *err = U_BUFFER_OVERFLOW_ERROR;
 663                     break;
 664                 }
 665             }
 666         }
 667         else
 668         {
 669             args->converter->toULength = (int8_t)i;
 670             *err = U_ILLEGAL_CHAR_FOUND;
 671             break;
 672         }
 673         offsetNum += i;
 674     }
 675
 676 donefornow:
 677     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 678     {
 679         /* End of target buffer */
 680         *err = U_BUFFER_OVERFLOW_ERROR;
 681     }
 682
 683     args->target = myTarget;
 684     args->source = (const char *) mySource;
 685     args->offsets = myOffsets;
 686 }
 687
 688 static void
 689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
 690                                   UErrorCode * err)
 691 {
 692     const UChar *mySource = args->source;
 693     unsigned char *myTarget;
 694     const UChar *sourceLimit = args->sourceLimit;
 695     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 696     UChar32 ch, ch2;
 697     unsigned int indexToWrite;
 698     unsigned char temp[sizeof(uint32_t)];
 699
 700     if(mySource >= sourceLimit) {
 701         /* no input, nothing to do */
 702         return;
 703     }
 704
 705     /* write the BOM if necessary */
 706     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 707         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 708         ucnv_fromUWriteBytes(args->converter,
 709                              bom, 4,
 710                              &args->target, args->targetLimit,
 711                              &args->offsets, -1,
 712                              err);
 713         args->converter->fromUnicodeStatus=0;
 714     }
 715
 716     myTarget = (unsigned char *) args->target;
 717     temp[3] = 0;
 718
 719     if (args->converter->fromUChar32)
 720     {
 721         ch = args->converter->fromUChar32;
 722         args->converter->fromUChar32 = 0;
 723         goto lowsurogate;
 724     }
 725
 726     while (mySource < sourceLimit && myTarget < targetLimit)
 727     {
 728         ch = *(mySource++);
 729
 730         if (U16_IS_SURROGATE(ch)) {
 731             if (U16_IS_LEAD(ch))
 732             {
 733 lowsurogate:
 734                 if (mySource < sourceLimit)
 735                 {
 736                     ch2 = *mySource;
 737                     if (U16_IS_TRAIL(ch2)) {
 738                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 739                         mySource++;
 740                     }
 741                     else {
 742                         /* this is an unmatched trail code unit (2nd surrogate) */
 743                         /* callback(illegal) */
 744                         args->converter->fromUChar32 = ch;
 745                         *err = U_ILLEGAL_CHAR_FOUND;
 746                         break;
 747                     }
 748                 }
 749                 else {
 750                     /* ran out of source */
 751                     args->converter->fromUChar32 = ch;
 752                     if (args->flush) {
 753                         /* this is an unmatched trail code unit (2nd surrogate) */
 754                         /* callback(illegal) */
 755                         *err = U_ILLEGAL_CHAR_FOUND;
 756                     }
 757                     break;
 758                 }
 759             }
 760             else {
 761                 /* this is an unmatched trail code unit (2nd surrogate) */
 762                 /* callback(illegal) */
 763                 args->converter->fromUChar32 = ch;
 764                 *err = U_ILLEGAL_CHAR_FOUND;
 765                 break;
 766             }
 767         }
 768
 769         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 770         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 771         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 772         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 773
 774         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 775         {
 776             if (myTarget < targetLimit)
 777             {
 778                 *(myTarget++) = temp[indexToWrite];
 779             }
 780             else
 781             {
 782                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 783                 *err = U_BUFFER_OVERFLOW_ERROR;
 784             }
 785         }
 786     }
 787
 788     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 789     {
 790         *err = U_BUFFER_OVERFLOW_ERROR;
 791     }
 792
 793     args->target = (char *) myTarget;
 794     args->source = mySource;
 795 }
 796
 797 static void
 798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 799                                                UErrorCode * err)
 800 {
 801     const UChar *mySource = args->source;
 802     unsigned char *myTarget;
 803     int32_t *myOffsets;
 804     const UChar *sourceLimit = args->sourceLimit;
 805     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 806     UChar32 ch, ch2;
 807     unsigned int indexToWrite;
 808     unsigned char temp[sizeof(uint32_t)];
 809     int32_t offsetNum = 0;
 810
 811     if(mySource >= sourceLimit) {
 812         /* no input, nothing to do */
 813         return;
 814     }
 815
 816     /* write the BOM if necessary */
 817     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 818         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 819         ucnv_fromUWriteBytes(args->converter,
 820                              bom, 4,
 821                              &args->target, args->targetLimit,
 822                              &args->offsets, -1,
 823                              err);
 824         args->converter->fromUnicodeStatus=0;
 825     }
 826
 827     myTarget = (unsigned char *) args->target;
 828     myOffsets = args->offsets;
 829     temp[3] = 0;
 830
 831     if (args->converter->fromUChar32)
 832     {
 833         ch = args->converter->fromUChar32;
 834         args->converter->fromUChar32 = 0;
 835         goto lowsurogate;
 836     }
 837
 838     while (mySource < sourceLimit && myTarget < targetLimit)
 839     {
 840         ch = *(mySource++);
 841
 842         if (U16_IS_SURROGATE(ch)) {
 843             if (U16_IS_LEAD(ch))
 844             {
 845 lowsurogate:
 846                 if (mySource < sourceLimit)
 847                 {
 848                     ch2 = *mySource;
 849                     if (U16_IS_TRAIL(ch2))
 850                     {
 851                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 852                         mySource++;
 853                     }
 854                     else {
 855                         /* this is an unmatched trail code unit (2nd surrogate) */
 856                         /* callback(illegal) */
 857                         args->converter->fromUChar32 = ch;
 858                         *err = U_ILLEGAL_CHAR_FOUND;
 859                         break;
 860                     }
 861                 }
 862                 else {
 863                     /* ran out of source */
 864                     args->converter->fromUChar32 = ch;
 865                     if (args->flush) {
 866                         /* this is an unmatched trail code unit (2nd surrogate) */
 867                         /* callback(illegal) */
 868                         *err = U_ILLEGAL_CHAR_FOUND;
 869                     }
 870                     break;
 871                 }
 872             }
 873             else {
 874                 /* this is an unmatched trail code unit (2nd surrogate) */
 875                 /* callback(illegal) */
 876                 args->converter->fromUChar32 = ch;
 877                 *err = U_ILLEGAL_CHAR_FOUND;
 878                 break;
 879             }
 880         }
 881
 882         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 883         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 884         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 885         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 886
 887         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 888         {
 889             if (myTarget < targetLimit)
 890             {
 891                 *(myTarget++) = temp[indexToWrite];
 892                 *(myOffsets++) = offsetNum;
 893             }
 894             else
 895             {
 896                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 897                 *err = U_BUFFER_OVERFLOW_ERROR;
 898             }
 899         }
 900         offsetNum = offsetNum + 1 + (temp[2] != 0);
 901     }
 902
 903     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 904     {
 905         *err = U_BUFFER_OVERFLOW_ERROR;
 906     }
 907
 908     args->target = (char *) myTarget;
 909     args->source = mySource;
 910     args->offsets = myOffsets;
 911 }
 912
 913 static UChar32
 914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
 915                                    UErrorCode* err)
 916 {
 917     const uint8_t *mySource;
 918     UChar32 myUChar;
 919     int32_t length;
 920
 921     mySource = (const uint8_t *)args->source;
 922     if (mySource >= (const uint8_t *)args->sourceLimit)
 923     {
 924         /* no input */
 925         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 926         return 0xffff;
 927     }
 928
 929     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 930     if (length < 4)
 931     {
 932         /* got a partial character */
 933         uprv_memcpy(args->converter->toUBytes, mySource, length);
 934         args->converter->toULength = (int8_t)length;
 935         args->source = (const char *)(mySource + length);
 936         *err = U_TRUNCATED_CHAR_FOUND;
 937         return 0xffff;
 938     }
 939
 940     /* Don't even try to do a direct cast because the value may be on an odd address. */
 941     myUChar = ((UChar32)mySource[3] << 24)
 942             | ((UChar32)mySource[2] << 16)
 943             | ((UChar32)mySource[1] << 8)
 944             | ((UChar32)mySource[0]);
 945
 946     args->source = (const char *)(mySource + 4);
 947     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 948         return myUChar;
 949     }
 950
 951     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 952     args->converter->toULength = 4;
 953
 954     *err = U_ILLEGAL_CHAR_FOUND;
 955     return 0xffff;
 956 }
 957
 958 static const UConverterImpl _UTF32LEImpl = {
 959     UCNV_UTF32_LittleEndian,
 960
 961     NULL,
 962     NULL,
 963
 964     NULL,
 965     NULL,
 966     NULL,
 967
 968     T_UConverter_toUnicode_UTF32_LE,
 969     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
 970     T_UConverter_fromUnicode_UTF32_LE,
 971     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
 972     T_UConverter_getNextUChar_UTF32_LE,
 973
 974     NULL,
 975     NULL,
 976     NULL,
 977     NULL,
 978     ucnv_getNonSurrogateUnicodeSet
 979 };
 980
 981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 982 static const UConverterStaticData _UTF32LEStaticData = {
 983     sizeof(UConverterStaticData),
 984     "UTF-32LE",
 985     1234,
 986     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
 987     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
 988     0,
 989     0,
 990     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 991 };
 992
 993
 994 const UConverterSharedData _UTF32LEData = {
 995     sizeof(UConverterSharedData), ~((uint32_t) 0),
 996     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
 997     0
 998 };
 999
1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1001
1002 /*
1003  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1004  * accordingly.
1005  *
1006  * State values:
1007  * 0    initial state
1008  * 1    saw 00
1009  * 2    saw 00 00
1010  * 3    saw 00 00 FE
1011  * 4    -
1012  * 5    saw FF
1013  * 6    saw FF FE
1014  * 7    saw FF FE 00
1015  * 8    UTF-32BE mode
1016  * 9    UTF-32LE mode
1017  *
1018  * During detection: state&3==number of matching bytes so far.
1019  *
1020  * On output, emit U+FEFF as the first code point.
1021  */
1022
1023 static void
1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1025     if(choice<=UCNV_RESET_TO_UNICODE) {
1026         /* reset toUnicode: state=0 */
1027         cnv->mode=0;
1028     }
1029     if(choice!=UCNV_RESET_TO_UNICODE) {
1030         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1031         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1032     }
1033 }
1034
1035 static void
1036 _UTF32Open(UConverter *cnv,
1037            UConverterLoadArgs *pArgs,
1038            UErrorCode *pErrorCode) {
1039     _UTF32Reset(cnv, UCNV_RESET_BOTH);
1040 }
1041
1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1043
1044 static void
1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1046                            UErrorCode *pErrorCode) {
1047     UConverter *cnv=pArgs->converter;
1048     const char *source=pArgs->source;
1049     const char *sourceLimit=pArgs->sourceLimit;
1050     int32_t *offsets=pArgs->offsets;
1051
1052     int32_t state, offsetDelta;
1053     char b;
1054
1055     state=cnv->mode;
1056
1057     /*
1058      * If we detect a BOM in this buffer, then we must add the BOM size to the
1059      * offsets because the actual converter function will not see and count the BOM.
1060      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1061      */
1062     offsetDelta=0;
1063
1064     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1065         switch(state) {
1066         case 0:
1067             b=*source;
1068             if(b==0) {
1069                 state=1; /* could be 00 00 FE FF */
1070             } else if(b==(char)0xff) {
1071                 state=5; /* could be FF FE 00 00 */
1072             } else {
1073                 state=8; /* default to UTF-32BE */
1074                 continue;
1075             }
1076             ++source;
1077             break;
1078         case 1:
1079         case 2:
1080         case 3:
1081         case 5:
1082         case 6:
1083         case 7:
1084             if(*source==utf32BOM[state]) {
1085                 ++state;
1086                 ++source;
1087                 if(state==4) {
1088                     state=8; /* detect UTF-32BE */
1089                     offsetDelta=(int32_t)(source-pArgs->source);
1090                 } else if(state==8) {
1091                     state=9; /* detect UTF-32LE */
1092                     offsetDelta=(int32_t)(source-pArgs->source);
1093                 }
1094             } else {
1095                 /* switch to UTF-32BE and pass the previous bytes */
1096                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1097
1098                 /* reset the source */
1099                 source=pArgs->source;
1100
1101                 if(count==(state&3)) {
1102                     /* simple: all in the same buffer, just reset source */
1103                 } else {
1104                     UBool oldFlush=pArgs->flush;
1105
1106                     /* some of the bytes are from a previous buffer, replay those first */
1107                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1108                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1109                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1110
1111                     /* no offsets: bytes from previous buffer, and not enough for output */
1112                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1113
1114                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1115                     pArgs->sourceLimit=sourceLimit;
1116                     pArgs->flush=oldFlush;
1117                 }
1118                 state=8;
1119                 continue;
1120             }
1121             break;
1122         case 8:
1123             /* call UTF-32BE */
1124             pArgs->source=source;
1125             if(offsets==NULL) {
1126                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1127             } else {
1128                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1129             }
1130             source=pArgs->source;
1131             break;
1132         case 9:
1133             /* call UTF-32LE */
1134             pArgs->source=source;
1135             if(offsets==NULL) {
1136                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1137             } else {
1138                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1139             }
1140             source=pArgs->source;
1141             break;
1142         default:
1143             break; /* does not occur */
1144         }
1145     }
1146
1147     /* add BOM size to offsets - see comment at offsetDelta declaration */
1148     if(offsets!=NULL && offsetDelta!=0) {
1149         int32_t *offsetsLimit=pArgs->offsets;
1150         while(offsets<offsetsLimit) {
1151             *offsets++ += offsetDelta;
1152         }
1153     }
1154
1155     pArgs->source=source;
1156
1157     if(source==sourceLimit && pArgs->flush) {
1158         /* handle truncated input */
1159         switch(state) {
1160         case 0:
1161             break; /* no input at all, nothing to do */
1162         case 8:
1163             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1164             break;
1165         case 9:
1166             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1167             break;
1168         default:
1169             /* handle 0<state<8: call UTF-32BE with too-short input */
1170             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1171             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1172
1173             /* no offsets: not enough for output */
1174             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1175             pArgs->source=source;
1176             pArgs->sourceLimit=sourceLimit;
1177             state=8;
1178             break;
1179         }
1180     }
1181
1182     cnv->mode=state;
1183 }
1184
1185 static UChar32
1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1187                    UErrorCode *pErrorCode) {
1188     switch(pArgs->converter->mode) {
1189     case 8:
1190         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1191     case 9:
1192         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1193     default:
1194         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1195     }
1196 }
1197
1198 static const UConverterImpl _UTF32Impl = {
1199     UCNV_UTF32,
1200
1201     NULL,
1202     NULL,
1203
1204     _UTF32Open,
1205     NULL,
1206     _UTF32Reset,
1207
1208     _UTF32ToUnicodeWithOffsets,
1209     _UTF32ToUnicodeWithOffsets,
1210 #if U_IS_BIG_ENDIAN
1211     T_UConverter_fromUnicode_UTF32_BE,
1212     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1213 #else
1214     T_UConverter_fromUnicode_UTF32_LE,
1215     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1216 #endif
1217     _UTF32GetNextUChar,
1218
1219     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1220     NULL,
1221     NULL,
1222     NULL,
1223     ucnv_getNonSurrogateUnicodeSet
1224 };
1225
1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1227 static const UConverterStaticData _UTF32StaticData = {
1228     sizeof(UConverterStaticData),
1229     "UTF-32",
1230     1236,
1231     UCNV_IBM, UCNV_UTF32, 4, 4,
1232 #if U_IS_BIG_ENDIAN
1233     { 0, 0, 0xff, 0xfd }, 4,
1234 #else
1235     { 0xfd, 0xff, 0, 0 }, 4,
1236 #endif
1237     FALSE, FALSE,
1238     0,
1239     0,
1240     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1241 };
1242
1243 const UConverterSharedData _UTF32Data = {
1244     sizeof(UConverterSharedData), ~((uint32_t) 0),
1245     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1246     0
1247 };
1248
1249 #endif