icuSources/common/ucnv_u32.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2006, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u32.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "ucnv_bld.h"
  23 #include "ucnv_cnv.h"
  24 #include "cmemory.h"
  25
  26 #define MAXIMUM_UCS2            0x0000FFFF
  27 #define MAXIMUM_UTF             0x0010FFFF
  28 #define HALF_SHIFT              10
  29 #define HALF_BASE               0x0010000
  30 #define HALF_MASK               0x3FF
  31 #define SURROGATE_HIGH_START    0xD800
  32 #define SURROGATE_LOW_START     0xDC00
  33
  34 /* -SURROGATE_LOW_START + HALF_BASE */
  35 #define SURROGATE_LOW_BASE      9216
  36
  37 enum {
  38     UCNV_NEED_TO_WRITE_BOM=1
  39 };
  40
  41 /* UTF-32BE ----------------------------------------------------------------- */
  42
  43 static void
  44 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
  45                                 UErrorCode * err)
  46 {
  47     const unsigned char *mySource = (unsigned char *) args->source;
  48     UChar *myTarget = args->target;
  49     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  50     const UChar *targetLimit = args->targetLimit;
  51     unsigned char *toUBytes = args->converter->toUBytes;
  52     uint32_t ch, i;
  53
  54     /* Restore state of current sequence */
  55     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
  56         i = args->converter->toULength;       /* restore # of bytes consumed */
  57         args->converter->toULength = 0;
  58
  59         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
  60         args->converter->toUnicodeStatus = 0;
  61         goto morebytes;
  62     }
  63
  64     while (mySource < sourceLimit && myTarget < targetLimit) {
  65         i = 0;
  66         ch = 0;
  67 morebytes:
  68         while (i < sizeof(uint32_t)) {
  69             if (mySource < sourceLimit) {
  70                 ch = (ch << 8) | (uint8_t)(*mySource);
  71                 toUBytes[i++] = (char) *(mySource++);
  72             }
  73             else {
  74                 /* stores a partially calculated target*/
  75                 /* + 1 to make 0 a valid character */
  76                 args->converter->toUnicodeStatus = ch + 1;
  77                 args->converter->toULength = (int8_t) i;
  78                 goto donefornow;
  79             }
  80         }
  81
  82         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
  83             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
  84             if (ch <= MAXIMUM_UCS2)
  85             {
  86                 /* fits in 16 bits */
  87                 *(myTarget++) = (UChar) ch;
  88             }
  89             else {
  90                 /* write out the surrogates */
  91                 *(myTarget++) = U16_LEAD(ch);
  92                 ch = U16_TRAIL(ch);
  93                 if (myTarget < targetLimit) {
  94                     *(myTarget++) = (UChar)ch;
  95                 }
  96                 else {
  97                     /* Put in overflow buffer (not handled here) */
  98                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
  99                     args->converter->UCharErrorBufferLength = 1;
 100                     *err = U_BUFFER_OVERFLOW_ERROR;
 101                     break;
 102                 }
 103             }
 104         }
 105         else {
 106             args->converter->toULength = (int8_t)i;
 107             *err = U_ILLEGAL_CHAR_FOUND;
 108             break;
 109         }
 110     }
 111
 112 donefornow:
 113     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 114         /* End of target buffer */
 115         *err = U_BUFFER_OVERFLOW_ERROR;
 116     }
 117
 118     args->target = myTarget;
 119     args->source = (const char *) mySource;
 120 }
 121
 122 static void
 123 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 124                                              UErrorCode * err)
 125 {
 126     const unsigned char *mySource = (unsigned char *) args->source;
 127     UChar *myTarget = args->target;
 128     int32_t *myOffsets = args->offsets;
 129     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 130     const UChar *targetLimit = args->targetLimit;
 131     unsigned char *toUBytes = args->converter->toUBytes;
 132     uint32_t ch, i;
 133     int32_t offsetNum = 0;
 134
 135     /* Restore state of current sequence */
 136     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
 137         i = args->converter->toULength;       /* restore # of bytes consumed */
 138         args->converter->toULength = 0;
 139
 140         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
 141         args->converter->toUnicodeStatus = 0;
 142         goto morebytes;
 143     }
 144
 145     while (mySource < sourceLimit && myTarget < targetLimit) {
 146         i = 0;
 147         ch = 0;
 148 morebytes:
 149         while (i < sizeof(uint32_t)) {
 150             if (mySource < sourceLimit) {
 151                 ch = (ch << 8) | (uint8_t)(*mySource);
 152                 toUBytes[i++] = (char) *(mySource++);
 153             }
 154             else {
 155                 /* stores a partially calculated target*/
 156                 /* + 1 to make 0 a valid character */
 157                 args->converter->toUnicodeStatus = ch + 1;
 158                 args->converter->toULength = (int8_t) i;
 159                 goto donefornow;
 160             }
 161         }
 162
 163         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 164             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 165             if (ch <= MAXIMUM_UCS2) {
 166                 /* fits in 16 bits */
 167                 *(myTarget++) = (UChar) ch;
 168                 *(myOffsets++) = offsetNum;
 169             }
 170             else {
 171                 /* write out the surrogates */
 172                 *(myTarget++) = U16_LEAD(ch);
 173                 *myOffsets++ = offsetNum;
 174                 ch = U16_TRAIL(ch);
 175                 if (myTarget < targetLimit)
 176                 {
 177                     *(myTarget++) = (UChar)ch;
 178                     *(myOffsets++) = offsetNum;
 179                 }
 180                 else {
 181                     /* Put in overflow buffer (not handled here) */
 182                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 183                     args->converter->UCharErrorBufferLength = 1;
 184                     *err = U_BUFFER_OVERFLOW_ERROR;
 185                     break;
 186                 }
 187             }
 188         }
 189         else {
 190             args->converter->toULength = (int8_t)i;
 191             *err = U_ILLEGAL_CHAR_FOUND;
 192             break;
 193         }
 194         offsetNum += i;
 195     }
 196
 197 donefornow:
 198     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 199     {
 200         /* End of target buffer */
 201         *err = U_BUFFER_OVERFLOW_ERROR;
 202     }
 203
 204     args->target = myTarget;
 205     args->source = (const char *) mySource;
 206     args->offsets = myOffsets;
 207 }
 208
 209 static void
 210 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
 211                                   UErrorCode * err)
 212 {
 213     const UChar *mySource = args->source;
 214     unsigned char *myTarget;
 215     const UChar *sourceLimit = args->sourceLimit;
 216     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 217     UChar32 ch, ch2;
 218     unsigned int indexToWrite;
 219     unsigned char temp[sizeof(uint32_t)];
 220
 221     if(mySource >= sourceLimit) {
 222         /* no input, nothing to do */
 223         return;
 224     }
 225
 226     /* write the BOM if necessary */
 227     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 228         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 229         ucnv_fromUWriteBytes(args->converter,
 230                              bom, 4,
 231                              &args->target, args->targetLimit,
 232                              &args->offsets, -1,
 233                              err);
 234         args->converter->fromUnicodeStatus=0;
 235     }
 236
 237     myTarget = (unsigned char *) args->target;
 238     temp[0] = 0;
 239
 240     if (args->converter->fromUChar32) {
 241         ch = args->converter->fromUChar32;
 242         args->converter->fromUChar32 = 0;
 243         goto lowsurogate;
 244     }
 245
 246     while (mySource < sourceLimit && myTarget < targetLimit) {
 247         ch = *(mySource++);
 248
 249         if (UTF_IS_SURROGATE(ch)) {
 250             if (U_IS_LEAD(ch)) {
 251 lowsurogate:
 252                 if (mySource < sourceLimit) {
 253                     ch2 = *mySource;
 254                     if (U_IS_TRAIL(ch2)) {
 255                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 256                         mySource++;
 257                     }
 258                     else {
 259                         /* this is an unmatched trail code unit (2nd surrogate) */
 260                         /* callback(illegal) */
 261                         args->converter->fromUChar32 = ch;
 262                         *err = U_ILLEGAL_CHAR_FOUND;
 263                         break;
 264                     }
 265                 }
 266                 else {
 267                     /* ran out of source */
 268                     args->converter->fromUChar32 = ch;
 269                     if (args->flush) {
 270                         /* this is an unmatched trail code unit (2nd surrogate) */
 271                         /* callback(illegal) */
 272                         *err = U_ILLEGAL_CHAR_FOUND;
 273                     }
 274                     break;
 275                 }
 276             }
 277             else {
 278                 /* this is an unmatched trail code unit (2nd surrogate) */
 279                 /* callback(illegal) */
 280                 args->converter->fromUChar32 = ch;
 281                 *err = U_ILLEGAL_CHAR_FOUND;
 282                 break;
 283             }
 284         }
 285
 286         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 287         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 288         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 289         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 290
 291         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 292             if (myTarget < targetLimit) {
 293                 *(myTarget++) = temp[indexToWrite];
 294             }
 295             else {
 296                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 297                 *err = U_BUFFER_OVERFLOW_ERROR;
 298             }
 299         }
 300     }
 301
 302     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 303         *err = U_BUFFER_OVERFLOW_ERROR;
 304     }
 305
 306     args->target = (char *) myTarget;
 307     args->source = mySource;
 308 }
 309
 310 static void
 311 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 312                                                UErrorCode * err)
 313 {
 314     const UChar *mySource = args->source;
 315     unsigned char *myTarget;
 316     int32_t *myOffsets;
 317     const UChar *sourceLimit = args->sourceLimit;
 318     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 319     UChar32 ch, ch2;
 320     int32_t offsetNum = 0;
 321     unsigned int indexToWrite;
 322     unsigned char temp[sizeof(uint32_t)];
 323
 324     if(mySource >= sourceLimit) {
 325         /* no input, nothing to do */
 326         return;
 327     }
 328
 329     /* write the BOM if necessary */
 330     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 331         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 332         ucnv_fromUWriteBytes(args->converter,
 333                              bom, 4,
 334                              &args->target, args->targetLimit,
 335                              &args->offsets, -1,
 336                              err);
 337         args->converter->fromUnicodeStatus=0;
 338     }
 339
 340     myTarget = (unsigned char *) args->target;
 341     myOffsets = args->offsets;
 342     temp[0] = 0;
 343
 344     if (args->converter->fromUChar32) {
 345         ch = args->converter->fromUChar32;
 346         args->converter->fromUChar32 = 0;
 347         goto lowsurogate;
 348     }
 349
 350     while (mySource < sourceLimit && myTarget < targetLimit) {
 351         ch = *(mySource++);
 352
 353         if (UTF_IS_SURROGATE(ch)) {
 354             if (U_IS_LEAD(ch)) {
 355 lowsurogate:
 356                 if (mySource < sourceLimit) {
 357                     ch2 = *mySource;
 358                     if (U_IS_TRAIL(ch2)) {
 359                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 360                         mySource++;
 361                     }
 362                     else {
 363                         /* this is an unmatched trail code unit (2nd surrogate) */
 364                         /* callback(illegal) */
 365                         args->converter->fromUChar32 = ch;
 366                         *err = U_ILLEGAL_CHAR_FOUND;
 367                         break;
 368                     }
 369                 }
 370                 else {
 371                     /* ran out of source */
 372                     args->converter->fromUChar32 = ch;
 373                     if (args->flush) {
 374                         /* this is an unmatched trail code unit (2nd surrogate) */
 375                         /* callback(illegal) */
 376                         *err = U_ILLEGAL_CHAR_FOUND;
 377                     }
 378                     break;
 379                 }
 380             }
 381             else {
 382                 /* this is an unmatched trail code unit (2nd surrogate) */
 383                 /* callback(illegal) */
 384                 args->converter->fromUChar32 = ch;
 385                 *err = U_ILLEGAL_CHAR_FOUND;
 386                 break;
 387             }
 388         }
 389
 390         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 391         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 392         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 393         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 394
 395         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 396             if (myTarget < targetLimit) {
 397                 *(myTarget++) = temp[indexToWrite];
 398                 *(myOffsets++) = offsetNum;
 399             }
 400             else {
 401                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 402                 *err = U_BUFFER_OVERFLOW_ERROR;
 403             }
 404         }
 405         offsetNum = offsetNum + 1 + (temp[1] != 0);
 406     }
 407
 408     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 409         *err = U_BUFFER_OVERFLOW_ERROR;
 410     }
 411
 412     args->target = (char *) myTarget;
 413     args->source = mySource;
 414     args->offsets = myOffsets;
 415 }
 416
 417 static UChar32
 418 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
 419                                    UErrorCode* err)
 420 {
 421     const uint8_t *mySource;
 422     UChar32 myUChar;
 423     int32_t length;
 424
 425     mySource = (const uint8_t *)args->source;
 426     if (mySource >= (const uint8_t *)args->sourceLimit)
 427     {
 428         /* no input */
 429         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 430         return 0xffff;
 431     }
 432
 433     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 434     if (length < 4)
 435     {
 436         /* got a partial character */
 437         uprv_memcpy(args->converter->toUBytes, mySource, length);
 438         args->converter->toULength = (int8_t)length;
 439         args->source = (const char *)(mySource + length);
 440         *err = U_TRUNCATED_CHAR_FOUND;
 441         return 0xffff;
 442     }
 443
 444     /* Don't even try to do a direct cast because the value may be on an odd address. */
 445     myUChar = ((UChar32)mySource[0] << 24)
 446             | ((UChar32)mySource[1] << 16)
 447             | ((UChar32)mySource[2] << 8)
 448             | ((UChar32)mySource[3]);
 449
 450     args->source = (const char *)(mySource + 4);
 451     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 452         return myUChar;
 453     }
 454
 455     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 456     args->converter->toULength = 4;
 457
 458     *err = U_ILLEGAL_CHAR_FOUND;
 459     return 0xffff;
 460 }
 461
 462 static const UConverterImpl _UTF32BEImpl = {
 463     UCNV_UTF32_BigEndian,
 464
 465     NULL,
 466     NULL,
 467
 468     NULL,
 469     NULL,
 470     NULL,
 471
 472     T_UConverter_toUnicode_UTF32_BE,
 473     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
 474     T_UConverter_fromUnicode_UTF32_BE,
 475     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
 476     T_UConverter_getNextUChar_UTF32_BE,
 477
 478     NULL,
 479     NULL,
 480     NULL,
 481     NULL,
 482     ucnv_getNonSurrogateUnicodeSet
 483 };
 484
 485 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 486 static const UConverterStaticData _UTF32BEStaticData = {
 487     sizeof(UConverterStaticData),
 488     "UTF-32BE",
 489     1232,
 490     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
 491     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
 492     0,
 493     0,
 494     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 495 };
 496
 497 const UConverterSharedData _UTF32BEData = {
 498     sizeof(UConverterSharedData), ~((uint32_t) 0),
 499     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
 500     0
 501 };
 502
 503 /* UTF-32LE ---------------------------------------------------------- */
 504
 505 static void
 506 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
 507                                 UErrorCode * err)
 508 {
 509     const unsigned char *mySource = (unsigned char *) args->source;
 510     UChar *myTarget = args->target;
 511     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 512     const UChar *targetLimit = args->targetLimit;
 513     unsigned char *toUBytes = args->converter->toUBytes;
 514     uint32_t ch, i;
 515
 516     /* Restore state of current sequence */
 517     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 518     {
 519         i = args->converter->toULength;       /* restore # of bytes consumed */
 520         args->converter->toULength = 0;
 521
 522         /* Stores the previously calculated ch from a previous call*/
 523         ch = args->converter->toUnicodeStatus - 1;
 524         args->converter->toUnicodeStatus = 0;
 525         goto morebytes;
 526     }
 527
 528     while (mySource < sourceLimit && myTarget < targetLimit)
 529     {
 530         i = 0;
 531         ch = 0;
 532 morebytes:
 533         while (i < sizeof(uint32_t))
 534         {
 535             if (mySource < sourceLimit)
 536             {
 537                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 538                 toUBytes[i++] = (char) *(mySource++);
 539             }
 540             else
 541             {
 542                 /* stores a partially calculated target*/
 543                 /* + 1 to make 0 a valid character */
 544                 args->converter->toUnicodeStatus = ch + 1;
 545                 args->converter->toULength = (int8_t) i;
 546                 goto donefornow;
 547             }
 548         }
 549
 550         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 551             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 552             if (ch <= MAXIMUM_UCS2) {
 553                 /* fits in 16 bits */
 554                 *(myTarget++) = (UChar) ch;
 555             }
 556             else {
 557                 /* write out the surrogates */
 558                 *(myTarget++) = U16_LEAD(ch);
 559                 ch = U16_TRAIL(ch);
 560                 if (myTarget < targetLimit) {
 561                     *(myTarget++) = (UChar)ch;
 562                 }
 563                 else {
 564                     /* Put in overflow buffer (not handled here) */
 565                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 566                     args->converter->UCharErrorBufferLength = 1;
 567                     *err = U_BUFFER_OVERFLOW_ERROR;
 568                     break;
 569                 }
 570             }
 571         }
 572         else {
 573             args->converter->toULength = (int8_t)i;
 574             *err = U_ILLEGAL_CHAR_FOUND;
 575             break;
 576         }
 577     }
 578
 579 donefornow:
 580     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 581     {
 582         /* End of target buffer */
 583         *err = U_BUFFER_OVERFLOW_ERROR;
 584     }
 585
 586     args->target = myTarget;
 587     args->source = (const char *) mySource;
 588 }
 589
 590 static void
 591 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 592                                              UErrorCode * err)
 593 {
 594     const unsigned char *mySource = (unsigned char *) args->source;
 595     UChar *myTarget = args->target;
 596     int32_t *myOffsets = args->offsets;
 597     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 598     const UChar *targetLimit = args->targetLimit;
 599     unsigned char *toUBytes = args->converter->toUBytes;
 600     uint32_t ch, i;
 601     int32_t offsetNum = 0;
 602
 603     /* Restore state of current sequence */
 604     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 605     {
 606         i = args->converter->toULength;       /* restore # of bytes consumed */
 607         args->converter->toULength = 0;
 608
 609         /* Stores the previously calculated ch from a previous call*/
 610         ch = args->converter->toUnicodeStatus - 1;
 611         args->converter->toUnicodeStatus = 0;
 612         goto morebytes;
 613     }
 614
 615     while (mySource < sourceLimit && myTarget < targetLimit)
 616     {
 617         i = 0;
 618         ch = 0;
 619 morebytes:
 620         while (i < sizeof(uint32_t))
 621         {
 622             if (mySource < sourceLimit)
 623             {
 624                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 625                 toUBytes[i++] = (char) *(mySource++);
 626             }
 627             else
 628             {
 629                 /* stores a partially calculated target*/
 630                 /* + 1 to make 0 a valid character */
 631                 args->converter->toUnicodeStatus = ch + 1;
 632                 args->converter->toULength = (int8_t) i;
 633                 goto donefornow;
 634             }
 635         }
 636
 637         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
 638         {
 639             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 640             if (ch <= MAXIMUM_UCS2)
 641             {
 642                 /* fits in 16 bits */
 643                 *(myTarget++) = (UChar) ch;
 644                 *(myOffsets++) = offsetNum;
 645             }
 646             else {
 647                 /* write out the surrogates */
 648                 *(myTarget++) = U16_LEAD(ch);
 649                 *(myOffsets++) = offsetNum;
 650                 ch = U16_TRAIL(ch);
 651                 if (myTarget < targetLimit)
 652                 {
 653                     *(myTarget++) = (UChar)ch;
 654                     *(myOffsets++) = offsetNum;
 655                 }
 656                 else
 657                 {
 658                     /* Put in overflow buffer (not handled here) */
 659                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 660                     args->converter->UCharErrorBufferLength = 1;
 661                     *err = U_BUFFER_OVERFLOW_ERROR;
 662                     break;
 663                 }
 664             }
 665         }
 666         else
 667         {
 668             args->converter->toULength = (int8_t)i;
 669             *err = U_ILLEGAL_CHAR_FOUND;
 670             break;
 671         }
 672         offsetNum += i;
 673     }
 674
 675 donefornow:
 676     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 677     {
 678         /* End of target buffer */
 679         *err = U_BUFFER_OVERFLOW_ERROR;
 680     }
 681
 682     args->target = myTarget;
 683     args->source = (const char *) mySource;
 684     args->offsets = myOffsets;
 685 }
 686
 687 static void
 688 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
 689                                   UErrorCode * err)
 690 {
 691     const UChar *mySource = args->source;
 692     unsigned char *myTarget;
 693     const UChar *sourceLimit = args->sourceLimit;
 694     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 695     UChar32 ch, ch2;
 696     unsigned int indexToWrite;
 697     unsigned char temp[sizeof(uint32_t)];
 698
 699     if(mySource >= sourceLimit) {
 700         /* no input, nothing to do */
 701         return;
 702     }
 703
 704     /* write the BOM if necessary */
 705     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 706         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 707         ucnv_fromUWriteBytes(args->converter,
 708                              bom, 4,
 709                              &args->target, args->targetLimit,
 710                              &args->offsets, -1,
 711                              err);
 712         args->converter->fromUnicodeStatus=0;
 713     }
 714
 715     myTarget = (unsigned char *) args->target;
 716     temp[3] = 0;
 717
 718     if (args->converter->fromUChar32)
 719     {
 720         ch = args->converter->fromUChar32;
 721         args->converter->fromUChar32 = 0;
 722         goto lowsurogate;
 723     }
 724
 725     while (mySource < sourceLimit && myTarget < targetLimit)
 726     {
 727         ch = *(mySource++);
 728
 729         if (UTF_IS_SURROGATE(ch)) {
 730             if (U_IS_LEAD(ch))
 731             {
 732 lowsurogate:
 733                 if (mySource < sourceLimit)
 734                 {
 735                     ch2 = *mySource;
 736                     if (U_IS_TRAIL(ch2)) {
 737                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 738                         mySource++;
 739                     }
 740                     else {
 741                         /* this is an unmatched trail code unit (2nd surrogate) */
 742                         /* callback(illegal) */
 743                         args->converter->fromUChar32 = ch;
 744                         *err = U_ILLEGAL_CHAR_FOUND;
 745                         break;
 746                     }
 747                 }
 748                 else {
 749                     /* ran out of source */
 750                     args->converter->fromUChar32 = ch;
 751                     if (args->flush) {
 752                         /* this is an unmatched trail code unit (2nd surrogate) */
 753                         /* callback(illegal) */
 754                         *err = U_ILLEGAL_CHAR_FOUND;
 755                     }
 756                     break;
 757                 }
 758             }
 759             else {
 760                 /* this is an unmatched trail code unit (2nd surrogate) */
 761                 /* callback(illegal) */
 762                 args->converter->fromUChar32 = ch;
 763                 *err = U_ILLEGAL_CHAR_FOUND;
 764                 break;
 765             }
 766         }
 767
 768         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 769         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 770         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 771         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 772
 773         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 774         {
 775             if (myTarget < targetLimit)
 776             {
 777                 *(myTarget++) = temp[indexToWrite];
 778             }
 779             else
 780             {
 781                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 782                 *err = U_BUFFER_OVERFLOW_ERROR;
 783             }
 784         }
 785     }
 786
 787     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 788     {
 789         *err = U_BUFFER_OVERFLOW_ERROR;
 790     }
 791
 792     args->target = (char *) myTarget;
 793     args->source = mySource;
 794 }
 795
 796 static void
 797 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 798                                                UErrorCode * err)
 799 {
 800     const UChar *mySource = args->source;
 801     unsigned char *myTarget;
 802     int32_t *myOffsets;
 803     const UChar *sourceLimit = args->sourceLimit;
 804     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 805     UChar32 ch, ch2;
 806     unsigned int indexToWrite;
 807     unsigned char temp[sizeof(uint32_t)];
 808     int32_t offsetNum = 0;
 809
 810     if(mySource >= sourceLimit) {
 811         /* no input, nothing to do */
 812         return;
 813     }
 814
 815     /* write the BOM if necessary */
 816     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 817         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 818         ucnv_fromUWriteBytes(args->converter,
 819                              bom, 4,
 820                              &args->target, args->targetLimit,
 821                              &args->offsets, -1,
 822                              err);
 823         args->converter->fromUnicodeStatus=0;
 824     }
 825
 826     myTarget = (unsigned char *) args->target;
 827     myOffsets = args->offsets;
 828     temp[3] = 0;
 829
 830     if (args->converter->fromUChar32)
 831     {
 832         ch = args->converter->fromUChar32;
 833         args->converter->fromUChar32 = 0;
 834         goto lowsurogate;
 835     }
 836
 837     while (mySource < sourceLimit && myTarget < targetLimit)
 838     {
 839         ch = *(mySource++);
 840
 841         if (UTF_IS_SURROGATE(ch)) {
 842             if (U_IS_LEAD(ch))
 843             {
 844 lowsurogate:
 845                 if (mySource < sourceLimit)
 846                 {
 847                     ch2 = *mySource;
 848                     if (U_IS_TRAIL(ch2))
 849                     {
 850                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 851                         mySource++;
 852                     }
 853                     else {
 854                         /* this is an unmatched trail code unit (2nd surrogate) */
 855                         /* callback(illegal) */
 856                         args->converter->fromUChar32 = ch;
 857                         *err = U_ILLEGAL_CHAR_FOUND;
 858                         break;
 859                     }
 860                 }
 861                 else {
 862                     /* ran out of source */
 863                     args->converter->fromUChar32 = ch;
 864                     if (args->flush) {
 865                         /* this is an unmatched trail code unit (2nd surrogate) */
 866                         /* callback(illegal) */
 867                         *err = U_ILLEGAL_CHAR_FOUND;
 868                     }
 869                     break;
 870                 }
 871             }
 872             else {
 873                 /* this is an unmatched trail code unit (2nd surrogate) */
 874                 /* callback(illegal) */
 875                 args->converter->fromUChar32 = ch;
 876                 *err = U_ILLEGAL_CHAR_FOUND;
 877                 break;
 878             }
 879         }
 880
 881         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 882         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 883         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 884         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 885
 886         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 887         {
 888             if (myTarget < targetLimit)
 889             {
 890                 *(myTarget++) = temp[indexToWrite];
 891                 *(myOffsets++) = offsetNum;
 892             }
 893             else
 894             {
 895                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 896                 *err = U_BUFFER_OVERFLOW_ERROR;
 897             }
 898         }
 899         offsetNum = offsetNum + 1 + (temp[2] != 0);
 900     }
 901
 902     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 903     {
 904         *err = U_BUFFER_OVERFLOW_ERROR;
 905     }
 906
 907     args->target = (char *) myTarget;
 908     args->source = mySource;
 909     args->offsets = myOffsets;
 910 }
 911
 912 static UChar32
 913 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
 914                                    UErrorCode* err)
 915 {
 916     const uint8_t *mySource;
 917     UChar32 myUChar;
 918     int32_t length;
 919
 920     mySource = (const uint8_t *)args->source;
 921     if (mySource >= (const uint8_t *)args->sourceLimit)
 922     {
 923         /* no input */
 924         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 925         return 0xffff;
 926     }
 927
 928     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 929     if (length < 4)
 930     {
 931         /* got a partial character */
 932         uprv_memcpy(args->converter->toUBytes, mySource, length);
 933         args->converter->toULength = (int8_t)length;
 934         args->source = (const char *)(mySource + length);
 935         *err = U_TRUNCATED_CHAR_FOUND;
 936         return 0xffff;
 937     }
 938
 939     /* Don't even try to do a direct cast because the value may be on an odd address. */
 940     myUChar = ((UChar32)mySource[3] << 24)
 941             | ((UChar32)mySource[2] << 16)
 942             | ((UChar32)mySource[1] << 8)
 943             | ((UChar32)mySource[0]);
 944
 945     args->source = (const char *)(mySource + 4);
 946     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 947         return myUChar;
 948     }
 949
 950     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 951     args->converter->toULength = 4;
 952
 953     *err = U_ILLEGAL_CHAR_FOUND;
 954     return 0xffff;
 955 }
 956
 957 static const UConverterImpl _UTF32LEImpl = {
 958     UCNV_UTF32_LittleEndian,
 959
 960     NULL,
 961     NULL,
 962
 963     NULL,
 964     NULL,
 965     NULL,
 966
 967     T_UConverter_toUnicode_UTF32_LE,
 968     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
 969     T_UConverter_fromUnicode_UTF32_LE,
 970     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
 971     T_UConverter_getNextUChar_UTF32_LE,
 972
 973     NULL,
 974     NULL,
 975     NULL,
 976     NULL,
 977     ucnv_getNonSurrogateUnicodeSet
 978 };
 979
 980 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 981 static const UConverterStaticData _UTF32LEStaticData = {
 982     sizeof(UConverterStaticData),
 983     "UTF-32LE",
 984     1234,
 985     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
 986     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
 987     0,
 988     0,
 989     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 990 };
 991
 992
 993 const UConverterSharedData _UTF32LEData = {
 994     sizeof(UConverterSharedData), ~((uint32_t) 0),
 995     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
 996     0
 997 };
 998
 999 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1000
1001 /*
1002  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1003  * accordingly.
1004  *
1005  * State values:
1006  * 0    initial state
1007  * 1    saw 00
1008  * 2    saw 00 00
1009  * 3    saw 00 00 FE
1010  * 4    -
1011  * 5    saw FF
1012  * 6    saw FF FE
1013  * 7    saw FF FE 00
1014  * 8    UTF-32BE mode
1015  * 9    UTF-32LE mode
1016  *
1017  * During detection: state&3==number of matching bytes so far.
1018  *
1019  * On output, emit U+FEFF as the first code point.
1020  */
1021
1022 static void
1023 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1024     if(choice<=UCNV_RESET_TO_UNICODE) {
1025         /* reset toUnicode: state=0 */
1026         cnv->mode=0;
1027     }
1028     if(choice!=UCNV_RESET_TO_UNICODE) {
1029         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1030         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1031     }
1032 }
1033
1034 static void
1035 _UTF32Open(UConverter *cnv,
1036            const char *name,
1037            const char *locale,
1038            uint32_t options,
1039            UErrorCode *pErrorCode) {
1040     _UTF32Reset(cnv, UCNV_RESET_BOTH);
1041 }
1042
1043 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1044
1045 static void
1046 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1047                            UErrorCode *pErrorCode) {
1048     UConverter *cnv=pArgs->converter;
1049     const char *source=pArgs->source;
1050     const char *sourceLimit=pArgs->sourceLimit;
1051     int32_t *offsets=pArgs->offsets;
1052
1053     int32_t state, offsetDelta;
1054     char b;
1055
1056     state=cnv->mode;
1057
1058     /*
1059      * If we detect a BOM in this buffer, then we must add the BOM size to the
1060      * offsets because the actual converter function will not see and count the BOM.
1061      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1062      */
1063     offsetDelta=0;
1064
1065     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1066         switch(state) {
1067         case 0:
1068             b=*source;
1069             if(b==0) {
1070                 state=1; /* could be 00 00 FE FF */
1071             } else if(b==(char)0xff) {
1072                 state=5; /* could be FF FE 00 00 */
1073             } else {
1074                 state=8; /* default to UTF-32BE */
1075                 continue;
1076             }
1077             ++source;
1078             break;
1079         case 1:
1080         case 2:
1081         case 3:
1082         case 5:
1083         case 6:
1084         case 7:
1085             if(*source==utf32BOM[state]) {
1086                 ++state;
1087                 ++source;
1088                 if(state==4) {
1089                     state=8; /* detect UTF-32BE */
1090                     offsetDelta=(int32_t)(source-pArgs->source);
1091                 } else if(state==8) {
1092                     state=9; /* detect UTF-32LE */
1093                     offsetDelta=(int32_t)(source-pArgs->source);
1094                 }
1095             } else {
1096                 /* switch to UTF-32BE and pass the previous bytes */
1097                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1098
1099                 /* reset the source */
1100                 source=pArgs->source;
1101
1102                 if(count==(state&3)) {
1103                     /* simple: all in the same buffer, just reset source */
1104                 } else {
1105                     UBool oldFlush=pArgs->flush;
1106
1107                     /* some of the bytes are from a previous buffer, replay those first */
1108                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1109                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1110                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1111
1112                     /* no offsets: bytes from previous buffer, and not enough for output */
1113                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1114
1115                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1116                     pArgs->sourceLimit=sourceLimit;
1117                     pArgs->flush=oldFlush;
1118                 }
1119                 state=8;
1120                 continue;
1121             }
1122             break;
1123         case 8:
1124             /* call UTF-32BE */
1125             pArgs->source=source;
1126             if(offsets==NULL) {
1127                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1128             } else {
1129                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1130             }
1131             source=pArgs->source;
1132             break;
1133         case 9:
1134             /* call UTF-32LE */
1135             pArgs->source=source;
1136             if(offsets==NULL) {
1137                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1138             } else {
1139                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1140             }
1141             source=pArgs->source;
1142             break;
1143         default:
1144             break; /* does not occur */
1145         }
1146     }
1147
1148     /* add BOM size to offsets - see comment at offsetDelta declaration */
1149     if(offsets!=NULL && offsetDelta!=0) {
1150         int32_t *offsetsLimit=pArgs->offsets;
1151         while(offsets<offsetsLimit) {
1152             *offsets++ += offsetDelta;
1153         }
1154     }
1155
1156     pArgs->source=source;
1157
1158     if(source==sourceLimit && pArgs->flush) {
1159         /* handle truncated input */
1160         switch(state) {
1161         case 0:
1162             break; /* no input at all, nothing to do */
1163         case 8:
1164             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1165             break;
1166         case 9:
1167             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1168             break;
1169         default:
1170             /* handle 0<state<8: call UTF-32BE with too-short input */
1171             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1172             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1173
1174             /* no offsets: not enough for output */
1175             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1176             pArgs->source=source;
1177             pArgs->sourceLimit=sourceLimit;
1178             state=8;
1179             break;
1180         }
1181     }
1182
1183     cnv->mode=state;
1184 }
1185
1186 static UChar32
1187 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1188                    UErrorCode *pErrorCode) {
1189     switch(pArgs->converter->mode) {
1190     case 8:
1191         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1192     case 9:
1193         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1194     default:
1195         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1196     }
1197 }
1198
1199 static const UConverterImpl _UTF32Impl = {
1200     UCNV_UTF32,
1201
1202     NULL,
1203     NULL,
1204
1205     _UTF32Open,
1206     NULL,
1207     _UTF32Reset,
1208
1209     _UTF32ToUnicodeWithOffsets,
1210     _UTF32ToUnicodeWithOffsets,
1211 #if U_IS_BIG_ENDIAN
1212     T_UConverter_fromUnicode_UTF32_BE,
1213     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1214 #else
1215     T_UConverter_fromUnicode_UTF32_LE,
1216     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1217 #endif
1218     _UTF32GetNextUChar,
1219
1220     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1221     NULL,
1222     NULL,
1223     NULL,
1224     ucnv_getNonSurrogateUnicodeSet
1225 };
1226
1227 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1228 static const UConverterStaticData _UTF32StaticData = {
1229     sizeof(UConverterStaticData),
1230     "UTF-32",
1231     1236,
1232     UCNV_IBM, UCNV_UTF32, 4, 4,
1233 #if U_IS_BIG_ENDIAN
1234     { 0, 0, 0xff, 0xfd }, 4,
1235 #else
1236     { 0xfd, 0xff, 0, 0 }, 4,
1237 #endif
1238     FALSE, FALSE,
1239     0,
1240     0,
1241     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1242 };
1243
1244 const UConverterSharedData _UTF32Data = {
1245     sizeof(UConverterSharedData), ~((uint32_t) 0),
1246     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1247     0
1248 };
1249
1250 #endif