icuSources/common/ucnv_u32.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u32.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "ucnv_bld.h"
  23 #include "ucnv_cnv.h"
  24 #include "cmemory.h"
  25
  26 #define MAXIMUM_UCS2            0x0000FFFF
  27 #define MAXIMUM_UTF             0x0010FFFF
  28 #define HALF_SHIFT              10
  29 #define HALF_BASE               0x0010000
  30 #define HALF_MASK               0x3FF
  31 #define SURROGATE_HIGH_START    0xD800
  32 #define SURROGATE_LOW_START     0xDC00
  33
  34 /* -SURROGATE_LOW_START + HALF_BASE */
  35 #define SURROGATE_LOW_BASE      9216
  36
  37 /* UTF-32BE ----------------------------------------------------------------- */
  38
  39 static void
  40 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
  41                                 UErrorCode * err)
  42 {
  43     const unsigned char *mySource = (unsigned char *) args->source;
  44     UChar *myTarget = args->target;
  45     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  46     const UChar *targetLimit = args->targetLimit;
  47     unsigned char *toUBytes = args->converter->toUBytes;
  48     uint32_t ch, i;
  49
  50     /* UTF-8 returns here for only non-offset, this needs to change.*/
  51     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
  52         i = args->converter->toULength;       /* restore # of bytes consumed */
  53
  54         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
  55         args->converter->toUnicodeStatus = 0;
  56         goto morebytes;
  57     }
  58
  59     while (mySource < sourceLimit && myTarget < targetLimit) {
  60         i = 0;
  61         ch = 0;
  62 morebytes:
  63         while (i < sizeof(uint32_t)) {
  64             if (mySource < sourceLimit) {
  65                 ch = (ch << 8) | (uint8_t)(*mySource);
  66                 toUBytes[i++] = (char) *(mySource++);
  67             }
  68             else {
  69                 /* stores a partially calculated target*/
  70                 /* + 1 to make 0 a valid character */
  71                 args->converter->toUnicodeStatus = ch + 1;
  72                 args->converter->toULength = (int8_t) i;
  73                 goto donefornow;
  74             }
  75         }
  76
  77         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
  78             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
  79             if (ch <= MAXIMUM_UCS2)
  80             {
  81                 /* fits in 16 bits */
  82                 *(myTarget++) = (UChar) ch;
  83             }
  84             else {
  85                 /* write out the surrogates */
  86                 *(myTarget++) = U16_LEAD(ch);
  87                 ch = U16_TRAIL(ch);
  88                 if (myTarget < targetLimit) {
  89                     *(myTarget++) = (UChar)ch;
  90                 }
  91                 else {
  92                     /* Put in overflow buffer (not handled here) */
  93                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
  94                     args->converter->UCharErrorBufferLength = 1;
  95                     *err = U_BUFFER_OVERFLOW_ERROR;
  96                     break;
  97                 }
  98             }
  99         }
 100         else {
 101             args->converter->toULength = (int8_t)i;
 102             *err = U_ILLEGAL_CHAR_FOUND;
 103             break;
 104         }
 105     }
 106
 107 donefornow:
 108     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 109         /* End of target buffer */
 110         *err = U_BUFFER_OVERFLOW_ERROR;
 111     }
 112
 113     args->target = myTarget;
 114     args->source = (const char *) mySource;
 115 }
 116
 117 static void
 118 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 119                                              UErrorCode * err)
 120 {
 121     const unsigned char *mySource = (unsigned char *) args->source;
 122     UChar *myTarget = args->target;
 123     int32_t *myOffsets = args->offsets;
 124     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 125     const UChar *targetLimit = args->targetLimit;
 126     unsigned char *toUBytes = args->converter->toUBytes;
 127     uint32_t ch, i;
 128     int32_t offsetNum = 0;
 129
 130     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
 131         i = args->converter->toULength;       /* restore # of bytes consumed */
 132
 133         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
 134         args->converter->toUnicodeStatus = 0;
 135         goto morebytes;
 136     }
 137
 138     while (mySource < sourceLimit && myTarget < targetLimit) {
 139         i = 0;
 140         ch = 0;
 141 morebytes:
 142         while (i < sizeof(uint32_t)) {
 143             if (mySource < sourceLimit) {
 144                 ch = (ch << 8) | (uint8_t)(*mySource);
 145                 toUBytes[i++] = (char) *(mySource++);
 146             }
 147             else {
 148                 /* stores a partially calculated target*/
 149                 /* + 1 to make 0 a valid character */
 150                 args->converter->toUnicodeStatus = ch + 1;
 151                 args->converter->toULength = (int8_t) i;
 152                 goto donefornow;
 153             }
 154         }
 155
 156         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 157             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 158             if (ch <= MAXIMUM_UCS2) {
 159                 /* fits in 16 bits */
 160                 *(myTarget++) = (UChar) ch;
 161                 *(myOffsets++) = offsetNum;
 162             }
 163             else {
 164                 /* write out the surrogates */
 165                 *(myTarget++) = U16_LEAD(ch);
 166                 *myOffsets++ = offsetNum;
 167                 ch = U16_TRAIL(ch);
 168                 if (myTarget < targetLimit)
 169                 {
 170                     *(myTarget++) = (UChar)ch;
 171                     *(myOffsets++) = offsetNum;
 172                 }
 173                 else {
 174                     /* Put in overflow buffer (not handled here) */
 175                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 176                     args->converter->UCharErrorBufferLength = 1;
 177                     *err = U_BUFFER_OVERFLOW_ERROR;
 178                     break;
 179                 }
 180             }
 181         }
 182         else {
 183             args->converter->toULength = (int8_t)i;
 184             *err = U_ILLEGAL_CHAR_FOUND;
 185             break;
 186         }
 187         offsetNum += i;
 188     }
 189
 190 donefornow:
 191     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 192     {
 193         /* End of target buffer */
 194         *err = U_BUFFER_OVERFLOW_ERROR;
 195     }
 196
 197     args->target = myTarget;
 198     args->source = (const char *) mySource;
 199     args->offsets = myOffsets;
 200 }
 201
 202 static void
 203 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
 204                                   UErrorCode * err)
 205 {
 206     const UChar *mySource = args->source;
 207     unsigned char *myTarget = (unsigned char *) args->target;
 208     const UChar *sourceLimit = args->sourceLimit;
 209     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 210     UChar32 ch, ch2;
 211     unsigned int indexToWrite;
 212     unsigned char temp[sizeof(uint32_t)];
 213
 214     temp[0] = 0;
 215
 216     if (args->converter->fromUChar32) {
 217         ch = args->converter->fromUChar32;
 218         args->converter->fromUChar32 = 0;
 219         goto lowsurogate;
 220     }
 221
 222     while (mySource < sourceLimit && myTarget < targetLimit) {
 223         ch = *(mySource++);
 224
 225         if (UTF_IS_SURROGATE(ch)) {
 226             if (U_IS_LEAD(ch)) {
 227 lowsurogate:
 228                 if (mySource < sourceLimit) {
 229                     ch2 = *mySource;
 230                     if (U_IS_TRAIL(ch2)) {
 231                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 232                         mySource++;
 233                     }
 234                     else {
 235                         /* this is an unmatched trail code unit (2nd surrogate) */
 236                         /* callback(illegal) */
 237                         args->converter->fromUChar32 = ch;
 238                         *err = U_ILLEGAL_CHAR_FOUND;
 239                         break;
 240                     }
 241                 }
 242                 else {
 243                     /* ran out of source */
 244                     args->converter->fromUChar32 = ch;
 245                     if (args->flush) {
 246                         /* this is an unmatched trail code unit (2nd surrogate) */
 247                         /* callback(illegal) */
 248                         *err = U_ILLEGAL_CHAR_FOUND;
 249                     }
 250                     break;
 251                 }
 252             }
 253             else {
 254                 /* this is an unmatched trail code unit (2nd surrogate) */
 255                 /* callback(illegal) */
 256                 args->converter->fromUChar32 = ch;
 257                 *err = U_ILLEGAL_CHAR_FOUND;
 258                 break;
 259             }
 260         }
 261
 262         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 263         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 264         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 265         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 266
 267         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 268             if (myTarget < targetLimit) {
 269                 *(myTarget++) = temp[indexToWrite];
 270             }
 271             else {
 272                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 273                 *err = U_BUFFER_OVERFLOW_ERROR;
 274             }
 275         }
 276     }
 277
 278     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 279         *err = U_BUFFER_OVERFLOW_ERROR;
 280     }
 281
 282     args->target = (char *) myTarget;
 283     args->source = mySource;
 284 }
 285
 286 static void
 287 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 288                                                UErrorCode * err)
 289 {
 290     const UChar *mySource = args->source;
 291     unsigned char *myTarget = (unsigned char *) args->target;
 292     int32_t *myOffsets = args->offsets;
 293     const UChar *sourceLimit = args->sourceLimit;
 294     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 295     UChar32 ch, ch2;
 296     int32_t offsetNum = 0;
 297     unsigned int indexToWrite;
 298     unsigned char temp[sizeof(uint32_t)];
 299
 300     temp[0] = 0;
 301
 302     if (args->converter->fromUChar32) {
 303         ch = args->converter->fromUChar32;
 304         args->converter->fromUChar32 = 0;
 305         goto lowsurogate;
 306     }
 307
 308     while (mySource < sourceLimit && myTarget < targetLimit) {
 309         ch = *(mySource++);
 310
 311         if (UTF_IS_SURROGATE(ch)) {
 312             if (U_IS_LEAD(ch)) {
 313 lowsurogate:
 314                 if (mySource < sourceLimit) {
 315                     ch2 = *mySource;
 316                     if (U_IS_TRAIL(ch2)) {
 317                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 318                         mySource++;
 319                     }
 320                     else {
 321                         /* this is an unmatched trail code unit (2nd surrogate) */
 322                         /* callback(illegal) */
 323                         args->converter->fromUChar32 = ch;
 324                         *err = U_ILLEGAL_CHAR_FOUND;
 325                         break;
 326                     }
 327                 }
 328                 else {
 329                     /* ran out of source */
 330                     args->converter->fromUChar32 = ch;
 331                     if (args->flush) {
 332                         /* this is an unmatched trail code unit (2nd surrogate) */
 333                         /* callback(illegal) */
 334                         *err = U_ILLEGAL_CHAR_FOUND;
 335                     }
 336                     break;
 337                 }
 338             }
 339             else {
 340                 /* this is an unmatched trail code unit (2nd surrogate) */
 341                 /* callback(illegal) */
 342                 args->converter->fromUChar32 = ch;
 343                 *err = U_ILLEGAL_CHAR_FOUND;
 344                 break;
 345             }
 346         }
 347
 348         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 349         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 350         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 351         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 352
 353         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 354             if (myTarget < targetLimit) {
 355                 *(myTarget++) = temp[indexToWrite];
 356                 *(myOffsets++) = offsetNum;
 357             }
 358             else {
 359                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 360                 *err = U_BUFFER_OVERFLOW_ERROR;
 361             }
 362         }
 363         offsetNum++;
 364     }
 365
 366     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 367         *err = U_BUFFER_OVERFLOW_ERROR;
 368     }
 369
 370     args->target = (char *) myTarget;
 371     args->source = mySource;
 372     args->offsets = myOffsets;
 373 }
 374
 375 static UChar32
 376 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
 377                                    UErrorCode* err)
 378 {
 379     const uint8_t *mySource;
 380     UChar32 myUChar;
 381     int32_t length;
 382
 383     mySource = (const uint8_t *)args->source;
 384     if (mySource >= (const uint8_t *)args->sourceLimit)
 385     {
 386         /* no input */
 387         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 388         return 0xffff;
 389     }
 390
 391     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 392     if (length < 4)
 393     {
 394         /* got a partial character */
 395         uprv_memcpy(args->converter->toUBytes, mySource, length);
 396         args->converter->toULength = (int8_t)length;
 397         args->source = (const char *)(mySource + length);
 398         *err = U_TRUNCATED_CHAR_FOUND;
 399         return 0xffff;
 400     }
 401
 402     /* Don't even try to do a direct cast because the value may be on an odd address. */
 403     myUChar = ((UChar32)mySource[0] << 24)
 404             | ((UChar32)mySource[1] << 16)
 405             | ((UChar32)mySource[2] << 8)
 406             | ((UChar32)mySource[3]);
 407
 408     args->source = (const char *)(mySource + 4);
 409     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 410         return myUChar;
 411     }
 412
 413     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 414     args->converter->toULength = 4;
 415
 416     *err = U_ILLEGAL_CHAR_FOUND;
 417     return 0xffff;
 418 }
 419
 420 static const UConverterImpl _UTF32BEImpl = {
 421     UCNV_UTF32_BigEndian,
 422
 423     NULL,
 424     NULL,
 425
 426     NULL,
 427     NULL,
 428     NULL,
 429
 430     T_UConverter_toUnicode_UTF32_BE,
 431     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
 432     T_UConverter_fromUnicode_UTF32_BE,
 433     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
 434     T_UConverter_getNextUChar_UTF32_BE,
 435
 436     NULL,
 437     NULL,
 438     NULL,
 439     NULL,
 440     ucnv_getCompleteUnicodeSet
 441 };
 442
 443 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 444 static const UConverterStaticData _UTF32BEStaticData = {
 445     sizeof(UConverterStaticData),
 446     "UTF-32BE",
 447     1232,
 448     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
 449     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
 450     0,
 451     0,
 452     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 453 };
 454
 455 const UConverterSharedData _UTF32BEData = {
 456     sizeof(UConverterSharedData), ~((uint32_t) 0),
 457     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
 458     0
 459 };
 460
 461 /* UTF-32LE ---------------------------------------------------------- */
 462
 463 static void
 464 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
 465                                 UErrorCode * err)
 466 {
 467     const unsigned char *mySource = (unsigned char *) args->source;
 468     UChar *myTarget = args->target;
 469     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 470     const UChar *targetLimit = args->targetLimit;
 471     unsigned char *toUBytes = args->converter->toUBytes;
 472     uint32_t ch, i;
 473
 474     /* UTF-8 returns here for only non-offset, this needs to change.*/
 475     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 476     {
 477         i = args->converter->toULength;       /* restore # of bytes consumed */
 478
 479         /* Stores the previously calculated ch from a previous call*/
 480         ch = args->converter->toUnicodeStatus - 1;
 481         args->converter->toUnicodeStatus = 0;
 482         goto morebytes;
 483     }
 484
 485     while (mySource < sourceLimit && myTarget < targetLimit)
 486     {
 487         i = 0;
 488         ch = 0;
 489 morebytes:
 490         while (i < sizeof(uint32_t))
 491         {
 492             if (mySource < sourceLimit)
 493             {
 494                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 495                 toUBytes[i++] = (char) *(mySource++);
 496             }
 497             else
 498             {
 499                 /* stores a partially calculated target*/
 500                 /* + 1 to make 0 a valid character */
 501                 args->converter->toUnicodeStatus = ch + 1;
 502                 args->converter->toULength = (int8_t) i;
 503                 goto donefornow;
 504             }
 505         }
 506
 507         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 508             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 509             if (ch <= MAXIMUM_UCS2) {
 510                 /* fits in 16 bits */
 511                 *(myTarget++) = (UChar) ch;
 512             }
 513             else {
 514                 /* write out the surrogates */
 515                 *(myTarget++) = U16_LEAD(ch);
 516                 ch = U16_TRAIL(ch);
 517                 if (myTarget < targetLimit) {
 518                     *(myTarget++) = (UChar)ch;
 519                 }
 520                 else {
 521                     /* Put in overflow buffer (not handled here) */
 522                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 523                     args->converter->UCharErrorBufferLength = 1;
 524                     *err = U_BUFFER_OVERFLOW_ERROR;
 525                     break;
 526                 }
 527             }
 528         }
 529         else {
 530             args->converter->toULength = (int8_t)i;
 531             *err = U_ILLEGAL_CHAR_FOUND;
 532             break;
 533         }
 534     }
 535
 536 donefornow:
 537     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 538     {
 539         /* End of target buffer */
 540         *err = U_BUFFER_OVERFLOW_ERROR;
 541     }
 542
 543     args->target = myTarget;
 544     args->source = (const char *) mySource;
 545 }
 546
 547 static void
 548 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 549                                              UErrorCode * err)
 550 {
 551     const unsigned char *mySource = (unsigned char *) args->source;
 552     UChar *myTarget = args->target;
 553     int32_t *myOffsets = args->offsets;
 554     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 555     const UChar *targetLimit = args->targetLimit;
 556     unsigned char *toUBytes = args->converter->toUBytes;
 557     uint32_t ch, i;
 558     int32_t offsetNum = 0;
 559
 560     /* UTF-8 returns here for only non-offset, this needs to change.*/
 561     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 562     {
 563         i = args->converter->toULength;       /* restore # of bytes consumed */
 564
 565         /* Stores the previously calculated ch from a previous call*/
 566         ch = args->converter->toUnicodeStatus - 1;
 567         args->converter->toUnicodeStatus = 0;
 568         goto morebytes;
 569     }
 570
 571     while (mySource < sourceLimit && myTarget < targetLimit)
 572     {
 573         i = 0;
 574         ch = 0;
 575 morebytes:
 576         while (i < sizeof(uint32_t))
 577         {
 578             if (mySource < sourceLimit)
 579             {
 580                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 581                 toUBytes[i++] = (char) *(mySource++);
 582             }
 583             else
 584             {
 585                 /* stores a partially calculated target*/
 586                 /* + 1 to make 0 a valid character */
 587                 args->converter->toUnicodeStatus = ch + 1;
 588                 args->converter->toULength = (int8_t) i;
 589                 goto donefornow;
 590             }
 591         }
 592
 593         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
 594         {
 595             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 596             if (ch <= MAXIMUM_UCS2)
 597             {
 598                 /* fits in 16 bits */
 599                 *(myTarget++) = (UChar) ch;
 600                 *(myOffsets++) = offsetNum;
 601             }
 602             else {
 603                 /* write out the surrogates */
 604                 *(myTarget++) = U16_LEAD(ch);
 605                 *(myOffsets++) = offsetNum;
 606                 ch = U16_TRAIL(ch);
 607                 if (myTarget < targetLimit)
 608                 {
 609                     *(myTarget++) = (UChar)ch;
 610                     *(myOffsets++) = offsetNum;
 611                 }
 612                 else
 613                 {
 614                     /* Put in overflow buffer (not handled here) */
 615                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 616                     args->converter->UCharErrorBufferLength = 1;
 617                     *err = U_BUFFER_OVERFLOW_ERROR;
 618                     break;
 619                 }
 620             }
 621         }
 622         else
 623         {
 624             args->converter->toULength = (int8_t)i;
 625             *err = U_ILLEGAL_CHAR_FOUND;
 626             break;
 627         }
 628         offsetNum += i;
 629     }
 630
 631 donefornow:
 632     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 633     {
 634         /* End of target buffer */
 635         *err = U_BUFFER_OVERFLOW_ERROR;
 636     }
 637
 638     args->target = myTarget;
 639     args->source = (const char *) mySource;
 640     args->offsets = myOffsets;
 641 }
 642
 643 static void
 644 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
 645                                   UErrorCode * err)
 646 {
 647     const UChar *mySource = args->source;
 648     unsigned char *myTarget = (unsigned char *) args->target;
 649     const UChar *sourceLimit = args->sourceLimit;
 650     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 651     UChar32 ch, ch2;
 652     unsigned int indexToWrite;
 653     unsigned char temp[sizeof(uint32_t)];
 654
 655     temp[3] = 0;
 656
 657     if (args->converter->fromUChar32)
 658     {
 659         ch = args->converter->fromUChar32;
 660         args->converter->fromUChar32 = 0;
 661         goto lowsurogate;
 662     }
 663
 664     while (mySource < sourceLimit && myTarget < targetLimit)
 665     {
 666         ch = *(mySource++);
 667
 668         if (UTF_IS_SURROGATE(ch)) {
 669             if (U_IS_LEAD(ch))
 670             {
 671 lowsurogate:
 672                 if (mySource < sourceLimit)
 673                 {
 674                     ch2 = *mySource;
 675                     if (U_IS_TRAIL(ch2)) {
 676                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 677                         mySource++;
 678                     }
 679                     else {
 680                         /* this is an unmatched trail code unit (2nd surrogate) */
 681                         /* callback(illegal) */
 682                         args->converter->fromUChar32 = ch;
 683                         *err = U_ILLEGAL_CHAR_FOUND;
 684                         break;
 685                     }
 686                 }
 687                 else {
 688                     /* ran out of source */
 689                     args->converter->fromUChar32 = ch;
 690                     if (args->flush) {
 691                         /* this is an unmatched trail code unit (2nd surrogate) */
 692                         /* callback(illegal) */
 693                         *err = U_ILLEGAL_CHAR_FOUND;
 694                     }
 695                     break;
 696                 }
 697             }
 698             else {
 699                 /* this is an unmatched trail code unit (2nd surrogate) */
 700                 /* callback(illegal) */
 701                 args->converter->fromUChar32 = ch;
 702                 *err = U_ILLEGAL_CHAR_FOUND;
 703                 break;
 704             }
 705         }
 706
 707         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 708         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 709         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 710         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 711
 712         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 713         {
 714             if (myTarget < targetLimit)
 715             {
 716                 *(myTarget++) = temp[indexToWrite];
 717             }
 718             else
 719             {
 720                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 721                 *err = U_BUFFER_OVERFLOW_ERROR;
 722             }
 723         }
 724     }
 725
 726     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 727     {
 728         *err = U_BUFFER_OVERFLOW_ERROR;
 729     }
 730
 731     args->target = (char *) myTarget;
 732     args->source = mySource;
 733 }
 734
 735 static void
 736 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 737                                                UErrorCode * err)
 738 {
 739     const UChar *mySource = args->source;
 740     unsigned char *myTarget = (unsigned char *) args->target;
 741     int32_t *myOffsets = args->offsets;
 742     const UChar *sourceLimit = args->sourceLimit;
 743     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 744     UChar32 ch, ch2;
 745     unsigned int indexToWrite;
 746     unsigned char temp[sizeof(uint32_t)];
 747     int32_t offsetNum = 0;
 748
 749     temp[3] = 0;
 750
 751     if (args->converter->fromUChar32)
 752     {
 753         ch = args->converter->fromUChar32;
 754         args->converter->fromUChar32 = 0;
 755         goto lowsurogate;
 756     }
 757
 758     while (mySource < sourceLimit && myTarget < targetLimit)
 759     {
 760         ch = *(mySource++);
 761
 762         if (UTF_IS_SURROGATE(ch)) {
 763             if (U_IS_LEAD(ch))
 764             {
 765 lowsurogate:
 766                 if (mySource < sourceLimit)
 767                 {
 768                     ch2 = *mySource;
 769                     if (U_IS_TRAIL(ch2))
 770                     {
 771                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 772                         mySource++;
 773                     }
 774                     else {
 775                         /* this is an unmatched trail code unit (2nd surrogate) */
 776                         /* callback(illegal) */
 777                         args->converter->fromUChar32 = ch;
 778                         *err = U_ILLEGAL_CHAR_FOUND;
 779                         break;
 780                     }
 781                 }
 782                 else {
 783                     /* ran out of source */
 784                     args->converter->fromUChar32 = ch;
 785                     if (args->flush) {
 786                         /* this is an unmatched trail code unit (2nd surrogate) */
 787                         /* callback(illegal) */
 788                         *err = U_ILLEGAL_CHAR_FOUND;
 789                     }
 790                     break;
 791                 }
 792             }
 793             else {
 794                 /* this is an unmatched trail code unit (2nd surrogate) */
 795                 /* callback(illegal) */
 796                 args->converter->fromUChar32 = ch;
 797                 *err = U_ILLEGAL_CHAR_FOUND;
 798                 break;
 799             }
 800         }
 801
 802         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 803         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 804         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 805         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 806
 807         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 808         {
 809             if (myTarget < targetLimit)
 810             {
 811                 *(myTarget++) = temp[indexToWrite];
 812                 *(myOffsets++) = offsetNum;
 813             }
 814             else
 815             {
 816                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 817                 *err = U_BUFFER_OVERFLOW_ERROR;
 818             }
 819         }
 820         offsetNum++;
 821     }
 822
 823     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 824     {
 825         *err = U_BUFFER_OVERFLOW_ERROR;
 826     }
 827
 828     args->target = (char *) myTarget;
 829     args->source = mySource;
 830     args->offsets = myOffsets;
 831 }
 832
 833 static UChar32
 834 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
 835                                    UErrorCode* err)
 836 {
 837     const uint8_t *mySource;
 838     UChar32 myUChar;
 839     int32_t length;
 840
 841     mySource = (const uint8_t *)args->source;
 842     if (mySource >= (const uint8_t *)args->sourceLimit)
 843     {
 844         /* no input */
 845         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 846         return 0xffff;
 847     }
 848
 849     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 850     if (length < 4)
 851     {
 852         /* got a partial character */
 853         uprv_memcpy(args->converter->toUBytes, mySource, length);
 854         args->converter->toULength = (int8_t)length;
 855         args->source = (const char *)(mySource + length);
 856         *err = U_TRUNCATED_CHAR_FOUND;
 857         return 0xffff;
 858     }
 859
 860     /* Don't even try to do a direct cast because the value may be on an odd address. */
 861     myUChar = ((UChar32)mySource[3] << 24)
 862             | ((UChar32)mySource[2] << 16)
 863             | ((UChar32)mySource[1] << 8)
 864             | ((UChar32)mySource[0]);
 865
 866     args->source = (const char *)(mySource + 4);
 867     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 868         return myUChar;
 869     }
 870
 871     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 872     args->converter->toULength = 4;
 873
 874     *err = U_ILLEGAL_CHAR_FOUND;
 875     return 0xffff;
 876 }
 877
 878 static const UConverterImpl _UTF32LEImpl = {
 879     UCNV_UTF32_LittleEndian,
 880
 881     NULL,
 882     NULL,
 883
 884     NULL,
 885     NULL,
 886     NULL,
 887
 888     T_UConverter_toUnicode_UTF32_LE,
 889     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
 890     T_UConverter_fromUnicode_UTF32_LE,
 891     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
 892     T_UConverter_getNextUChar_UTF32_LE,
 893
 894     NULL,
 895     NULL,
 896     NULL,
 897     NULL,
 898     ucnv_getCompleteUnicodeSet
 899 };
 900
 901 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 902 static const UConverterStaticData _UTF32LEStaticData = {
 903     sizeof(UConverterStaticData),
 904     "UTF-32LE",
 905     1234,
 906     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
 907     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
 908     0,
 909     0,
 910     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 911 };
 912
 913
 914 const UConverterSharedData _UTF32LEData = {
 915     sizeof(UConverterSharedData), ~((uint32_t) 0),
 916     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
 917     0
 918 };
 919
 920 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
 921
 922 /*
 923  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
 924  * accordingly.
 925  *
 926  * State values:
 927  * 0    initial state
 928  * 1    saw 00
 929  * 2    saw 00 00
 930  * 3    saw 00 00 FE
 931  * 4    -
 932  * 5    saw FF
 933  * 6    saw FF FE
 934  * 7    saw FF FE 00
 935  * 8    UTF-32BE mode
 936  * 9    UTF-32LE mode
 937  *
 938  * During detection: state&3==number of matching bytes so far.
 939  *
 940  * On output, emit U+FEFF as the first code point.
 941  */
 942
 943 static void
 944 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
 945     if(choice<=UCNV_RESET_TO_UNICODE) {
 946         /* reset toUnicode: state=0 */
 947         cnv->mode=0;
 948     }
 949     if(choice!=UCNV_RESET_TO_UNICODE) {
 950         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
 951         cnv->charErrorBufferLength=4;
 952 #if U_IS_BIG_ENDIAN
 953         cnv->charErrorBuffer[0]=0;
 954         cnv->charErrorBuffer[1]=0;
 955         cnv->charErrorBuffer[2]=0xfe;
 956         cnv->charErrorBuffer[3]=0xff;
 957 #else
 958         cnv->charErrorBuffer[0]=0xff;
 959         cnv->charErrorBuffer[1]=0xfe;
 960         cnv->charErrorBuffer[2]=0;
 961         cnv->charErrorBuffer[3]=0;
 962 #endif
 963     }
 964 }
 965
 966 static void
 967 _UTF32Open(UConverter *cnv,
 968            const char *name,
 969            const char *locale,
 970            uint32_t options,
 971            UErrorCode *pErrorCode) {
 972     _UTF32Reset(cnv, UCNV_RESET_BOTH);
 973 }
 974
 975 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
 976
 977 static void
 978 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 979                            UErrorCode *pErrorCode) {
 980     UConverter *cnv=pArgs->converter;
 981     const char *source=pArgs->source;
 982     const char *sourceLimit=pArgs->sourceLimit;
 983     int32_t *offsets=pArgs->offsets;
 984
 985     int32_t state, offsetDelta;
 986     char b;
 987
 988     state=cnv->mode;
 989
 990     /*
 991      * If we detect a BOM in this buffer, then we must add the BOM size to the
 992      * offsets because the actual converter function will not see and count the BOM.
 993      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
 994      */
 995     offsetDelta=0;
 996
 997     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
 998         switch(state) {
 999         case 0:
1000             b=*source;
1001             if(b==0) {
1002                 state=1; /* could be 00 00 FE FF */
1003             } else if(b==(char)0xff) {
1004                 state=5; /* could be FF FE 00 00 */
1005             } else {
1006                 state=8; /* default to UTF-32BE */
1007                 continue;
1008             }
1009             ++source;
1010             break;
1011         case 1:
1012         case 2:
1013         case 3:
1014         case 5:
1015         case 6:
1016         case 7:
1017             if(*source==utf32BOM[state]) {
1018                 ++state;
1019                 ++source;
1020                 if(state==4) {
1021                     state=8; /* detect UTF-32BE */
1022                     offsetDelta=source-pArgs->source;
1023                 } else if(state==8) {
1024                     state=9; /* detect UTF-32LE */
1025                     offsetDelta=source-pArgs->source;
1026                 }
1027             } else {
1028                 /* switch to UTF-32BE and pass the previous bytes */
1029                 int32_t count=source-pArgs->source; /* number of bytes from this buffer */
1030
1031                 /* reset the source */
1032                 source=pArgs->source;
1033
1034                 if(count==(state&3)) {
1035                     /* simple: all in the same buffer, just reset source */
1036                 } else {
1037                     UBool oldFlush=pArgs->flush;
1038
1039                     /* some of the bytes are from a previous buffer, replay those first */
1040                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1041                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1042                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1043
1044                     /* no offsets: bytes from previous buffer, and not enough for output */
1045                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1046
1047                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1048                     pArgs->sourceLimit=sourceLimit;
1049                     pArgs->flush=oldFlush;
1050                 }
1051                 state=8;
1052                 continue;
1053             }
1054             break;
1055         case 8:
1056             /* call UTF-32BE */
1057             pArgs->source=source;
1058             if(offsets==NULL) {
1059                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1060             } else {
1061                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1062             }
1063             source=pArgs->source;
1064             break;
1065         case 9:
1066             /* call UTF-32LE */
1067             pArgs->source=source;
1068             if(offsets==NULL) {
1069                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1070             } else {
1071                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1072             }
1073             source=pArgs->source;
1074             break;
1075         default:
1076             break; /* does not occur */
1077         }
1078     }
1079
1080     /* add BOM size to offsets - see comment at offsetDelta declaration */
1081     if(offsets!=NULL && offsetDelta!=0) {
1082         int32_t *offsetsLimit=pArgs->offsets;
1083         while(offsets<offsetsLimit) {
1084             *offsets++ += offsetDelta;
1085         }
1086     }
1087
1088     pArgs->source=source;
1089
1090     if(source==sourceLimit && pArgs->flush) {
1091         /* handle truncated input */
1092         switch(state) {
1093         case 0:
1094             break; /* no input at all, nothing to do */
1095         case 8:
1096             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1097             break;
1098         case 9:
1099             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1100             break;
1101         default:
1102             /* handle 0<state<8: call UTF-32BE with too-short input */
1103             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1104             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1105
1106             /* no offsets: not enough for output */
1107             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1108             pArgs->source=source;
1109             pArgs->sourceLimit=sourceLimit;
1110             state=8;
1111             break;
1112         }
1113     }
1114
1115     cnv->mode=state;
1116 }
1117
1118 static UChar32
1119 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1120                    UErrorCode *pErrorCode) {
1121     switch(pArgs->converter->mode) {
1122     case 8:
1123         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1124     case 9:
1125         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1126     default:
1127         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1128     }
1129 }
1130
1131 static const UConverterImpl _UTF32Impl = {
1132     UCNV_UTF32,
1133
1134     NULL,
1135     NULL,
1136
1137     _UTF32Open,
1138     NULL,
1139     _UTF32Reset,
1140
1141     _UTF32ToUnicodeWithOffsets,
1142     _UTF32ToUnicodeWithOffsets,
1143 #if U_IS_BIG_ENDIAN
1144     T_UConverter_fromUnicode_UTF32_BE,
1145     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1146 #else
1147     T_UConverter_fromUnicode_UTF32_LE,
1148     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1149 #endif
1150     _UTF32GetNextUChar,
1151
1152     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1153     NULL,
1154     NULL,
1155     NULL,
1156     ucnv_getCompleteUnicodeSet
1157 };
1158
1159 static const UConverterStaticData _UTF32StaticData = {
1160     sizeof(UConverterStaticData),
1161     "UTF-32",
1162     0, /* ### TODO review correctness of all Unicode CCSIDs */
1163     UCNV_IBM, UCNV_UTF32, 4, 4,
1164 #if U_IS_BIG_ENDIAN
1165     { 0, 0, 0xff, 0xfd }, 4,
1166 #else
1167     { 0xfd, 0xff, 0, 0 }, 4,
1168 #endif
1169     FALSE, FALSE,
1170     0,
1171     0,
1172     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1173 };
1174
1175 const UConverterSharedData _UTF32Data = {
1176     sizeof(UConverterSharedData), ~((uint32_t) 0),
1177     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
1178     0
1179 };
1180
1181 #endif