icuSources/common/ucnv_u8.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2007, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u8.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
  15 *
  16 *   Also, CESU-8 implementation, see UTR 26.
  17 *   The CESU-8 converter uses all the same functions as the
  18 *   UTF-8 converter, with a branch for converting supplementary code points.
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_CONVERSION
  24
  25 #include "unicode/ucnv.h"
  26 #include "ucnv_bld.h"
  27 #include "ucnv_cnv.h"
  28 #include "cmemory.h"
  29
  30 /* Prototypes --------------------------------------------------------------- */
  31
  32 /* Keep these here to make finicky compilers happy */
  33
  34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
  35                                            UErrorCode *err);
  36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
  37                                                         UErrorCode *err);
  38
  39
  40 /* UTF-8 -------------------------------------------------------------------- */
  41
  42 /* UTF-8 Conversion DATA
  43  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
  44  */
  45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
  46 #define MAXIMUM_UCS2            0x0000FFFF
  47 #define MAXIMUM_UTF             0x0010FFFF
  48 #define MAXIMUM_UCS4            0x7FFFFFFF
  49 #define HALF_SHIFT              10
  50 #define HALF_BASE               0x0010000
  51 #define HALF_MASK               0x3FF
  52 #define SURROGATE_HIGH_START    0xD800
  53 #define SURROGATE_HIGH_END      0xDBFF
  54 #define SURROGATE_LOW_START     0xDC00
  55 #define SURROGATE_LOW_END       0xDFFF
  56
  57 /* -SURROGATE_LOW_START + HALF_BASE */
  58 #define SURROGATE_LOW_BASE      9216
  59
  60 static const uint32_t offsetsFromUTF8[7] = {0,
  61   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
  62   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
  63 };
  64
  65 /* END OF UTF-8 Conversion DATA */
  66
  67 static const int8_t bytesFromUTF8[256] = {
  68   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  69   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  70   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  72   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  73   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  74   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  75   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  76 };
  77
  78 /*
  79  * Starting with Unicode 3.0.1:
  80  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
  81  * byte sequences with more than 4 bytes are illegal in UTF-8,
  82  * which is tested with impossible values for them
  83  */
  84 static const uint32_t
  85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
  86
  87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
  88                                   UErrorCode * err)
  89 {
  90     UConverter *cnv = args->converter;
  91     const unsigned char *mySource = (unsigned char *) args->source;
  92     UChar *myTarget = args->target;
  93     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  94     const UChar *targetLimit = args->targetLimit;
  95     unsigned char *toUBytes = cnv->toUBytes;
  96     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
  97     uint32_t ch, ch2 = 0;
  98     int32_t i, inBytes;
  99
 100     /* Restore size of current sequence */
 101     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 102     {
 103         inBytes = cnv->mode;            /* restore # of bytes to consume */
 104         i = cnv->toULength;             /* restore # of bytes consumed */
 105         cnv->toULength = 0;
 106
 107         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 108         cnv->toUnicodeStatus = 0;
 109         goto morebytes;
 110     }
 111
 112
 113     while (mySource < sourceLimit && myTarget < targetLimit)
 114     {
 115         ch = *(mySource++);
 116         if (ch < 0x80)        /* Simple case */
 117         {
 118             *(myTarget++) = (UChar) ch;
 119         }
 120         else
 121         {
 122             /* store the first char */
 123             toUBytes[0] = (char)ch;
 124             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
 125             i = 1;
 126
 127 morebytes:
 128             while (i < inBytes)
 129             {
 130                 if (mySource < sourceLimit)
 131                 {
 132                     toUBytes[i] = (char) (ch2 = *mySource);
 133                     if (!UTF8_IS_TRAIL(ch2))
 134                     {
 135                         break; /* i < inBytes */
 136                     }
 137                     ch = (ch << 6) + ch2;
 138                     ++mySource;
 139                     i++;
 140                 }
 141                 else
 142                 {
 143                     /* stores a partially calculated target*/
 144                     cnv->toUnicodeStatus = ch;
 145                     cnv->mode = inBytes;
 146                     cnv->toULength = (int8_t) i;
 147                     goto donefornow;
 148                 }
 149             }
 150
 151             /* Remove the accumulated high bits */
 152             ch -= offsetsFromUTF8[inBytes];
 153
 154             /*
 155              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 156              * - use only trail bytes after a lead byte (checked above)
 157              * - use the right number of trail bytes for a given lead byte
 158              * - encode a code point <= U+10ffff
 159              * - use the fewest possible number of bytes for their code points
 160              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 161              *
 162              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 163              * There are no irregular sequences any more.
 164              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 165              */
 166             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 167                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
 168             {
 169                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 170                 if (ch <= MAXIMUM_UCS2)
 171                 {
 172                     /* fits in 16 bits */
 173                     *(myTarget++) = (UChar) ch;
 174                 }
 175                 else
 176                 {
 177                     /* write out the surrogates */
 178                     ch -= HALF_BASE;
 179                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 180                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 181                     if (myTarget < targetLimit)
 182                     {
 183                         *(myTarget++) = (UChar)ch;
 184                     }
 185                     else
 186                     {
 187                         /* Put in overflow buffer (not handled here) */
 188                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 189                         cnv->UCharErrorBufferLength = 1;
 190                         *err = U_BUFFER_OVERFLOW_ERROR;
 191                         break;
 192                     }
 193                 }
 194             }
 195             else
 196             {
 197                 cnv->toULength = (int8_t)i;
 198                 *err = U_ILLEGAL_CHAR_FOUND;
 199                 break;
 200             }
 201         }
 202     }
 203
 204 donefornow:
 205     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 206     {
 207         /* End of target buffer */
 208         *err = U_BUFFER_OVERFLOW_ERROR;
 209     }
 210
 211     args->target = myTarget;
 212     args->source = (const char *) mySource;
 213 }
 214
 215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
 216                                                 UErrorCode * err)
 217 {
 218     UConverter *cnv = args->converter;
 219     const unsigned char *mySource = (unsigned char *) args->source;
 220     UChar *myTarget = args->target;
 221     int32_t *myOffsets = args->offsets;
 222     int32_t offsetNum = 0;
 223     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 224     const UChar *targetLimit = args->targetLimit;
 225     unsigned char *toUBytes = cnv->toUBytes;
 226     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
 227     uint32_t ch, ch2 = 0;
 228     int32_t i, inBytes;
 229
 230     /* Restore size of current sequence */
 231     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 232     {
 233         inBytes = cnv->mode;            /* restore # of bytes to consume */
 234         i = cnv->toULength;             /* restore # of bytes consumed */
 235         cnv->toULength = 0;
 236
 237         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 238         cnv->toUnicodeStatus = 0;
 239         goto morebytes;
 240     }
 241
 242     while (mySource < sourceLimit && myTarget < targetLimit)
 243     {
 244         ch = *(mySource++);
 245         if (ch < 0x80)        /* Simple case */
 246         {
 247             *(myTarget++) = (UChar) ch;
 248             *(myOffsets++) = offsetNum++;
 249         }
 250         else
 251         {
 252             toUBytes[0] = (char)ch;
 253             inBytes = bytesFromUTF8[ch];
 254             i = 1;
 255
 256 morebytes:
 257             while (i < inBytes)
 258             {
 259                 if (mySource < sourceLimit)
 260                 {
 261                     toUBytes[i] = (char) (ch2 = *mySource);
 262                     if (!UTF8_IS_TRAIL(ch2))
 263                     {
 264                         break; /* i < inBytes */
 265                     }
 266                     ch = (ch << 6) + ch2;
 267                     ++mySource;
 268                     i++;
 269                 }
 270                 else
 271                 {
 272                     cnv->toUnicodeStatus = ch;
 273                     cnv->mode = inBytes;
 274                     cnv->toULength = (int8_t)i;
 275                     goto donefornow;
 276                 }
 277             }
 278
 279             /* Remove the accumulated high bits */
 280             ch -= offsetsFromUTF8[inBytes];
 281
 282             /*
 283              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 284              * - use only trail bytes after a lead byte (checked above)
 285              * - use the right number of trail bytes for a given lead byte
 286              * - encode a code point <= U+10ffff
 287              * - use the fewest possible number of bytes for their code points
 288              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 289              *
 290              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 291              * There are no irregular sequences any more.
 292              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 293              */
 294             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 295                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
 296             {
 297                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 298                 if (ch <= MAXIMUM_UCS2)
 299                 {
 300                     /* fits in 16 bits */
 301                     *(myTarget++) = (UChar) ch;
 302                     *(myOffsets++) = offsetNum;
 303                 }
 304                 else
 305                 {
 306                     /* write out the surrogates */
 307                     ch -= HALF_BASE;
 308                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 309                     *(myOffsets++) = offsetNum;
 310                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 311                     if (myTarget < targetLimit)
 312                     {
 313                         *(myTarget++) = (UChar)ch;
 314                         *(myOffsets++) = offsetNum;
 315                     }
 316                     else
 317                     {
 318                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 319                         cnv->UCharErrorBufferLength = 1;
 320                         *err = U_BUFFER_OVERFLOW_ERROR;
 321                     }
 322                 }
 323                 offsetNum += i;
 324             }
 325             else
 326             {
 327                 cnv->toULength = (int8_t)i;
 328                 *err = U_ILLEGAL_CHAR_FOUND;
 329                 break;
 330             }
 331         }
 332     }
 333
 334 donefornow:
 335     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 336     {   /* End of target buffer */
 337         *err = U_BUFFER_OVERFLOW_ERROR;
 338     }
 339
 340     args->target = myTarget;
 341     args->source = (const char *) mySource;
 342     args->offsets = myOffsets;
 343 }
 344
 345 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
 346                                     UErrorCode * err)
 347 {
 348     UConverter *cnv = args->converter;
 349     const UChar *mySource = args->source;
 350     const UChar *sourceLimit = args->sourceLimit;
 351     uint8_t *myTarget = (uint8_t *) args->target;
 352     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 353     uint8_t *tempPtr;
 354     UChar32 ch;
 355     uint8_t tempBuf[4];
 356     int32_t indexToWrite;
 357     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
 358
 359     if (cnv->fromUChar32 && myTarget < targetLimit)
 360     {
 361         ch = cnv->fromUChar32;
 362         cnv->fromUChar32 = 0;
 363         goto lowsurrogate;
 364     }
 365
 366     while (mySource < sourceLimit && myTarget < targetLimit)
 367     {
 368         ch = *(mySource++);
 369
 370         if (ch < 0x80)        /* Single byte */
 371         {
 372             *(myTarget++) = (uint8_t) ch;
 373         }
 374         else if (ch < 0x800)  /* Double byte */
 375         {
 376             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 377             if (myTarget < targetLimit)
 378             {
 379                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 380             }
 381             else
 382             {
 383                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 384                 cnv->charErrorBufferLength = 1;
 385                 *err = U_BUFFER_OVERFLOW_ERROR;
 386             }
 387         }
 388         else {
 389             /* Check for surrogates */
 390             if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
 391 lowsurrogate:
 392                 if (mySource < sourceLimit) {
 393                     /* test both code units */
 394                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
 395                         /* convert and consume this supplementary code point */
 396                         ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
 397                         ++mySource;
 398                         /* exit this condition tree */
 399                     }
 400                     else {
 401                         /* this is an unpaired trail or lead code unit */
 402                         /* callback(illegal) */
 403                         cnv->fromUChar32 = ch;
 404                         *err = U_ILLEGAL_CHAR_FOUND;
 405                         break;
 406                     }
 407                 }
 408                 else {
 409                     /* no more input */
 410                     cnv->fromUChar32 = ch;
 411                     break;
 412                 }
 413             }
 414
 415             /* Do we write the buffer directly for speed,
 416             or do we have to be careful about target buffer space? */
 417             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 418
 419             if (ch <= MAXIMUM_UCS2) {
 420                 indexToWrite = 2;
 421                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 422             }
 423             else {
 424                 indexToWrite = 3;
 425                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 426                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 427             }
 428             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 429             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 430
 431             if (tempPtr == myTarget) {
 432                 /* There was enough space to write the codepoint directly. */
 433                 myTarget += (indexToWrite + 1);
 434             }
 435             else {
 436                 /* We might run out of room soon. Write it slowly. */
 437                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 438                     if (myTarget < targetLimit) {
 439                         *(myTarget++) = *tempPtr;
 440                     }
 441                     else {
 442                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 443                         *err = U_BUFFER_OVERFLOW_ERROR;
 444                     }
 445                 }
 446             }
 447         }
 448     }
 449
 450     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 451     {
 452         *err = U_BUFFER_OVERFLOW_ERROR;
 453     }
 454
 455     args->target = (char *) myTarget;
 456     args->source = mySource;
 457 }
 458
 459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
 460                                                   UErrorCode * err)
 461 {
 462     UConverter *cnv = args->converter;
 463     const UChar *mySource = args->source;
 464     int32_t *myOffsets = args->offsets;
 465     const UChar *sourceLimit = args->sourceLimit;
 466     uint8_t *myTarget = (uint8_t *) args->target;
 467     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 468     uint8_t *tempPtr;
 469     UChar32 ch;
 470     int32_t offsetNum, nextSourceIndex;
 471     int32_t indexToWrite;
 472     uint8_t tempBuf[4];
 473     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
 474
 475     if (cnv->fromUChar32 && myTarget < targetLimit)
 476     {
 477         ch = cnv->fromUChar32;
 478         cnv->fromUChar32 = 0;
 479         offsetNum = -1;
 480         nextSourceIndex = 0;
 481         goto lowsurrogate;
 482     } else {
 483         offsetNum = 0;
 484     }
 485
 486     while (mySource < sourceLimit && myTarget < targetLimit)
 487     {
 488         ch = *(mySource++);
 489
 490         if (ch < 0x80)        /* Single byte */
 491         {
 492             *(myOffsets++) = offsetNum++;
 493             *(myTarget++) = (char) ch;
 494         }
 495         else if (ch < 0x800)  /* Double byte */
 496         {
 497             *(myOffsets++) = offsetNum;
 498             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 499             if (myTarget < targetLimit)
 500             {
 501                 *(myOffsets++) = offsetNum++;
 502                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 503             }
 504             else
 505             {
 506                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 507                 cnv->charErrorBufferLength = 1;
 508                 *err = U_BUFFER_OVERFLOW_ERROR;
 509             }
 510         }
 511         else
 512         /* Check for surrogates */
 513         {
 514             nextSourceIndex = offsetNum + 1;
 515
 516             if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
 517 lowsurrogate:
 518                 if (mySource < sourceLimit) {
 519                     /* test both code units */
 520                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
 521                         /* convert and consume this supplementary code point */
 522                         ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
 523                         ++mySource;
 524                         ++nextSourceIndex;
 525                         /* exit this condition tree */
 526                     }
 527                     else {
 528                         /* this is an unpaired trail or lead code unit */
 529                         /* callback(illegal) */
 530                         cnv->fromUChar32 = ch;
 531                         *err = U_ILLEGAL_CHAR_FOUND;
 532                         break;
 533                     }
 534                 }
 535                 else {
 536                     /* no more input */
 537                     cnv->fromUChar32 = ch;
 538                     break;
 539                 }
 540             }
 541
 542             /* Do we write the buffer directly for speed,
 543             or do we have to be careful about target buffer space? */
 544             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 545
 546             if (ch <= MAXIMUM_UCS2) {
 547                 indexToWrite = 2;
 548                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 549             }
 550             else {
 551                 indexToWrite = 3;
 552                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 553                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 554             }
 555             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 556             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 557
 558             if (tempPtr == myTarget) {
 559                 /* There was enough space to write the codepoint directly. */
 560                 myTarget += (indexToWrite + 1);
 561                 myOffsets[0] = offsetNum;
 562                 myOffsets[1] = offsetNum;
 563                 myOffsets[2] = offsetNum;
 564                 if (indexToWrite >= 3) {
 565                     myOffsets[3] = offsetNum;
 566                 }
 567                 myOffsets += (indexToWrite + 1);
 568             }
 569             else {
 570                 /* We might run out of room soon. Write it slowly. */
 571                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 572                     if (myTarget < targetLimit)
 573                     {
 574                         *(myOffsets++) = offsetNum;
 575                         *(myTarget++) = *tempPtr;
 576                     }
 577                     else
 578                     {
 579                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 580                         *err = U_BUFFER_OVERFLOW_ERROR;
 581                     }
 582                 }
 583             }
 584             offsetNum = nextSourceIndex;
 585         }
 586     }
 587
 588     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 589     {
 590         *err = U_BUFFER_OVERFLOW_ERROR;
 591     }
 592
 593     args->target = (char *) myTarget;
 594     args->source = mySource;
 595     args->offsets = myOffsets;
 596 }
 597
 598 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
 599                                                UErrorCode *err) {
 600     UConverter *cnv;
 601     const uint8_t *sourceInitial;
 602     const uint8_t *source;
 603     uint16_t extraBytesToWrite;
 604     uint8_t myByte;
 605     UChar32 ch;
 606     int8_t i, isLegalSequence;
 607
 608     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
 609
 610     cnv = args->converter;
 611     sourceInitial = source = (const uint8_t *)args->source;
 612     if (source >= (const uint8_t *)args->sourceLimit)
 613     {
 614         /* no input */
 615         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 616         return 0xffff;
 617     }
 618
 619     myByte = (uint8_t)*(source++);
 620     if (myByte < 0x80)
 621     {
 622         args->source = (const char *)source;
 623         return (UChar32)myByte;
 624     }
 625
 626     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
 627     if (extraBytesToWrite == 0) {
 628         cnv->toUBytes[0] = myByte;
 629         cnv->toULength = 1;
 630         *err = U_ILLEGAL_CHAR_FOUND;
 631         args->source = (const char *)source;
 632         return 0xffff;
 633     }
 634
 635     /*The byte sequence is longer than the buffer area passed*/
 636     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
 637     {
 638         /* check if all of the remaining bytes are trail bytes */
 639         cnv->toUBytes[0] = myByte;
 640         i = 1;
 641         *err = U_TRUNCATED_CHAR_FOUND;
 642         while(source < (const uint8_t *)args->sourceLimit) {
 643             if(U8_IS_TRAIL(myByte = *source)) {
 644                 cnv->toUBytes[i++] = myByte;
 645                 ++source;
 646             } else {
 647                 /* error even before we run out of input */
 648                 *err = U_ILLEGAL_CHAR_FOUND;
 649                 break;
 650             }
 651         }
 652         cnv->toULength = i;
 653         args->source = (const char *)source;
 654         return 0xffff;
 655     }
 656
 657     isLegalSequence = 1;
 658     ch = myByte << 6;
 659     switch(extraBytesToWrite)
 660     {
 661       /* note: code falls through cases! (sic)*/
 662     case 6:
 663         ch += (myByte = *source);
 664         ch <<= 6;
 665         if (!UTF8_IS_TRAIL(myByte))
 666         {
 667             isLegalSequence = 0;
 668             break;
 669         }
 670         ++source;
 671     case 5:
 672         ch += (myByte = *source);
 673         ch <<= 6;
 674         if (!UTF8_IS_TRAIL(myByte))
 675         {
 676             isLegalSequence = 0;
 677             break;
 678         }
 679         ++source;
 680     case 4:
 681         ch += (myByte = *source);
 682         ch <<= 6;
 683         if (!UTF8_IS_TRAIL(myByte))
 684         {
 685             isLegalSequence = 0;
 686             break;
 687         }
 688         ++source;
 689     case 3:
 690         ch += (myByte = *source);
 691         ch <<= 6;
 692         if (!UTF8_IS_TRAIL(myByte))
 693         {
 694             isLegalSequence = 0;
 695             break;
 696         }
 697         ++source;
 698     case 2:
 699         ch += (myByte = *source);
 700         if (!UTF8_IS_TRAIL(myByte))
 701         {
 702             isLegalSequence = 0;
 703             break;
 704         }
 705         ++source;
 706     };
 707     ch -= offsetsFromUTF8[extraBytesToWrite];
 708     args->source = (const char *)source;
 709
 710     /*
 711      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 712      * - use only trail bytes after a lead byte (checked above)
 713      * - use the right number of trail bytes for a given lead byte
 714      * - encode a code point <= U+10ffff
 715      * - use the fewest possible number of bytes for their code points
 716      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 717      *
 718      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 719      * There are no irregular sequences any more.
 720      */
 721     if (isLegalSequence &&
 722         (uint32_t)ch <= MAXIMUM_UTF &&
 723         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
 724         !U_IS_SURROGATE(ch)
 725     ) {
 726         return ch; /* return the code point */
 727     }
 728
 729     for(i = 0; sourceInitial < source; ++i) {
 730         cnv->toUBytes[i] = *sourceInitial++;
 731     }
 732     cnv->toULength = i;
 733     *err = U_ILLEGAL_CHAR_FOUND;
 734     return 0xffff;
 735 }
 736
 737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
 738
 739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
 740 static const UChar32
 741 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
 742
 743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
 744 static const UChar32
 745 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
 746
 747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
 748 static void
 749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
 750                   UConverterToUnicodeArgs *pToUArgs,
 751                   UErrorCode *pErrorCode) {
 752     UConverter *utf8, *cnv;
 753     const uint8_t *source, *sourceLimit;
 754     uint8_t *target;
 755     int32_t targetCapacity;
 756     int32_t count;
 757
 758     int8_t oldToULength, toULength, toULimit;
 759
 760     UChar32 c;
 761     uint8_t b, t1, t2;
 762
 763     /* set up the local pointers */
 764     utf8=pToUArgs->converter;
 765     cnv=pFromUArgs->converter;
 766     source=(uint8_t *)pToUArgs->source;
 767     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
 768     target=(uint8_t *)pFromUArgs->target;
 769     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
 770
 771     /* get the converter state from the UTF-8 UConverter */
 772     c=(UChar32)utf8->toUnicodeStatus;
 773     if(c!=0) {
 774         toULength=oldToULength=utf8->toULength;
 775         toULimit=(int8_t)utf8->mode;
 776     } else {
 777         toULength=oldToULength=toULimit=0;
 778     }
 779
 780     count=(int32_t)(sourceLimit-source)+oldToULength;
 781     if(count<toULimit) {
 782         /*
 783          * Not enough input to complete the partial character.
 784          * Jump to moreBytes below - it will not output to target.
 785          */
 786     } else if(targetCapacity<toULimit) {
 787         /*
 788          * Not enough target capacity to output the partial character.
 789          * Let the standard converter handle this.
 790          */
 791         *pErrorCode=U_USING_DEFAULT_WARNING;
 792         return;
 793     } else {
 794         /*
 795          * Use a single counter for source and target, counting the minimum of
 796          * the source length and the target capacity.
 797          * As a result, the source length is checked only once per multi-byte
 798          * character instead of twice.
 799          *
 800          * Make sure that the last byte sequence is complete, or else
 801          * stop just before it.
 802          * (The longest legal byte sequence has 3 trail bytes.)
 803          * Count oldToULength (number of source bytes from a previous buffer)
 804          * into the source length but reduce the source index by toULimit
 805          * while going back over trail bytes in order to not go back into
 806          * the bytes that will be read for finishing a partial
 807          * sequence from the previous buffer.
 808          * Let the standard converter handle edge cases.
 809          */
 810         int32_t i;
 811
 812         if(count>targetCapacity) {
 813             count=targetCapacity;
 814         }
 815
 816         i=0;
 817         while(i<3 && i<(count-toULimit)) {
 818             b=source[count-oldToULength-i-1];
 819             if(U8_IS_TRAIL(b)) {
 820                 ++i;
 821             } else {
 822                 if(i<utf8_countTrailBytes[b]) {
 823                     /* stop converting before the lead byte if there are not enough trail bytes for it */
 824                     count-=i+1;
 825                 }
 826                 break;
 827             }
 828         }
 829     }
 830
 831     if(c!=0) {
 832         utf8->toUnicodeStatus=0;
 833         utf8->toULength=0;
 834         goto moreBytes;
 835         /* See note in ucnv_SBCSFromUTF8() about this goto. */
 836     }
 837
 838     /* conversion loop */
 839     while(count>0) {
 840         b=*source++;
 841         if((int8_t)b>=0) {
 842             /* convert ASCII */
 843             *target++=b;
 844             --count;
 845             continue;
 846         } else {
 847             if(b>0xe0) {
 848                 if( /* handle U+1000..U+D7FF inline */
 849                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
 850                                                (b==0xed && (t1 <= 0x9f))) &&
 851                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 852                 ) {
 853                     source+=2;
 854                     *target++=b;
 855                     *target++=t1;
 856                     *target++=t2;
 857                     count-=3;
 858                     continue;
 859                 }
 860             } else if(b<0xe0) {
 861                 if( /* handle U+0080..U+07FF inline */
 862                     b>=0xc2 &&
 863                     (t1=*source) >= 0x80 && t1 <= 0xbf
 864                 ) {
 865                     ++source;
 866                     *target++=b;
 867                     *target++=t1;
 868                     count-=2;
 869                     continue;
 870                 }
 871             } else if(b==0xe0) {
 872                 if( /* handle U+0800..U+0FFF inline */
 873                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
 874                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 875                 ) {
 876                     source+=2;
 877                     *target++=b;
 878                     *target++=t1;
 879                     *target++=t2;
 880                     count-=3;
 881                     continue;
 882                 }
 883             }
 884
 885             /* handle "complicated" and error cases, and continuing partial characters */
 886             oldToULength=0;
 887             toULength=1;
 888             toULimit=utf8_countTrailBytes[b]+1;
 889             c=b;
 890 moreBytes:
 891             while(toULength<toULimit) {
 892                 if(source<sourceLimit) {
 893                     b=*source;
 894                     if(U8_IS_TRAIL(b)) {
 895                         ++source;
 896                         ++toULength;
 897                         c=(c<<6)+b;
 898                     } else {
 899                         break; /* sequence too short, stop with toULength<toULimit */
 900                     }
 901                 } else {
 902                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
 903                     source-=(toULength-oldToULength);
 904                     while(oldToULength<toULength) {
 905                         utf8->toUBytes[oldToULength++]=*source++;
 906                     }
 907                     utf8->toUnicodeStatus=c;
 908                     utf8->toULength=toULength;
 909                     utf8->mode=toULimit;
 910                     pToUArgs->source=(char *)source;
 911                     pFromUArgs->target=(char *)target;
 912                     return;
 913                 }
 914             }
 915
 916             if( toULength==toULimit &&      /* consumed all trail bytes */
 917                 (toULength==3 || toULength==2) &&             /* BMP */
 918                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
 919                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
 920             ) {
 921                 /* legal byte sequence for BMP code point */
 922             } else if(
 923                 toULength==toULimit && toULength==4 &&
 924                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
 925             ) {
 926                 /* legal byte sequence for supplementary code point */
 927             } else {
 928                 /* error handling: illegal UTF-8 byte sequence */
 929                 source-=(toULength-oldToULength);
 930                 while(oldToULength<toULength) {
 931                     utf8->toUBytes[oldToULength++]=*source++;
 932                 }
 933                 utf8->toULength=toULength;
 934                 pToUArgs->source=(char *)source;
 935                 pFromUArgs->target=(char *)target;
 936                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 937                 return;
 938             }
 939
 940             /* copy the legal byte sequence to the target */
 941             {
 942                 int8_t i;
 943
 944                 for(i=0; i<oldToULength; ++i) {
 945                     *target++=utf8->toUBytes[i];
 946                 }
 947                 source-=(toULength-oldToULength);
 948                 for(; i<toULength; ++i) {
 949                     *target++=*source++;
 950                 }
 951                 count-=toULength;
 952             }
 953         }
 954     }
 955
 956     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
 957         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
 958             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 959         } else {
 960             b=*source;
 961             toULimit=utf8_countTrailBytes[b]+1;
 962             if(toULimit>(sourceLimit-source)) {
 963                 /* collect a truncated byte sequence */
 964                 toULength=0;
 965                 c=b;
 966                 for(;;) {
 967                     utf8->toUBytes[toULength++]=b;
 968                     if(++source==sourceLimit) {
 969                         /* partial byte sequence at end of source */
 970                         utf8->toUnicodeStatus=c;
 971                         utf8->toULength=toULength;
 972                         utf8->mode=toULimit;
 973                         break;
 974                     } else if(!U8_IS_TRAIL(b=*source)) {
 975                         /* lead byte in trail byte position */
 976                         utf8->toULength=toULength;
 977                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 978                         break;
 979                     }
 980                     c=(c<<6)+b;
 981                 }
 982             } else {
 983                 /* partial-sequence target overflow: fall back to the pivoting implementation */
 984                 *pErrorCode=U_USING_DEFAULT_WARNING;
 985             }
 986         }
 987     }
 988
 989     /* write back the updated pointers */
 990     pToUArgs->source=(char *)source;
 991     pFromUArgs->target=(char *)target;
 992 }
 993
 994 /* UTF-8 converter data ----------------------------------------------------- */
 995
 996 static const UConverterImpl _UTF8Impl={
 997     UCNV_UTF8,
 998
 999     NULL,
1000     NULL,
1001
1002     NULL,
1003     NULL,
1004     NULL,
1005
1006     ucnv_toUnicode_UTF8,
1007     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1008     ucnv_fromUnicode_UTF8,
1009     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1010     ucnv_getNextUChar_UTF8,
1011
1012     NULL,
1013     NULL,
1014     NULL,
1015     NULL,
1016     ucnv_getNonSurrogateUnicodeSet,
1017
1018     ucnv_UTF8FromUTF8,
1019     ucnv_UTF8FromUTF8
1020 };
1021
1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1023 static const UConverterStaticData _UTF8StaticData={
1024     sizeof(UConverterStaticData),
1025     "UTF-8",
1026     1208, UCNV_IBM, UCNV_UTF8,
1027     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1028     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1029     0,
1030     0,
1031     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1032 };
1033
1034
1035 const UConverterSharedData _UTF8Data={
1036     sizeof(UConverterSharedData), ~((uint32_t) 0),
1037     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1038     0
1039 };
1040
1041 /* CESU-8 converter data ---------------------------------------------------- */
1042
1043 static const UConverterImpl _CESU8Impl={
1044     UCNV_CESU8,
1045
1046     NULL,
1047     NULL,
1048
1049     NULL,
1050     NULL,
1051     NULL,
1052
1053     ucnv_toUnicode_UTF8,
1054     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1055     ucnv_fromUnicode_UTF8,
1056     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1057     NULL,
1058
1059     NULL,
1060     NULL,
1061     NULL,
1062     NULL,
1063     ucnv_getCompleteUnicodeSet
1064 };
1065
1066 static const UConverterStaticData _CESU8StaticData={
1067     sizeof(UConverterStaticData),
1068     "CESU-8",
1069     9400, /* CCSID for CESU-8 */
1070     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1071     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1072     0,
1073     0,
1074     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1075 };
1076
1077
1078 const UConverterSharedData _CESU8Data={
1079     sizeof(UConverterSharedData), ~((uint32_t) 0),
1080     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1081     0
1082 };
1083
1084 #endif