icuSources/common/ucnv_u8.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2016, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u8.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
  15 *
  16 *   Also, CESU-8 implementation, see UTR 26.
  17 *   The CESU-8 converter uses all the same functions as the
  18 *   UTF-8 converter, with a branch for converting supplementary code points.
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_CONVERSION
  24
  25 #include "unicode/ucnv.h"
  26 #include "unicode/utf.h"
  27 #include "unicode/utf8.h"
  28 #include "unicode/utf16.h"
  29 #include "ucnv_bld.h"
  30 #include "ucnv_cnv.h"
  31 #include "cmemory.h"
  32
  33 /* Prototypes --------------------------------------------------------------- */
  34
  35 /* Keep these here to make finicky compilers happy */
  36
  37 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
  38                                            UErrorCode *err);
  39 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
  40                                                         UErrorCode *err);
  41
  42
  43 /* UTF-8 -------------------------------------------------------------------- */
  44
  45 /* UTF-8 Conversion DATA
  46  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
  47  */
  48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
  49 #define MAXIMUM_UCS2            0x0000FFFF
  50 #define MAXIMUM_UTF             0x0010FFFF
  51 #define MAXIMUM_UCS4            0x7FFFFFFF
  52 #define HALF_SHIFT              10
  53 #define HALF_BASE               0x0010000
  54 #define HALF_MASK               0x3FF
  55 #define SURROGATE_HIGH_START    0xD800
  56 #define SURROGATE_HIGH_END      0xDBFF
  57 #define SURROGATE_LOW_START     0xDC00
  58 #define SURROGATE_LOW_END       0xDFFF
  59
  60 /* -SURROGATE_LOW_START + HALF_BASE */
  61 #define SURROGATE_LOW_BASE      9216
  62
  63 static const uint32_t offsetsFromUTF8[7] = {0,
  64   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
  65   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
  66 };
  67
  68 /* END OF UTF-8 Conversion DATA */
  69
  70 static const int8_t bytesFromUTF8[256] = {
  71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  72   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  75   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  76   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  77   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  78   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  79 };
  80
  81 /*
  82  * Starting with Unicode 3.0.1:
  83  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
  84  * byte sequences with more than 4 bytes are illegal in UTF-8,
  85  * which is tested with impossible values for them
  86  */
  87 static const uint32_t
  88 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
  89
  90 static UBool hasCESU8Data(const UConverter *cnv)
  91 {
  92 #if UCONFIG_ONLY_HTML_CONVERSION
  93     return FALSE;
  94 #else
  95     return (UBool)(cnv->sharedData == &_CESU8Data);
  96 #endif
  97 }
  98
  99 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
 100                                   UErrorCode * err)
 101 {
 102     UConverter *cnv = args->converter;
 103     const unsigned char *mySource = (unsigned char *) args->source;
 104     UChar *myTarget = args->target;
 105     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 106     const UChar *targetLimit = args->targetLimit;
 107     unsigned char *toUBytes = cnv->toUBytes;
 108     UBool isCESU8 = hasCESU8Data(cnv);
 109     uint32_t ch, ch2 = 0;
 110     int32_t i, inBytes;
 111
 112     /* Restore size of current sequence */
 113     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 114     {
 115         inBytes = cnv->mode;            /* restore # of bytes to consume */
 116         i = cnv->toULength;             /* restore # of bytes consumed */
 117         cnv->toULength = 0;
 118
 119         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 120         cnv->toUnicodeStatus = 0;
 121         goto morebytes;
 122     }
 123
 124
 125     while (mySource < sourceLimit && myTarget < targetLimit)
 126     {
 127         ch = *(mySource++);
 128         if (ch < 0x80)        /* Simple case */
 129         {
 130             *(myTarget++) = (UChar) ch;
 131         }
 132         else
 133         {
 134             /* store the first char */
 135             toUBytes[0] = (char)ch;
 136             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
 137             i = 1;
 138
 139 morebytes:
 140             while (i < inBytes)
 141             {
 142                 if (mySource < sourceLimit)
 143                 {
 144                     toUBytes[i] = (char) (ch2 = *mySource);
 145                     if (!U8_IS_TRAIL(ch2))
 146                     {
 147                         break; /* i < inBytes */
 148                     }
 149                     ch = (ch << 6) + ch2;
 150                     ++mySource;
 151                     i++;
 152                 }
 153                 else
 154                 {
 155                     /* stores a partially calculated target*/
 156                     cnv->toUnicodeStatus = ch;
 157                     cnv->mode = inBytes;
 158                     cnv->toULength = (int8_t) i;
 159                     goto donefornow;
 160                 }
 161             }
 162
 163             /* Remove the accumulated high bits */
 164             ch -= offsetsFromUTF8[inBytes];
 165
 166             /*
 167              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 168              * - use only trail bytes after a lead byte (checked above)
 169              * - use the right number of trail bytes for a given lead byte
 170              * - encode a code point <= U+10ffff
 171              * - use the fewest possible number of bytes for their code points
 172              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 173              *
 174              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 175              * There are no irregular sequences any more.
 176              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 177              */
 178             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 179                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
 180             {
 181                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 182                 if (ch <= MAXIMUM_UCS2)
 183                 {
 184                     /* fits in 16 bits */
 185                     *(myTarget++) = (UChar) ch;
 186                 }
 187                 else
 188                 {
 189                     /* write out the surrogates */
 190                     ch -= HALF_BASE;
 191                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 192                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 193                     if (myTarget < targetLimit)
 194                     {
 195                         *(myTarget++) = (UChar)ch;
 196                     }
 197                     else
 198                     {
 199                         /* Put in overflow buffer (not handled here) */
 200                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 201                         cnv->UCharErrorBufferLength = 1;
 202                         *err = U_BUFFER_OVERFLOW_ERROR;
 203                         break;
 204                     }
 205                 }
 206             }
 207             else
 208             {
 209                 cnv->toULength = (int8_t)i;
 210                 *err = U_ILLEGAL_CHAR_FOUND;
 211                 break;
 212             }
 213         }
 214     }
 215
 216 donefornow:
 217     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 218     {
 219         /* End of target buffer */
 220         *err = U_BUFFER_OVERFLOW_ERROR;
 221     }
 222
 223     args->target = myTarget;
 224     args->source = (const char *) mySource;
 225 }
 226
 227 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
 228                                                 UErrorCode * err)
 229 {
 230     UConverter *cnv = args->converter;
 231     const unsigned char *mySource = (unsigned char *) args->source;
 232     UChar *myTarget = args->target;
 233     int32_t *myOffsets = args->offsets;
 234     int32_t offsetNum = 0;
 235     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 236     const UChar *targetLimit = args->targetLimit;
 237     unsigned char *toUBytes = cnv->toUBytes;
 238     UBool isCESU8 = hasCESU8Data(cnv);
 239     uint32_t ch, ch2 = 0;
 240     int32_t i, inBytes;
 241
 242     /* Restore size of current sequence */
 243     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 244     {
 245         inBytes = cnv->mode;            /* restore # of bytes to consume */
 246         i = cnv->toULength;             /* restore # of bytes consumed */
 247         cnv->toULength = 0;
 248
 249         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 250         cnv->toUnicodeStatus = 0;
 251         goto morebytes;
 252     }
 253
 254     while (mySource < sourceLimit && myTarget < targetLimit)
 255     {
 256         ch = *(mySource++);
 257         if (ch < 0x80)        /* Simple case */
 258         {
 259             *(myTarget++) = (UChar) ch;
 260             *(myOffsets++) = offsetNum++;
 261         }
 262         else
 263         {
 264             toUBytes[0] = (char)ch;
 265             inBytes = bytesFromUTF8[ch];
 266             i = 1;
 267
 268 morebytes:
 269             while (i < inBytes)
 270             {
 271                 if (mySource < sourceLimit)
 272                 {
 273                     toUBytes[i] = (char) (ch2 = *mySource);
 274                     if (!U8_IS_TRAIL(ch2))
 275                     {
 276                         break; /* i < inBytes */
 277                     }
 278                     ch = (ch << 6) + ch2;
 279                     ++mySource;
 280                     i++;
 281                 }
 282                 else
 283                 {
 284                     cnv->toUnicodeStatus = ch;
 285                     cnv->mode = inBytes;
 286                     cnv->toULength = (int8_t)i;
 287                     goto donefornow;
 288                 }
 289             }
 290
 291             /* Remove the accumulated high bits */
 292             ch -= offsetsFromUTF8[inBytes];
 293
 294             /*
 295              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 296              * - use only trail bytes after a lead byte (checked above)
 297              * - use the right number of trail bytes for a given lead byte
 298              * - encode a code point <= U+10ffff
 299              * - use the fewest possible number of bytes for their code points
 300              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 301              *
 302              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 303              * There are no irregular sequences any more.
 304              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 305              */
 306             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 307                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
 308             {
 309                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 310                 if (ch <= MAXIMUM_UCS2)
 311                 {
 312                     /* fits in 16 bits */
 313                     *(myTarget++) = (UChar) ch;
 314                     *(myOffsets++) = offsetNum;
 315                 }
 316                 else
 317                 {
 318                     /* write out the surrogates */
 319                     ch -= HALF_BASE;
 320                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 321                     *(myOffsets++) = offsetNum;
 322                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 323                     if (myTarget < targetLimit)
 324                     {
 325                         *(myTarget++) = (UChar)ch;
 326                         *(myOffsets++) = offsetNum;
 327                     }
 328                     else
 329                     {
 330                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 331                         cnv->UCharErrorBufferLength = 1;
 332                         *err = U_BUFFER_OVERFLOW_ERROR;
 333                     }
 334                 }
 335                 offsetNum += i;
 336             }
 337             else
 338             {
 339                 cnv->toULength = (int8_t)i;
 340                 *err = U_ILLEGAL_CHAR_FOUND;
 341                 break;
 342             }
 343         }
 344     }
 345
 346 donefornow:
 347     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 348     {   /* End of target buffer */
 349         *err = U_BUFFER_OVERFLOW_ERROR;
 350     }
 351
 352     args->target = myTarget;
 353     args->source = (const char *) mySource;
 354     args->offsets = myOffsets;
 355 }
 356
 357 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
 358                                     UErrorCode * err)
 359 {
 360     UConverter *cnv = args->converter;
 361     const UChar *mySource = args->source;
 362     const UChar *sourceLimit = args->sourceLimit;
 363     uint8_t *myTarget = (uint8_t *) args->target;
 364     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 365     uint8_t *tempPtr;
 366     UChar32 ch;
 367     uint8_t tempBuf[4];
 368     int32_t indexToWrite;
 369     UBool isNotCESU8 = !hasCESU8Data(cnv);
 370
 371     if (cnv->fromUChar32 && myTarget < targetLimit)
 372     {
 373         ch = cnv->fromUChar32;
 374         cnv->fromUChar32 = 0;
 375         goto lowsurrogate;
 376     }
 377
 378     while (mySource < sourceLimit && myTarget < targetLimit)
 379     {
 380         ch = *(mySource++);
 381
 382         if (ch < 0x80)        /* Single byte */
 383         {
 384             *(myTarget++) = (uint8_t) ch;
 385         }
 386         else if (ch < 0x800)  /* Double byte */
 387         {
 388             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 389             if (myTarget < targetLimit)
 390             {
 391                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 392             }
 393             else
 394             {
 395                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 396                 cnv->charErrorBufferLength = 1;
 397                 *err = U_BUFFER_OVERFLOW_ERROR;
 398             }
 399         }
 400         else {
 401             /* Check for surrogates */
 402             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 403 lowsurrogate:
 404                 if (mySource < sourceLimit) {
 405                     /* test both code units */
 406                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 407                         /* convert and consume this supplementary code point */
 408                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 409                         ++mySource;
 410                         /* exit this condition tree */
 411                     }
 412                     else {
 413                         /* this is an unpaired trail or lead code unit */
 414                         /* callback(illegal) */
 415                         cnv->fromUChar32 = ch;
 416                         *err = U_ILLEGAL_CHAR_FOUND;
 417                         break;
 418                     }
 419                 }
 420                 else {
 421                     /* no more input */
 422                     cnv->fromUChar32 = ch;
 423                     break;
 424                 }
 425             }
 426
 427             /* Do we write the buffer directly for speed,
 428             or do we have to be careful about target buffer space? */
 429             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 430
 431             if (ch <= MAXIMUM_UCS2) {
 432                 indexToWrite = 2;
 433                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 434             }
 435             else {
 436                 indexToWrite = 3;
 437                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 438                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 439             }
 440             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 441             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 442
 443             if (tempPtr == myTarget) {
 444                 /* There was enough space to write the codepoint directly. */
 445                 myTarget += (indexToWrite + 1);
 446             }
 447             else {
 448                 /* We might run out of room soon. Write it slowly. */
 449                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 450                     if (myTarget < targetLimit) {
 451                         *(myTarget++) = *tempPtr;
 452                     }
 453                     else {
 454                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 455                         *err = U_BUFFER_OVERFLOW_ERROR;
 456                     }
 457                 }
 458             }
 459         }
 460     }
 461
 462     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 463     {
 464         *err = U_BUFFER_OVERFLOW_ERROR;
 465     }
 466
 467     args->target = (char *) myTarget;
 468     args->source = mySource;
 469 }
 470
 471 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
 472                                                   UErrorCode * err)
 473 {
 474     UConverter *cnv = args->converter;
 475     const UChar *mySource = args->source;
 476     int32_t *myOffsets = args->offsets;
 477     const UChar *sourceLimit = args->sourceLimit;
 478     uint8_t *myTarget = (uint8_t *) args->target;
 479     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 480     uint8_t *tempPtr;
 481     UChar32 ch;
 482     int32_t offsetNum, nextSourceIndex;
 483     int32_t indexToWrite;
 484     uint8_t tempBuf[4];
 485     UBool isNotCESU8 = !hasCESU8Data(cnv);
 486
 487     if (cnv->fromUChar32 && myTarget < targetLimit)
 488     {
 489         ch = cnv->fromUChar32;
 490         cnv->fromUChar32 = 0;
 491         offsetNum = -1;
 492         nextSourceIndex = 0;
 493         goto lowsurrogate;
 494     } else {
 495         offsetNum = 0;
 496     }
 497
 498     while (mySource < sourceLimit && myTarget < targetLimit)
 499     {
 500         ch = *(mySource++);
 501
 502         if (ch < 0x80)        /* Single byte */
 503         {
 504             *(myOffsets++) = offsetNum++;
 505             *(myTarget++) = (char) ch;
 506         }
 507         else if (ch < 0x800)  /* Double byte */
 508         {
 509             *(myOffsets++) = offsetNum;
 510             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 511             if (myTarget < targetLimit)
 512             {
 513                 *(myOffsets++) = offsetNum++;
 514                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 515             }
 516             else
 517             {
 518                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 519                 cnv->charErrorBufferLength = 1;
 520                 *err = U_BUFFER_OVERFLOW_ERROR;
 521             }
 522         }
 523         else
 524         /* Check for surrogates */
 525         {
 526             nextSourceIndex = offsetNum + 1;
 527
 528             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 529 lowsurrogate:
 530                 if (mySource < sourceLimit) {
 531                     /* test both code units */
 532                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 533                         /* convert and consume this supplementary code point */
 534                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 535                         ++mySource;
 536                         ++nextSourceIndex;
 537                         /* exit this condition tree */
 538                     }
 539                     else {
 540                         /* this is an unpaired trail or lead code unit */
 541                         /* callback(illegal) */
 542                         cnv->fromUChar32 = ch;
 543                         *err = U_ILLEGAL_CHAR_FOUND;
 544                         break;
 545                     }
 546                 }
 547                 else {
 548                     /* no more input */
 549                     cnv->fromUChar32 = ch;
 550                     break;
 551                 }
 552             }
 553
 554             /* Do we write the buffer directly for speed,
 555             or do we have to be careful about target buffer space? */
 556             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 557
 558             if (ch <= MAXIMUM_UCS2) {
 559                 indexToWrite = 2;
 560                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 561             }
 562             else {
 563                 indexToWrite = 3;
 564                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 565                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 566             }
 567             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 568             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 569
 570             if (tempPtr == myTarget) {
 571                 /* There was enough space to write the codepoint directly. */
 572                 myTarget += (indexToWrite + 1);
 573                 myOffsets[0] = offsetNum;
 574                 myOffsets[1] = offsetNum;
 575                 myOffsets[2] = offsetNum;
 576                 if (indexToWrite >= 3) {
 577                     myOffsets[3] = offsetNum;
 578                 }
 579                 myOffsets += (indexToWrite + 1);
 580             }
 581             else {
 582                 /* We might run out of room soon. Write it slowly. */
 583                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 584                     if (myTarget < targetLimit)
 585                     {
 586                         *(myOffsets++) = offsetNum;
 587                         *(myTarget++) = *tempPtr;
 588                     }
 589                     else
 590                     {
 591                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 592                         *err = U_BUFFER_OVERFLOW_ERROR;
 593                     }
 594                 }
 595             }
 596             offsetNum = nextSourceIndex;
 597         }
 598     }
 599
 600     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 601     {
 602         *err = U_BUFFER_OVERFLOW_ERROR;
 603     }
 604
 605     args->target = (char *) myTarget;
 606     args->source = mySource;
 607     args->offsets = myOffsets;
 608 }
 609
 610 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
 611                                                UErrorCode *err) {
 612     UConverter *cnv;
 613     const uint8_t *sourceInitial;
 614     const uint8_t *source;
 615     uint16_t extraBytesToWrite;
 616     uint8_t myByte;
 617     UChar32 ch;
 618     int8_t i, isLegalSequence;
 619
 620     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
 621
 622     cnv = args->converter;
 623     sourceInitial = source = (const uint8_t *)args->source;
 624     if (source >= (const uint8_t *)args->sourceLimit)
 625     {
 626         /* no input */
 627         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 628         return 0xffff;
 629     }
 630
 631     myByte = (uint8_t)*(source++);
 632     if (myByte < 0x80)
 633     {
 634         args->source = (const char *)source;
 635         return (UChar32)myByte;
 636     }
 637
 638     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
 639     if (extraBytesToWrite == 0) {
 640         cnv->toUBytes[0] = myByte;
 641         cnv->toULength = 1;
 642         *err = U_ILLEGAL_CHAR_FOUND;
 643         args->source = (const char *)source;
 644         return 0xffff;
 645     }
 646
 647     /*The byte sequence is longer than the buffer area passed*/
 648     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
 649     {
 650         /* check if all of the remaining bytes are trail bytes */
 651         cnv->toUBytes[0] = myByte;
 652         i = 1;
 653         *err = U_TRUNCATED_CHAR_FOUND;
 654         while(source < (const uint8_t *)args->sourceLimit) {
 655             if(U8_IS_TRAIL(myByte = *source)) {
 656                 cnv->toUBytes[i++] = myByte;
 657                 ++source;
 658             } else {
 659                 /* error even before we run out of input */
 660                 *err = U_ILLEGAL_CHAR_FOUND;
 661                 break;
 662             }
 663         }
 664         cnv->toULength = i;
 665         args->source = (const char *)source;
 666         return 0xffff;
 667     }
 668
 669     isLegalSequence = 1;
 670     ch = myByte << 6;
 671     switch(extraBytesToWrite)
 672     {
 673       /* note: code falls through cases! (sic)*/
 674     case 6:
 675         ch += (myByte = *source);
 676         ch <<= 6;
 677         if (!U8_IS_TRAIL(myByte))
 678         {
 679             isLegalSequence = 0;
 680             break;
 681         }
 682         ++source;
 683         U_FALLTHROUGH;
 684     case 5:
 685         ch += (myByte = *source);
 686         ch <<= 6;
 687         if (!U8_IS_TRAIL(myByte))
 688         {
 689             isLegalSequence = 0;
 690             break;
 691         }
 692         ++source;
 693         U_FALLTHROUGH;
 694     case 4:
 695         ch += (myByte = *source);
 696         ch <<= 6;
 697         if (!U8_IS_TRAIL(myByte))
 698         {
 699             isLegalSequence = 0;
 700             break;
 701         }
 702         ++source;
 703         U_FALLTHROUGH;
 704     case 3:
 705         ch += (myByte = *source);
 706         ch <<= 6;
 707         if (!U8_IS_TRAIL(myByte))
 708         {
 709             isLegalSequence = 0;
 710             break;
 711         }
 712         ++source;
 713         U_FALLTHROUGH;
 714     case 2:
 715         ch += (myByte = *source);
 716         if (!U8_IS_TRAIL(myByte))
 717         {
 718             isLegalSequence = 0;
 719             break;
 720         }
 721         ++source;
 722     };
 723     ch -= offsetsFromUTF8[extraBytesToWrite];
 724     args->source = (const char *)source;
 725
 726     /*
 727      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 728      * - use only trail bytes after a lead byte (checked above)
 729      * - use the right number of trail bytes for a given lead byte
 730      * - encode a code point <= U+10ffff
 731      * - use the fewest possible number of bytes for their code points
 732      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 733      *
 734      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 735      * There are no irregular sequences any more.
 736      */
 737     if (isLegalSequence &&
 738         (uint32_t)ch <= MAXIMUM_UTF &&
 739         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
 740         !U_IS_SURROGATE(ch)
 741     ) {
 742         return ch; /* return the code point */
 743     }
 744
 745     for(i = 0; sourceInitial < source; ++i) {
 746         cnv->toUBytes[i] = *sourceInitial++;
 747     }
 748     cnv->toULength = i;
 749     *err = U_ILLEGAL_CHAR_FOUND;
 750     return 0xffff;
 751 }
 752
 753 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
 754
 755 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
 756 static const UChar32
 757 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
 758
 759 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
 760 static const UChar32
 761 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
 762
 763 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
 764 static void
 765 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
 766                   UConverterToUnicodeArgs *pToUArgs,
 767                   UErrorCode *pErrorCode) {
 768     UConverter *utf8;
 769     const uint8_t *source, *sourceLimit;
 770     uint8_t *target;
 771     int32_t targetCapacity;
 772     int32_t count;
 773
 774     int8_t oldToULength, toULength, toULimit;
 775
 776     UChar32 c;
 777     uint8_t b, t1, t2;
 778
 779     /* set up the local pointers */
 780     utf8=pToUArgs->converter;
 781     source=(uint8_t *)pToUArgs->source;
 782     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
 783     target=(uint8_t *)pFromUArgs->target;
 784     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
 785
 786     /* get the converter state from the UTF-8 UConverter */
 787     c=(UChar32)utf8->toUnicodeStatus;
 788     if(c!=0) {
 789         toULength=oldToULength=utf8->toULength;
 790         toULimit=(int8_t)utf8->mode;
 791     } else {
 792         toULength=oldToULength=toULimit=0;
 793     }
 794
 795     count=(int32_t)(sourceLimit-source)+oldToULength;
 796     if(count<toULimit) {
 797         /*
 798          * Not enough input to complete the partial character.
 799          * Jump to moreBytes below - it will not output to target.
 800          */
 801     } else if(targetCapacity<toULimit) {
 802         /*
 803          * Not enough target capacity to output the partial character.
 804          * Let the standard converter handle this.
 805          */
 806         *pErrorCode=U_USING_DEFAULT_WARNING;
 807         return;
 808     } else {
 809         /*
 810          * Use a single counter for source and target, counting the minimum of
 811          * the source length and the target capacity.
 812          * As a result, the source length is checked only once per multi-byte
 813          * character instead of twice.
 814          *
 815          * Make sure that the last byte sequence is complete, or else
 816          * stop just before it.
 817          * (The longest legal byte sequence has 3 trail bytes.)
 818          * Count oldToULength (number of source bytes from a previous buffer)
 819          * into the source length but reduce the source index by toULimit
 820          * while going back over trail bytes in order to not go back into
 821          * the bytes that will be read for finishing a partial
 822          * sequence from the previous buffer.
 823          * Let the standard converter handle edge cases.
 824          */
 825         int32_t i;
 826
 827         if(count>targetCapacity) {
 828             count=targetCapacity;
 829         }
 830
 831         i=0;
 832         while(i<3 && i<(count-toULimit)) {
 833             b=source[count-oldToULength-i-1];
 834             if(U8_IS_TRAIL(b)) {
 835                 ++i;
 836             } else {
 837                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
 838                     /* stop converting before the lead byte if there are not enough trail bytes for it */
 839                     count-=i+1;
 840                 }
 841                 break;
 842             }
 843         }
 844     }
 845
 846     if(c!=0) {
 847         utf8->toUnicodeStatus=0;
 848         utf8->toULength=0;
 849         goto moreBytes;
 850         /* See note in ucnv_SBCSFromUTF8() about this goto. */
 851     }
 852
 853     /* conversion loop */
 854     while(count>0) {
 855         b=*source++;
 856         if((int8_t)b>=0) {
 857             /* convert ASCII */
 858             *target++=b;
 859             --count;
 860             continue;
 861         } else {
 862             if(b>0xe0) {
 863                 if( /* handle U+1000..U+D7FF inline */
 864                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
 865                                                (b==0xed && (t1 <= 0x9f))) &&
 866                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 867                 ) {
 868                     source+=2;
 869                     *target++=b;
 870                     *target++=t1;
 871                     *target++=t2;
 872                     count-=3;
 873                     continue;
 874                 }
 875             } else if(b<0xe0) {
 876                 if( /* handle U+0080..U+07FF inline */
 877                     b>=0xc2 &&
 878                     (t1=*source) >= 0x80 && t1 <= 0xbf
 879                 ) {
 880                     ++source;
 881                     *target++=b;
 882                     *target++=t1;
 883                     count-=2;
 884                     continue;
 885                 }
 886             } else if(b==0xe0) {
 887                 if( /* handle U+0800..U+0FFF inline */
 888                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
 889                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 890                 ) {
 891                     source+=2;
 892                     *target++=b;
 893                     *target++=t1;
 894                     *target++=t2;
 895                     count-=3;
 896                     continue;
 897                 }
 898             }
 899
 900             /* handle "complicated" and error cases, and continuing partial characters */
 901             oldToULength=0;
 902             toULength=1;
 903             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
 904             c=b;
 905 moreBytes:
 906             while(toULength<toULimit) {
 907                 if(source<sourceLimit) {
 908                     b=*source;
 909                     if(U8_IS_TRAIL(b)) {
 910                         ++source;
 911                         ++toULength;
 912                         c=(c<<6)+b;
 913                     } else {
 914                         break; /* sequence too short, stop with toULength<toULimit */
 915                     }
 916                 } else {
 917                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
 918                     source-=(toULength-oldToULength);
 919                     while(oldToULength<toULength) {
 920                         utf8->toUBytes[oldToULength++]=*source++;
 921                     }
 922                     utf8->toUnicodeStatus=c;
 923                     utf8->toULength=toULength;
 924                     utf8->mode=toULimit;
 925                     pToUArgs->source=(char *)source;
 926                     pFromUArgs->target=(char *)target;
 927                     return;
 928                 }
 929             }
 930
 931             if( toULength==toULimit &&      /* consumed all trail bytes */
 932                 (toULength==3 || toULength==2) &&             /* BMP */
 933                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
 934                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
 935             ) {
 936                 /* legal byte sequence for BMP code point */
 937             } else if(
 938                 toULength==toULimit && toULength==4 &&
 939                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
 940             ) {
 941                 /* legal byte sequence for supplementary code point */
 942             } else {
 943                 /* error handling: illegal UTF-8 byte sequence */
 944                 source-=(toULength-oldToULength);
 945                 while(oldToULength<toULength) {
 946                     utf8->toUBytes[oldToULength++]=*source++;
 947                 }
 948                 utf8->toULength=toULength;
 949                 pToUArgs->source=(char *)source;
 950                 pFromUArgs->target=(char *)target;
 951                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 952                 return;
 953             }
 954
 955             /* copy the legal byte sequence to the target */
 956             {
 957                 int8_t i;
 958
 959                 for(i=0; i<oldToULength; ++i) {
 960                     *target++=utf8->toUBytes[i];
 961                 }
 962                 source-=(toULength-oldToULength);
 963                 for(; i<toULength; ++i) {
 964                     *target++=*source++;
 965                 }
 966                 count-=toULength;
 967             }
 968         }
 969     }
 970
 971     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
 972         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
 973             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 974         } else {
 975             b=*source;
 976             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
 977             if(toULimit>(sourceLimit-source)) {
 978                 /* collect a truncated byte sequence */
 979                 toULength=0;
 980                 c=b;
 981                 for(;;) {
 982                     utf8->toUBytes[toULength++]=b;
 983                     if(++source==sourceLimit) {
 984                         /* partial byte sequence at end of source */
 985                         utf8->toUnicodeStatus=c;
 986                         utf8->toULength=toULength;
 987                         utf8->mode=toULimit;
 988                         break;
 989                     } else if(!U8_IS_TRAIL(b=*source)) {
 990                         /* lead byte in trail byte position */
 991                         utf8->toULength=toULength;
 992                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 993                         break;
 994                     }
 995                     c=(c<<6)+b;
 996                 }
 997             } else {
 998                 /* partial-sequence target overflow: fall back to the pivoting implementation */
 999                 *pErrorCode=U_USING_DEFAULT_WARNING;
1000             }
1001         }
1002     }
1003
1004     /* write back the updated pointers */
1005     pToUArgs->source=(char *)source;
1006     pFromUArgs->target=(char *)target;
1007 }
1008
1009 /* UTF-8 converter data ----------------------------------------------------- */
1010
1011 static const UConverterImpl _UTF8Impl={
1012     UCNV_UTF8,
1013
1014     NULL,
1015     NULL,
1016
1017     NULL,
1018     NULL,
1019     NULL,
1020
1021     ucnv_toUnicode_UTF8,
1022     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1023     ucnv_fromUnicode_UTF8,
1024     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1025     ucnv_getNextUChar_UTF8,
1026
1027     NULL,
1028     NULL,
1029     NULL,
1030     NULL,
1031     ucnv_getNonSurrogateUnicodeSet,
1032
1033     ucnv_UTF8FromUTF8,
1034     ucnv_UTF8FromUTF8
1035 };
1036
1037 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1038 static const UConverterStaticData _UTF8StaticData={
1039     sizeof(UConverterStaticData),
1040     "UTF-8",
1041     1208, UCNV_IBM, UCNV_UTF8,
1042     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1043     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1044     0,
1045     0,
1046     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1047 };
1048
1049
1050 const UConverterSharedData _UTF8Data=
1051         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1052
1053 /* CESU-8 converter data ---------------------------------------------------- */
1054
1055 static const UConverterImpl _CESU8Impl={
1056     UCNV_CESU8,
1057
1058     NULL,
1059     NULL,
1060
1061     NULL,
1062     NULL,
1063     NULL,
1064
1065     ucnv_toUnicode_UTF8,
1066     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1067     ucnv_fromUnicode_UTF8,
1068     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1069     NULL,
1070
1071     NULL,
1072     NULL,
1073     NULL,
1074     NULL,
1075     ucnv_getCompleteUnicodeSet
1076 };
1077
1078 static const UConverterStaticData _CESU8StaticData={
1079     sizeof(UConverterStaticData),
1080     "CESU-8",
1081     9400, /* CCSID for CESU-8 */
1082     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1083     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1084     0,
1085     0,
1086     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1087 };
1088
1089
1090 const UConverterSharedData _CESU8Data=
1091         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
1092
1093 #endif