icuSources/common/ucnv_u8.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u8.c
   9 *   encoding:   UTF-8
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
  17 *
  18 *   Also, CESU-8 implementation, see UTR 26.
  19 *   The CESU-8 converter uses all the same functions as the
  20 *   UTF-8 converter, with a branch for converting supplementary code points.
  21 */
  22
  23 #include "unicode/utypes.h"
  24
  25 #if !UCONFIG_NO_CONVERSION
  26
  27 #include "unicode/ucnv.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf8.h"
  30 #include "unicode/utf16.h"
  31 #include "ucnv_bld.h"
  32 #include "ucnv_cnv.h"
  33 #include "cmemory.h"
  34
  35 /* Prototypes --------------------------------------------------------------- */
  36
  37 /* Keep these here to make finicky compilers happy */
  38
  39 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
  40                                            UErrorCode *err);
  41 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
  42                                                         UErrorCode *err);
  43
  44
  45 /* UTF-8 -------------------------------------------------------------------- */
  46
  47 /* UTF-8 Conversion DATA
  48  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
  49  */
  50 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
  51 #define MAXIMUM_UCS2            0x0000FFFF
  52 #define MAXIMUM_UTF             0x0010FFFF
  53 #define MAXIMUM_UCS4            0x7FFFFFFF
  54 #define HALF_SHIFT              10
  55 #define HALF_BASE               0x0010000
  56 #define HALF_MASK               0x3FF
  57 #define SURROGATE_HIGH_START    0xD800
  58 #define SURROGATE_HIGH_END      0xDBFF
  59 #define SURROGATE_LOW_START     0xDC00
  60 #define SURROGATE_LOW_END       0xDFFF
  61
  62 /* -SURROGATE_LOW_START + HALF_BASE */
  63 #define SURROGATE_LOW_BASE      9216
  64
  65 static const uint32_t offsetsFromUTF8[7] = {0,
  66   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
  67   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
  68 };
  69
  70 /* END OF UTF-8 Conversion DATA */
  71
  72 static const int8_t bytesFromUTF8[256] = {
  73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  75   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  76   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  77   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  80   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  81 };
  82
  83 /*
  84  * Starting with Unicode 3.0.1:
  85  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
  86  * byte sequences with more than 4 bytes are illegal in UTF-8,
  87  * which is tested with impossible values for them
  88  */
  89 static const uint32_t
  90 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
  91
  92 static UBool hasCESU8Data(const UConverter *cnv)
  93 {
  94 #if UCONFIG_ONLY_HTML_CONVERSION
  95     return FALSE;
  96 #else
  97     return (UBool)(cnv->sharedData == &_CESU8Data);
  98 #endif
  99 }
 100 U_CDECL_BEGIN
 101 static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
 102                                   UErrorCode * err)
 103 {
 104     UConverter *cnv = args->converter;
 105     const unsigned char *mySource = (unsigned char *) args->source;
 106     UChar *myTarget = args->target;
 107     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 108     const UChar *targetLimit = args->targetLimit;
 109     unsigned char *toUBytes = cnv->toUBytes;
 110     UBool isCESU8 = hasCESU8Data(cnv);
 111     uint32_t ch, ch2 = 0;
 112     int32_t i, inBytes;
 113
 114     /* Restore size of current sequence */
 115     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 116     {
 117         inBytes = cnv->mode;            /* restore # of bytes to consume */
 118         i = cnv->toULength;             /* restore # of bytes consumed */
 119         cnv->toULength = 0;
 120
 121         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 122         cnv->toUnicodeStatus = 0;
 123         goto morebytes;
 124     }
 125
 126
 127     while (mySource < sourceLimit && myTarget < targetLimit)
 128     {
 129         ch = *(mySource++);
 130         if (ch < 0x80)        /* Simple case */
 131         {
 132             *(myTarget++) = (UChar) ch;
 133         }
 134         else
 135         {
 136             /* store the first char */
 137             toUBytes[0] = (char)ch;
 138             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
 139             i = 1;
 140
 141 morebytes:
 142             while (i < inBytes)
 143             {
 144                 if (mySource < sourceLimit)
 145                 {
 146                     toUBytes[i] = (char) (ch2 = *mySource);
 147                     if (!U8_IS_TRAIL(ch2))
 148                     {
 149                         break; /* i < inBytes */
 150                     }
 151                     ch = (ch << 6) + ch2;
 152                     ++mySource;
 153                     i++;
 154                 }
 155                 else
 156                 {
 157                     /* stores a partially calculated target*/
 158                     cnv->toUnicodeStatus = ch;
 159                     cnv->mode = inBytes;
 160                     cnv->toULength = (int8_t) i;
 161                     goto donefornow;
 162                 }
 163             }
 164
 165             /* Remove the accumulated high bits */
 166             ch -= offsetsFromUTF8[inBytes];
 167
 168             /*
 169              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 170              * - use only trail bytes after a lead byte (checked above)
 171              * - use the right number of trail bytes for a given lead byte
 172              * - encode a code point <= U+10ffff
 173              * - use the fewest possible number of bytes for their code points
 174              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 175              *
 176              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 177              * There are no irregular sequences any more.
 178              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 179              */
 180             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 181                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
 182             {
 183                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 184                 if (ch <= MAXIMUM_UCS2)
 185                 {
 186                     /* fits in 16 bits */
 187                     *(myTarget++) = (UChar) ch;
 188                 }
 189                 else
 190                 {
 191                     /* write out the surrogates */
 192                     ch -= HALF_BASE;
 193                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 194                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 195                     if (myTarget < targetLimit)
 196                     {
 197                         *(myTarget++) = (UChar)ch;
 198                     }
 199                     else
 200                     {
 201                         /* Put in overflow buffer (not handled here) */
 202                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 203                         cnv->UCharErrorBufferLength = 1;
 204                         *err = U_BUFFER_OVERFLOW_ERROR;
 205                         break;
 206                     }
 207                 }
 208             }
 209             else
 210             {
 211                 cnv->toULength = (int8_t)i;
 212                 *err = U_ILLEGAL_CHAR_FOUND;
 213                 break;
 214             }
 215         }
 216     }
 217
 218 donefornow:
 219     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 220     {
 221         /* End of target buffer */
 222         *err = U_BUFFER_OVERFLOW_ERROR;
 223     }
 224
 225     args->target = myTarget;
 226     args->source = (const char *) mySource;
 227 }
 228
 229 static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
 230                                                 UErrorCode * err)
 231 {
 232     UConverter *cnv = args->converter;
 233     const unsigned char *mySource = (unsigned char *) args->source;
 234     UChar *myTarget = args->target;
 235     int32_t *myOffsets = args->offsets;
 236     int32_t offsetNum = 0;
 237     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 238     const UChar *targetLimit = args->targetLimit;
 239     unsigned char *toUBytes = cnv->toUBytes;
 240     UBool isCESU8 = hasCESU8Data(cnv);
 241     uint32_t ch, ch2 = 0;
 242     int32_t i, inBytes;
 243
 244     /* Restore size of current sequence */
 245     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 246     {
 247         inBytes = cnv->mode;            /* restore # of bytes to consume */
 248         i = cnv->toULength;             /* restore # of bytes consumed */
 249         cnv->toULength = 0;
 250
 251         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 252         cnv->toUnicodeStatus = 0;
 253         goto morebytes;
 254     }
 255
 256     while (mySource < sourceLimit && myTarget < targetLimit)
 257     {
 258         ch = *(mySource++);
 259         if (ch < 0x80)        /* Simple case */
 260         {
 261             *(myTarget++) = (UChar) ch;
 262             *(myOffsets++) = offsetNum++;
 263         }
 264         else
 265         {
 266             toUBytes[0] = (char)ch;
 267             inBytes = bytesFromUTF8[ch];
 268             i = 1;
 269
 270 morebytes:
 271             while (i < inBytes)
 272             {
 273                 if (mySource < sourceLimit)
 274                 {
 275                     toUBytes[i] = (char) (ch2 = *mySource);
 276                     if (!U8_IS_TRAIL(ch2))
 277                     {
 278                         break; /* i < inBytes */
 279                     }
 280                     ch = (ch << 6) + ch2;
 281                     ++mySource;
 282                     i++;
 283                 }
 284                 else
 285                 {
 286                     cnv->toUnicodeStatus = ch;
 287                     cnv->mode = inBytes;
 288                     cnv->toULength = (int8_t)i;
 289                     goto donefornow;
 290                 }
 291             }
 292
 293             /* Remove the accumulated high bits */
 294             ch -= offsetsFromUTF8[inBytes];
 295
 296             /*
 297              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 298              * - use only trail bytes after a lead byte (checked above)
 299              * - use the right number of trail bytes for a given lead byte
 300              * - encode a code point <= U+10ffff
 301              * - use the fewest possible number of bytes for their code points
 302              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 303              *
 304              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 305              * There are no irregular sequences any more.
 306              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 307              */
 308             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 309                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
 310             {
 311                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 312                 if (ch <= MAXIMUM_UCS2)
 313                 {
 314                     /* fits in 16 bits */
 315                     *(myTarget++) = (UChar) ch;
 316                     *(myOffsets++) = offsetNum;
 317                 }
 318                 else
 319                 {
 320                     /* write out the surrogates */
 321                     ch -= HALF_BASE;
 322                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 323                     *(myOffsets++) = offsetNum;
 324                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 325                     if (myTarget < targetLimit)
 326                     {
 327                         *(myTarget++) = (UChar)ch;
 328                         *(myOffsets++) = offsetNum;
 329                     }
 330                     else
 331                     {
 332                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 333                         cnv->UCharErrorBufferLength = 1;
 334                         *err = U_BUFFER_OVERFLOW_ERROR;
 335                     }
 336                 }
 337                 offsetNum += i;
 338             }
 339             else
 340             {
 341                 cnv->toULength = (int8_t)i;
 342                 *err = U_ILLEGAL_CHAR_FOUND;
 343                 break;
 344             }
 345         }
 346     }
 347
 348 donefornow:
 349     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 350     {   /* End of target buffer */
 351         *err = U_BUFFER_OVERFLOW_ERROR;
 352     }
 353
 354     args->target = myTarget;
 355     args->source = (const char *) mySource;
 356     args->offsets = myOffsets;
 357 }
 358 U_CDECL_END
 359
 360 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
 361                                     UErrorCode * err)
 362 {
 363     UConverter *cnv = args->converter;
 364     const UChar *mySource = args->source;
 365     const UChar *sourceLimit = args->sourceLimit;
 366     uint8_t *myTarget = (uint8_t *) args->target;
 367     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 368     uint8_t *tempPtr;
 369     UChar32 ch;
 370     uint8_t tempBuf[4];
 371     int32_t indexToWrite;
 372     UBool isNotCESU8 = !hasCESU8Data(cnv);
 373
 374     if (cnv->fromUChar32 && myTarget < targetLimit)
 375     {
 376         ch = cnv->fromUChar32;
 377         cnv->fromUChar32 = 0;
 378         goto lowsurrogate;
 379     }
 380
 381     while (mySource < sourceLimit && myTarget < targetLimit)
 382     {
 383         ch = *(mySource++);
 384
 385         if (ch < 0x80)        /* Single byte */
 386         {
 387             *(myTarget++) = (uint8_t) ch;
 388         }
 389         else if (ch < 0x800)  /* Double byte */
 390         {
 391             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 392             if (myTarget < targetLimit)
 393             {
 394                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 395             }
 396             else
 397             {
 398                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 399                 cnv->charErrorBufferLength = 1;
 400                 *err = U_BUFFER_OVERFLOW_ERROR;
 401             }
 402         }
 403         else {
 404             /* Check for surrogates */
 405             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 406 lowsurrogate:
 407                 if (mySource < sourceLimit) {
 408                     /* test both code units */
 409                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 410                         /* convert and consume this supplementary code point */
 411                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 412                         ++mySource;
 413                         /* exit this condition tree */
 414                     }
 415                     else {
 416                         /* this is an unpaired trail or lead code unit */
 417                         /* callback(illegal) */
 418                         cnv->fromUChar32 = ch;
 419                         *err = U_ILLEGAL_CHAR_FOUND;
 420                         break;
 421                     }
 422                 }
 423                 else {
 424                     /* no more input */
 425                     cnv->fromUChar32 = ch;
 426                     break;
 427                 }
 428             }
 429
 430             /* Do we write the buffer directly for speed,
 431             or do we have to be careful about target buffer space? */
 432             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 433
 434             if (ch <= MAXIMUM_UCS2) {
 435                 indexToWrite = 2;
 436                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 437             }
 438             else {
 439                 indexToWrite = 3;
 440                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 441                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 442             }
 443             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 444             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 445
 446             if (tempPtr == myTarget) {
 447                 /* There was enough space to write the codepoint directly. */
 448                 myTarget += (indexToWrite + 1);
 449             }
 450             else {
 451                 /* We might run out of room soon. Write it slowly. */
 452                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 453                     if (myTarget < targetLimit) {
 454                         *(myTarget++) = *tempPtr;
 455                     }
 456                     else {
 457                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 458                         *err = U_BUFFER_OVERFLOW_ERROR;
 459                     }
 460                 }
 461             }
 462         }
 463     }
 464
 465     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 466     {
 467         *err = U_BUFFER_OVERFLOW_ERROR;
 468     }
 469
 470     args->target = (char *) myTarget;
 471     args->source = mySource;
 472 }
 473
 474 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
 475                                                   UErrorCode * err)
 476 {
 477     UConverter *cnv = args->converter;
 478     const UChar *mySource = args->source;
 479     int32_t *myOffsets = args->offsets;
 480     const UChar *sourceLimit = args->sourceLimit;
 481     uint8_t *myTarget = (uint8_t *) args->target;
 482     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 483     uint8_t *tempPtr;
 484     UChar32 ch;
 485     int32_t offsetNum, nextSourceIndex;
 486     int32_t indexToWrite;
 487     uint8_t tempBuf[4];
 488     UBool isNotCESU8 = !hasCESU8Data(cnv);
 489
 490     if (cnv->fromUChar32 && myTarget < targetLimit)
 491     {
 492         ch = cnv->fromUChar32;
 493         cnv->fromUChar32 = 0;
 494         offsetNum = -1;
 495         nextSourceIndex = 0;
 496         goto lowsurrogate;
 497     } else {
 498         offsetNum = 0;
 499     }
 500
 501     while (mySource < sourceLimit && myTarget < targetLimit)
 502     {
 503         ch = *(mySource++);
 504
 505         if (ch < 0x80)        /* Single byte */
 506         {
 507             *(myOffsets++) = offsetNum++;
 508             *(myTarget++) = (char) ch;
 509         }
 510         else if (ch < 0x800)  /* Double byte */
 511         {
 512             *(myOffsets++) = offsetNum;
 513             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 514             if (myTarget < targetLimit)
 515             {
 516                 *(myOffsets++) = offsetNum++;
 517                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 518             }
 519             else
 520             {
 521                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 522                 cnv->charErrorBufferLength = 1;
 523                 *err = U_BUFFER_OVERFLOW_ERROR;
 524             }
 525         }
 526         else
 527         /* Check for surrogates */
 528         {
 529             nextSourceIndex = offsetNum + 1;
 530
 531             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 532 lowsurrogate:
 533                 if (mySource < sourceLimit) {
 534                     /* test both code units */
 535                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 536                         /* convert and consume this supplementary code point */
 537                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 538                         ++mySource;
 539                         ++nextSourceIndex;
 540                         /* exit this condition tree */
 541                     }
 542                     else {
 543                         /* this is an unpaired trail or lead code unit */
 544                         /* callback(illegal) */
 545                         cnv->fromUChar32 = ch;
 546                         *err = U_ILLEGAL_CHAR_FOUND;
 547                         break;
 548                     }
 549                 }
 550                 else {
 551                     /* no more input */
 552                     cnv->fromUChar32 = ch;
 553                     break;
 554                 }
 555             }
 556
 557             /* Do we write the buffer directly for speed,
 558             or do we have to be careful about target buffer space? */
 559             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 560
 561             if (ch <= MAXIMUM_UCS2) {
 562                 indexToWrite = 2;
 563                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 564             }
 565             else {
 566                 indexToWrite = 3;
 567                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 568                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 569             }
 570             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 571             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 572
 573             if (tempPtr == myTarget) {
 574                 /* There was enough space to write the codepoint directly. */
 575                 myTarget += (indexToWrite + 1);
 576                 myOffsets[0] = offsetNum;
 577                 myOffsets[1] = offsetNum;
 578                 myOffsets[2] = offsetNum;
 579                 if (indexToWrite >= 3) {
 580                     myOffsets[3] = offsetNum;
 581                 }
 582                 myOffsets += (indexToWrite + 1);
 583             }
 584             else {
 585                 /* We might run out of room soon. Write it slowly. */
 586                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 587                     if (myTarget < targetLimit)
 588                     {
 589                         *(myOffsets++) = offsetNum;
 590                         *(myTarget++) = *tempPtr;
 591                     }
 592                     else
 593                     {
 594                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 595                         *err = U_BUFFER_OVERFLOW_ERROR;
 596                     }
 597                 }
 598             }
 599             offsetNum = nextSourceIndex;
 600         }
 601     }
 602
 603     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 604     {
 605         *err = U_BUFFER_OVERFLOW_ERROR;
 606     }
 607
 608     args->target = (char *) myTarget;
 609     args->source = mySource;
 610     args->offsets = myOffsets;
 611 }
 612
 613 U_CDECL_BEGIN
 614 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
 615                                                UErrorCode *err) {
 616     UConverter *cnv;
 617     const uint8_t *sourceInitial;
 618     const uint8_t *source;
 619     uint16_t extraBytesToWrite;
 620     uint8_t myByte;
 621     UChar32 ch;
 622     int8_t i, isLegalSequence;
 623
 624     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
 625
 626     cnv = args->converter;
 627     sourceInitial = source = (const uint8_t *)args->source;
 628     if (source >= (const uint8_t *)args->sourceLimit)
 629     {
 630         /* no input */
 631         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 632         return 0xffff;
 633     }
 634
 635     myByte = (uint8_t)*(source++);
 636     if (myByte < 0x80)
 637     {
 638         args->source = (const char *)source;
 639         return (UChar32)myByte;
 640     }
 641
 642     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
 643     if (extraBytesToWrite == 0) {
 644         cnv->toUBytes[0] = myByte;
 645         cnv->toULength = 1;
 646         *err = U_ILLEGAL_CHAR_FOUND;
 647         args->source = (const char *)source;
 648         return 0xffff;
 649     }
 650
 651     /*The byte sequence is longer than the buffer area passed*/
 652     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
 653     {
 654         /* check if all of the remaining bytes are trail bytes */
 655         cnv->toUBytes[0] = myByte;
 656         i = 1;
 657         *err = U_TRUNCATED_CHAR_FOUND;
 658         while(source < (const uint8_t *)args->sourceLimit) {
 659             if(U8_IS_TRAIL(myByte = *source)) {
 660                 cnv->toUBytes[i++] = myByte;
 661                 ++source;
 662             } else {
 663                 /* error even before we run out of input */
 664                 *err = U_ILLEGAL_CHAR_FOUND;
 665                 break;
 666             }
 667         }
 668         cnv->toULength = i;
 669         args->source = (const char *)source;
 670         return 0xffff;
 671     }
 672
 673     isLegalSequence = 1;
 674     ch = myByte << 6;
 675     switch(extraBytesToWrite)
 676     {
 677       /* note: code falls through cases! (sic)*/
 678     case 6:
 679         ch += (myByte = *source);
 680         ch <<= 6;
 681         if (!U8_IS_TRAIL(myByte))
 682         {
 683             isLegalSequence = 0;
 684             break;
 685         }
 686         ++source;
 687         U_FALLTHROUGH;
 688     case 5:
 689         ch += (myByte = *source);
 690         ch <<= 6;
 691         if (!U8_IS_TRAIL(myByte))
 692         {
 693             isLegalSequence = 0;
 694             break;
 695         }
 696         ++source;
 697         U_FALLTHROUGH;
 698     case 4:
 699         ch += (myByte = *source);
 700         ch <<= 6;
 701         if (!U8_IS_TRAIL(myByte))
 702         {
 703             isLegalSequence = 0;
 704             break;
 705         }
 706         ++source;
 707         U_FALLTHROUGH;
 708     case 3:
 709         ch += (myByte = *source);
 710         ch <<= 6;
 711         if (!U8_IS_TRAIL(myByte))
 712         {
 713             isLegalSequence = 0;
 714             break;
 715         }
 716         ++source;
 717         U_FALLTHROUGH;
 718     case 2:
 719         ch += (myByte = *source);
 720         if (!U8_IS_TRAIL(myByte))
 721         {
 722             isLegalSequence = 0;
 723             break;
 724         }
 725         ++source;
 726     };
 727     ch -= offsetsFromUTF8[extraBytesToWrite];
 728     args->source = (const char *)source;
 729
 730     /*
 731      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 732      * - use only trail bytes after a lead byte (checked above)
 733      * - use the right number of trail bytes for a given lead byte
 734      * - encode a code point <= U+10ffff
 735      * - use the fewest possible number of bytes for their code points
 736      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 737      *
 738      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 739      * There are no irregular sequences any more.
 740      */
 741     if (isLegalSequence &&
 742         (uint32_t)ch <= MAXIMUM_UTF &&
 743         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
 744         !U_IS_SURROGATE(ch)
 745     ) {
 746         return ch; /* return the code point */
 747     }
 748
 749     for(i = 0; sourceInitial < source; ++i) {
 750         cnv->toUBytes[i] = *sourceInitial++;
 751     }
 752     cnv->toULength = i;
 753     *err = U_ILLEGAL_CHAR_FOUND;
 754     return 0xffff;
 755 }
 756 U_CDECL_END
 757
 758 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
 759
 760 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
 761 static const UChar32
 762 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
 763
 764 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
 765 static const UChar32
 766 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
 767
 768 U_CDECL_BEGIN
 769 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
 770 static void U_CALLCONV
 771 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
 772                   UConverterToUnicodeArgs *pToUArgs,
 773                   UErrorCode *pErrorCode) {
 774     UConverter *utf8;
 775     const uint8_t *source, *sourceLimit;
 776     uint8_t *target;
 777     int32_t targetCapacity;
 778     int32_t count;
 779
 780     int8_t oldToULength, toULength, toULimit;
 781
 782     UChar32 c;
 783     uint8_t b, t1, t2;
 784
 785     /* set up the local pointers */
 786     utf8=pToUArgs->converter;
 787     source=(uint8_t *)pToUArgs->source;
 788     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
 789     target=(uint8_t *)pFromUArgs->target;
 790     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
 791
 792     /* get the converter state from the UTF-8 UConverter */
 793     c=(UChar32)utf8->toUnicodeStatus;
 794     if(c!=0) {
 795         toULength=oldToULength=utf8->toULength;
 796         toULimit=(int8_t)utf8->mode;
 797     } else {
 798         toULength=oldToULength=toULimit=0;
 799     }
 800
 801     count=(int32_t)(sourceLimit-source)+oldToULength;
 802     if(count<toULimit) {
 803         /*
 804          * Not enough input to complete the partial character.
 805          * Jump to moreBytes below - it will not output to target.
 806          */
 807     } else if(targetCapacity<toULimit) {
 808         /*
 809          * Not enough target capacity to output the partial character.
 810          * Let the standard converter handle this.
 811          */
 812         *pErrorCode=U_USING_DEFAULT_WARNING;
 813         return;
 814     } else {
 815         /*
 816          * Use a single counter for source and target, counting the minimum of
 817          * the source length and the target capacity.
 818          * As a result, the source length is checked only once per multi-byte
 819          * character instead of twice.
 820          *
 821          * Make sure that the last byte sequence is complete, or else
 822          * stop just before it.
 823          * (The longest legal byte sequence has 3 trail bytes.)
 824          * Count oldToULength (number of source bytes from a previous buffer)
 825          * into the source length but reduce the source index by toULimit
 826          * while going back over trail bytes in order to not go back into
 827          * the bytes that will be read for finishing a partial
 828          * sequence from the previous buffer.
 829          * Let the standard converter handle edge cases.
 830          */
 831         int32_t i;
 832
 833         if(count>targetCapacity) {
 834             count=targetCapacity;
 835         }
 836
 837         i=0;
 838         while(i<3 && i<(count-toULimit)) {
 839             b=source[count-oldToULength-i-1];
 840             if(U8_IS_TRAIL(b)) {
 841                 ++i;
 842             } else {
 843                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
 844                     /* stop converting before the lead byte if there are not enough trail bytes for it */
 845                     count-=i+1;
 846                 }
 847                 break;
 848             }
 849         }
 850     }
 851
 852     if(c!=0) {
 853         utf8->toUnicodeStatus=0;
 854         utf8->toULength=0;
 855         goto moreBytes;
 856         /* See note in ucnv_SBCSFromUTF8() about this goto. */
 857     }
 858
 859     /* conversion loop */
 860     while(count>0) {
 861         b=*source++;
 862         if((int8_t)b>=0) {
 863             /* convert ASCII */
 864             *target++=b;
 865             --count;
 866             continue;
 867         } else {
 868             if(b>0xe0) {
 869                 if( /* handle U+1000..U+D7FF inline */
 870                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
 871                                                (b==0xed && (t1 <= 0x9f))) &&
 872                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 873                 ) {
 874                     source+=2;
 875                     *target++=b;
 876                     *target++=t1;
 877                     *target++=t2;
 878                     count-=3;
 879                     continue;
 880                 }
 881             } else if(b<0xe0) {
 882                 if( /* handle U+0080..U+07FF inline */
 883                     b>=0xc2 &&
 884                     (t1=*source) >= 0x80 && t1 <= 0xbf
 885                 ) {
 886                     ++source;
 887                     *target++=b;
 888                     *target++=t1;
 889                     count-=2;
 890                     continue;
 891                 }
 892             } else if(b==0xe0) {
 893                 if( /* handle U+0800..U+0FFF inline */
 894                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
 895                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 896                 ) {
 897                     source+=2;
 898                     *target++=b;
 899                     *target++=t1;
 900                     *target++=t2;
 901                     count-=3;
 902                     continue;
 903                 }
 904             }
 905
 906             /* handle "complicated" and error cases, and continuing partial characters */
 907             oldToULength=0;
 908             toULength=1;
 909             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
 910             c=b;
 911 moreBytes:
 912             while(toULength<toULimit) {
 913                 if(source<sourceLimit) {
 914                     b=*source;
 915                     if(U8_IS_TRAIL(b)) {
 916                         ++source;
 917                         ++toULength;
 918                         c=(c<<6)+b;
 919                     } else {
 920                         break; /* sequence too short, stop with toULength<toULimit */
 921                     }
 922                 } else {
 923                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
 924                     source-=(toULength-oldToULength);
 925                     while(oldToULength<toULength) {
 926                         utf8->toUBytes[oldToULength++]=*source++;
 927                     }
 928                     utf8->toUnicodeStatus=c;
 929                     utf8->toULength=toULength;
 930                     utf8->mode=toULimit;
 931                     pToUArgs->source=(char *)source;
 932                     pFromUArgs->target=(char *)target;
 933                     return;
 934                 }
 935             }
 936
 937             if( toULength==toULimit &&      /* consumed all trail bytes */
 938                 (toULength==3 || toULength==2) &&             /* BMP */
 939                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
 940                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
 941             ) {
 942                 /* legal byte sequence for BMP code point */
 943             } else if(
 944                 toULength==toULimit && toULength==4 &&
 945                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
 946             ) {
 947                 /* legal byte sequence for supplementary code point */
 948             } else {
 949                 /* error handling: illegal UTF-8 byte sequence */
 950                 source-=(toULength-oldToULength);
 951                 while(oldToULength<toULength) {
 952                     utf8->toUBytes[oldToULength++]=*source++;
 953                 }
 954                 utf8->toULength=toULength;
 955                 pToUArgs->source=(char *)source;
 956                 pFromUArgs->target=(char *)target;
 957                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 958                 return;
 959             }
 960
 961             /* copy the legal byte sequence to the target */
 962             {
 963                 int8_t i;
 964
 965                 for(i=0; i<oldToULength; ++i) {
 966                     *target++=utf8->toUBytes[i];
 967                 }
 968                 source-=(toULength-oldToULength);
 969                 for(; i<toULength; ++i) {
 970                     *target++=*source++;
 971                 }
 972                 count-=toULength;
 973             }
 974         }
 975     }
 976
 977     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
 978         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
 979             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 980         } else {
 981             b=*source;
 982             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
 983             if(toULimit>(sourceLimit-source)) {
 984                 /* collect a truncated byte sequence */
 985                 toULength=0;
 986                 c=b;
 987                 for(;;) {
 988                     utf8->toUBytes[toULength++]=b;
 989                     if(++source==sourceLimit) {
 990                         /* partial byte sequence at end of source */
 991                         utf8->toUnicodeStatus=c;
 992                         utf8->toULength=toULength;
 993                         utf8->mode=toULimit;
 994                         break;
 995                     } else if(!U8_IS_TRAIL(b=*source)) {
 996                         /* lead byte in trail byte position */
 997                         utf8->toULength=toULength;
 998                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 999                         break;
1000                     }
1001                     c=(c<<6)+b;
1002                 }
1003             } else {
1004                 /* partial-sequence target overflow: fall back to the pivoting implementation */
1005                 *pErrorCode=U_USING_DEFAULT_WARNING;
1006             }
1007         }
1008     }
1009
1010     /* write back the updated pointers */
1011     pToUArgs->source=(char *)source;
1012     pFromUArgs->target=(char *)target;
1013 }
1014
1015 U_CDECL_END
1016
1017 /* UTF-8 converter data ----------------------------------------------------- */
1018
1019 static const UConverterImpl _UTF8Impl={
1020     UCNV_UTF8,
1021
1022     NULL,
1023     NULL,
1024
1025     NULL,
1026     NULL,
1027     NULL,
1028
1029     ucnv_toUnicode_UTF8,
1030     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1031     ucnv_fromUnicode_UTF8,
1032     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1033     ucnv_getNextUChar_UTF8,
1034
1035     NULL,
1036     NULL,
1037     NULL,
1038     NULL,
1039     ucnv_getNonSurrogateUnicodeSet,
1040
1041     ucnv_UTF8FromUTF8,
1042     ucnv_UTF8FromUTF8
1043 };
1044
1045 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1046 static const UConverterStaticData _UTF8StaticData={
1047     sizeof(UConverterStaticData),
1048     "UTF-8",
1049     1208, UCNV_IBM, UCNV_UTF8,
1050     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1051     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1052     0,
1053     0,
1054     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1055 };
1056
1057
1058 const UConverterSharedData _UTF8Data=
1059         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1060
1061 /* CESU-8 converter data ---------------------------------------------------- */
1062
1063 static const UConverterImpl _CESU8Impl={
1064     UCNV_CESU8,
1065
1066     NULL,
1067     NULL,
1068
1069     NULL,
1070     NULL,
1071     NULL,
1072
1073     ucnv_toUnicode_UTF8,
1074     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1075     ucnv_fromUnicode_UTF8,
1076     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1077     NULL,
1078
1079     NULL,
1080     NULL,
1081     NULL,
1082     NULL,
1083     ucnv_getCompleteUnicodeSet,
1084
1085     NULL,
1086     NULL
1087 };
1088
1089 static const UConverterStaticData _CESU8StaticData={
1090     sizeof(UConverterStaticData),
1091     "CESU-8",
1092     9400, /* CCSID for CESU-8 */
1093     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1094     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1095     0,
1096     0,
1097     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1098 };
1099
1100
1101 const UConverterSharedData _CESU8Data=
1102         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
1103
1104 #endif