icuSources/common/ucnv_u8.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u8.c
   9 *   encoding:   UTF-8
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
  17 *
  18 *   Also, CESU-8 implementation, see UTR 26.
  19 *   The CESU-8 converter uses all the same functions as the
  20 *   UTF-8 converter, with a branch for converting supplementary code points.
  21 */
  22
  23 #include "unicode/utypes.h"
  24
  25 #if !UCONFIG_NO_CONVERSION
  26
  27 #include "unicode/ucnv.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf8.h"
  30 #include "unicode/utf16.h"
  31 #include "uassert.h"
  32 #include "ucnv_bld.h"
  33 #include "ucnv_cnv.h"
  34 #include "cmemory.h"
  35 #include "ustr_imp.h"
  36
  37 /* Prototypes --------------------------------------------------------------- */
  38
  39 /* Keep these here to make finicky compilers happy */
  40
  41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
  42                                            UErrorCode *err);
  43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
  44                                                         UErrorCode *err);
  45
  46
  47 /* UTF-8 -------------------------------------------------------------------- */
  48
  49 #define MAXIMUM_UCS2            0x0000FFFF
  50
  51 static const uint32_t offsetsFromUTF8[5] = {0,
  52   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
  53   (uint32_t) 0x03C82080
  54 };
  55
  56 static UBool hasCESU8Data(const UConverter *cnv)
  57 {
  58 #if UCONFIG_ONLY_HTML_CONVERSION
  59     return FALSE;
  60 #else
  61     return (UBool)(cnv->sharedData == &_CESU8Data);
  62 #endif
  63 }
  64 U_CDECL_BEGIN
  65 static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
  66                                   UErrorCode * err)
  67 {
  68     UConverter *cnv = args->converter;
  69     const unsigned char *mySource = (unsigned char *) args->source;
  70     UChar *myTarget = args->target;
  71     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  72     const UChar *targetLimit = args->targetLimit;
  73     unsigned char *toUBytes = cnv->toUBytes;
  74     UBool isCESU8 = hasCESU8Data(cnv);
  75     uint32_t ch, ch2 = 0;
  76     int32_t i, inBytes;
  77
  78     /* Restore size of current sequence */
  79     if (cnv->toULength > 0 && myTarget < targetLimit)
  80     {
  81         inBytes = cnv->mode;            /* restore # of bytes to consume */
  82         i = cnv->toULength;             /* restore # of bytes consumed */
  83         cnv->toULength = 0;
  84
  85         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
  86         cnv->toUnicodeStatus = 0;
  87         goto morebytes;
  88     }
  89
  90
  91     while (mySource < sourceLimit && myTarget < targetLimit)
  92     {
  93         ch = *(mySource++);
  94         if (U8_IS_SINGLE(ch))        /* Simple case */
  95         {
  96             *(myTarget++) = (UChar) ch;
  97         }
  98         else
  99         {
 100             /* store the first char */
 101             toUBytes[0] = (char)ch;
 102             inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
 103             i = 1;
 104
 105 morebytes:
 106             while (i < inBytes)
 107             {
 108                 if (mySource < sourceLimit)
 109                 {
 110                     toUBytes[i] = (char) (ch2 = *mySource);
 111                     if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
 112                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
 113                     {
 114                         break; /* i < inBytes */
 115                     }
 116                     ch = (ch << 6) + ch2;
 117                     ++mySource;
 118                     i++;
 119                 }
 120                 else
 121                 {
 122                     /* stores a partially calculated target*/
 123                     cnv->toUnicodeStatus = ch;
 124                     cnv->mode = inBytes;
 125                     cnv->toULength = (int8_t) i;
 126                     goto donefornow;
 127                 }
 128             }
 129
 130             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 131             if (i == inBytes && (!isCESU8 || i <= 3))
 132             {
 133                 /* Remove the accumulated high bits */
 134                 ch -= offsetsFromUTF8[inBytes];
 135
 136                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 137                 if (ch <= MAXIMUM_UCS2)
 138                 {
 139                     /* fits in 16 bits */
 140                     *(myTarget++) = (UChar) ch;
 141                 }
 142                 else
 143                 {
 144                     /* write out the surrogates */
 145                     *(myTarget++) = U16_LEAD(ch);
 146                     ch = U16_TRAIL(ch);
 147                     if (myTarget < targetLimit)
 148                     {
 149                         *(myTarget++) = (UChar)ch;
 150                     }
 151                     else
 152                     {
 153                         /* Put in overflow buffer (not handled here) */
 154                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 155                         cnv->UCharErrorBufferLength = 1;
 156                         *err = U_BUFFER_OVERFLOW_ERROR;
 157                         break;
 158                     }
 159                 }
 160             }
 161             else
 162             {
 163                 cnv->toULength = (int8_t)i;
 164                 *err = U_ILLEGAL_CHAR_FOUND;
 165                 break;
 166             }
 167         }
 168     }
 169
 170 donefornow:
 171     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 172     {
 173         /* End of target buffer */
 174         *err = U_BUFFER_OVERFLOW_ERROR;
 175     }
 176
 177     args->target = myTarget;
 178     args->source = (const char *) mySource;
 179 }
 180
 181 static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
 182                                                 UErrorCode * err)
 183 {
 184     UConverter *cnv = args->converter;
 185     const unsigned char *mySource = (unsigned char *) args->source;
 186     UChar *myTarget = args->target;
 187     int32_t *myOffsets = args->offsets;
 188     int32_t offsetNum = 0;
 189     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 190     const UChar *targetLimit = args->targetLimit;
 191     unsigned char *toUBytes = cnv->toUBytes;
 192     UBool isCESU8 = hasCESU8Data(cnv);
 193     uint32_t ch, ch2 = 0;
 194     int32_t i, inBytes;
 195
 196     /* Restore size of current sequence */
 197     if (cnv->toULength > 0 && myTarget < targetLimit)
 198     {
 199         inBytes = cnv->mode;            /* restore # of bytes to consume */
 200         i = cnv->toULength;             /* restore # of bytes consumed */
 201         cnv->toULength = 0;
 202
 203         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 204         cnv->toUnicodeStatus = 0;
 205         goto morebytes;
 206     }
 207
 208     while (mySource < sourceLimit && myTarget < targetLimit)
 209     {
 210         ch = *(mySource++);
 211         if (U8_IS_SINGLE(ch))        /* Simple case */
 212         {
 213             *(myTarget++) = (UChar) ch;
 214             *(myOffsets++) = offsetNum++;
 215         }
 216         else
 217         {
 218             toUBytes[0] = (char)ch;
 219             inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
 220             i = 1;
 221
 222 morebytes:
 223             while (i < inBytes)
 224             {
 225                 if (mySource < sourceLimit)
 226                 {
 227                     toUBytes[i] = (char) (ch2 = *mySource);
 228                     if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
 229                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
 230                     {
 231                         break; /* i < inBytes */
 232                     }
 233                     ch = (ch << 6) + ch2;
 234                     ++mySource;
 235                     i++;
 236                 }
 237                 else
 238                 {
 239                     cnv->toUnicodeStatus = ch;
 240                     cnv->mode = inBytes;
 241                     cnv->toULength = (int8_t)i;
 242                     goto donefornow;
 243                 }
 244             }
 245
 246             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 247             if (i == inBytes && (!isCESU8 || i <= 3))
 248             {
 249                 /* Remove the accumulated high bits */
 250                 ch -= offsetsFromUTF8[inBytes];
 251
 252                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 253                 if (ch <= MAXIMUM_UCS2)
 254                 {
 255                     /* fits in 16 bits */
 256                     *(myTarget++) = (UChar) ch;
 257                     *(myOffsets++) = offsetNum;
 258                 }
 259                 else
 260                 {
 261                     /* write out the surrogates */
 262                     *(myTarget++) = U16_LEAD(ch);
 263                     *(myOffsets++) = offsetNum;
 264                     ch = U16_TRAIL(ch);
 265                     if (myTarget < targetLimit)
 266                     {
 267                         *(myTarget++) = (UChar)ch;
 268                         *(myOffsets++) = offsetNum;
 269                     }
 270                     else
 271                     {
 272                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 273                         cnv->UCharErrorBufferLength = 1;
 274                         *err = U_BUFFER_OVERFLOW_ERROR;
 275                     }
 276                 }
 277                 offsetNum += i;
 278             }
 279             else
 280             {
 281                 cnv->toULength = (int8_t)i;
 282                 *err = U_ILLEGAL_CHAR_FOUND;
 283                 break;
 284             }
 285         }
 286     }
 287
 288 donefornow:
 289     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 290     {   /* End of target buffer */
 291         *err = U_BUFFER_OVERFLOW_ERROR;
 292     }
 293
 294     args->target = myTarget;
 295     args->source = (const char *) mySource;
 296     args->offsets = myOffsets;
 297 }
 298 U_CDECL_END
 299
 300 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
 301                                     UErrorCode * err)
 302 {
 303     UConverter *cnv = args->converter;
 304     const UChar *mySource = args->source;
 305     const UChar *sourceLimit = args->sourceLimit;
 306     uint8_t *myTarget = (uint8_t *) args->target;
 307     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 308     uint8_t *tempPtr;
 309     UChar32 ch;
 310     uint8_t tempBuf[4];
 311     int32_t indexToWrite;
 312     UBool isNotCESU8 = !hasCESU8Data(cnv);
 313
 314     if (cnv->fromUChar32 && myTarget < targetLimit)
 315     {
 316         ch = cnv->fromUChar32;
 317         cnv->fromUChar32 = 0;
 318         goto lowsurrogate;
 319     }
 320
 321     while (mySource < sourceLimit && myTarget < targetLimit)
 322     {
 323         ch = *(mySource++);
 324
 325         if (ch < 0x80)        /* Single byte */
 326         {
 327             *(myTarget++) = (uint8_t) ch;
 328         }
 329         else if (ch < 0x800)  /* Double byte */
 330         {
 331             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 332             if (myTarget < targetLimit)
 333             {
 334                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 335             }
 336             else
 337             {
 338                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 339                 cnv->charErrorBufferLength = 1;
 340                 *err = U_BUFFER_OVERFLOW_ERROR;
 341             }
 342         }
 343         else {
 344             /* Check for surrogates */
 345             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 346 lowsurrogate:
 347                 if (mySource < sourceLimit) {
 348                     /* test both code units */
 349                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 350                         /* convert and consume this supplementary code point */
 351                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 352                         ++mySource;
 353                         /* exit this condition tree */
 354                     }
 355                     else {
 356                         /* this is an unpaired trail or lead code unit */
 357                         /* callback(illegal) */
 358                         cnv->fromUChar32 = ch;
 359                         *err = U_ILLEGAL_CHAR_FOUND;
 360                         break;
 361                     }
 362                 }
 363                 else {
 364                     /* no more input */
 365                     cnv->fromUChar32 = ch;
 366                     break;
 367                 }
 368             }
 369
 370             /* Do we write the buffer directly for speed,
 371             or do we have to be careful about target buffer space? */
 372             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 373
 374             if (ch <= MAXIMUM_UCS2) {
 375                 indexToWrite = 2;
 376                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 377             }
 378             else {
 379                 indexToWrite = 3;
 380                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 381                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 382             }
 383             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 384             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 385
 386             if (tempPtr == myTarget) {
 387                 /* There was enough space to write the codepoint directly. */
 388                 myTarget += (indexToWrite + 1);
 389             }
 390             else {
 391                 /* We might run out of room soon. Write it slowly. */
 392                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 393                     if (myTarget < targetLimit) {
 394                         *(myTarget++) = *tempPtr;
 395                     }
 396                     else {
 397                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 398                         *err = U_BUFFER_OVERFLOW_ERROR;
 399                     }
 400                 }
 401             }
 402         }
 403     }
 404
 405     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 406     {
 407         *err = U_BUFFER_OVERFLOW_ERROR;
 408     }
 409
 410     args->target = (char *) myTarget;
 411     args->source = mySource;
 412 }
 413
 414 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
 415                                                   UErrorCode * err)
 416 {
 417     UConverter *cnv = args->converter;
 418     const UChar *mySource = args->source;
 419     int32_t *myOffsets = args->offsets;
 420     const UChar *sourceLimit = args->sourceLimit;
 421     uint8_t *myTarget = (uint8_t *) args->target;
 422     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 423     uint8_t *tempPtr;
 424     UChar32 ch;
 425     int32_t offsetNum, nextSourceIndex;
 426     int32_t indexToWrite;
 427     uint8_t tempBuf[4];
 428     UBool isNotCESU8 = !hasCESU8Data(cnv);
 429
 430     if (cnv->fromUChar32 && myTarget < targetLimit)
 431     {
 432         ch = cnv->fromUChar32;
 433         cnv->fromUChar32 = 0;
 434         offsetNum = -1;
 435         nextSourceIndex = 0;
 436         goto lowsurrogate;
 437     } else {
 438         offsetNum = 0;
 439     }
 440
 441     while (mySource < sourceLimit && myTarget < targetLimit)
 442     {
 443         ch = *(mySource++);
 444
 445         if (ch < 0x80)        /* Single byte */
 446         {
 447             *(myOffsets++) = offsetNum++;
 448             *(myTarget++) = (char) ch;
 449         }
 450         else if (ch < 0x800)  /* Double byte */
 451         {
 452             *(myOffsets++) = offsetNum;
 453             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 454             if (myTarget < targetLimit)
 455             {
 456                 *(myOffsets++) = offsetNum++;
 457                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 458             }
 459             else
 460             {
 461                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 462                 cnv->charErrorBufferLength = 1;
 463                 *err = U_BUFFER_OVERFLOW_ERROR;
 464             }
 465         }
 466         else
 467         /* Check for surrogates */
 468         {
 469             nextSourceIndex = offsetNum + 1;
 470
 471             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 472 lowsurrogate:
 473                 if (mySource < sourceLimit) {
 474                     /* test both code units */
 475                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 476                         /* convert and consume this supplementary code point */
 477                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 478                         ++mySource;
 479                         ++nextSourceIndex;
 480                         /* exit this condition tree */
 481                     }
 482                     else {
 483                         /* this is an unpaired trail or lead code unit */
 484                         /* callback(illegal) */
 485                         cnv->fromUChar32 = ch;
 486                         *err = U_ILLEGAL_CHAR_FOUND;
 487                         break;
 488                     }
 489                 }
 490                 else {
 491                     /* no more input */
 492                     cnv->fromUChar32 = ch;
 493                     break;
 494                 }
 495             }
 496
 497             /* Do we write the buffer directly for speed,
 498             or do we have to be careful about target buffer space? */
 499             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 500
 501             if (ch <= MAXIMUM_UCS2) {
 502                 indexToWrite = 2;
 503                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 504             }
 505             else {
 506                 indexToWrite = 3;
 507                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 508                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 509             }
 510             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 511             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 512
 513             if (tempPtr == myTarget) {
 514                 /* There was enough space to write the codepoint directly. */
 515                 myTarget += (indexToWrite + 1);
 516                 myOffsets[0] = offsetNum;
 517                 myOffsets[1] = offsetNum;
 518                 myOffsets[2] = offsetNum;
 519                 if (indexToWrite >= 3) {
 520                     myOffsets[3] = offsetNum;
 521                 }
 522                 myOffsets += (indexToWrite + 1);
 523             }
 524             else {
 525                 /* We might run out of room soon. Write it slowly. */
 526                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 527                     if (myTarget < targetLimit)
 528                     {
 529                         *(myOffsets++) = offsetNum;
 530                         *(myTarget++) = *tempPtr;
 531                     }
 532                     else
 533                     {
 534                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 535                         *err = U_BUFFER_OVERFLOW_ERROR;
 536                     }
 537                 }
 538             }
 539             offsetNum = nextSourceIndex;
 540         }
 541     }
 542
 543     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 544     {
 545         *err = U_BUFFER_OVERFLOW_ERROR;
 546     }
 547
 548     args->target = (char *) myTarget;
 549     args->source = mySource;
 550     args->offsets = myOffsets;
 551 }
 552
 553 U_CDECL_BEGIN
 554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
 555                                                UErrorCode *err) {
 556     UConverter *cnv;
 557     const uint8_t *sourceInitial;
 558     const uint8_t *source;
 559     uint8_t myByte;
 560     UChar32 ch;
 561     int8_t i;
 562
 563     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
 564
 565     cnv = args->converter;
 566     sourceInitial = source = (const uint8_t *)args->source;
 567     if (source >= (const uint8_t *)args->sourceLimit)
 568     {
 569         /* no input */
 570         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 571         return 0xffff;
 572     }
 573
 574     myByte = (uint8_t)*(source++);
 575     if (U8_IS_SINGLE(myByte))
 576     {
 577         args->source = (const char *)source;
 578         return (UChar32)myByte;
 579     }
 580
 581     uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
 582     if (countTrailBytes == 0) {
 583         cnv->toUBytes[0] = myByte;
 584         cnv->toULength = 1;
 585         *err = U_ILLEGAL_CHAR_FOUND;
 586         args->source = (const char *)source;
 587         return 0xffff;
 588     }
 589
 590     /*The byte sequence is longer than the buffer area passed*/
 591     if (((const char *)source + countTrailBytes) > args->sourceLimit)
 592     {
 593         /* check if all of the remaining bytes are trail bytes */
 594         uint16_t extraBytesToWrite = countTrailBytes + 1;
 595         cnv->toUBytes[0] = myByte;
 596         i = 1;
 597         *err = U_TRUNCATED_CHAR_FOUND;
 598         while(source < (const uint8_t *)args->sourceLimit) {
 599             uint8_t b = *source;
 600             if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
 601                 cnv->toUBytes[i++] = b;
 602                 ++source;
 603             } else {
 604                 /* error even before we run out of input */
 605                 *err = U_ILLEGAL_CHAR_FOUND;
 606                 break;
 607             }
 608         }
 609         cnv->toULength = i;
 610         args->source = (const char *)source;
 611         return 0xffff;
 612     }
 613
 614     ch = myByte << 6;
 615     if(countTrailBytes == 2) {
 616         uint8_t t1 = *source, t2;
 617         if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
 618             args->source = (const char *)(source + 1);
 619             return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
 620         }
 621     } else if(countTrailBytes == 1) {
 622         uint8_t t1 = *source;
 623         if(U8_IS_TRAIL(t1)) {
 624             args->source = (const char *)(source + 1);
 625             return (ch + t1) - offsetsFromUTF8[2];
 626         }
 627     } else {  // countTrailBytes == 3
 628         uint8_t t1 = *source, t2, t3;
 629         if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
 630                 U8_IS_TRAIL(t3 = *++source)) {
 631             args->source = (const char *)(source + 1);
 632             return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
 633         }
 634     }
 635     args->source = (const char *)source;
 636
 637     for(i = 0; sourceInitial < source; ++i) {
 638         cnv->toUBytes[i] = *sourceInitial++;
 639     }
 640     cnv->toULength = i;
 641     *err = U_ILLEGAL_CHAR_FOUND;
 642     return 0xffff;
 643 }
 644 U_CDECL_END
 645
 646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
 647
 648 U_CDECL_BEGIN
 649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
 650 static void U_CALLCONV
 651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
 652                   UConverterToUnicodeArgs *pToUArgs,
 653                   UErrorCode *pErrorCode) {
 654     UConverter *utf8;
 655     const uint8_t *source, *sourceLimit;
 656     uint8_t *target;
 657     int32_t targetCapacity;
 658     int32_t count;
 659
 660     int8_t oldToULength, toULength, toULimit;
 661
 662     UChar32 c;
 663     uint8_t b, t1, t2;
 664
 665     /* set up the local pointers */
 666     utf8=pToUArgs->converter;
 667     source=(uint8_t *)pToUArgs->source;
 668     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
 669     target=(uint8_t *)pFromUArgs->target;
 670     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
 671
 672     /* get the converter state from the UTF-8 UConverter */
 673     if(utf8->toULength > 0) {
 674         toULength=oldToULength=utf8->toULength;
 675         toULimit=(int8_t)utf8->mode;
 676         c=(UChar32)utf8->toUnicodeStatus;
 677     } else {
 678         toULength=oldToULength=toULimit=0;
 679         c = 0;
 680     }
 681
 682     count=(int32_t)(sourceLimit-source)+oldToULength;
 683     if(count<toULimit) {
 684         /*
 685          * Not enough input to complete the partial character.
 686          * Jump to moreBytes below - it will not output to target.
 687          */
 688     } else if(targetCapacity<toULimit) {
 689         /*
 690          * Not enough target capacity to output the partial character.
 691          * Let the standard converter handle this.
 692          */
 693         *pErrorCode=U_USING_DEFAULT_WARNING;
 694         return;
 695     } else {
 696         // Use a single counter for source and target, counting the minimum of
 697         // the source length and the target capacity.
 698         // Let the standard converter handle edge cases.
 699         if(count>targetCapacity) {
 700             count=targetCapacity;
 701         }
 702
 703         // The conversion loop checks count>0 only once per character.
 704         // If the buffer ends with a truncated sequence,
 705         // then we reduce the count to stop before that,
 706         // and collect the remaining bytes after the conversion loop.
 707
 708         // Do not go back into the bytes that will be read for finishing a partial
 709         // sequence from the previous buffer.
 710         int32_t length=count-toULimit;
 711         U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
 712         count=toULimit+length;
 713     }
 714
 715     if(c!=0) {
 716         utf8->toUnicodeStatus=0;
 717         utf8->toULength=0;
 718         goto moreBytes;
 719         /* See note in ucnv_SBCSFromUTF8() about this goto. */
 720     }
 721
 722     /* conversion loop */
 723     while(count>0) {
 724         b=*source++;
 725         if(U8_IS_SINGLE(b)) {
 726             /* convert ASCII */
 727             *target++=b;
 728             --count;
 729             continue;
 730         } else {
 731             if(b>=0xe0) {
 732                 if( /* handle U+0800..U+FFFF inline */
 733                     b<0xf0 &&
 734                     U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
 735                     U8_IS_TRAIL(t2=source[1])
 736                 ) {
 737                     source+=2;
 738                     *target++=b;
 739                     *target++=t1;
 740                     *target++=t2;
 741                     count-=3;
 742                     continue;
 743                 }
 744             } else {
 745                 if( /* handle U+0080..U+07FF inline */
 746                     b>=0xc2 &&
 747                     U8_IS_TRAIL(t1=*source)
 748                 ) {
 749                     ++source;
 750                     *target++=b;
 751                     *target++=t1;
 752                     count-=2;
 753                     continue;
 754                 }
 755             }
 756
 757             /* handle "complicated" and error cases, and continuing partial characters */
 758             oldToULength=0;
 759             toULength=1;
 760             toULimit=U8_COUNT_BYTES_NON_ASCII(b);
 761             c=b;
 762 moreBytes:
 763             while(toULength<toULimit) {
 764                 if(source<sourceLimit) {
 765                     b=*source;
 766                     if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
 767                         ++source;
 768                         ++toULength;
 769                         c=(c<<6)+b;
 770                     } else {
 771                         break; /* sequence too short, stop with toULength<toULimit */
 772                     }
 773                 } else {
 774                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
 775                     source-=(toULength-oldToULength);
 776                     while(oldToULength<toULength) {
 777                         utf8->toUBytes[oldToULength++]=*source++;
 778                     }
 779                     utf8->toUnicodeStatus=c;
 780                     utf8->toULength=toULength;
 781                     utf8->mode=toULimit;
 782                     pToUArgs->source=(char *)source;
 783                     pFromUArgs->target=(char *)target;
 784                     return;
 785                 }
 786             }
 787
 788             if(toULength!=toULimit) {
 789                 /* error handling: illegal UTF-8 byte sequence */
 790                 source-=(toULength-oldToULength);
 791                 while(oldToULength<toULength) {
 792                     utf8->toUBytes[oldToULength++]=*source++;
 793                 }
 794                 utf8->toULength=toULength;
 795                 pToUArgs->source=(char *)source;
 796                 pFromUArgs->target=(char *)target;
 797                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 798                 return;
 799             }
 800
 801             /* copy the legal byte sequence to the target */
 802             {
 803                 int8_t i;
 804
 805                 for(i=0; i<oldToULength; ++i) {
 806                     *target++=utf8->toUBytes[i];
 807                 }
 808                 source-=(toULength-oldToULength);
 809                 for(; i<toULength; ++i) {
 810                     *target++=*source++;
 811                 }
 812                 count-=toULength;
 813             }
 814         }
 815     }
 816     U_ASSERT(count>=0);
 817
 818     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
 819         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
 820             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 821         } else {
 822             b=*source;
 823             toULimit=U8_COUNT_BYTES(b);
 824             if(toULimit>(sourceLimit-source)) {
 825                 /* collect a truncated byte sequence */
 826                 toULength=0;
 827                 c=b;
 828                 for(;;) {
 829                     utf8->toUBytes[toULength++]=b;
 830                     if(++source==sourceLimit) {
 831                         /* partial byte sequence at end of source */
 832                         utf8->toUnicodeStatus=c;
 833                         utf8->toULength=toULength;
 834                         utf8->mode=toULimit;
 835                         break;
 836                     } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
 837                         utf8->toULength=toULength;
 838                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 839                         break;
 840                     }
 841                     c=(c<<6)+b;
 842                 }
 843             } else {
 844                 /* partial-sequence target overflow: fall back to the pivoting implementation */
 845                 *pErrorCode=U_USING_DEFAULT_WARNING;
 846             }
 847         }
 848     }
 849
 850     /* write back the updated pointers */
 851     pToUArgs->source=(char *)source;
 852     pFromUArgs->target=(char *)target;
 853 }
 854
 855 U_CDECL_END
 856
 857 /* UTF-8 converter data ----------------------------------------------------- */
 858
 859 static const UConverterImpl _UTF8Impl={
 860     UCNV_UTF8,
 861
 862     NULL,
 863     NULL,
 864
 865     NULL,
 866     NULL,
 867     NULL,
 868
 869     ucnv_toUnicode_UTF8,
 870     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
 871     ucnv_fromUnicode_UTF8,
 872     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
 873     ucnv_getNextUChar_UTF8,
 874
 875     NULL,
 876     NULL,
 877     NULL,
 878     NULL,
 879     ucnv_getNonSurrogateUnicodeSet,
 880
 881     ucnv_UTF8FromUTF8,
 882     ucnv_UTF8FromUTF8
 883 };
 884
 885 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
 886 static const UConverterStaticData _UTF8StaticData={
 887     sizeof(UConverterStaticData),
 888     "UTF-8",
 889     1208, UCNV_IBM, UCNV_UTF8,
 890     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
 891     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
 892     0,
 893     0,
 894     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 895 };
 896
 897
 898 const UConverterSharedData _UTF8Data=
 899         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
 900
 901 /* CESU-8 converter data ---------------------------------------------------- */
 902
 903 static const UConverterImpl _CESU8Impl={
 904     UCNV_CESU8,
 905
 906     NULL,
 907     NULL,
 908
 909     NULL,
 910     NULL,
 911     NULL,
 912
 913     ucnv_toUnicode_UTF8,
 914     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
 915     ucnv_fromUnicode_UTF8,
 916     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
 917     NULL,
 918
 919     NULL,
 920     NULL,
 921     NULL,
 922     NULL,
 923     ucnv_getCompleteUnicodeSet,
 924
 925     NULL,
 926     NULL
 927 };
 928
 929 static const UConverterStaticData _CESU8StaticData={
 930     sizeof(UConverterStaticData),
 931     "CESU-8",
 932     9400, /* CCSID for CESU-8 */
 933     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
 934     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
 935     0,
 936     0,
 937     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 938 };
 939
 940
 941 const UConverterSharedData _CESU8Data=
 942         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
 943
 944 #endif