icuSources/common/ucnv_u8.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u8.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
  15 *
  16 *   Also, CESU-8 implementation, see UTR 26.
  17 *   The CESU-8 converter uses all the same functions as the
  18 *   UTF-8 converter, with a branch for converting supplementary code points.
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_CONVERSION
  24
  25 #include "unicode/ucnv.h"
  26 #include "ucnv_bld.h"
  27 #include "ucnv_cnv.h"
  28 #include "cmemory.h"
  29
  30 /* Prototypes --------------------------------------------------------------- */
  31
  32 /* Keep these here to make finicky compilers happy */
  33
  34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
  35                                            UErrorCode *err);
  36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
  37                                                         UErrorCode *err);
  38
  39
  40 /* UTF-8 -------------------------------------------------------------------- */
  41
  42 /* UTF-8 Conversion DATA
  43  *   for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
  44  */
  45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
  46 #define MAXIMUM_UCS2            0x0000FFFF
  47 #define MAXIMUM_UTF             0x0010FFFF
  48 #define MAXIMUM_UCS4            0x7FFFFFFF
  49 #define HALF_SHIFT              10
  50 #define HALF_BASE               0x0010000
  51 #define HALF_MASK               0x3FF
  52 #define SURROGATE_HIGH_START    0xD800
  53 #define SURROGATE_HIGH_END      0xDBFF
  54 #define SURROGATE_LOW_START     0xDC00
  55 #define SURROGATE_LOW_END       0xDFFF
  56
  57 /* -SURROGATE_LOW_START + HALF_BASE */
  58 #define SURROGATE_LOW_BASE      9216
  59
  60 static const uint32_t offsetsFromUTF8[7] = {0,
  61   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
  62   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
  63 };
  64
  65 /* END OF UTF-8 Conversion DATA */
  66
  67 static const int8_t bytesFromUTF8[256] = {
  68   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  69   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  70   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  72   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  73   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  74   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  75   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  76 };
  77
  78 /*
  79  * Starting with Unicode 3.0.1:
  80  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
  81  * byte sequences with more than 4 bytes are illegal in UTF-8,
  82  * which is tested with impossible values for them
  83  */
  84 static const uint32_t
  85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
  86
  87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
  88                                   UErrorCode * err)
  89 {
  90     const unsigned char *mySource = (unsigned char *) args->source;
  91     UChar *myTarget = args->target;
  92     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  93     const UChar *targetLimit = args->targetLimit;
  94     unsigned char *toUBytes = args->converter->toUBytes;
  95     UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
  96     uint32_t ch, ch2 = 0;
  97     int32_t i, inBytes;
  98
  99     /* Restore size of current sequence */
 100     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 101     {
 102         inBytes = args->converter->mode;            /* restore # of bytes to consume */
 103         i = args->converter->toULength;             /* restore # of bytes consumed */
 104
 105         ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 106         args->converter->toUnicodeStatus = 0;
 107         goto morebytes;
 108     }
 109
 110
 111     while (mySource < sourceLimit && myTarget < targetLimit)
 112     {
 113         ch = *(mySource++);
 114         if (ch < 0x80)        /* Simple case */
 115         {
 116             *(myTarget++) = (UChar) ch;
 117         }
 118         else
 119         {
 120             /* store the first char */
 121             toUBytes[0] = (char)ch;
 122             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
 123             i = 1;
 124
 125 morebytes:
 126             while (i < inBytes)
 127             {
 128                 if (mySource < sourceLimit)
 129                 {
 130                     toUBytes[i] = (char) (ch2 = *mySource);
 131                     if (!UTF8_IS_TRAIL(ch2))
 132                     {
 133                         break; /* i < inBytes */
 134                     }
 135                     ch = (ch << 6) + ch2;
 136                     ++mySource;
 137                     i++;
 138                 }
 139                 else
 140                 {
 141                     /* stores a partially calculated target*/
 142                     args->converter->toUnicodeStatus = ch;
 143                     args->converter->mode = inBytes;
 144                     args->converter->toULength = (int8_t) i;
 145                     goto donefornow;
 146                 }
 147             }
 148
 149             /* Remove the accumulated high bits */
 150             ch -= offsetsFromUTF8[inBytes];
 151
 152             /*
 153              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 154              * - use only trail bytes after a lead byte (checked above)
 155              * - use the right number of trail bytes for a given lead byte
 156              * - encode a code point <= U+10ffff
 157              * - use the fewest possible number of bytes for their code points
 158              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 159              *
 160              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 161              * There are no irregular sequences any more.
 162              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 163              */
 164             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 165                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
 166             {
 167                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 168                 args->converter->toULength = 0;
 169                 if (ch <= MAXIMUM_UCS2)
 170                 {
 171                     /* fits in 16 bits */
 172                     *(myTarget++) = (UChar) ch;
 173                 }
 174                 else
 175                 {
 176                     /* write out the surrogates */
 177                     ch -= HALF_BASE;
 178                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 179                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 180                     if (myTarget < targetLimit)
 181                     {
 182                         *(myTarget++) = (UChar)ch;
 183                     }
 184                     else
 185                     {
 186                         /* Put in overflow buffer (not handled here) */
 187                         args->converter->UCharErrorBuffer[0] = (UChar) ch;
 188                         args->converter->UCharErrorBufferLength = 1;
 189                         *err = U_BUFFER_OVERFLOW_ERROR;
 190                         break;
 191                     }
 192                 }
 193             }
 194             else
 195             {
 196                 args->converter->toULength = (int8_t)i;
 197                 *err = U_ILLEGAL_CHAR_FOUND;
 198                 break;
 199             }
 200         }
 201     }
 202
 203 donefornow:
 204     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 205     {
 206         /* End of target buffer */
 207         *err = U_BUFFER_OVERFLOW_ERROR;
 208     }
 209
 210     args->target = myTarget;
 211     args->source = (const char *) mySource;
 212 }
 213
 214 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
 215                                                 UErrorCode * err)
 216 {
 217     const unsigned char *mySource = (unsigned char *) args->source;
 218     UChar *myTarget = args->target;
 219     int32_t *myOffsets = args->offsets;
 220     int32_t offsetNum = 0;
 221     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 222     const UChar *targetLimit = args->targetLimit;
 223     unsigned char *toUBytes = args->converter->toUBytes;
 224     UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
 225     uint32_t ch, ch2 = 0;
 226     int32_t i, inBytes;
 227
 228     /* Restore size of current sequence */
 229     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 230     {
 231         inBytes = args->converter->mode;            /* restore # of bytes to consume */
 232         i = args->converter->toULength;             /* restore # of bytes consumed */
 233
 234         ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 235         args->converter->toUnicodeStatus = 0;
 236         goto morebytes;
 237     }
 238
 239     while (mySource < sourceLimit && myTarget < targetLimit)
 240     {
 241         ch = *(mySource++);
 242         if (ch < 0x80)        /* Simple case */
 243         {
 244             *(myTarget++) = (UChar) ch;
 245             *(myOffsets++) = offsetNum++;
 246         }
 247         else
 248         {
 249             toUBytes[0] = (char)ch;
 250             inBytes = bytesFromUTF8[ch];
 251             i = 1;
 252
 253 morebytes:
 254             while (i < inBytes)
 255             {
 256                 if (mySource < sourceLimit)
 257                 {
 258                     toUBytes[i] = (char) (ch2 = *mySource);
 259                     if (!UTF8_IS_TRAIL(ch2))
 260                     {
 261                         break; /* i < inBytes */
 262                     }
 263                     ch = (ch << 6) + ch2;
 264                     ++mySource;
 265                     i++;
 266                 }
 267                 else
 268                 {
 269                     args->converter->toUnicodeStatus = ch;
 270                     args->converter->mode = inBytes;
 271                     args->converter->toULength = (int8_t)i;
 272                     goto donefornow;
 273                 }
 274             }
 275
 276             /* Remove the accumulated high bits */
 277             ch -= offsetsFromUTF8[inBytes];
 278
 279             /*
 280              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 281              * - use only trail bytes after a lead byte (checked above)
 282              * - use the right number of trail bytes for a given lead byte
 283              * - encode a code point <= U+10ffff
 284              * - use the fewest possible number of bytes for their code points
 285              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 286              *
 287              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 288              * There are no irregular sequences any more.
 289              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 290              */
 291             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 292                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
 293             {
 294                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 295                 args->converter->toULength = 0;
 296                 if (ch <= MAXIMUM_UCS2)
 297                 {
 298                     /* fits in 16 bits */
 299                     *(myTarget++) = (UChar) ch;
 300                     *(myOffsets++) = offsetNum;
 301                 }
 302                 else
 303                 {
 304                     /* write out the surrogates */
 305                     ch -= HALF_BASE;
 306                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 307                     *(myOffsets++) = offsetNum;
 308                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 309                     if (myTarget < targetLimit)
 310                     {
 311                         *(myTarget++) = (UChar)ch;
 312                         *(myOffsets++) = offsetNum;
 313                     }
 314                     else
 315                     {
 316                         args->converter->UCharErrorBuffer[0] = (UChar) ch;
 317                         args->converter->UCharErrorBufferLength = 1;
 318                         *err = U_BUFFER_OVERFLOW_ERROR;
 319                     }
 320                 }
 321                 offsetNum += i;
 322             }
 323             else
 324             {
 325                 args->converter->toULength = (int8_t)i;
 326                 *err = U_ILLEGAL_CHAR_FOUND;
 327                 break;
 328             }
 329         }
 330     }
 331
 332 donefornow:
 333     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 334     {   /* End of target buffer */
 335         *err = U_BUFFER_OVERFLOW_ERROR;
 336     }
 337
 338     args->target = myTarget;
 339     args->source = (const char *) mySource;
 340     args->offsets = myOffsets;
 341 }
 342
 343 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
 344                                     UErrorCode * err)
 345 {
 346     UConverter *cnv = args->converter;
 347     const UChar *mySource = args->source;
 348     unsigned char *myTarget = (unsigned char *) args->target;
 349     const UChar *sourceLimit = args->sourceLimit;
 350     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 351     UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
 352     UChar32 ch, ch2;
 353     int16_t indexToWrite;
 354     char temp[4];
 355
 356     if (cnv->fromUChar32 && myTarget < targetLimit)
 357     {
 358         ch = cnv->fromUChar32;
 359         cnv->fromUChar32 = 0;
 360         goto lowsurrogate;
 361     }
 362
 363     while (mySource < sourceLimit && myTarget < targetLimit)
 364     {
 365         ch = *(mySource++);
 366
 367         if (ch < 0x80)        /* Single byte */
 368         {
 369             *(myTarget++) = (char) ch;
 370         }
 371         else if (ch < 0x800)  /* Double byte */
 372         {
 373             *(myTarget++) = (char) ((ch >> 6) | 0xc0);
 374             if (myTarget < targetLimit)
 375             {
 376                 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
 377             }
 378             else
 379             {
 380                 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
 381                 cnv->charErrorBufferLength = 1;
 382                 *err = U_BUFFER_OVERFLOW_ERROR;
 383             }
 384         }
 385         else
 386         /* Check for surrogates */
 387         {
 388             if(UTF_IS_SURROGATE(ch) && !isCESU8) {
 389                 if(UTF_IS_SURROGATE_FIRST(ch)) {
 390 lowsurrogate:
 391                     if (mySource < sourceLimit) {
 392                         /* test the following code unit */
 393                         UChar trail=*mySource;
 394                         if(UTF_IS_SECOND_SURROGATE(trail)) {
 395                             ++mySource;
 396                             ch=UTF16_GET_PAIR_VALUE(ch, trail);
 397                             ch2 = 0;
 398                             /* convert this supplementary code point */
 399                             /* exit this condition tree */
 400                         } else {
 401                             /* this is an unmatched lead code unit (1st surrogate) */
 402                             /* callback(illegal) */
 403                             cnv->fromUChar32 = ch;
 404                             *err = U_ILLEGAL_CHAR_FOUND;
 405                             break;
 406                         }
 407                     } else {
 408                         /* no more input */
 409                         cnv->fromUChar32 = ch;
 410                         break;
 411                     }
 412                 } else {
 413                     /* this is an unmatched trail code unit (2nd surrogate) */
 414                     /* callback(illegal) */
 415                     cnv->fromUChar32 = ch;
 416                     *err = U_ILLEGAL_CHAR_FOUND;
 417                     break;
 418                 }
 419             }
 420
 421             if (ch < 0x10000)
 422             {
 423                 indexToWrite = 2;
 424                 temp[2] = (char) ((ch >> 12) | 0xe0);
 425             }
 426             else
 427             {
 428                 indexToWrite = 3;
 429                 temp[3] = (char) ((ch >> 18) | 0xf0);
 430                 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
 431             }
 432             temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
 433             temp[0] = (char) ((ch & 0x3f) | 0x80);
 434
 435             for (; indexToWrite >= 0; indexToWrite--)
 436             {
 437                 if (myTarget < targetLimit)
 438                 {
 439                     *(myTarget++) = temp[indexToWrite];
 440                 }
 441                 else
 442                 {
 443                     cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
 444                     *err = U_BUFFER_OVERFLOW_ERROR;
 445                 }
 446             }
 447         }
 448     }
 449
 450     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 451     {
 452         *err = U_BUFFER_OVERFLOW_ERROR;
 453     }
 454
 455     args->target = (char *) myTarget;
 456     args->source = mySource;
 457 }
 458
 459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
 460                                                   UErrorCode * err)
 461 {
 462     UConverter *cnv = args->converter;
 463     const UChar *mySource = args->source;
 464     unsigned char *myTarget = (unsigned char *) args->target;
 465     int32_t *myOffsets = args->offsets;
 466     const UChar *sourceLimit = args->sourceLimit;
 467     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 468     UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
 469     UChar32 ch, ch2;
 470     int32_t offsetNum, nextSourceIndex;
 471     int16_t indexToWrite;
 472     char temp[4];
 473
 474     if (cnv->fromUChar32 && myTarget < targetLimit)
 475     {
 476         ch = cnv->fromUChar32;
 477         cnv->fromUChar32 = 0;
 478         offsetNum = -1;
 479         nextSourceIndex = 0;
 480         goto lowsurrogate;
 481     } else {
 482         offsetNum = 0;
 483     }
 484
 485     while (mySource < sourceLimit && myTarget < targetLimit)
 486     {
 487         ch = *(mySource++);
 488
 489         if (ch < 0x80)        /* Single byte */
 490         {
 491             *(myOffsets++) = offsetNum++;
 492             *(myTarget++) = (char) ch;
 493         }
 494         else if (ch < 0x800)  /* Double byte */
 495         {
 496             *(myOffsets++) = offsetNum;
 497             *(myTarget++) = (char) ((ch >> 6) | 0xc0);
 498             if (myTarget < targetLimit)
 499             {
 500                 *(myOffsets++) = offsetNum++;
 501                 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
 502             }
 503             else
 504             {
 505                 cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
 506                 cnv->charErrorBufferLength = 1;
 507                 *err = U_BUFFER_OVERFLOW_ERROR;
 508             }
 509         }
 510         else
 511         /* Check for surrogates */
 512         {
 513             nextSourceIndex = offsetNum + 1;
 514
 515             if(UTF_IS_SURROGATE(ch) && !isCESU8) {
 516                 if(UTF_IS_SURROGATE_FIRST(ch)) {
 517 lowsurrogate:
 518                     if (mySource < sourceLimit) {
 519                         /* test the following code unit */
 520                         UChar trail=*mySource;
 521                         if(UTF_IS_SECOND_SURROGATE(trail)) {
 522                             ++mySource;
 523                             ++nextSourceIndex;
 524                             ch=UTF16_GET_PAIR_VALUE(ch, trail);
 525                             ch2 = 0;
 526                             /* convert this supplementary code point */
 527                             /* exit this condition tree */
 528                         } else {
 529                             /* this is an unmatched lead code unit (1st surrogate) */
 530                             /* callback(illegal) */
 531                             cnv->fromUChar32 = ch;
 532                             *err = U_ILLEGAL_CHAR_FOUND;
 533                             break;
 534                         }
 535                     } else {
 536                         /* no more input */
 537                         cnv->fromUChar32 = ch;
 538                         break;
 539                     }
 540                 } else {
 541                     /* this is an unmatched trail code unit (2nd surrogate) */
 542                     /* callback(illegal) */
 543                     cnv->fromUChar32 = ch;
 544                     *err = U_ILLEGAL_CHAR_FOUND;
 545                     break;
 546                 }
 547             }
 548
 549             if (ch < 0x10000)
 550             {
 551                 indexToWrite = 2;
 552                 temp[2] = (char) ((ch >> 12) | 0xe0);
 553             }
 554             else
 555             {
 556                 indexToWrite = 3;
 557                 temp[3] = (char) ((ch >> 18) | 0xf0);
 558                 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
 559             }
 560             temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
 561             temp[0] = (char) ((ch & 0x3f) | 0x80);
 562
 563             for (; indexToWrite >= 0; indexToWrite--)
 564             {
 565                 if (myTarget < targetLimit)
 566                 {
 567                     *(myOffsets++) = offsetNum;
 568                     *(myTarget++) = temp[indexToWrite];
 569                 }
 570                 else
 571                 {
 572                     cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
 573                     *err = U_BUFFER_OVERFLOW_ERROR;
 574                 }
 575             }
 576             offsetNum = nextSourceIndex;
 577         }
 578     }
 579
 580     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 581     {
 582         *err = U_BUFFER_OVERFLOW_ERROR;
 583     }
 584
 585     args->target = (char *) myTarget;
 586     args->source = mySource;
 587     args->offsets = myOffsets;
 588 }
 589
 590 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
 591                                                UErrorCode *err) {
 592     UConverter *cnv;
 593     const uint8_t *sourceInitial;
 594     const uint8_t *source;
 595     uint16_t extraBytesToWrite;
 596     uint8_t myByte;
 597     UChar32 ch;
 598     int8_t i, isLegalSequence;
 599
 600     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
 601
 602     cnv = args->converter;
 603     sourceInitial = source = (const uint8_t *)args->source;
 604     if (source >= (const uint8_t *)args->sourceLimit)
 605     {
 606         /* no input */
 607         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 608         return 0xffff;
 609     }
 610
 611     myByte = (uint8_t)*(source++);
 612     if (myByte < 0x80)
 613     {
 614         args->source = (const char *)source;
 615         return (UChar32)myByte;
 616     }
 617
 618     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
 619     if (extraBytesToWrite == 0) {
 620         cnv->toUBytes[0] = myByte;
 621         cnv->toULength = 1;
 622         *err = U_ILLEGAL_CHAR_FOUND;
 623         args->source = (const char *)source;
 624         return 0xffff;
 625     }
 626
 627     /*The byte sequence is longer than the buffer area passed*/
 628     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
 629     {
 630         /* check if all of the remaining bytes are trail bytes */
 631         cnv->toUBytes[0] = myByte;
 632         i = 1;
 633         *err = U_TRUNCATED_CHAR_FOUND;
 634         while(source < (const uint8_t *)args->sourceLimit) {
 635             if(U8_IS_TRAIL(myByte = *source)) {
 636                 cnv->toUBytes[i++] = myByte;
 637                 ++source;
 638             } else {
 639                 /* error even before we run out of input */
 640                 *err = U_ILLEGAL_CHAR_FOUND;
 641                 break;
 642             }
 643         }
 644         cnv->toULength = i;
 645         args->source = (const char *)source;
 646         return 0xffff;
 647     }
 648
 649     isLegalSequence = 1;
 650     ch = myByte << 6;
 651     switch(extraBytesToWrite)
 652     {
 653       /* note: code falls through cases! (sic)*/
 654     case 6:
 655         ch += (myByte = *source);
 656         ch <<= 6;
 657         if (!UTF8_IS_TRAIL(myByte))
 658         {
 659             isLegalSequence = 0;
 660             break;
 661         }
 662         ++source;
 663     case 5:
 664         ch += (myByte = *source);
 665         ch <<= 6;
 666         if (!UTF8_IS_TRAIL(myByte))
 667         {
 668             isLegalSequence = 0;
 669             break;
 670         }
 671         ++source;
 672     case 4:
 673         ch += (myByte = *source);
 674         ch <<= 6;
 675         if (!UTF8_IS_TRAIL(myByte))
 676         {
 677             isLegalSequence = 0;
 678             break;
 679         }
 680         ++source;
 681     case 3:
 682         ch += (myByte = *source);
 683         ch <<= 6;
 684         if (!UTF8_IS_TRAIL(myByte))
 685         {
 686             isLegalSequence = 0;
 687             break;
 688         }
 689         ++source;
 690     case 2:
 691         ch += (myByte = *source);
 692         if (!UTF8_IS_TRAIL(myByte))
 693         {
 694             isLegalSequence = 0;
 695             break;
 696         }
 697         ++source;
 698     };
 699     ch -= offsetsFromUTF8[extraBytesToWrite];
 700     args->source = (const char *)source;
 701
 702     /*
 703      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 704      * - use only trail bytes after a lead byte (checked above)
 705      * - use the right number of trail bytes for a given lead byte
 706      * - encode a code point <= U+10ffff
 707      * - use the fewest possible number of bytes for their code points
 708      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 709      *
 710      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 711      * There are no irregular sequences any more.
 712      */
 713     if (isLegalSequence &&
 714         (uint32_t)ch <= MAXIMUM_UTF &&
 715         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
 716         !U_IS_SURROGATE(ch)
 717     ) {
 718         return ch; /* return the code point */
 719     }
 720
 721     for(i = 0; sourceInitial < source; ++i) {
 722         cnv->toUBytes[i] = *sourceInitial++;
 723     }
 724     cnv->toULength = i;
 725     *err = U_ILLEGAL_CHAR_FOUND;
 726     return 0xffff;
 727 }
 728
 729 /* UTF-8 converter data ----------------------------------------------------- */
 730
 731 static const UConverterImpl _UTF8Impl={
 732     UCNV_UTF8,
 733
 734     NULL,
 735     NULL,
 736
 737     NULL,
 738     NULL,
 739     NULL,
 740
 741     ucnv_toUnicode_UTF8,
 742     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
 743     ucnv_fromUnicode_UTF8,
 744     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
 745     ucnv_getNextUChar_UTF8,
 746
 747     NULL,
 748     NULL,
 749     NULL,
 750     NULL,
 751     ucnv_getNonSurrogateUnicodeSet
 752 };
 753
 754 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
 755 static const UConverterStaticData _UTF8StaticData={
 756     sizeof(UConverterStaticData),
 757     "UTF-8",
 758     1208, UCNV_IBM, UCNV_UTF8,
 759     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
 760     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
 761     0,
 762     0,
 763     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 764 };
 765
 766
 767 const UConverterSharedData _UTF8Data={
 768     sizeof(UConverterSharedData), ~((uint32_t) 0),
 769     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
 770     0
 771 };
 772
 773 /* CESU-8 converter data ---------------------------------------------------- */
 774
 775 static const UConverterImpl _CESU8Impl={
 776     UCNV_CESU8,
 777
 778     NULL,
 779     NULL,
 780
 781     NULL,
 782     NULL,
 783     NULL,
 784
 785     ucnv_toUnicode_UTF8,
 786     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
 787     ucnv_fromUnicode_UTF8,
 788     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
 789     NULL,
 790
 791     NULL,
 792     NULL,
 793     NULL,
 794     NULL,
 795     ucnv_getCompleteUnicodeSet
 796 };
 797
 798 static const UConverterStaticData _CESU8StaticData={
 799     sizeof(UConverterStaticData),
 800     "CESU-8",
 801     0, UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
 802     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
 803     0,
 804     0,
 805     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 806 };
 807
 808
 809 const UConverterSharedData _CESU8Data={
 810     sizeof(UConverterSharedData), ~((uint32_t) 0),
 811     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
 812     0
 813 };
 814
 815 #endif