icuSources/common/ustrtrns.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 *
   6 *   Copyright (C) 2001-2016, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 ******************************************************************************
  10 *
  11 * File ustrtrns.cpp
  12 *
  13 * Modification History:
  14 *
  15 *   Date        Name        Description
  16 *   9/10/2001    Ram    Creation.
  17 ******************************************************************************
  18 */
  19
  20 /*******************************************************************************
  21  *
  22  * u_strTo* and u_strFrom* APIs
  23  * WCS functions moved to ustr_wcs.c for better modularization
  24  *
  25  *******************************************************************************
  26  */
  27
  28
  29 #include "unicode/putil.h"
  30 #include "unicode/ustring.h"
  31 #include "unicode/utf.h"
  32 #include "unicode/utf8.h"
  33 #include "unicode/utf16.h"
  34 #include "cstring.h"
  35 #include "cmemory.h"
  36 #include "ustr_imp.h"
  37 #include "uassert.h"
  38
  39 U_CAPI UChar* U_EXPORT2
  40 u_strFromUTF32WithSub(UChar *dest,
  41                int32_t destCapacity,
  42                int32_t *pDestLength,
  43                const UChar32 *src,
  44                int32_t srcLength,
  45                UChar32 subchar, int32_t *pNumSubstitutions,
  46                UErrorCode *pErrorCode) {
  47     const UChar32 *srcLimit;
  48     UChar32 ch;
  49     UChar *destLimit;
  50     UChar *pDest;
  51     int32_t reqLength;
  52     int32_t numSubstitutions;
  53
  54     /* args check */
  55     if(U_FAILURE(*pErrorCode)){
  56         return NULL;
  57     }
  58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
  60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  61     ) {
  62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  63         return NULL;
  64     }
  65
  66     if(pNumSubstitutions != NULL) {
  67         *pNumSubstitutions = 0;
  68     }
  69
  70     pDest = dest;
  71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
  72     reqLength = 0;
  73     numSubstitutions = 0;
  74
  75     if(srcLength < 0) {
  76         /* simple loop for conversion of a NUL-terminated BMP string */
  77         while((ch=*src) != 0 &&
  78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
  79             ++src;
  80             if(pDest < destLimit) {
  81                 *pDest++ = (UChar)ch;
  82             } else {
  83                 ++reqLength;
  84             }
  85         }
  86         srcLimit = src;
  87         if(ch != 0) {
  88             /* "complicated" case, find the end of the remaining string */
  89             while(*++srcLimit != 0) {}
  90         }
  91     } else {
  92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
  93     }
  94
  95     /* convert with length */
  96     while(src < srcLimit) {
  97         ch = *src++;
  98         do {
  99             /* usually "loops" once; twice only for writing subchar */
 100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
 101                 if(pDest < destLimit) {
 102                     *pDest++ = (UChar)ch;
 103                 } else {
 104                     ++reqLength;
 105                 }
 106                 break;
 107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
 108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
 109                     *pDest++ = U16_LEAD(ch);
 110                     *pDest++ = U16_TRAIL(ch);
 111                 } else {
 112                     reqLength += 2;
 113                 }
 114                 break;
 115             } else if((ch = subchar) < 0) {
 116                 /* surrogate code point, or not a Unicode code point at all */
 117                 *pErrorCode = U_INVALID_CHAR_FOUND;
 118                 return NULL;
 119             } else {
 120                 ++numSubstitutions;
 121             }
 122         } while(TRUE);
 123     }
 124
 125     reqLength += (int32_t)(pDest - dest);
 126     if(pDestLength) {
 127         *pDestLength = reqLength;
 128     }
 129     if(pNumSubstitutions != NULL) {
 130         *pNumSubstitutions = numSubstitutions;
 131     }
 132
 133     /* Terminate the buffer */
 134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
 135
 136     return dest;
 137 }
 138
 139 U_CAPI UChar* U_EXPORT2
 140 u_strFromUTF32(UChar *dest,
 141                int32_t destCapacity,
 142                int32_t *pDestLength,
 143                const UChar32 *src,
 144                int32_t srcLength,
 145                UErrorCode *pErrorCode) {
 146     return u_strFromUTF32WithSub(
 147             dest, destCapacity, pDestLength,
 148             src, srcLength,
 149             U_SENTINEL, NULL,
 150             pErrorCode);
 151 }
 152
 153 U_CAPI UChar32* U_EXPORT2
 154 u_strToUTF32WithSub(UChar32 *dest,
 155              int32_t destCapacity,
 156              int32_t *pDestLength,
 157              const UChar *src,
 158              int32_t srcLength,
 159              UChar32 subchar, int32_t *pNumSubstitutions,
 160              UErrorCode *pErrorCode) {
 161     const UChar *srcLimit;
 162     UChar32 ch;
 163     UChar ch2;
 164     UChar32 *destLimit;
 165     UChar32 *pDest;
 166     int32_t reqLength;
 167     int32_t numSubstitutions;
 168
 169     /* args check */
 170     if(U_FAILURE(*pErrorCode)){
 171         return NULL;
 172     }
 173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 176     ) {
 177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 178         return NULL;
 179     }
 180
 181     if(pNumSubstitutions != NULL) {
 182         *pNumSubstitutions = 0;
 183     }
 184
 185     pDest = dest;
 186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
 187     reqLength = 0;
 188     numSubstitutions = 0;
 189
 190     if(srcLength < 0) {
 191         /* simple loop for conversion of a NUL-terminated BMP string */
 192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
 193             ++src;
 194             if(pDest < destLimit) {
 195                 *pDest++ = ch;
 196             } else {
 197                 ++reqLength;
 198             }
 199         }
 200         srcLimit = src;
 201         if(ch != 0) {
 202             /* "complicated" case, find the end of the remaining string */
 203             while(*++srcLimit != 0) {}
 204         }
 205     } else {
 206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
 207     }
 208
 209     /* convert with length */
 210     while(src < srcLimit) {
 211         ch = *src++;
 212         if(!U16_IS_SURROGATE(ch)) {
 213             /* write or count ch below */
 214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
 215             ++src;
 216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
 217         } else if((ch = subchar) < 0) {
 218             /* unpaired surrogate */
 219             *pErrorCode = U_INVALID_CHAR_FOUND;
 220             return NULL;
 221         } else {
 222             ++numSubstitutions;
 223         }
 224         if(pDest < destLimit) {
 225             *pDest++ = ch;
 226         } else {
 227             ++reqLength;
 228         }
 229     }
 230
 231     reqLength += (int32_t)(pDest - dest);
 232     if(pDestLength) {
 233         *pDestLength = reqLength;
 234     }
 235     if(pNumSubstitutions != NULL) {
 236         *pNumSubstitutions = numSubstitutions;
 237     }
 238
 239     /* Terminate the buffer */
 240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
 241
 242     return dest;
 243 }
 244
 245 U_CAPI UChar32* U_EXPORT2
 246 u_strToUTF32(UChar32 *dest,
 247              int32_t destCapacity,
 248              int32_t *pDestLength,
 249              const UChar *src,
 250              int32_t srcLength,
 251              UErrorCode *pErrorCode) {
 252     return u_strToUTF32WithSub(
 253             dest, destCapacity, pDestLength,
 254             src, srcLength,
 255             U_SENTINEL, NULL,
 256             pErrorCode);
 257 }
 258
 259 /* for utf8_nextCharSafeBodyTerminated() */
 260 static const UChar32
 261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
 262
 263 /*
 264  * Version of utf8_nextCharSafeBody() with the following differences:
 265  * - checks for NUL termination instead of length
 266  * - works with pointers instead of indexes
 267  * - always strict (strict==-1)
 268  *
 269  * *ps points to after the lead byte and will be moved to after the last trail byte.
 270  * c is the lead byte.
 271  * @return the code point, or U_SENTINEL
 272  */
 273 static UChar32
 274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
 275     const uint8_t *s=*ps;
 276     uint8_t trail, illegal=0;
 277     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 278     U_ASSERT(count<6);
 279     U8_MASK_LEAD_BYTE((c), count);
 280     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 281     switch(count) {
 282     /* each branch falls through to the next one */
 283     case 5:
 284     case 4:
 285         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 286         illegal=1;
 287         break;
 288     case 3:
 289         trail=(uint8_t)(*s++ - 0x80);
 290         c=(c<<6)|trail;
 291         if(trail>0x3f || c>=0x110) {
 292             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
 293             illegal=1;
 294             break;
 295         }
 296         U_FALLTHROUGH;
 297     case 2:
 298         trail=(uint8_t)(*s++ - 0x80);
 299         if(trail>0x3f) {
 300             /* not a trail byte */
 301             illegal=1;
 302             break;
 303         }
 304         c=(c<<6)|trail;
 305         U_FALLTHROUGH;
 306     case 1:
 307         trail=(uint8_t)(*s++ - 0x80);
 308         if(trail>0x3f) {
 309             /* not a trail byte */
 310             illegal=1;
 311         }
 312         c=(c<<6)|trail;
 313         break;
 314     case 0:
 315         return U_SENTINEL;
 316     /* no default branch to optimize switch()  - all values are covered */
 317     }
 318
 319     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 320     /* illegal is also set if count>=4 */
 321     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 322         /* error handling */
 323         /* don't go beyond this sequence */
 324         s=*ps;
 325         while(count>0 && U8_IS_TRAIL(*s)) {
 326             ++s;
 327             --count;
 328         }
 329         c=U_SENTINEL;
 330     }
 331     *ps=s;
 332     return c;
 333 }
 334
 335 /*
 336  * Version of utf8_nextCharSafeBody() with the following differences:
 337  * - works with pointers instead of indexes
 338  * - always strict (strict==-1)
 339  *
 340  * *ps points to after the lead byte and will be moved to after the last trail byte.
 341  * c is the lead byte.
 342  * @return the code point, or U_SENTINEL
 343  */
 344 static UChar32
 345 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
 346     const uint8_t *s=*ps;
 347     uint8_t trail, illegal=0;
 348     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 349     if((limit-s)>=count) {
 350         U8_MASK_LEAD_BYTE((c), count);
 351         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 352         switch(count) {
 353         /* each branch falls through to the next one */
 354         case 5:
 355         case 4:
 356             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 357             illegal=1;
 358             break;
 359         case 3:
 360             trail=*s++;
 361             c=(c<<6)|(trail&0x3f);
 362             if(c<0x110) {
 363                 illegal|=(trail&0xc0)^0x80;
 364             } else {
 365                 /* code point>0x10ffff, outside Unicode */
 366                 illegal=1;
 367                 break;
 368             }
 369             U_FALLTHROUGH;
 370         case 2:
 371             trail=*s++;
 372             c=(c<<6)|(trail&0x3f);
 373             illegal|=(trail&0xc0)^0x80;
 374             U_FALLTHROUGH;
 375         case 1:
 376             trail=*s++;
 377             c=(c<<6)|(trail&0x3f);
 378             illegal|=(trail&0xc0)^0x80;
 379             break;
 380         case 0:
 381             return U_SENTINEL;
 382         /* no default branch to optimize switch()  - all values are covered */
 383         }
 384     } else {
 385         illegal=1; /* too few bytes left */
 386     }
 387
 388     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 389     /* illegal is also set if count>=4 */
 390     U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
 391     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 392         /* error handling */
 393         /* don't go beyond this sequence */
 394         s=*ps;
 395         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
 396             ++s;
 397             --count;
 398         }
 399         c=U_SENTINEL;
 400     }
 401     *ps=s;
 402     return c;
 403 }
 404
 405 U_CAPI UChar* U_EXPORT2
 406 u_strFromUTF8WithSub(UChar *dest,
 407               int32_t destCapacity,
 408               int32_t *pDestLength,
 409               const char* src,
 410               int32_t srcLength,
 411               UChar32 subchar, int32_t *pNumSubstitutions,
 412               UErrorCode *pErrorCode){
 413     UChar *pDest = dest;
 414     UChar *pDestLimit = dest+destCapacity;
 415     UChar32 ch;
 416     int32_t reqLength = 0;
 417     const uint8_t* pSrc = (const uint8_t*) src;
 418     uint8_t t1, t2; /* trail bytes */
 419     int32_t numSubstitutions;
 420
 421     /* args check */
 422     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 423         return NULL;
 424     }
 425
 426     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 427         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 428         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 429     ) {
 430         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 431         return NULL;
 432     }
 433
 434     if(pNumSubstitutions!=NULL) {
 435         *pNumSubstitutions=0;
 436     }
 437     numSubstitutions=0;
 438
 439     /*
 440      * Inline processing of UTF-8 byte sequences:
 441      *
 442      * Byte sequences for the most common characters are handled inline in
 443      * the conversion loops. In order to reduce the path lengths for those
 444      * characters, the tests are arranged in a kind of binary search.
 445      * ASCII (<=0x7f) is checked first, followed by the dividing point
 446      * between 2- and 3-byte sequences (0xe0).
 447      * The 3-byte branch is tested first to speed up CJK text.
 448      * The compiler should combine the subtractions for the two tests for 0xe0.
 449      * Each branch then tests for the other end of its range.
 450      */
 451
 452     if(srcLength < 0){
 453         /*
 454          * Transform a NUL-terminated string.
 455          * The code explicitly checks for NULs only in the lead byte position.
 456          * A NUL byte in the trail byte position fails the trail byte range check anyway.
 457          */
 458         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 459             if(ch <= 0x7f){
 460                 *pDest++=(UChar)ch;
 461                 ++pSrc;
 462             } else {
 463                 if(ch > 0xe0) {
 464                     if( /* handle U+1000..U+CFFF inline */
 465                         ch <= 0xec &&
 466                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 467                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 468                     ) {
 469                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 470                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 471                         pSrc += 3;
 472                         continue;
 473                     }
 474                 } else if(ch < 0xe0) {
 475                     if( /* handle U+0080..U+07FF inline */
 476                         ch >= 0xc2 &&
 477                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 478                     ) {
 479                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 480                         pSrc += 2;
 481                         continue;
 482                     }
 483                 }
 484
 485                 /* function call for "complicated" and error cases */
 486                 ++pSrc; /* continue after the lead byte */
 487                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 488                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 489                     *pErrorCode = U_INVALID_CHAR_FOUND;
 490                     return NULL;
 491                 } else if(ch<=0xFFFF) {
 492                     *(pDest++)=(UChar)ch;
 493                 } else {
 494                     *(pDest++)=U16_LEAD(ch);
 495                     if(pDest<pDestLimit) {
 496                         *(pDest++)=U16_TRAIL(ch);
 497                     } else {
 498                         reqLength++;
 499                         break;
 500                     }
 501                 }
 502             }
 503         }
 504
 505         /* Pre-flight the rest of the string. */
 506         while((ch = *pSrc) != 0) {
 507             if(ch <= 0x7f){
 508                 ++reqLength;
 509                 ++pSrc;
 510             } else {
 511                 if(ch > 0xe0) {
 512                     if( /* handle U+1000..U+CFFF inline */
 513                         ch <= 0xec &&
 514                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 515                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 516                     ) {
 517                         ++reqLength;
 518                         pSrc += 3;
 519                         continue;
 520                     }
 521                 } else if(ch < 0xe0) {
 522                     if( /* handle U+0080..U+07FF inline */
 523                         ch >= 0xc2 &&
 524                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 525                     ) {
 526                         ++reqLength;
 527                         pSrc += 2;
 528                         continue;
 529                     }
 530                 }
 531
 532                 /* function call for "complicated" and error cases */
 533                 ++pSrc; /* continue after the lead byte */
 534                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 535                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 536                     *pErrorCode = U_INVALID_CHAR_FOUND;
 537                     return NULL;
 538                 }
 539                 reqLength += U16_LENGTH(ch);
 540             }
 541         }
 542     } else /* srcLength >= 0 */ {
 543         const uint8_t *pSrcLimit = pSrc + srcLength;
 544         int32_t count;
 545
 546         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
 547         for(;;) {
 548             /*
 549              * Each iteration of the inner loop progresses by at most 3 UTF-8
 550              * bytes and one UChar, for most characters.
 551              * For supplementary code points (4 & 2), which are rare,
 552              * there is an additional adjustment.
 553              */
 554             count = (int32_t)(pDestLimit - pDest);
 555             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
 556             if(count > srcLength) {
 557                 count = srcLength; /* min(remaining dest, remaining src/3) */
 558             }
 559             if(count < 3) {
 560                 /*
 561                  * Too much overhead if we get near the end of the string,
 562                  * continue with the next loop.
 563                  */
 564                 break;
 565             }
 566
 567             do {
 568                 ch = *pSrc;
 569                 if(ch <= 0x7f){
 570                     *pDest++=(UChar)ch;
 571                     ++pSrc;
 572                 } else {
 573                     if(ch > 0xe0) {
 574                         if( /* handle U+1000..U+CFFF inline */
 575                             ch <= 0xec &&
 576                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 577                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 578                         ) {
 579                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 580                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 581                             pSrc += 3;
 582                             continue;
 583                         }
 584                     } else if(ch < 0xe0) {
 585                         if( /* handle U+0080..U+07FF inline */
 586                             ch >= 0xc2 &&
 587                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 588                         ) {
 589                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 590                             pSrc += 2;
 591                             continue;
 592                         }
 593                     }
 594
 595                     if(ch >= 0xf0 || subchar > 0xffff) {
 596                         /*
 597                          * We may read up to six bytes and write up to two UChars,
 598                          * which we didn't account for with computing count,
 599                          * so we adjust it here.
 600                          */
 601                         if(--count == 0) {
 602                             break;
 603                         }
 604                     }
 605
 606                     /* function call for "complicated" and error cases */
 607                     ++pSrc; /* continue after the lead byte */
 608                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 609                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 610                         *pErrorCode = U_INVALID_CHAR_FOUND;
 611                         return NULL;
 612                     }else if(ch<=0xFFFF){
 613                         *(pDest++)=(UChar)ch;
 614                     }else{
 615                         *(pDest++)=U16_LEAD(ch);
 616                         *(pDest++)=U16_TRAIL(ch);
 617                     }
 618                 }
 619             } while(--count > 0);
 620         }
 621
 622         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
 623             ch = *pSrc;
 624             if(ch <= 0x7f){
 625                 *pDest++=(UChar)ch;
 626                 ++pSrc;
 627             } else {
 628                 if(ch > 0xe0) {
 629                     if( /* handle U+1000..U+CFFF inline */
 630                         ch <= 0xec &&
 631                         ((pSrcLimit - pSrc) >= 3) &&
 632                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 633                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 634                     ) {
 635                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 636                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 637                         pSrc += 3;
 638                         continue;
 639                     }
 640                 } else if(ch < 0xe0) {
 641                     if( /* handle U+0080..U+07FF inline */
 642                         ch >= 0xc2 &&
 643                         ((pSrcLimit - pSrc) >= 2) &&
 644                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 645                     ) {
 646                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 647                         pSrc += 2;
 648                         continue;
 649                     }
 650                 }
 651
 652                 /* function call for "complicated" and error cases */
 653                 ++pSrc; /* continue after the lead byte */
 654                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 655                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 656                     *pErrorCode = U_INVALID_CHAR_FOUND;
 657                     return NULL;
 658                 }else if(ch<=0xFFFF){
 659                     *(pDest++)=(UChar)ch;
 660                 }else{
 661                     *(pDest++)=U16_LEAD(ch);
 662                     if(pDest<pDestLimit){
 663                         *(pDest++)=U16_TRAIL(ch);
 664                     }else{
 665                         reqLength++;
 666                         break;
 667                     }
 668                 }
 669             }
 670         }
 671         /* do not fill the dest buffer just count the UChars needed */
 672         while(pSrc < pSrcLimit){
 673             ch = *pSrc;
 674             if(ch <= 0x7f){
 675                 reqLength++;
 676                 ++pSrc;
 677             } else {
 678                 if(ch > 0xe0) {
 679                     if( /* handle U+1000..U+CFFF inline */
 680                         ch <= 0xec &&
 681                         ((pSrcLimit - pSrc) >= 3) &&
 682                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 683                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 684                     ) {
 685                         reqLength++;
 686                         pSrc += 3;
 687                         continue;
 688                     }
 689                 } else if(ch < 0xe0) {
 690                     if( /* handle U+0080..U+07FF inline */
 691                         ch >= 0xc2 &&
 692                         ((pSrcLimit - pSrc) >= 2) &&
 693                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 694                     ) {
 695                         reqLength++;
 696                         pSrc += 2;
 697                         continue;
 698                     }
 699                 }
 700
 701                 /* function call for "complicated" and error cases */
 702                 ++pSrc; /* continue after the lead byte */
 703                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 704                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 705                     *pErrorCode = U_INVALID_CHAR_FOUND;
 706                     return NULL;
 707                 }
 708                 reqLength+=U16_LENGTH(ch);
 709             }
 710         }
 711     }
 712
 713     reqLength+=(int32_t)(pDest - dest);
 714
 715     if(pNumSubstitutions!=NULL) {
 716         *pNumSubstitutions=numSubstitutions;
 717     }
 718
 719     if(pDestLength){
 720         *pDestLength = reqLength;
 721     }
 722
 723     /* Terminate the buffer */
 724     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 725
 726     return dest;
 727 }
 728
 729 U_CAPI UChar* U_EXPORT2
 730 u_strFromUTF8(UChar *dest,
 731               int32_t destCapacity,
 732               int32_t *pDestLength,
 733               const char* src,
 734               int32_t srcLength,
 735               UErrorCode *pErrorCode){
 736     return u_strFromUTF8WithSub(
 737             dest, destCapacity, pDestLength,
 738             src, srcLength,
 739             U_SENTINEL, NULL,
 740             pErrorCode);
 741 }
 742
 743 U_CAPI UChar * U_EXPORT2
 744 u_strFromUTF8Lenient(UChar *dest,
 745                      int32_t destCapacity,
 746                      int32_t *pDestLength,
 747                      const char *src,
 748                      int32_t srcLength,
 749                      UErrorCode *pErrorCode) {
 750     UChar *pDest = dest;
 751     UChar32 ch;
 752     int32_t reqLength = 0;
 753     uint8_t* pSrc = (uint8_t*) src;
 754
 755     /* args check */
 756     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 757         return NULL;
 758     }
 759
 760     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 761         (destCapacity<0) || (dest == NULL && destCapacity > 0)
 762     ) {
 763         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 764         return NULL;
 765     }
 766
 767     if(srcLength < 0) {
 768         /* Transform a NUL-terminated string. */
 769         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
 770         uint8_t t1, t2, t3; /* trail bytes */
 771
 772         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 773             if(ch < 0xc0) {
 774                 /*
 775                  * ASCII, or a trail byte in lead position which is treated like
 776                  * a single-byte sequence for better character boundary
 777                  * resynchronization after illegal sequences.
 778                  */
 779                 *pDest++=(UChar)ch;
 780                 ++pSrc;
 781                 continue;
 782             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 783                 if((t1 = pSrc[1]) != 0) {
 784                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 785                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
 786                     pSrc += 2;
 787                     continue;
 788                 }
 789             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 790                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
 791                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 792                     /* 0x2080 = (0x80 << 6) + 0x80 */
 793                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
 794                     pSrc += 3;
 795                     continue;
 796                 }
 797             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 798                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
 799                     pSrc += 4;
 800                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 801                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
 802                     *(pDest++) = U16_LEAD(ch);
 803                     if(pDest < pDestLimit) {
 804                         *(pDest++) = U16_TRAIL(ch);
 805                     } else {
 806                         reqLength = 1;
 807                         break;
 808                     }
 809                     continue;
 810                 }
 811             }
 812
 813             /* truncated character at the end */
 814             *pDest++ = 0xfffd;
 815             while(*++pSrc != 0) {}
 816             break;
 817         }
 818
 819         /* Pre-flight the rest of the string. */
 820         while((ch = *pSrc) != 0) {
 821             if(ch < 0xc0) {
 822                 /*
 823                  * ASCII, or a trail byte in lead position which is treated like
 824                  * a single-byte sequence for better character boundary
 825                  * resynchronization after illegal sequences.
 826                  */
 827                 ++reqLength;
 828                 ++pSrc;
 829                 continue;
 830             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 831                 if(pSrc[1] != 0) {
 832                     ++reqLength;
 833                     pSrc += 2;
 834                     continue;
 835                 }
 836             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 837                 if(pSrc[1] != 0 && pSrc[2] != 0) {
 838                     ++reqLength;
 839                     pSrc += 3;
 840                     continue;
 841                 }
 842             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 843                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
 844                     reqLength += 2;
 845                     pSrc += 4;
 846                     continue;
 847                 }
 848             }
 849
 850             /* truncated character at the end */
 851             ++reqLength;
 852             break;
 853         }
 854     } else /* srcLength >= 0 */ {
 855       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
 856
 857         /*
 858          * This function requires that if srcLength is given, then it must be
 859          * destCapatity >= srcLength so that we need not check for
 860          * destination buffer overflow in the loop.
 861          */
 862         if(destCapacity < srcLength) {
 863             if(pDestLength != NULL) {
 864                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
 865             }
 866             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
 867             return NULL;
 868         }
 869
 870         if((pSrcLimit - pSrc) >= 4) {
 871             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
 872
 873             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
 874             do {
 875                 ch = *pSrc++;
 876                 if(ch < 0xc0) {
 877                     /*
 878                      * ASCII, or a trail byte in lead position which is treated like
 879                      * a single-byte sequence for better character boundary
 880                      * resynchronization after illegal sequences.
 881                      */
 882                     *pDest++=(UChar)ch;
 883                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
 884                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 885                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 886                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 887                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 888                     /* 0x2080 = (0x80 << 6) + 0x80 */
 889                     ch = (ch << 12) + (*pSrc++ << 6);
 890                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 891                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 892                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 893                     ch = (ch << 18) + (*pSrc++ << 12);
 894                     ch += *pSrc++ << 6;
 895                     ch += *pSrc++ - 0x3c82080;
 896                     *(pDest++) = U16_LEAD(ch);
 897                     *(pDest++) = U16_TRAIL(ch);
 898                 }
 899             } while(pSrc < pSrcLimit);
 900
 901             pSrcLimit += 3; /* restore original pSrcLimit */
 902         }
 903
 904         while(pSrc < pSrcLimit) {
 905             ch = *pSrc++;
 906             if(ch < 0xc0) {
 907                 /*
 908                  * ASCII, or a trail byte in lead position which is treated like
 909                  * a single-byte sequence for better character boundary
 910                  * resynchronization after illegal sequences.
 911                  */
 912                 *pDest++=(UChar)ch;
 913                 continue;
 914             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 915                 if(pSrc < pSrcLimit) {
 916                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 917                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 918                     continue;
 919                 }
 920             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 921                 if((pSrcLimit - pSrc) >= 2) {
 922                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 923                     /* 0x2080 = (0x80 << 6) + 0x80 */
 924                     ch = (ch << 12) + (*pSrc++ << 6);
 925                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 926                     pSrc += 3;
 927                     continue;
 928                 }
 929             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 930                 if((pSrcLimit - pSrc) >= 3) {
 931                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 932                     ch = (ch << 18) + (*pSrc++ << 12);
 933                     ch += *pSrc++ << 6;
 934                     ch += *pSrc++ - 0x3c82080;
 935                     *(pDest++) = U16_LEAD(ch);
 936                     *(pDest++) = U16_TRAIL(ch);
 937                     pSrc += 4;
 938                     continue;
 939                 }
 940             }
 941
 942             /* truncated character at the end */
 943             *pDest++ = 0xfffd;
 944             break;
 945         }
 946     }
 947
 948     reqLength+=(int32_t)(pDest - dest);
 949
 950     if(pDestLength){
 951         *pDestLength = reqLength;
 952     }
 953
 954     /* Terminate the buffer */
 955     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 956
 957     return dest;
 958 }
 959
 960 static inline uint8_t *
 961 _appendUTF8(uint8_t *pDest, UChar32 c) {
 962     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
 963     if((c)<=0x7f) {
 964         *pDest++=(uint8_t)c;
 965     } else if(c<=0x7ff) {
 966         *pDest++=(uint8_t)((c>>6)|0xc0);
 967         *pDest++=(uint8_t)((c&0x3f)|0x80);
 968     } else if(c<=0xffff) {
 969         *pDest++=(uint8_t)((c>>12)|0xe0);
 970         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
 971         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 972     } else /* if((uint32_t)(c)<=0x10ffff) */ {
 973         *pDest++=(uint8_t)(((c)>>18)|0xf0);
 974         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
 975         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
 976         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 977     }
 978     return pDest;
 979 }
 980
 981
 982 U_CAPI char* U_EXPORT2
 983 u_strToUTF8WithSub(char *dest,
 984             int32_t destCapacity,
 985             int32_t *pDestLength,
 986             const UChar *pSrc,
 987             int32_t srcLength,
 988             UChar32 subchar, int32_t *pNumSubstitutions,
 989             UErrorCode *pErrorCode){
 990     int32_t reqLength=0;
 991     uint32_t ch=0,ch2=0;
 992     uint8_t *pDest = (uint8_t *)dest;
 993     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
 994     int32_t numSubstitutions;
 995
 996     /* args check */
 997     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 998         return NULL;
 999     }
1000
1001     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1002         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
1003         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1004     ) {
1005         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1006         return NULL;
1007     }
1008
1009     if(pNumSubstitutions!=NULL) {
1010         *pNumSubstitutions=0;
1011     }
1012     numSubstitutions=0;
1013
1014     if(srcLength==-1) {
1015         while((ch=*pSrc)!=0) {
1016             ++pSrc;
1017             if(ch <= 0x7f) {
1018                 if(pDest<pDestLimit) {
1019                     *pDest++ = (uint8_t)ch;
1020                 } else {
1021                     reqLength = 1;
1022                     break;
1023                 }
1024             } else if(ch <= 0x7ff) {
1025                 if((pDestLimit - pDest) >= 2) {
1026                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1027                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1028                 } else {
1029                     reqLength = 2;
1030                     break;
1031                 }
1032             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1033                 if((pDestLimit - pDest) >= 3) {
1034                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1035                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1036                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1037                 } else {
1038                     reqLength = 3;
1039                     break;
1040                 }
1041             } else /* ch is a surrogate */ {
1042                 int32_t length;
1043
1044                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1045                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1046                     ++pSrc;
1047                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1048                 } else if(subchar>=0) {
1049                     ch=subchar;
1050                     ++numSubstitutions;
1051                 } else {
1052                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1053                     *pErrorCode = U_INVALID_CHAR_FOUND;
1054                     return NULL;
1055                 }
1056
1057                 length = U8_LENGTH(ch);
1058                 if((pDestLimit - pDest) >= length) {
1059                     /* convert and append*/
1060                     pDest=_appendUTF8(pDest, ch);
1061                 } else {
1062                     reqLength = length;
1063                     break;
1064                 }
1065             }
1066         }
1067         while((ch=*pSrc++)!=0) {
1068             if(ch<=0x7f) {
1069                 ++reqLength;
1070             } else if(ch<=0x7ff) {
1071                 reqLength+=2;
1072             } else if(!U16_IS_SURROGATE(ch)) {
1073                 reqLength+=3;
1074             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1075                 ++pSrc;
1076                 reqLength+=4;
1077             } else if(subchar>=0) {
1078                 reqLength+=U8_LENGTH(subchar);
1079                 ++numSubstitutions;
1080             } else {
1081                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1082                 *pErrorCode = U_INVALID_CHAR_FOUND;
1083                 return NULL;
1084             }
1085         }
1086     } else {
1087         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1088         int32_t count;
1089
1090         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1091         for(;;) {
1092             /*
1093              * Each iteration of the inner loop progresses by at most 3 UTF-8
1094              * bytes and one UChar, for most characters.
1095              * For supplementary code points (4 & 2), which are rare,
1096              * there is an additional adjustment.
1097              */
1098             count = (int32_t)((pDestLimit - pDest) / 3);
1099             srcLength = (int32_t)(pSrcLimit - pSrc);
1100             if(count > srcLength) {
1101                 count = srcLength; /* min(remaining dest/3, remaining src) */
1102             }
1103             if(count < 3) {
1104                 /*
1105                  * Too much overhead if we get near the end of the string,
1106                  * continue with the next loop.
1107                  */
1108                 break;
1109             }
1110             do {
1111                 ch=*pSrc++;
1112                 if(ch <= 0x7f) {
1113                     *pDest++ = (uint8_t)ch;
1114                 } else if(ch <= 0x7ff) {
1115                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1116                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1118                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1119                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1120                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1121                 } else /* ch is a surrogate */ {
1122                     /*
1123                      * We will read two UChars and probably output four bytes,
1124                      * which we didn't account for with computing count,
1125                      * so we adjust it here.
1126                      */
1127                     if(--count == 0) {
1128                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1129                         break;  /* recompute count */
1130                     }
1131
1132                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1133                         ++pSrc;
1134                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1135
1136                         /* writing 4 bytes per 2 UChars is ok */
1137                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1138                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1139                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1140                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1141                     } else  {
1142                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1143                         if(subchar>=0) {
1144                             ch=subchar;
1145                             ++numSubstitutions;
1146                         } else {
1147                             *pErrorCode = U_INVALID_CHAR_FOUND;
1148                             return NULL;
1149                         }
1150
1151                         /* convert and append*/
1152                         pDest=_appendUTF8(pDest, ch);
1153                     }
1154                 }
1155             } while(--count > 0);
1156         }
1157
1158         while(pSrc<pSrcLimit) {
1159             ch=*pSrc++;
1160             if(ch <= 0x7f) {
1161                 if(pDest<pDestLimit) {
1162                     *pDest++ = (uint8_t)ch;
1163                 } else {
1164                     reqLength = 1;
1165                     break;
1166                 }
1167             } else if(ch <= 0x7ff) {
1168                 if((pDestLimit - pDest) >= 2) {
1169                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1170                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1171                 } else {
1172                     reqLength = 2;
1173                     break;
1174                 }
1175             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1176                 if((pDestLimit - pDest) >= 3) {
1177                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1178                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1179                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1180                 } else {
1181                     reqLength = 3;
1182                     break;
1183                 }
1184             } else /* ch is a surrogate */ {
1185                 int32_t length;
1186
1187                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1188                     ++pSrc;
1189                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1190                 } else if(subchar>=0) {
1191                     ch=subchar;
1192                     ++numSubstitutions;
1193                 } else {
1194                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1195                     *pErrorCode = U_INVALID_CHAR_FOUND;
1196                     return NULL;
1197                 }
1198
1199                 length = U8_LENGTH(ch);
1200                 if((pDestLimit - pDest) >= length) {
1201                     /* convert and append*/
1202                     pDest=_appendUTF8(pDest, ch);
1203                 } else {
1204                     reqLength = length;
1205                     break;
1206                 }
1207             }
1208         }
1209         while(pSrc<pSrcLimit) {
1210             ch=*pSrc++;
1211             if(ch<=0x7f) {
1212                 ++reqLength;
1213             } else if(ch<=0x7ff) {
1214                 reqLength+=2;
1215             } else if(!U16_IS_SURROGATE(ch)) {
1216                 reqLength+=3;
1217             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1218                 ++pSrc;
1219                 reqLength+=4;
1220             } else if(subchar>=0) {
1221                 reqLength+=U8_LENGTH(subchar);
1222                 ++numSubstitutions;
1223             } else {
1224                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1225                 *pErrorCode = U_INVALID_CHAR_FOUND;
1226                 return NULL;
1227             }
1228         }
1229     }
1230
1231     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1232
1233     if(pNumSubstitutions!=NULL) {
1234         *pNumSubstitutions=numSubstitutions;
1235     }
1236
1237     if(pDestLength){
1238         *pDestLength = reqLength;
1239     }
1240
1241     /* Terminate the buffer */
1242     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1243     return dest;
1244 }
1245
1246 U_CAPI char* U_EXPORT2
1247 u_strToUTF8(char *dest,
1248             int32_t destCapacity,
1249             int32_t *pDestLength,
1250             const UChar *pSrc,
1251             int32_t srcLength,
1252             UErrorCode *pErrorCode){
1253     return u_strToUTF8WithSub(
1254             dest, destCapacity, pDestLength,
1255             pSrc, srcLength,
1256             U_SENTINEL, NULL,
1257             pErrorCode);
1258 }
1259
1260 U_CAPI UChar* U_EXPORT2
1261 u_strFromJavaModifiedUTF8WithSub(
1262         UChar *dest,
1263         int32_t destCapacity,
1264         int32_t *pDestLength,
1265         const char *src,
1266         int32_t srcLength,
1267         UChar32 subchar, int32_t *pNumSubstitutions,
1268         UErrorCode *pErrorCode) {
1269     UChar *pDest = dest;
1270     UChar *pDestLimit = dest+destCapacity;
1271     UChar32 ch;
1272     int32_t reqLength = 0;
1273     const uint8_t* pSrc = (const uint8_t*) src;
1274     const uint8_t *pSrcLimit;
1275     int32_t count;
1276     uint8_t t1, t2; /* trail bytes */
1277     int32_t numSubstitutions;
1278
1279     /* args check */
1280     if(U_FAILURE(*pErrorCode)){
1281         return NULL;
1282     }
1283     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1284         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1285         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1286     ) {
1287         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1288         return NULL;
1289     }
1290
1291     if(pNumSubstitutions!=NULL) {
1292         *pNumSubstitutions=0;
1293     }
1294     numSubstitutions=0;
1295
1296     if(srcLength < 0) {
1297         /*
1298          * Transform a NUL-terminated ASCII string.
1299          * Handle non-ASCII strings with slower code.
1300          */
1301         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1302             *pDest++=(UChar)ch;
1303             ++pSrc;
1304         }
1305         if(ch == 0) {
1306             reqLength=(int32_t)(pDest - dest);
1307             if(pDestLength) {
1308                 *pDestLength = reqLength;
1309             }
1310
1311             /* Terminate the buffer */
1312             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1313             return dest;
1314         }
1315         srcLength = uprv_strlen((const char *)pSrc);
1316     }
1317
1318     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1319     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1320     for(;;) {
1321         count = (int32_t)(pDestLimit - pDest);
1322         srcLength = (int32_t)(pSrcLimit - pSrc);
1323         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1324             /* fast ASCII loop */
1325             const uint8_t *prevSrc = pSrc;
1326             int32_t delta;
1327             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1328                 *pDest++=(UChar)ch;
1329                 ++pSrc;
1330             }
1331             delta = (int32_t)(pSrc - prevSrc);
1332             count -= delta;
1333             srcLength -= delta;
1334         }
1335         /*
1336          * Each iteration of the inner loop progresses by at most 3 UTF-8
1337          * bytes and one UChar.
1338          */
1339         srcLength /= 3;
1340         if(count > srcLength) {
1341             count = srcLength; /* min(remaining dest, remaining src/3) */
1342         }
1343         if(count < 3) {
1344             /*
1345              * Too much overhead if we get near the end of the string,
1346              * continue with the next loop.
1347              */
1348             break;
1349         }
1350         do {
1351             ch = *pSrc;
1352             if(ch <= 0x7f){
1353                 *pDest++=(UChar)ch;
1354                 ++pSrc;
1355             } else {
1356                 if(ch >= 0xe0) {
1357                     if( /* handle U+0000..U+FFFF inline */
1358                         ch <= 0xef &&
1359                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1360                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1361                     ) {
1362                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1363                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1364                         pSrc += 3;
1365                         continue;
1366                     }
1367                 } else {
1368                     if( /* handle U+0000..U+07FF inline */
1369                         ch >= 0xc0 &&
1370                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1371                     ) {
1372                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1373                         pSrc += 2;
1374                         continue;
1375                     }
1376                 }
1377
1378                 if(subchar < 0) {
1379                     *pErrorCode = U_INVALID_CHAR_FOUND;
1380                     return NULL;
1381                 } else if(subchar > 0xffff && --count == 0) {
1382                     /*
1383                      * We need to write two UChars, adjusted count for that,
1384                      * and ran out of space.
1385                      */
1386                     break;
1387                 } else {
1388                     /* function call for error cases */
1389                     ++pSrc; /* continue after the lead byte */
1390                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1391                     ++numSubstitutions;
1392                     if(subchar<=0xFFFF) {
1393                         *(pDest++)=(UChar)subchar;
1394                     } else {
1395                         *(pDest++)=U16_LEAD(subchar);
1396                         *(pDest++)=U16_TRAIL(subchar);
1397                     }
1398                 }
1399             }
1400         } while(--count > 0);
1401     }
1402
1403     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1404         ch = *pSrc;
1405         if(ch <= 0x7f){
1406             *pDest++=(UChar)ch;
1407             ++pSrc;
1408         } else {
1409             if(ch >= 0xe0) {
1410                 if( /* handle U+0000..U+FFFF inline */
1411                     ch <= 0xef &&
1412                     ((pSrcLimit - pSrc) >= 3) &&
1413                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1414                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1415                 ) {
1416                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1417                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1418                     pSrc += 3;
1419                     continue;
1420                 }
1421             } else {
1422                 if( /* handle U+0000..U+07FF inline */
1423                     ch >= 0xc0 &&
1424                     ((pSrcLimit - pSrc) >= 2) &&
1425                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1426                 ) {
1427                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1428                     pSrc += 2;
1429                     continue;
1430                 }
1431             }
1432
1433             if(subchar < 0) {
1434                 *pErrorCode = U_INVALID_CHAR_FOUND;
1435                 return NULL;
1436             } else {
1437                 /* function call for error cases */
1438                 ++pSrc; /* continue after the lead byte */
1439                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1440                 ++numSubstitutions;
1441                 if(subchar<=0xFFFF) {
1442                     *(pDest++)=(UChar)subchar;
1443                 } else {
1444                     *(pDest++)=U16_LEAD(subchar);
1445                     if(pDest<pDestLimit) {
1446                         *(pDest++)=U16_TRAIL(subchar);
1447                     } else {
1448                         reqLength++;
1449                         break;
1450                     }
1451                 }
1452             }
1453         }
1454     }
1455
1456     /* do not fill the dest buffer just count the UChars needed */
1457     while(pSrc < pSrcLimit){
1458         ch = *pSrc;
1459         if(ch <= 0x7f) {
1460             reqLength++;
1461             ++pSrc;
1462         } else {
1463             if(ch >= 0xe0) {
1464                 if( /* handle U+0000..U+FFFF inline */
1465                     ch <= 0xef &&
1466                     ((pSrcLimit - pSrc) >= 3) &&
1467                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1468                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1469                 ) {
1470                     reqLength++;
1471                     pSrc += 3;
1472                     continue;
1473                 }
1474             } else {
1475                 if( /* handle U+0000..U+07FF inline */
1476                     ch >= 0xc0 &&
1477                     ((pSrcLimit - pSrc) >= 2) &&
1478                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1479                 ) {
1480                     reqLength++;
1481                     pSrc += 2;
1482                     continue;
1483                 }
1484             }
1485
1486             if(subchar < 0) {
1487                 *pErrorCode = U_INVALID_CHAR_FOUND;
1488                 return NULL;
1489             } else {
1490                 /* function call for error cases */
1491                 ++pSrc; /* continue after the lead byte */
1492                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1493                 ++numSubstitutions;
1494                 reqLength+=U16_LENGTH(ch);
1495             }
1496         }
1497     }
1498
1499     if(pNumSubstitutions!=NULL) {
1500         *pNumSubstitutions=numSubstitutions;
1501     }
1502
1503     reqLength+=(int32_t)(pDest - dest);
1504     if(pDestLength) {
1505         *pDestLength = reqLength;
1506     }
1507
1508     /* Terminate the buffer */
1509     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1510     return dest;
1511 }
1512
1513 U_CAPI char* U_EXPORT2
1514 u_strToJavaModifiedUTF8(
1515         char *dest,
1516         int32_t destCapacity,
1517         int32_t *pDestLength,
1518         const UChar *src,
1519         int32_t srcLength,
1520         UErrorCode *pErrorCode) {
1521     int32_t reqLength=0;
1522     uint32_t ch=0;
1523     uint8_t *pDest = (uint8_t *)dest;
1524     uint8_t *pDestLimit = pDest + destCapacity;
1525     const UChar *pSrcLimit;
1526     int32_t count;
1527
1528     /* args check */
1529     if(U_FAILURE(*pErrorCode)){
1530         return NULL;
1531     }
1532     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1533         (dest==NULL && destCapacity!=0) || destCapacity<0
1534     ) {
1535         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1536         return NULL;
1537     }
1538
1539     if(srcLength==-1) {
1540         /* Convert NUL-terminated ASCII, then find the string length. */
1541         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1542             *pDest++ = (uint8_t)ch;
1543             ++src;
1544         }
1545         if(ch == 0) {
1546             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1547             if(pDestLength) {
1548                 *pDestLength = reqLength;
1549             }
1550
1551             /* Terminate the buffer */
1552             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1553             return dest;
1554         }
1555         srcLength = u_strlen(src);
1556     }
1557
1558     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1559     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1560     for(;;) {
1561         count = (int32_t)(pDestLimit - pDest);
1562         srcLength = (int32_t)(pSrcLimit - src);
1563         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1564             /* fast ASCII loop */
1565             const UChar *prevSrc = src;
1566             int32_t delta;
1567             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1568                 *pDest++=(uint8_t)ch;
1569                 ++src;
1570             }
1571             delta = (int32_t)(src - prevSrc);
1572             count -= delta;
1573             srcLength -= delta;
1574         }
1575         /*
1576          * Each iteration of the inner loop progresses by at most 3 UTF-8
1577          * bytes and one UChar.
1578          */
1579         count /= 3;
1580         if(count > srcLength) {
1581             count = srcLength; /* min(remaining dest/3, remaining src) */
1582         }
1583         if(count < 3) {
1584             /*
1585              * Too much overhead if we get near the end of the string,
1586              * continue with the next loop.
1587              */
1588             break;
1589         }
1590         do {
1591             ch=*src++;
1592             if(ch <= 0x7f && ch != 0) {
1593                 *pDest++ = (uint8_t)ch;
1594             } else if(ch <= 0x7ff) {
1595                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1596                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597             } else {
1598                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1599                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1600                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1601             }
1602         } while(--count > 0);
1603     }
1604
1605     while(src<pSrcLimit) {
1606         ch=*src++;
1607         if(ch <= 0x7f && ch != 0) {
1608             if(pDest<pDestLimit) {
1609                 *pDest++ = (uint8_t)ch;
1610             } else {
1611                 reqLength = 1;
1612                 break;
1613             }
1614         } else if(ch <= 0x7ff) {
1615             if((pDestLimit - pDest) >= 2) {
1616                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1617                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1618             } else {
1619                 reqLength = 2;
1620                 break;
1621             }
1622         } else {
1623             if((pDestLimit - pDest) >= 3) {
1624                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1625                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1626                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1627             } else {
1628                 reqLength = 3;
1629                 break;
1630             }
1631         }
1632     }
1633     while(src<pSrcLimit) {
1634         ch=*src++;
1635         if(ch <= 0x7f && ch != 0) {
1636             ++reqLength;
1637         } else if(ch<=0x7ff) {
1638             reqLength+=2;
1639         } else {
1640             reqLength+=3;
1641         }
1642     }
1643
1644     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1645     if(pDestLength){
1646         *pDestLength = reqLength;
1647     }
1648
1649     /* Terminate the buffer */
1650     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1651     return dest;
1652 }