icuSources/common/ustrtrns.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2013, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *
   9 * File ustrtrns.cpp
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   9/10/2001    Ram    Creation.
  15 ******************************************************************************
  16 */
  17
  18 /*******************************************************************************
  19  *
  20  * u_strTo* and u_strFrom* APIs
  21  * WCS functions moved to ustr_wcs.c for better modularization
  22  *
  23  *******************************************************************************
  24  */
  25
  26
  27 #include "unicode/putil.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/utf.h"
  30 #include "unicode/utf8.h"
  31 #include "unicode/utf16.h"
  32 #include "cstring.h"
  33 #include "cmemory.h"
  34 #include "ustr_imp.h"
  35 #include "uassert.h"
  36
  37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  38
  39 U_CAPI UChar* U_EXPORT2
  40 u_strFromUTF32WithSub(UChar *dest,
  41                int32_t destCapacity,
  42                int32_t *pDestLength,
  43                const UChar32 *src,
  44                int32_t srcLength,
  45                UChar32 subchar, int32_t *pNumSubstitutions,
  46                UErrorCode *pErrorCode) {
  47     const UChar32 *srcLimit;
  48     UChar32 ch;
  49     UChar *destLimit;
  50     UChar *pDest;
  51     int32_t reqLength;
  52     int32_t numSubstitutions;
  53
  54     /* args check */
  55     if(U_FAILURE(*pErrorCode)){
  56         return NULL;
  57     }
  58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
  60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  61     ) {
  62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  63         return NULL;
  64     }
  65
  66     if(pNumSubstitutions != NULL) {
  67         *pNumSubstitutions = 0;
  68     }
  69
  70     pDest = dest;
  71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
  72     reqLength = 0;
  73     numSubstitutions = 0;
  74
  75     if(srcLength < 0) {
  76         /* simple loop for conversion of a NUL-terminated BMP string */
  77         while((ch=*src) != 0 &&
  78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
  79             ++src;
  80             if(pDest < destLimit) {
  81                 *pDest++ = (UChar)ch;
  82             } else {
  83                 ++reqLength;
  84             }
  85         }
  86         srcLimit = src;
  87         if(ch != 0) {
  88             /* "complicated" case, find the end of the remaining string */
  89             while(*++srcLimit != 0) {}
  90         }
  91     } else {
  92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
  93     }
  94
  95     /* convert with length */
  96     while(src < srcLimit) {
  97         ch = *src++;
  98         do {
  99             /* usually "loops" once; twice only for writing subchar */
 100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
 101                 if(pDest < destLimit) {
 102                     *pDest++ = (UChar)ch;
 103                 } else {
 104                     ++reqLength;
 105                 }
 106                 break;
 107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
 108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
 109                     *pDest++ = U16_LEAD(ch);
 110                     *pDest++ = U16_TRAIL(ch);
 111                 } else {
 112                     reqLength += 2;
 113                 }
 114                 break;
 115             } else if((ch = subchar) < 0) {
 116                 /* surrogate code point, or not a Unicode code point at all */
 117                 *pErrorCode = U_INVALID_CHAR_FOUND;
 118                 return NULL;
 119             } else {
 120                 ++numSubstitutions;
 121             }
 122         } while(TRUE);
 123     }
 124
 125     reqLength += (int32_t)(pDest - dest);
 126     if(pDestLength) {
 127         *pDestLength = reqLength;
 128     }
 129     if(pNumSubstitutions != NULL) {
 130         *pNumSubstitutions = numSubstitutions;
 131     }
 132
 133     /* Terminate the buffer */
 134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
 135
 136     return dest;
 137 }
 138
 139 U_CAPI UChar* U_EXPORT2
 140 u_strFromUTF32(UChar *dest,
 141                int32_t destCapacity,
 142                int32_t *pDestLength,
 143                const UChar32 *src,
 144                int32_t srcLength,
 145                UErrorCode *pErrorCode) {
 146     return u_strFromUTF32WithSub(
 147             dest, destCapacity, pDestLength,
 148             src, srcLength,
 149             U_SENTINEL, NULL,
 150             pErrorCode);
 151 }
 152
 153 U_CAPI UChar32* U_EXPORT2
 154 u_strToUTF32WithSub(UChar32 *dest,
 155              int32_t destCapacity,
 156              int32_t *pDestLength,
 157              const UChar *src,
 158              int32_t srcLength,
 159              UChar32 subchar, int32_t *pNumSubstitutions,
 160              UErrorCode *pErrorCode) {
 161     const UChar *srcLimit;
 162     UChar32 ch;
 163     UChar ch2;
 164     UChar32 *destLimit;
 165     UChar32 *pDest;
 166     int32_t reqLength;
 167     int32_t numSubstitutions;
 168
 169     /* args check */
 170     if(U_FAILURE(*pErrorCode)){
 171         return NULL;
 172     }
 173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 176     ) {
 177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 178         return NULL;
 179     }
 180
 181     if(pNumSubstitutions != NULL) {
 182         *pNumSubstitutions = 0;
 183     }
 184
 185     pDest = dest;
 186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
 187     reqLength = 0;
 188     numSubstitutions = 0;
 189
 190     if(srcLength < 0) {
 191         /* simple loop for conversion of a NUL-terminated BMP string */
 192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
 193             ++src;
 194             if(pDest < destLimit) {
 195                 *pDest++ = ch;
 196             } else {
 197                 ++reqLength;
 198             }
 199         }
 200         srcLimit = src;
 201         if(ch != 0) {
 202             /* "complicated" case, find the end of the remaining string */
 203             while(*++srcLimit != 0) {}
 204         }
 205     } else {
 206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
 207     }
 208
 209     /* convert with length */
 210     while(src < srcLimit) {
 211         ch = *src++;
 212         if(!U16_IS_SURROGATE(ch)) {
 213             /* write or count ch below */
 214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
 215             ++src;
 216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
 217         } else if((ch = subchar) < 0) {
 218             /* unpaired surrogate */
 219             *pErrorCode = U_INVALID_CHAR_FOUND;
 220             return NULL;
 221         } else {
 222             ++numSubstitutions;
 223         }
 224         if(pDest < destLimit) {
 225             *pDest++ = ch;
 226         } else {
 227             ++reqLength;
 228         }
 229     }
 230
 231     reqLength += (int32_t)(pDest - dest);
 232     if(pDestLength) {
 233         *pDestLength = reqLength;
 234     }
 235     if(pNumSubstitutions != NULL) {
 236         *pNumSubstitutions = numSubstitutions;
 237     }
 238
 239     /* Terminate the buffer */
 240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
 241
 242     return dest;
 243 }
 244
 245 U_CAPI UChar32* U_EXPORT2
 246 u_strToUTF32(UChar32 *dest,
 247              int32_t destCapacity,
 248              int32_t *pDestLength,
 249              const UChar *src,
 250              int32_t srcLength,
 251              UErrorCode *pErrorCode) {
 252     return u_strToUTF32WithSub(
 253             dest, destCapacity, pDestLength,
 254             src, srcLength,
 255             U_SENTINEL, NULL,
 256             pErrorCode);
 257 }
 258
 259 /* for utf8_nextCharSafeBodyTerminated() */
 260 static const UChar32
 261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
 262
 263 /*
 264  * Version of utf8_nextCharSafeBody() with the following differences:
 265  * - checks for NUL termination instead of length
 266  * - works with pointers instead of indexes
 267  * - always strict (strict==-1)
 268  *
 269  * *ps points to after the lead byte and will be moved to after the last trail byte.
 270  * c is the lead byte.
 271  * @return the code point, or U_SENTINEL
 272  */
 273 static UChar32
 274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
 275     const uint8_t *s=*ps;
 276     uint8_t trail, illegal=0;
 277     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 278     U_ASSERT(count<6);
 279     U8_MASK_LEAD_BYTE((c), count);
 280     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 281     switch(count) {
 282     /* each branch falls through to the next one */
 283     case 5:
 284     case 4:
 285         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 286         illegal=1;
 287         break;
 288     case 3:
 289         trail=(uint8_t)(*s++ - 0x80);
 290         c=(c<<6)|trail;
 291         if(trail>0x3f || c>=0x110) {
 292             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
 293             illegal=1;
 294             break;
 295         }
 296     case 2: /*fall through*/
 297         trail=(uint8_t)(*s++ - 0x80);
 298         if(trail>0x3f) {
 299             /* not a trail byte */
 300             illegal=1;
 301             break;
 302         }
 303         c=(c<<6)|trail;
 304     case 1: /*fall through*/
 305         trail=(uint8_t)(*s++ - 0x80);
 306         if(trail>0x3f) {
 307             /* not a trail byte */
 308             illegal=1;
 309         }
 310         c=(c<<6)|trail;
 311         break;
 312     case 0:
 313         return U_SENTINEL;
 314     /* no default branch to optimize switch()  - all values are covered */
 315     }
 316
 317     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 318     /* illegal is also set if count>=4 */
 319     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 320         /* error handling */
 321         /* don't go beyond this sequence */
 322         s=*ps;
 323         while(count>0 && U8_IS_TRAIL(*s)) {
 324             ++s;
 325             --count;
 326         }
 327         c=U_SENTINEL;
 328     }
 329     *ps=s;
 330     return c;
 331 }
 332
 333 /*
 334  * Version of utf8_nextCharSafeBody() with the following differences:
 335  * - works with pointers instead of indexes
 336  * - always strict (strict==-1)
 337  *
 338  * *ps points to after the lead byte and will be moved to after the last trail byte.
 339  * c is the lead byte.
 340  * @return the code point, or U_SENTINEL
 341  */
 342 static UChar32
 343 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
 344     const uint8_t *s=*ps;
 345     uint8_t trail, illegal=0;
 346     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 347     if((limit-s)>=count) {
 348         U8_MASK_LEAD_BYTE((c), count);
 349         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 350         switch(count) {
 351         /* each branch falls through to the next one */
 352         case 5:
 353         case 4:
 354             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 355             illegal=1;
 356             break;
 357         case 3:
 358             trail=*s++;
 359             c=(c<<6)|(trail&0x3f);
 360             if(c<0x110) {
 361                 illegal|=(trail&0xc0)^0x80;
 362             } else {
 363                 /* code point>0x10ffff, outside Unicode */
 364                 illegal=1;
 365                 break;
 366             }
 367         case 2: /*fall through*/
 368             trail=*s++;
 369             c=(c<<6)|(trail&0x3f);
 370             illegal|=(trail&0xc0)^0x80;
 371         case 1: /*fall through*/
 372             trail=*s++;
 373             c=(c<<6)|(trail&0x3f);
 374             illegal|=(trail&0xc0)^0x80;
 375             break;
 376         case 0:
 377             return U_SENTINEL;
 378         /* no default branch to optimize switch()  - all values are covered */
 379         }
 380     } else {
 381         illegal=1; /* too few bytes left */
 382     }
 383
 384     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 385     /* illegal is also set if count>=4 */
 386     U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
 387     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 388         /* error handling */
 389         /* don't go beyond this sequence */
 390         s=*ps;
 391         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
 392             ++s;
 393             --count;
 394         }
 395         c=U_SENTINEL;
 396     }
 397     *ps=s;
 398     return c;
 399 }
 400
 401 U_CAPI UChar* U_EXPORT2
 402 u_strFromUTF8WithSub(UChar *dest,
 403               int32_t destCapacity,
 404               int32_t *pDestLength,
 405               const char* src,
 406               int32_t srcLength,
 407               UChar32 subchar, int32_t *pNumSubstitutions,
 408               UErrorCode *pErrorCode){
 409     UChar *pDest = dest;
 410     UChar *pDestLimit = dest+destCapacity;
 411     UChar32 ch;
 412     int32_t reqLength = 0;
 413     const uint8_t* pSrc = (const uint8_t*) src;
 414     uint8_t t1, t2; /* trail bytes */
 415     int32_t numSubstitutions;
 416
 417     /* args check */
 418     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 419         return NULL;
 420     }
 421
 422     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 423         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 424         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 425     ) {
 426         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 427         return NULL;
 428     }
 429
 430     if(pNumSubstitutions!=NULL) {
 431         *pNumSubstitutions=0;
 432     }
 433     numSubstitutions=0;
 434
 435     /*
 436      * Inline processing of UTF-8 byte sequences:
 437      *
 438      * Byte sequences for the most common characters are handled inline in
 439      * the conversion loops. In order to reduce the path lengths for those
 440      * characters, the tests are arranged in a kind of binary search.
 441      * ASCII (<=0x7f) is checked first, followed by the dividing point
 442      * between 2- and 3-byte sequences (0xe0).
 443      * The 3-byte branch is tested first to speed up CJK text.
 444      * The compiler should combine the subtractions for the two tests for 0xe0.
 445      * Each branch then tests for the other end of its range.
 446      */
 447
 448     if(srcLength < 0){
 449         /*
 450          * Transform a NUL-terminated string.
 451          * The code explicitly checks for NULs only in the lead byte position.
 452          * A NUL byte in the trail byte position fails the trail byte range check anyway.
 453          */
 454         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 455             if(ch <= 0x7f){
 456                 *pDest++=(UChar)ch;
 457                 ++pSrc;
 458             } else {
 459                 if(ch > 0xe0) {
 460                     if( /* handle U+1000..U+CFFF inline */
 461                         ch <= 0xec &&
 462                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 463                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 464                     ) {
 465                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 466                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 467                         pSrc += 3;
 468                         continue;
 469                     }
 470                 } else if(ch < 0xe0) {
 471                     if( /* handle U+0080..U+07FF inline */
 472                         ch >= 0xc2 &&
 473                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 474                     ) {
 475                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 476                         pSrc += 2;
 477                         continue;
 478                     }
 479                 }
 480
 481                 /* function call for "complicated" and error cases */
 482                 ++pSrc; /* continue after the lead byte */
 483                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 484                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 485                     *pErrorCode = U_INVALID_CHAR_FOUND;
 486                     return NULL;
 487                 } else if(ch<=0xFFFF) {
 488                     *(pDest++)=(UChar)ch;
 489                 } else {
 490                     *(pDest++)=U16_LEAD(ch);
 491                     if(pDest<pDestLimit) {
 492                         *(pDest++)=U16_TRAIL(ch);
 493                     } else {
 494                         reqLength++;
 495                         break;
 496                     }
 497                 }
 498             }
 499         }
 500
 501         /* Pre-flight the rest of the string. */
 502         while((ch = *pSrc) != 0) {
 503             if(ch <= 0x7f){
 504                 ++reqLength;
 505                 ++pSrc;
 506             } else {
 507                 if(ch > 0xe0) {
 508                     if( /* handle U+1000..U+CFFF inline */
 509                         ch <= 0xec &&
 510                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 511                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 512                     ) {
 513                         ++reqLength;
 514                         pSrc += 3;
 515                         continue;
 516                     }
 517                 } else if(ch < 0xe0) {
 518                     if( /* handle U+0080..U+07FF inline */
 519                         ch >= 0xc2 &&
 520                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 521                     ) {
 522                         ++reqLength;
 523                         pSrc += 2;
 524                         continue;
 525                     }
 526                 }
 527
 528                 /* function call for "complicated" and error cases */
 529                 ++pSrc; /* continue after the lead byte */
 530                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 531                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 532                     *pErrorCode = U_INVALID_CHAR_FOUND;
 533                     return NULL;
 534                 }
 535                 reqLength += U16_LENGTH(ch);
 536             }
 537         }
 538     } else /* srcLength >= 0 */ {
 539         const uint8_t *pSrcLimit = pSrc + srcLength;
 540         int32_t count;
 541
 542         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
 543         for(;;) {
 544             /*
 545              * Each iteration of the inner loop progresses by at most 3 UTF-8
 546              * bytes and one UChar, for most characters.
 547              * For supplementary code points (4 & 2), which are rare,
 548              * there is an additional adjustment.
 549              */
 550             count = (int32_t)(pDestLimit - pDest);
 551             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
 552             if(count > srcLength) {
 553                 count = srcLength; /* min(remaining dest, remaining src/3) */
 554             }
 555             if(count < 3) {
 556                 /*
 557                  * Too much overhead if we get near the end of the string,
 558                  * continue with the next loop.
 559                  */
 560                 break;
 561             }
 562
 563             do {
 564                 ch = *pSrc;
 565                 if(ch <= 0x7f){
 566                     *pDest++=(UChar)ch;
 567                     ++pSrc;
 568                 } else {
 569                     if(ch > 0xe0) {
 570                         if( /* handle U+1000..U+CFFF inline */
 571                             ch <= 0xec &&
 572                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 573                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 574                         ) {
 575                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 576                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 577                             pSrc += 3;
 578                             continue;
 579                         }
 580                     } else if(ch < 0xe0) {
 581                         if( /* handle U+0080..U+07FF inline */
 582                             ch >= 0xc2 &&
 583                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 584                         ) {
 585                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 586                             pSrc += 2;
 587                             continue;
 588                         }
 589                     }
 590
 591                     if(ch >= 0xf0 || subchar > 0xffff) {
 592                         /*
 593                          * We may read up to six bytes and write up to two UChars,
 594                          * which we didn't account for with computing count,
 595                          * so we adjust it here.
 596                          */
 597                         if(--count == 0) {
 598                             break;
 599                         }
 600                     }
 601
 602                     /* function call for "complicated" and error cases */
 603                     ++pSrc; /* continue after the lead byte */
 604                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 605                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 606                         *pErrorCode = U_INVALID_CHAR_FOUND;
 607                         return NULL;
 608                     }else if(ch<=0xFFFF){
 609                         *(pDest++)=(UChar)ch;
 610                     }else{
 611                         *(pDest++)=U16_LEAD(ch);
 612                         *(pDest++)=U16_TRAIL(ch);
 613                     }
 614                 }
 615             } while(--count > 0);
 616         }
 617
 618         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
 619             ch = *pSrc;
 620             if(ch <= 0x7f){
 621                 *pDest++=(UChar)ch;
 622                 ++pSrc;
 623             } else {
 624                 if(ch > 0xe0) {
 625                     if( /* handle U+1000..U+CFFF inline */
 626                         ch <= 0xec &&
 627                         ((pSrcLimit - pSrc) >= 3) &&
 628                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 629                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 630                     ) {
 631                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 632                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 633                         pSrc += 3;
 634                         continue;
 635                     }
 636                 } else if(ch < 0xe0) {
 637                     if( /* handle U+0080..U+07FF inline */
 638                         ch >= 0xc2 &&
 639                         ((pSrcLimit - pSrc) >= 2) &&
 640                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 641                     ) {
 642                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 643                         pSrc += 2;
 644                         continue;
 645                     }
 646                 }
 647
 648                 /* function call for "complicated" and error cases */
 649                 ++pSrc; /* continue after the lead byte */
 650                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 651                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 652                     *pErrorCode = U_INVALID_CHAR_FOUND;
 653                     return NULL;
 654                 }else if(ch<=0xFFFF){
 655                     *(pDest++)=(UChar)ch;
 656                 }else{
 657                     *(pDest++)=U16_LEAD(ch);
 658                     if(pDest<pDestLimit){
 659                         *(pDest++)=U16_TRAIL(ch);
 660                     }else{
 661                         reqLength++;
 662                         break;
 663                     }
 664                 }
 665             }
 666         }
 667         /* do not fill the dest buffer just count the UChars needed */
 668         while(pSrc < pSrcLimit){
 669             ch = *pSrc;
 670             if(ch <= 0x7f){
 671                 reqLength++;
 672                 ++pSrc;
 673             } else {
 674                 if(ch > 0xe0) {
 675                     if( /* handle U+1000..U+CFFF inline */
 676                         ch <= 0xec &&
 677                         ((pSrcLimit - pSrc) >= 3) &&
 678                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 679                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 680                     ) {
 681                         reqLength++;
 682                         pSrc += 3;
 683                         continue;
 684                     }
 685                 } else if(ch < 0xe0) {
 686                     if( /* handle U+0080..U+07FF inline */
 687                         ch >= 0xc2 &&
 688                         ((pSrcLimit - pSrc) >= 2) &&
 689                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 690                     ) {
 691                         reqLength++;
 692                         pSrc += 2;
 693                         continue;
 694                     }
 695                 }
 696
 697                 /* function call for "complicated" and error cases */
 698                 ++pSrc; /* continue after the lead byte */
 699                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 700                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 701                     *pErrorCode = U_INVALID_CHAR_FOUND;
 702                     return NULL;
 703                 }
 704                 reqLength+=U16_LENGTH(ch);
 705             }
 706         }
 707     }
 708
 709     reqLength+=(int32_t)(pDest - dest);
 710
 711     if(pNumSubstitutions!=NULL) {
 712         *pNumSubstitutions=numSubstitutions;
 713     }
 714
 715     if(pDestLength){
 716         *pDestLength = reqLength;
 717     }
 718
 719     /* Terminate the buffer */
 720     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 721
 722     return dest;
 723 }
 724
 725 U_CAPI UChar* U_EXPORT2
 726 u_strFromUTF8(UChar *dest,
 727               int32_t destCapacity,
 728               int32_t *pDestLength,
 729               const char* src,
 730               int32_t srcLength,
 731               UErrorCode *pErrorCode){
 732     return u_strFromUTF8WithSub(
 733             dest, destCapacity, pDestLength,
 734             src, srcLength,
 735             U_SENTINEL, NULL,
 736             pErrorCode);
 737 }
 738
 739 U_CAPI UChar * U_EXPORT2
 740 u_strFromUTF8Lenient(UChar *dest,
 741                      int32_t destCapacity,
 742                      int32_t *pDestLength,
 743                      const char *src,
 744                      int32_t srcLength,
 745                      UErrorCode *pErrorCode) {
 746     UChar *pDest = dest;
 747     UChar32 ch;
 748     int32_t reqLength = 0;
 749     uint8_t* pSrc = (uint8_t*) src;
 750
 751     /* args check */
 752     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 753         return NULL;
 754     }
 755
 756     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 757         (destCapacity<0) || (dest == NULL && destCapacity > 0)
 758     ) {
 759         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 760         return NULL;
 761     }
 762
 763     if(srcLength < 0) {
 764         /* Transform a NUL-terminated string. */
 765         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
 766         uint8_t t1, t2, t3; /* trail bytes */
 767
 768         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 769             if(ch < 0xc0) {
 770                 /*
 771                  * ASCII, or a trail byte in lead position which is treated like
 772                  * a single-byte sequence for better character boundary
 773                  * resynchronization after illegal sequences.
 774                  */
 775                 *pDest++=(UChar)ch;
 776                 ++pSrc;
 777                 continue;
 778             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 779                 if((t1 = pSrc[1]) != 0) {
 780                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 781                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
 782                     pSrc += 2;
 783                     continue;
 784                 }
 785             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 786                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
 787                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 788                     /* 0x2080 = (0x80 << 6) + 0x80 */
 789                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
 790                     pSrc += 3;
 791                     continue;
 792                 }
 793             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 794                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
 795                     pSrc += 4;
 796                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 797                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
 798                     *(pDest++) = U16_LEAD(ch);
 799                     if(pDest < pDestLimit) {
 800                         *(pDest++) = U16_TRAIL(ch);
 801                     } else {
 802                         reqLength = 1;
 803                         break;
 804                     }
 805                     continue;
 806                 }
 807             }
 808
 809             /* truncated character at the end */
 810             *pDest++ = 0xfffd;
 811             while(*++pSrc != 0) {}
 812             break;
 813         }
 814
 815         /* Pre-flight the rest of the string. */
 816         while((ch = *pSrc) != 0) {
 817             if(ch < 0xc0) {
 818                 /*
 819                  * ASCII, or a trail byte in lead position which is treated like
 820                  * a single-byte sequence for better character boundary
 821                  * resynchronization after illegal sequences.
 822                  */
 823                 ++reqLength;
 824                 ++pSrc;
 825                 continue;
 826             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 827                 if(pSrc[1] != 0) {
 828                     ++reqLength;
 829                     pSrc += 2;
 830                     continue;
 831                 }
 832             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 833                 if(pSrc[1] != 0 && pSrc[2] != 0) {
 834                     ++reqLength;
 835                     pSrc += 3;
 836                     continue;
 837                 }
 838             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 839                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
 840                     reqLength += 2;
 841                     pSrc += 4;
 842                     continue;
 843                 }
 844             }
 845
 846             /* truncated character at the end */
 847             ++reqLength;
 848             break;
 849         }
 850     } else /* srcLength >= 0 */ {
 851       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
 852
 853         /*
 854          * This function requires that if srcLength is given, then it must be
 855          * destCapatity >= srcLength so that we need not check for
 856          * destination buffer overflow in the loop.
 857          */
 858         if(destCapacity < srcLength) {
 859             if(pDestLength != NULL) {
 860                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
 861             }
 862             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
 863             return NULL;
 864         }
 865
 866         if((pSrcLimit - pSrc) >= 4) {
 867             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
 868
 869             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
 870             do {
 871                 ch = *pSrc++;
 872                 if(ch < 0xc0) {
 873                     /*
 874                      * ASCII, or a trail byte in lead position which is treated like
 875                      * a single-byte sequence for better character boundary
 876                      * resynchronization after illegal sequences.
 877                      */
 878                     *pDest++=(UChar)ch;
 879                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
 880                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 881                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 882                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 883                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 884                     /* 0x2080 = (0x80 << 6) + 0x80 */
 885                     ch = (ch << 12) + (*pSrc++ << 6);
 886                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 887                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 888                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 889                     ch = (ch << 18) + (*pSrc++ << 12);
 890                     ch += *pSrc++ << 6;
 891                     ch += *pSrc++ - 0x3c82080;
 892                     *(pDest++) = U16_LEAD(ch);
 893                     *(pDest++) = U16_TRAIL(ch);
 894                 }
 895             } while(pSrc < pSrcLimit);
 896
 897             pSrcLimit += 3; /* restore original pSrcLimit */
 898         }
 899
 900         while(pSrc < pSrcLimit) {
 901             ch = *pSrc++;
 902             if(ch < 0xc0) {
 903                 /*
 904                  * ASCII, or a trail byte in lead position which is treated like
 905                  * a single-byte sequence for better character boundary
 906                  * resynchronization after illegal sequences.
 907                  */
 908                 *pDest++=(UChar)ch;
 909                 continue;
 910             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 911                 if(pSrc < pSrcLimit) {
 912                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 913                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 914                     continue;
 915                 }
 916             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 917                 if((pSrcLimit - pSrc) >= 2) {
 918                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 919                     /* 0x2080 = (0x80 << 6) + 0x80 */
 920                     ch = (ch << 12) + (*pSrc++ << 6);
 921                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 922                     pSrc += 3;
 923                     continue;
 924                 }
 925             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 926                 if((pSrcLimit - pSrc) >= 3) {
 927                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 928                     ch = (ch << 18) + (*pSrc++ << 12);
 929                     ch += *pSrc++ << 6;
 930                     ch += *pSrc++ - 0x3c82080;
 931                     *(pDest++) = U16_LEAD(ch);
 932                     *(pDest++) = U16_TRAIL(ch);
 933                     pSrc += 4;
 934                     continue;
 935                 }
 936             }
 937
 938             /* truncated character at the end */
 939             *pDest++ = 0xfffd;
 940             break;
 941         }
 942     }
 943
 944     reqLength+=(int32_t)(pDest - dest);
 945
 946     if(pDestLength){
 947         *pDestLength = reqLength;
 948     }
 949
 950     /* Terminate the buffer */
 951     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 952
 953     return dest;
 954 }
 955
 956 static inline uint8_t *
 957 _appendUTF8(uint8_t *pDest, UChar32 c) {
 958     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
 959     if((c)<=0x7f) {
 960         *pDest++=(uint8_t)c;
 961     } else if(c<=0x7ff) {
 962         *pDest++=(uint8_t)((c>>6)|0xc0);
 963         *pDest++=(uint8_t)((c&0x3f)|0x80);
 964     } else if(c<=0xffff) {
 965         *pDest++=(uint8_t)((c>>12)|0xe0);
 966         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
 967         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 968     } else /* if((uint32_t)(c)<=0x10ffff) */ {
 969         *pDest++=(uint8_t)(((c)>>18)|0xf0);
 970         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
 971         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
 972         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 973     }
 974     return pDest;
 975 }
 976
 977
 978 U_CAPI char* U_EXPORT2
 979 u_strToUTF8WithSub(char *dest,
 980             int32_t destCapacity,
 981             int32_t *pDestLength,
 982             const UChar *pSrc,
 983             int32_t srcLength,
 984             UChar32 subchar, int32_t *pNumSubstitutions,
 985             UErrorCode *pErrorCode){
 986     int32_t reqLength=0;
 987     uint32_t ch=0,ch2=0;
 988     uint8_t *pDest = (uint8_t *)dest;
 989     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
 990     int32_t numSubstitutions;
 991
 992     /* args check */
 993     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 994         return NULL;
 995     }
 996
 997     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
 998         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 999         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1000     ) {
1001         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1002         return NULL;
1003     }
1004
1005     if(pNumSubstitutions!=NULL) {
1006         *pNumSubstitutions=0;
1007     }
1008     numSubstitutions=0;
1009
1010     if(srcLength==-1) {
1011         while((ch=*pSrc)!=0) {
1012             ++pSrc;
1013             if(ch <= 0x7f) {
1014                 if(pDest<pDestLimit) {
1015                     *pDest++ = (uint8_t)ch;
1016                 } else {
1017                     reqLength = 1;
1018                     break;
1019                 }
1020             } else if(ch <= 0x7ff) {
1021                 if((pDestLimit - pDest) >= 2) {
1022                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1023                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1024                 } else {
1025                     reqLength = 2;
1026                     break;
1027                 }
1028             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1029                 if((pDestLimit - pDest) >= 3) {
1030                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1031                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1032                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1033                 } else {
1034                     reqLength = 3;
1035                     break;
1036                 }
1037             } else /* ch is a surrogate */ {
1038                 int32_t length;
1039
1040                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1041                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1042                     ++pSrc;
1043                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1044                 } else if(subchar>=0) {
1045                     ch=subchar;
1046                     ++numSubstitutions;
1047                 } else {
1048                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1049                     *pErrorCode = U_INVALID_CHAR_FOUND;
1050                     return NULL;
1051                 }
1052
1053                 length = U8_LENGTH(ch);
1054                 if((pDestLimit - pDest) >= length) {
1055                     /* convert and append*/
1056                     pDest=_appendUTF8(pDest, ch);
1057                 } else {
1058                     reqLength = length;
1059                     break;
1060                 }
1061             }
1062         }
1063         while((ch=*pSrc++)!=0) {
1064             if(ch<=0x7f) {
1065                 ++reqLength;
1066             } else if(ch<=0x7ff) {
1067                 reqLength+=2;
1068             } else if(!U16_IS_SURROGATE(ch)) {
1069                 reqLength+=3;
1070             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1071                 ++pSrc;
1072                 reqLength+=4;
1073             } else if(subchar>=0) {
1074                 reqLength+=U8_LENGTH(subchar);
1075                 ++numSubstitutions;
1076             } else {
1077                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1078                 *pErrorCode = U_INVALID_CHAR_FOUND;
1079                 return NULL;
1080             }
1081         }
1082     } else {
1083         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1084         int32_t count;
1085
1086         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1087         for(;;) {
1088             /*
1089              * Each iteration of the inner loop progresses by at most 3 UTF-8
1090              * bytes and one UChar, for most characters.
1091              * For supplementary code points (4 & 2), which are rare,
1092              * there is an additional adjustment.
1093              */
1094             count = (int32_t)((pDestLimit - pDest) / 3);
1095             srcLength = (int32_t)(pSrcLimit - pSrc);
1096             if(count > srcLength) {
1097                 count = srcLength; /* min(remaining dest/3, remaining src) */
1098             }
1099             if(count < 3) {
1100                 /*
1101                  * Too much overhead if we get near the end of the string,
1102                  * continue with the next loop.
1103                  */
1104                 break;
1105             }
1106             do {
1107                 ch=*pSrc++;
1108                 if(ch <= 0x7f) {
1109                     *pDest++ = (uint8_t)ch;
1110                 } else if(ch <= 0x7ff) {
1111                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1112                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1113                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1114                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1115                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1116                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117                 } else /* ch is a surrogate */ {
1118                     /*
1119                      * We will read two UChars and probably output four bytes,
1120                      * which we didn't account for with computing count,
1121                      * so we adjust it here.
1122                      */
1123                     if(--count == 0) {
1124                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1125                         break;  /* recompute count */
1126                     }
1127
1128                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1129                         ++pSrc;
1130                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1131
1132                         /* writing 4 bytes per 2 UChars is ok */
1133                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1134                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1135                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1136                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1137                     } else  {
1138                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1139                         if(subchar>=0) {
1140                             ch=subchar;
1141                             ++numSubstitutions;
1142                         } else {
1143                             *pErrorCode = U_INVALID_CHAR_FOUND;
1144                             return NULL;
1145                         }
1146
1147                         /* convert and append*/
1148                         pDest=_appendUTF8(pDest, ch);
1149                     }
1150                 }
1151             } while(--count > 0);
1152         }
1153
1154         while(pSrc<pSrcLimit) {
1155             ch=*pSrc++;
1156             if(ch <= 0x7f) {
1157                 if(pDest<pDestLimit) {
1158                     *pDest++ = (uint8_t)ch;
1159                 } else {
1160                     reqLength = 1;
1161                     break;
1162                 }
1163             } else if(ch <= 0x7ff) {
1164                 if((pDestLimit - pDest) >= 2) {
1165                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1166                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1167                 } else {
1168                     reqLength = 2;
1169                     break;
1170                 }
1171             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1172                 if((pDestLimit - pDest) >= 3) {
1173                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1174                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1175                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1176                 } else {
1177                     reqLength = 3;
1178                     break;
1179                 }
1180             } else /* ch is a surrogate */ {
1181                 int32_t length;
1182
1183                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1184                     ++pSrc;
1185                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1186                 } else if(subchar>=0) {
1187                     ch=subchar;
1188                     ++numSubstitutions;
1189                 } else {
1190                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1191                     *pErrorCode = U_INVALID_CHAR_FOUND;
1192                     return NULL;
1193                 }
1194
1195                 length = U8_LENGTH(ch);
1196                 if((pDestLimit - pDest) >= length) {
1197                     /* convert and append*/
1198                     pDest=_appendUTF8(pDest, ch);
1199                 } else {
1200                     reqLength = length;
1201                     break;
1202                 }
1203             }
1204         }
1205         while(pSrc<pSrcLimit) {
1206             ch=*pSrc++;
1207             if(ch<=0x7f) {
1208                 ++reqLength;
1209             } else if(ch<=0x7ff) {
1210                 reqLength+=2;
1211             } else if(!U16_IS_SURROGATE(ch)) {
1212                 reqLength+=3;
1213             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1214                 ++pSrc;
1215                 reqLength+=4;
1216             } else if(subchar>=0) {
1217                 reqLength+=U8_LENGTH(subchar);
1218                 ++numSubstitutions;
1219             } else {
1220                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1221                 *pErrorCode = U_INVALID_CHAR_FOUND;
1222                 return NULL;
1223             }
1224         }
1225     }
1226
1227     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1228
1229     if(pNumSubstitutions!=NULL) {
1230         *pNumSubstitutions=numSubstitutions;
1231     }
1232
1233     if(pDestLength){
1234         *pDestLength = reqLength;
1235     }
1236
1237     /* Terminate the buffer */
1238     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1239     return dest;
1240 }
1241
1242 U_CAPI char* U_EXPORT2
1243 u_strToUTF8(char *dest,
1244             int32_t destCapacity,
1245             int32_t *pDestLength,
1246             const UChar *pSrc,
1247             int32_t srcLength,
1248             UErrorCode *pErrorCode){
1249     return u_strToUTF8WithSub(
1250             dest, destCapacity, pDestLength,
1251             pSrc, srcLength,
1252             U_SENTINEL, NULL,
1253             pErrorCode);
1254 }
1255
1256 U_CAPI UChar* U_EXPORT2
1257 u_strFromJavaModifiedUTF8WithSub(
1258         UChar *dest,
1259         int32_t destCapacity,
1260         int32_t *pDestLength,
1261         const char *src,
1262         int32_t srcLength,
1263         UChar32 subchar, int32_t *pNumSubstitutions,
1264         UErrorCode *pErrorCode) {
1265     UChar *pDest = dest;
1266     UChar *pDestLimit = dest+destCapacity;
1267     UChar32 ch;
1268     int32_t reqLength = 0;
1269     const uint8_t* pSrc = (const uint8_t*) src;
1270     const uint8_t *pSrcLimit;
1271     int32_t count;
1272     uint8_t t1, t2; /* trail bytes */
1273     int32_t numSubstitutions;
1274
1275     /* args check */
1276     if(U_FAILURE(*pErrorCode)){
1277         return NULL;
1278     }
1279     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1280         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1281         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1282     ) {
1283         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1284         return NULL;
1285     }
1286
1287     if(pNumSubstitutions!=NULL) {
1288         *pNumSubstitutions=0;
1289     }
1290     numSubstitutions=0;
1291
1292     if(srcLength < 0) {
1293         /*
1294          * Transform a NUL-terminated ASCII string.
1295          * Handle non-ASCII strings with slower code.
1296          */
1297         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1298             *pDest++=(UChar)ch;
1299             ++pSrc;
1300         }
1301         if(ch == 0) {
1302             reqLength=(int32_t)(pDest - dest);
1303             if(pDestLength) {
1304                 *pDestLength = reqLength;
1305             }
1306
1307             /* Terminate the buffer */
1308             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309             return dest;
1310         }
1311         srcLength = uprv_strlen((const char *)pSrc);
1312     }
1313
1314     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1315     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1316     for(;;) {
1317         count = (int32_t)(pDestLimit - pDest);
1318         srcLength = (int32_t)(pSrcLimit - pSrc);
1319         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1320             /* fast ASCII loop */
1321             const uint8_t *prevSrc = pSrc;
1322             int32_t delta;
1323             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1324                 *pDest++=(UChar)ch;
1325                 ++pSrc;
1326             }
1327             delta = (int32_t)(pSrc - prevSrc);
1328             count -= delta;
1329             srcLength -= delta;
1330         }
1331         /*
1332          * Each iteration of the inner loop progresses by at most 3 UTF-8
1333          * bytes and one UChar.
1334          */
1335         srcLength /= 3;
1336         if(count > srcLength) {
1337             count = srcLength; /* min(remaining dest, remaining src/3) */
1338         }
1339         if(count < 3) {
1340             /*
1341              * Too much overhead if we get near the end of the string,
1342              * continue with the next loop.
1343              */
1344             break;
1345         }
1346         do {
1347             ch = *pSrc;
1348             if(ch <= 0x7f){
1349                 *pDest++=(UChar)ch;
1350                 ++pSrc;
1351             } else {
1352                 if(ch >= 0xe0) {
1353                     if( /* handle U+0000..U+FFFF inline */
1354                         ch <= 0xef &&
1355                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1356                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1357                     ) {
1358                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1359                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1360                         pSrc += 3;
1361                         continue;
1362                     }
1363                 } else {
1364                     if( /* handle U+0000..U+07FF inline */
1365                         ch >= 0xc0 &&
1366                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1367                     ) {
1368                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1369                         pSrc += 2;
1370                         continue;
1371                     }
1372                 }
1373
1374                 if(subchar < 0) {
1375                     *pErrorCode = U_INVALID_CHAR_FOUND;
1376                     return NULL;
1377                 } else if(subchar > 0xffff && --count == 0) {
1378                     /*
1379                      * We need to write two UChars, adjusted count for that,
1380                      * and ran out of space.
1381                      */
1382                     break;
1383                 } else {
1384                     /* function call for error cases */
1385                     ++pSrc; /* continue after the lead byte */
1386                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1387                     ++numSubstitutions;
1388                     if(subchar<=0xFFFF) {
1389                         *(pDest++)=(UChar)subchar;
1390                     } else {
1391                         *(pDest++)=U16_LEAD(subchar);
1392                         *(pDest++)=U16_TRAIL(subchar);
1393                     }
1394                 }
1395             }
1396         } while(--count > 0);
1397     }
1398
1399     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1400         ch = *pSrc;
1401         if(ch <= 0x7f){
1402             *pDest++=(UChar)ch;
1403             ++pSrc;
1404         } else {
1405             if(ch >= 0xe0) {
1406                 if( /* handle U+0000..U+FFFF inline */
1407                     ch <= 0xef &&
1408                     ((pSrcLimit - pSrc) >= 3) &&
1409                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1410                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1411                 ) {
1412                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1413                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1414                     pSrc += 3;
1415                     continue;
1416                 }
1417             } else {
1418                 if( /* handle U+0000..U+07FF inline */
1419                     ch >= 0xc0 &&
1420                     ((pSrcLimit - pSrc) >= 2) &&
1421                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1422                 ) {
1423                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1424                     pSrc += 2;
1425                     continue;
1426                 }
1427             }
1428
1429             if(subchar < 0) {
1430                 *pErrorCode = U_INVALID_CHAR_FOUND;
1431                 return NULL;
1432             } else {
1433                 /* function call for error cases */
1434                 ++pSrc; /* continue after the lead byte */
1435                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1436                 ++numSubstitutions;
1437                 if(subchar<=0xFFFF) {
1438                     *(pDest++)=(UChar)subchar;
1439                 } else {
1440                     *(pDest++)=U16_LEAD(subchar);
1441                     if(pDest<pDestLimit) {
1442                         *(pDest++)=U16_TRAIL(subchar);
1443                     } else {
1444                         reqLength++;
1445                         break;
1446                     }
1447                 }
1448             }
1449         }
1450     }
1451
1452     /* do not fill the dest buffer just count the UChars needed */
1453     while(pSrc < pSrcLimit){
1454         ch = *pSrc;
1455         if(ch <= 0x7f) {
1456             reqLength++;
1457             ++pSrc;
1458         } else {
1459             if(ch >= 0xe0) {
1460                 if( /* handle U+0000..U+FFFF inline */
1461                     ch <= 0xef &&
1462                     ((pSrcLimit - pSrc) >= 3) &&
1463                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1464                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1465                 ) {
1466                     reqLength++;
1467                     pSrc += 3;
1468                     continue;
1469                 }
1470             } else {
1471                 if( /* handle U+0000..U+07FF inline */
1472                     ch >= 0xc0 &&
1473                     ((pSrcLimit - pSrc) >= 2) &&
1474                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1475                 ) {
1476                     reqLength++;
1477                     pSrc += 2;
1478                     continue;
1479                 }
1480             }
1481
1482             if(subchar < 0) {
1483                 *pErrorCode = U_INVALID_CHAR_FOUND;
1484                 return NULL;
1485             } else {
1486                 /* function call for error cases */
1487                 ++pSrc; /* continue after the lead byte */
1488                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1489                 ++numSubstitutions;
1490                 reqLength+=U16_LENGTH(ch);
1491             }
1492         }
1493     }
1494
1495     if(pNumSubstitutions!=NULL) {
1496         *pNumSubstitutions=numSubstitutions;
1497     }
1498
1499     reqLength+=(int32_t)(pDest - dest);
1500     if(pDestLength) {
1501         *pDestLength = reqLength;
1502     }
1503
1504     /* Terminate the buffer */
1505     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1506     return dest;
1507 }
1508
1509 U_CAPI char* U_EXPORT2
1510 u_strToJavaModifiedUTF8(
1511         char *dest,
1512         int32_t destCapacity,
1513         int32_t *pDestLength,
1514         const UChar *src,
1515         int32_t srcLength,
1516         UErrorCode *pErrorCode) {
1517     int32_t reqLength=0;
1518     uint32_t ch=0;
1519     uint8_t *pDest = (uint8_t *)dest;
1520     uint8_t *pDestLimit = pDest + destCapacity;
1521     const UChar *pSrcLimit;
1522     int32_t count;
1523
1524     /* args check */
1525     if(U_FAILURE(*pErrorCode)){
1526         return NULL;
1527     }
1528     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1529         (dest==NULL && destCapacity!=0) || destCapacity<0
1530     ) {
1531         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1532         return NULL;
1533     }
1534
1535     if(srcLength==-1) {
1536         /* Convert NUL-terminated ASCII, then find the string length. */
1537         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1538             *pDest++ = (uint8_t)ch;
1539             ++src;
1540         }
1541         if(ch == 0) {
1542             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1543             if(pDestLength) {
1544                 *pDestLength = reqLength;
1545             }
1546
1547             /* Terminate the buffer */
1548             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1549             return dest;
1550         }
1551         srcLength = u_strlen(src);
1552     }
1553
1554     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1555     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1556     for(;;) {
1557         count = (int32_t)(pDestLimit - pDest);
1558         srcLength = (int32_t)(pSrcLimit - src);
1559         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1560             /* fast ASCII loop */
1561             const UChar *prevSrc = src;
1562             int32_t delta;
1563             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1564                 *pDest++=(uint8_t)ch;
1565                 ++src;
1566             }
1567             delta = (int32_t)(src - prevSrc);
1568             count -= delta;
1569             srcLength -= delta;
1570         }
1571         /*
1572          * Each iteration of the inner loop progresses by at most 3 UTF-8
1573          * bytes and one UChar.
1574          */
1575         count /= 3;
1576         if(count > srcLength) {
1577             count = srcLength; /* min(remaining dest/3, remaining src) */
1578         }
1579         if(count < 3) {
1580             /*
1581              * Too much overhead if we get near the end of the string,
1582              * continue with the next loop.
1583              */
1584             break;
1585         }
1586         do {
1587             ch=*src++;
1588             if(ch <= 0x7f && ch != 0) {
1589                 *pDest++ = (uint8_t)ch;
1590             } else if(ch <= 0x7ff) {
1591                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1592                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1593             } else {
1594                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1595                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1596                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597             }
1598         } while(--count > 0);
1599     }
1600
1601     while(src<pSrcLimit) {
1602         ch=*src++;
1603         if(ch <= 0x7f && ch != 0) {
1604             if(pDest<pDestLimit) {
1605                 *pDest++ = (uint8_t)ch;
1606             } else {
1607                 reqLength = 1;
1608                 break;
1609             }
1610         } else if(ch <= 0x7ff) {
1611             if((pDestLimit - pDest) >= 2) {
1612                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1613                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1614             } else {
1615                 reqLength = 2;
1616                 break;
1617             }
1618         } else {
1619             if((pDestLimit - pDest) >= 3) {
1620                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1621                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1622                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1623             } else {
1624                 reqLength = 3;
1625                 break;
1626             }
1627         }
1628     }
1629     while(src<pSrcLimit) {
1630         ch=*src++;
1631         if(ch <= 0x7f && ch != 0) {
1632             ++reqLength;
1633         } else if(ch<=0x7ff) {
1634             reqLength+=2;
1635         } else {
1636             reqLength+=3;
1637         }
1638     }
1639
1640     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1641     if(pDestLength){
1642         *pDestLength = reqLength;
1643     }
1644
1645     /* Terminate the buffer */
1646     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1647     return dest;
1648 }