icuSources/common/ustrtrns.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2016, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *
   9 * File ustrtrns.cpp
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   9/10/2001    Ram    Creation.
  15 ******************************************************************************
  16 */
  17
  18 /*******************************************************************************
  19  *
  20  * u_strTo* and u_strFrom* APIs
  21  * WCS functions moved to ustr_wcs.c for better modularization
  22  *
  23  *******************************************************************************
  24  */
  25
  26
  27 #include "unicode/putil.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/utf.h"
  30 #include "unicode/utf8.h"
  31 #include "unicode/utf16.h"
  32 #include "cstring.h"
  33 #include "cmemory.h"
  34 #include "ustr_imp.h"
  35 #include "uassert.h"
  36
  37 U_CAPI UChar* U_EXPORT2
  38 u_strFromUTF32WithSub(UChar *dest,
  39                int32_t destCapacity,
  40                int32_t *pDestLength,
  41                const UChar32 *src,
  42                int32_t srcLength,
  43                UChar32 subchar, int32_t *pNumSubstitutions,
  44                UErrorCode *pErrorCode) {
  45     const UChar32 *srcLimit;
  46     UChar32 ch;
  47     UChar *destLimit;
  48     UChar *pDest;
  49     int32_t reqLength;
  50     int32_t numSubstitutions;
  51
  52     /* args check */
  53     if(U_FAILURE(*pErrorCode)){
  54         return NULL;
  55     }
  56     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  57         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
  58         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  59     ) {
  60         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  61         return NULL;
  62     }
  63
  64     if(pNumSubstitutions != NULL) {
  65         *pNumSubstitutions = 0;
  66     }
  67
  68     pDest = dest;
  69     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
  70     reqLength = 0;
  71     numSubstitutions = 0;
  72
  73     if(srcLength < 0) {
  74         /* simple loop for conversion of a NUL-terminated BMP string */
  75         while((ch=*src) != 0 &&
  76               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
  77             ++src;
  78             if(pDest < destLimit) {
  79                 *pDest++ = (UChar)ch;
  80             } else {
  81                 ++reqLength;
  82             }
  83         }
  84         srcLimit = src;
  85         if(ch != 0) {
  86             /* "complicated" case, find the end of the remaining string */
  87             while(*++srcLimit != 0) {}
  88         }
  89     } else {
  90       srcLimit = (src!=NULL)?(src + srcLength):NULL;
  91     }
  92
  93     /* convert with length */
  94     while(src < srcLimit) {
  95         ch = *src++;
  96         do {
  97             /* usually "loops" once; twice only for writing subchar */
  98             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
  99                 if(pDest < destLimit) {
 100                     *pDest++ = (UChar)ch;
 101                 } else {
 102                     ++reqLength;
 103                 }
 104                 break;
 105             } else if(0x10000 <= ch && ch <= 0x10ffff) {
 106                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
 107                     *pDest++ = U16_LEAD(ch);
 108                     *pDest++ = U16_TRAIL(ch);
 109                 } else {
 110                     reqLength += 2;
 111                 }
 112                 break;
 113             } else if((ch = subchar) < 0) {
 114                 /* surrogate code point, or not a Unicode code point at all */
 115                 *pErrorCode = U_INVALID_CHAR_FOUND;
 116                 return NULL;
 117             } else {
 118                 ++numSubstitutions;
 119             }
 120         } while(TRUE);
 121     }
 122
 123     reqLength += (int32_t)(pDest - dest);
 124     if(pDestLength) {
 125         *pDestLength = reqLength;
 126     }
 127     if(pNumSubstitutions != NULL) {
 128         *pNumSubstitutions = numSubstitutions;
 129     }
 130
 131     /* Terminate the buffer */
 132     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
 133
 134     return dest;
 135 }
 136
 137 U_CAPI UChar* U_EXPORT2
 138 u_strFromUTF32(UChar *dest,
 139                int32_t destCapacity,
 140                int32_t *pDestLength,
 141                const UChar32 *src,
 142                int32_t srcLength,
 143                UErrorCode *pErrorCode) {
 144     return u_strFromUTF32WithSub(
 145             dest, destCapacity, pDestLength,
 146             src, srcLength,
 147             U_SENTINEL, NULL,
 148             pErrorCode);
 149 }
 150
 151 U_CAPI UChar32* U_EXPORT2
 152 u_strToUTF32WithSub(UChar32 *dest,
 153              int32_t destCapacity,
 154              int32_t *pDestLength,
 155              const UChar *src,
 156              int32_t srcLength,
 157              UChar32 subchar, int32_t *pNumSubstitutions,
 158              UErrorCode *pErrorCode) {
 159     const UChar *srcLimit;
 160     UChar32 ch;
 161     UChar ch2;
 162     UChar32 *destLimit;
 163     UChar32 *pDest;
 164     int32_t reqLength;
 165     int32_t numSubstitutions;
 166
 167     /* args check */
 168     if(U_FAILURE(*pErrorCode)){
 169         return NULL;
 170     }
 171     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 172         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 173         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 174     ) {
 175         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 176         return NULL;
 177     }
 178
 179     if(pNumSubstitutions != NULL) {
 180         *pNumSubstitutions = 0;
 181     }
 182
 183     pDest = dest;
 184     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
 185     reqLength = 0;
 186     numSubstitutions = 0;
 187
 188     if(srcLength < 0) {
 189         /* simple loop for conversion of a NUL-terminated BMP string */
 190         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
 191             ++src;
 192             if(pDest < destLimit) {
 193                 *pDest++ = ch;
 194             } else {
 195                 ++reqLength;
 196             }
 197         }
 198         srcLimit = src;
 199         if(ch != 0) {
 200             /* "complicated" case, find the end of the remaining string */
 201             while(*++srcLimit != 0) {}
 202         }
 203     } else {
 204         srcLimit = (src!=NULL)?(src + srcLength):NULL;
 205     }
 206
 207     /* convert with length */
 208     while(src < srcLimit) {
 209         ch = *src++;
 210         if(!U16_IS_SURROGATE(ch)) {
 211             /* write or count ch below */
 212         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
 213             ++src;
 214             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
 215         } else if((ch = subchar) < 0) {
 216             /* unpaired surrogate */
 217             *pErrorCode = U_INVALID_CHAR_FOUND;
 218             return NULL;
 219         } else {
 220             ++numSubstitutions;
 221         }
 222         if(pDest < destLimit) {
 223             *pDest++ = ch;
 224         } else {
 225             ++reqLength;
 226         }
 227     }
 228
 229     reqLength += (int32_t)(pDest - dest);
 230     if(pDestLength) {
 231         *pDestLength = reqLength;
 232     }
 233     if(pNumSubstitutions != NULL) {
 234         *pNumSubstitutions = numSubstitutions;
 235     }
 236
 237     /* Terminate the buffer */
 238     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
 239
 240     return dest;
 241 }
 242
 243 U_CAPI UChar32* U_EXPORT2
 244 u_strToUTF32(UChar32 *dest,
 245              int32_t destCapacity,
 246              int32_t *pDestLength,
 247              const UChar *src,
 248              int32_t srcLength,
 249              UErrorCode *pErrorCode) {
 250     return u_strToUTF32WithSub(
 251             dest, destCapacity, pDestLength,
 252             src, srcLength,
 253             U_SENTINEL, NULL,
 254             pErrorCode);
 255 }
 256
 257 /* for utf8_nextCharSafeBodyTerminated() */
 258 static const UChar32
 259 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
 260
 261 /*
 262  * Version of utf8_nextCharSafeBody() with the following differences:
 263  * - checks for NUL termination instead of length
 264  * - works with pointers instead of indexes
 265  * - always strict (strict==-1)
 266  *
 267  * *ps points to after the lead byte and will be moved to after the last trail byte.
 268  * c is the lead byte.
 269  * @return the code point, or U_SENTINEL
 270  */
 271 static UChar32
 272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
 273     const uint8_t *s=*ps;
 274     uint8_t trail, illegal=0;
 275     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 276     U_ASSERT(count<6);
 277     U8_MASK_LEAD_BYTE((c), count);
 278     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 279     switch(count) {
 280     /* each branch falls through to the next one */
 281     case 5:
 282     case 4:
 283         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 284         illegal=1;
 285         break;
 286     case 3:
 287         trail=(uint8_t)(*s++ - 0x80);
 288         c=(c<<6)|trail;
 289         if(trail>0x3f || c>=0x110) {
 290             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
 291             illegal=1;
 292             break;
 293         }
 294         U_FALLTHROUGH;
 295     case 2:
 296         trail=(uint8_t)(*s++ - 0x80);
 297         if(trail>0x3f) {
 298             /* not a trail byte */
 299             illegal=1;
 300             break;
 301         }
 302         c=(c<<6)|trail;
 303         U_FALLTHROUGH;
 304     case 1:
 305         trail=(uint8_t)(*s++ - 0x80);
 306         if(trail>0x3f) {
 307             /* not a trail byte */
 308             illegal=1;
 309         }
 310         c=(c<<6)|trail;
 311         break;
 312     case 0:
 313         return U_SENTINEL;
 314     /* no default branch to optimize switch()  - all values are covered */
 315     }
 316
 317     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 318     /* illegal is also set if count>=4 */
 319     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 320         /* error handling */
 321         /* don't go beyond this sequence */
 322         s=*ps;
 323         while(count>0 && U8_IS_TRAIL(*s)) {
 324             ++s;
 325             --count;
 326         }
 327         c=U_SENTINEL;
 328     }
 329     *ps=s;
 330     return c;
 331 }
 332
 333 /*
 334  * Version of utf8_nextCharSafeBody() with the following differences:
 335  * - works with pointers instead of indexes
 336  * - always strict (strict==-1)
 337  *
 338  * *ps points to after the lead byte and will be moved to after the last trail byte.
 339  * c is the lead byte.
 340  * @return the code point, or U_SENTINEL
 341  */
 342 static UChar32
 343 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
 344     const uint8_t *s=*ps;
 345     uint8_t trail, illegal=0;
 346     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 347     if((limit-s)>=count) {
 348         U8_MASK_LEAD_BYTE((c), count);
 349         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 350         switch(count) {
 351         /* each branch falls through to the next one */
 352         case 5:
 353         case 4:
 354             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 355             illegal=1;
 356             break;
 357         case 3:
 358             trail=*s++;
 359             c=(c<<6)|(trail&0x3f);
 360             if(c<0x110) {
 361                 illegal|=(trail&0xc0)^0x80;
 362             } else {
 363                 /* code point>0x10ffff, outside Unicode */
 364                 illegal=1;
 365                 break;
 366             }
 367             U_FALLTHROUGH;
 368         case 2:
 369             trail=*s++;
 370             c=(c<<6)|(trail&0x3f);
 371             illegal|=(trail&0xc0)^0x80;
 372             U_FALLTHROUGH;
 373         case 1:
 374             trail=*s++;
 375             c=(c<<6)|(trail&0x3f);
 376             illegal|=(trail&0xc0)^0x80;
 377             break;
 378         case 0:
 379             return U_SENTINEL;
 380         /* no default branch to optimize switch()  - all values are covered */
 381         }
 382     } else {
 383         illegal=1; /* too few bytes left */
 384     }
 385
 386     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 387     /* illegal is also set if count>=4 */
 388     U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
 389     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 390         /* error handling */
 391         /* don't go beyond this sequence */
 392         s=*ps;
 393         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
 394             ++s;
 395             --count;
 396         }
 397         c=U_SENTINEL;
 398     }
 399     *ps=s;
 400     return c;
 401 }
 402
 403 U_CAPI UChar* U_EXPORT2
 404 u_strFromUTF8WithSub(UChar *dest,
 405               int32_t destCapacity,
 406               int32_t *pDestLength,
 407               const char* src,
 408               int32_t srcLength,
 409               UChar32 subchar, int32_t *pNumSubstitutions,
 410               UErrorCode *pErrorCode){
 411     UChar *pDest = dest;
 412     UChar *pDestLimit = dest+destCapacity;
 413     UChar32 ch;
 414     int32_t reqLength = 0;
 415     const uint8_t* pSrc = (const uint8_t*) src;
 416     uint8_t t1, t2; /* trail bytes */
 417     int32_t numSubstitutions;
 418
 419     /* args check */
 420     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 421         return NULL;
 422     }
 423
 424     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 425         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 426         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 427     ) {
 428         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 429         return NULL;
 430     }
 431
 432     if(pNumSubstitutions!=NULL) {
 433         *pNumSubstitutions=0;
 434     }
 435     numSubstitutions=0;
 436
 437     /*
 438      * Inline processing of UTF-8 byte sequences:
 439      *
 440      * Byte sequences for the most common characters are handled inline in
 441      * the conversion loops. In order to reduce the path lengths for those
 442      * characters, the tests are arranged in a kind of binary search.
 443      * ASCII (<=0x7f) is checked first, followed by the dividing point
 444      * between 2- and 3-byte sequences (0xe0).
 445      * The 3-byte branch is tested first to speed up CJK text.
 446      * The compiler should combine the subtractions for the two tests for 0xe0.
 447      * Each branch then tests for the other end of its range.
 448      */
 449
 450     if(srcLength < 0){
 451         /*
 452          * Transform a NUL-terminated string.
 453          * The code explicitly checks for NULs only in the lead byte position.
 454          * A NUL byte in the trail byte position fails the trail byte range check anyway.
 455          */
 456         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 457             if(ch <= 0x7f){
 458                 *pDest++=(UChar)ch;
 459                 ++pSrc;
 460             } else {
 461                 if(ch > 0xe0) {
 462                     if( /* handle U+1000..U+CFFF inline */
 463                         ch <= 0xec &&
 464                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 465                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 466                     ) {
 467                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 468                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 469                         pSrc += 3;
 470                         continue;
 471                     }
 472                 } else if(ch < 0xe0) {
 473                     if( /* handle U+0080..U+07FF inline */
 474                         ch >= 0xc2 &&
 475                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 476                     ) {
 477                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 478                         pSrc += 2;
 479                         continue;
 480                     }
 481                 }
 482
 483                 /* function call for "complicated" and error cases */
 484                 ++pSrc; /* continue after the lead byte */
 485                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 486                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 487                     *pErrorCode = U_INVALID_CHAR_FOUND;
 488                     return NULL;
 489                 } else if(ch<=0xFFFF) {
 490                     *(pDest++)=(UChar)ch;
 491                 } else {
 492                     *(pDest++)=U16_LEAD(ch);
 493                     if(pDest<pDestLimit) {
 494                         *(pDest++)=U16_TRAIL(ch);
 495                     } else {
 496                         reqLength++;
 497                         break;
 498                     }
 499                 }
 500             }
 501         }
 502
 503         /* Pre-flight the rest of the string. */
 504         while((ch = *pSrc) != 0) {
 505             if(ch <= 0x7f){
 506                 ++reqLength;
 507                 ++pSrc;
 508             } else {
 509                 if(ch > 0xe0) {
 510                     if( /* handle U+1000..U+CFFF inline */
 511                         ch <= 0xec &&
 512                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 513                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 514                     ) {
 515                         ++reqLength;
 516                         pSrc += 3;
 517                         continue;
 518                     }
 519                 } else if(ch < 0xe0) {
 520                     if( /* handle U+0080..U+07FF inline */
 521                         ch >= 0xc2 &&
 522                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 523                     ) {
 524                         ++reqLength;
 525                         pSrc += 2;
 526                         continue;
 527                     }
 528                 }
 529
 530                 /* function call for "complicated" and error cases */
 531                 ++pSrc; /* continue after the lead byte */
 532                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 533                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 534                     *pErrorCode = U_INVALID_CHAR_FOUND;
 535                     return NULL;
 536                 }
 537                 reqLength += U16_LENGTH(ch);
 538             }
 539         }
 540     } else /* srcLength >= 0 */ {
 541         const uint8_t *pSrcLimit = pSrc + srcLength;
 542         int32_t count;
 543
 544         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
 545         for(;;) {
 546             /*
 547              * Each iteration of the inner loop progresses by at most 3 UTF-8
 548              * bytes and one UChar, for most characters.
 549              * For supplementary code points (4 & 2), which are rare,
 550              * there is an additional adjustment.
 551              */
 552             count = (int32_t)(pDestLimit - pDest);
 553             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
 554             if(count > srcLength) {
 555                 count = srcLength; /* min(remaining dest, remaining src/3) */
 556             }
 557             if(count < 3) {
 558                 /*
 559                  * Too much overhead if we get near the end of the string,
 560                  * continue with the next loop.
 561                  */
 562                 break;
 563             }
 564
 565             do {
 566                 ch = *pSrc;
 567                 if(ch <= 0x7f){
 568                     *pDest++=(UChar)ch;
 569                     ++pSrc;
 570                 } else {
 571                     if(ch > 0xe0) {
 572                         if( /* handle U+1000..U+CFFF inline */
 573                             ch <= 0xec &&
 574                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 575                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 576                         ) {
 577                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 578                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 579                             pSrc += 3;
 580                             continue;
 581                         }
 582                     } else if(ch < 0xe0) {
 583                         if( /* handle U+0080..U+07FF inline */
 584                             ch >= 0xc2 &&
 585                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 586                         ) {
 587                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 588                             pSrc += 2;
 589                             continue;
 590                         }
 591                     }
 592
 593                     if(ch >= 0xf0 || subchar > 0xffff) {
 594                         /*
 595                          * We may read up to six bytes and write up to two UChars,
 596                          * which we didn't account for with computing count,
 597                          * so we adjust it here.
 598                          */
 599                         if(--count == 0) {
 600                             break;
 601                         }
 602                     }
 603
 604                     /* function call for "complicated" and error cases */
 605                     ++pSrc; /* continue after the lead byte */
 606                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 607                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 608                         *pErrorCode = U_INVALID_CHAR_FOUND;
 609                         return NULL;
 610                     }else if(ch<=0xFFFF){
 611                         *(pDest++)=(UChar)ch;
 612                     }else{
 613                         *(pDest++)=U16_LEAD(ch);
 614                         *(pDest++)=U16_TRAIL(ch);
 615                     }
 616                 }
 617             } while(--count > 0);
 618         }
 619
 620         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
 621             ch = *pSrc;
 622             if(ch <= 0x7f){
 623                 *pDest++=(UChar)ch;
 624                 ++pSrc;
 625             } else {
 626                 if(ch > 0xe0) {
 627                     if( /* handle U+1000..U+CFFF inline */
 628                         ch <= 0xec &&
 629                         ((pSrcLimit - pSrc) >= 3) &&
 630                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 631                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 632                     ) {
 633                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 634                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 635                         pSrc += 3;
 636                         continue;
 637                     }
 638                 } else if(ch < 0xe0) {
 639                     if( /* handle U+0080..U+07FF inline */
 640                         ch >= 0xc2 &&
 641                         ((pSrcLimit - pSrc) >= 2) &&
 642                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 643                     ) {
 644                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 645                         pSrc += 2;
 646                         continue;
 647                     }
 648                 }
 649
 650                 /* function call for "complicated" and error cases */
 651                 ++pSrc; /* continue after the lead byte */
 652                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 653                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 654                     *pErrorCode = U_INVALID_CHAR_FOUND;
 655                     return NULL;
 656                 }else if(ch<=0xFFFF){
 657                     *(pDest++)=(UChar)ch;
 658                 }else{
 659                     *(pDest++)=U16_LEAD(ch);
 660                     if(pDest<pDestLimit){
 661                         *(pDest++)=U16_TRAIL(ch);
 662                     }else{
 663                         reqLength++;
 664                         break;
 665                     }
 666                 }
 667             }
 668         }
 669         /* do not fill the dest buffer just count the UChars needed */
 670         while(pSrc < pSrcLimit){
 671             ch = *pSrc;
 672             if(ch <= 0x7f){
 673                 reqLength++;
 674                 ++pSrc;
 675             } else {
 676                 if(ch > 0xe0) {
 677                     if( /* handle U+1000..U+CFFF inline */
 678                         ch <= 0xec &&
 679                         ((pSrcLimit - pSrc) >= 3) &&
 680                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 681                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 682                     ) {
 683                         reqLength++;
 684                         pSrc += 3;
 685                         continue;
 686                     }
 687                 } else if(ch < 0xe0) {
 688                     if( /* handle U+0080..U+07FF inline */
 689                         ch >= 0xc2 &&
 690                         ((pSrcLimit - pSrc) >= 2) &&
 691                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 692                     ) {
 693                         reqLength++;
 694                         pSrc += 2;
 695                         continue;
 696                     }
 697                 }
 698
 699                 /* function call for "complicated" and error cases */
 700                 ++pSrc; /* continue after the lead byte */
 701                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 702                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 703                     *pErrorCode = U_INVALID_CHAR_FOUND;
 704                     return NULL;
 705                 }
 706                 reqLength+=U16_LENGTH(ch);
 707             }
 708         }
 709     }
 710
 711     reqLength+=(int32_t)(pDest - dest);
 712
 713     if(pNumSubstitutions!=NULL) {
 714         *pNumSubstitutions=numSubstitutions;
 715     }
 716
 717     if(pDestLength){
 718         *pDestLength = reqLength;
 719     }
 720
 721     /* Terminate the buffer */
 722     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 723
 724     return dest;
 725 }
 726
 727 U_CAPI UChar* U_EXPORT2
 728 u_strFromUTF8(UChar *dest,
 729               int32_t destCapacity,
 730               int32_t *pDestLength,
 731               const char* src,
 732               int32_t srcLength,
 733               UErrorCode *pErrorCode){
 734     return u_strFromUTF8WithSub(
 735             dest, destCapacity, pDestLength,
 736             src, srcLength,
 737             U_SENTINEL, NULL,
 738             pErrorCode);
 739 }
 740
 741 U_CAPI UChar * U_EXPORT2
 742 u_strFromUTF8Lenient(UChar *dest,
 743                      int32_t destCapacity,
 744                      int32_t *pDestLength,
 745                      const char *src,
 746                      int32_t srcLength,
 747                      UErrorCode *pErrorCode) {
 748     UChar *pDest = dest;
 749     UChar32 ch;
 750     int32_t reqLength = 0;
 751     uint8_t* pSrc = (uint8_t*) src;
 752
 753     /* args check */
 754     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 755         return NULL;
 756     }
 757
 758     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 759         (destCapacity<0) || (dest == NULL && destCapacity > 0)
 760     ) {
 761         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 762         return NULL;
 763     }
 764
 765     if(srcLength < 0) {
 766         /* Transform a NUL-terminated string. */
 767         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
 768         uint8_t t1, t2, t3; /* trail bytes */
 769
 770         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 771             if(ch < 0xc0) {
 772                 /*
 773                  * ASCII, or a trail byte in lead position which is treated like
 774                  * a single-byte sequence for better character boundary
 775                  * resynchronization after illegal sequences.
 776                  */
 777                 *pDest++=(UChar)ch;
 778                 ++pSrc;
 779                 continue;
 780             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 781                 if((t1 = pSrc[1]) != 0) {
 782                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 783                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
 784                     pSrc += 2;
 785                     continue;
 786                 }
 787             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 788                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
 789                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 790                     /* 0x2080 = (0x80 << 6) + 0x80 */
 791                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
 792                     pSrc += 3;
 793                     continue;
 794                 }
 795             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 796                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
 797                     pSrc += 4;
 798                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 799                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
 800                     *(pDest++) = U16_LEAD(ch);
 801                     if(pDest < pDestLimit) {
 802                         *(pDest++) = U16_TRAIL(ch);
 803                     } else {
 804                         reqLength = 1;
 805                         break;
 806                     }
 807                     continue;
 808                 }
 809             }
 810
 811             /* truncated character at the end */
 812             *pDest++ = 0xfffd;
 813             while(*++pSrc != 0) {}
 814             break;
 815         }
 816
 817         /* Pre-flight the rest of the string. */
 818         while((ch = *pSrc) != 0) {
 819             if(ch < 0xc0) {
 820                 /*
 821                  * ASCII, or a trail byte in lead position which is treated like
 822                  * a single-byte sequence for better character boundary
 823                  * resynchronization after illegal sequences.
 824                  */
 825                 ++reqLength;
 826                 ++pSrc;
 827                 continue;
 828             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 829                 if(pSrc[1] != 0) {
 830                     ++reqLength;
 831                     pSrc += 2;
 832                     continue;
 833                 }
 834             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 835                 if(pSrc[1] != 0 && pSrc[2] != 0) {
 836                     ++reqLength;
 837                     pSrc += 3;
 838                     continue;
 839                 }
 840             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 841                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
 842                     reqLength += 2;
 843                     pSrc += 4;
 844                     continue;
 845                 }
 846             }
 847
 848             /* truncated character at the end */
 849             ++reqLength;
 850             break;
 851         }
 852     } else /* srcLength >= 0 */ {
 853       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
 854
 855         /*
 856          * This function requires that if srcLength is given, then it must be
 857          * destCapatity >= srcLength so that we need not check for
 858          * destination buffer overflow in the loop.
 859          */
 860         if(destCapacity < srcLength) {
 861             if(pDestLength != NULL) {
 862                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
 863             }
 864             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
 865             return NULL;
 866         }
 867
 868         if((pSrcLimit - pSrc) >= 4) {
 869             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
 870
 871             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
 872             do {
 873                 ch = *pSrc++;
 874                 if(ch < 0xc0) {
 875                     /*
 876                      * ASCII, or a trail byte in lead position which is treated like
 877                      * a single-byte sequence for better character boundary
 878                      * resynchronization after illegal sequences.
 879                      */
 880                     *pDest++=(UChar)ch;
 881                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
 882                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 883                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 884                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 885                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 886                     /* 0x2080 = (0x80 << 6) + 0x80 */
 887                     ch = (ch << 12) + (*pSrc++ << 6);
 888                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 889                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 890                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 891                     ch = (ch << 18) + (*pSrc++ << 12);
 892                     ch += *pSrc++ << 6;
 893                     ch += *pSrc++ - 0x3c82080;
 894                     *(pDest++) = U16_LEAD(ch);
 895                     *(pDest++) = U16_TRAIL(ch);
 896                 }
 897             } while(pSrc < pSrcLimit);
 898
 899             pSrcLimit += 3; /* restore original pSrcLimit */
 900         }
 901
 902         while(pSrc < pSrcLimit) {
 903             ch = *pSrc++;
 904             if(ch < 0xc0) {
 905                 /*
 906                  * ASCII, or a trail byte in lead position which is treated like
 907                  * a single-byte sequence for better character boundary
 908                  * resynchronization after illegal sequences.
 909                  */
 910                 *pDest++=(UChar)ch;
 911                 continue;
 912             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 913                 if(pSrc < pSrcLimit) {
 914                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 915                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 916                     continue;
 917                 }
 918             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 919                 if((pSrcLimit - pSrc) >= 2) {
 920                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 921                     /* 0x2080 = (0x80 << 6) + 0x80 */
 922                     ch = (ch << 12) + (*pSrc++ << 6);
 923                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 924                     pSrc += 3;
 925                     continue;
 926                 }
 927             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 928                 if((pSrcLimit - pSrc) >= 3) {
 929                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 930                     ch = (ch << 18) + (*pSrc++ << 12);
 931                     ch += *pSrc++ << 6;
 932                     ch += *pSrc++ - 0x3c82080;
 933                     *(pDest++) = U16_LEAD(ch);
 934                     *(pDest++) = U16_TRAIL(ch);
 935                     pSrc += 4;
 936                     continue;
 937                 }
 938             }
 939
 940             /* truncated character at the end */
 941             *pDest++ = 0xfffd;
 942             break;
 943         }
 944     }
 945
 946     reqLength+=(int32_t)(pDest - dest);
 947
 948     if(pDestLength){
 949         *pDestLength = reqLength;
 950     }
 951
 952     /* Terminate the buffer */
 953     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 954
 955     return dest;
 956 }
 957
 958 static inline uint8_t *
 959 _appendUTF8(uint8_t *pDest, UChar32 c) {
 960     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
 961     if((c)<=0x7f) {
 962         *pDest++=(uint8_t)c;
 963     } else if(c<=0x7ff) {
 964         *pDest++=(uint8_t)((c>>6)|0xc0);
 965         *pDest++=(uint8_t)((c&0x3f)|0x80);
 966     } else if(c<=0xffff) {
 967         *pDest++=(uint8_t)((c>>12)|0xe0);
 968         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
 969         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 970     } else /* if((uint32_t)(c)<=0x10ffff) */ {
 971         *pDest++=(uint8_t)(((c)>>18)|0xf0);
 972         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
 973         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
 974         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 975     }
 976     return pDest;
 977 }
 978
 979
 980 U_CAPI char* U_EXPORT2
 981 u_strToUTF8WithSub(char *dest,
 982             int32_t destCapacity,
 983             int32_t *pDestLength,
 984             const UChar *pSrc,
 985             int32_t srcLength,
 986             UChar32 subchar, int32_t *pNumSubstitutions,
 987             UErrorCode *pErrorCode){
 988     int32_t reqLength=0;
 989     uint32_t ch=0,ch2=0;
 990     uint8_t *pDest = (uint8_t *)dest;
 991     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
 992     int32_t numSubstitutions;
 993
 994     /* args check */
 995     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 996         return NULL;
 997     }
 998
 999     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1000         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
1001         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1002     ) {
1003         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1004         return NULL;
1005     }
1006
1007     if(pNumSubstitutions!=NULL) {
1008         *pNumSubstitutions=0;
1009     }
1010     numSubstitutions=0;
1011
1012     if(srcLength==-1) {
1013         while((ch=*pSrc)!=0) {
1014             ++pSrc;
1015             if(ch <= 0x7f) {
1016                 if(pDest<pDestLimit) {
1017                     *pDest++ = (uint8_t)ch;
1018                 } else {
1019                     reqLength = 1;
1020                     break;
1021                 }
1022             } else if(ch <= 0x7ff) {
1023                 if((pDestLimit - pDest) >= 2) {
1024                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1025                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1026                 } else {
1027                     reqLength = 2;
1028                     break;
1029                 }
1030             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1031                 if((pDestLimit - pDest) >= 3) {
1032                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1033                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1034                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1035                 } else {
1036                     reqLength = 3;
1037                     break;
1038                 }
1039             } else /* ch is a surrogate */ {
1040                 int32_t length;
1041
1042                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1043                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1044                     ++pSrc;
1045                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1046                 } else if(subchar>=0) {
1047                     ch=subchar;
1048                     ++numSubstitutions;
1049                 } else {
1050                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1051                     *pErrorCode = U_INVALID_CHAR_FOUND;
1052                     return NULL;
1053                 }
1054
1055                 length = U8_LENGTH(ch);
1056                 if((pDestLimit - pDest) >= length) {
1057                     /* convert and append*/
1058                     pDest=_appendUTF8(pDest, ch);
1059                 } else {
1060                     reqLength = length;
1061                     break;
1062                 }
1063             }
1064         }
1065         while((ch=*pSrc++)!=0) {
1066             if(ch<=0x7f) {
1067                 ++reqLength;
1068             } else if(ch<=0x7ff) {
1069                 reqLength+=2;
1070             } else if(!U16_IS_SURROGATE(ch)) {
1071                 reqLength+=3;
1072             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1073                 ++pSrc;
1074                 reqLength+=4;
1075             } else if(subchar>=0) {
1076                 reqLength+=U8_LENGTH(subchar);
1077                 ++numSubstitutions;
1078             } else {
1079                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1080                 *pErrorCode = U_INVALID_CHAR_FOUND;
1081                 return NULL;
1082             }
1083         }
1084     } else {
1085         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1086         int32_t count;
1087
1088         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1089         for(;;) {
1090             /*
1091              * Each iteration of the inner loop progresses by at most 3 UTF-8
1092              * bytes and one UChar, for most characters.
1093              * For supplementary code points (4 & 2), which are rare,
1094              * there is an additional adjustment.
1095              */
1096             count = (int32_t)((pDestLimit - pDest) / 3);
1097             srcLength = (int32_t)(pSrcLimit - pSrc);
1098             if(count > srcLength) {
1099                 count = srcLength; /* min(remaining dest/3, remaining src) */
1100             }
1101             if(count < 3) {
1102                 /*
1103                  * Too much overhead if we get near the end of the string,
1104                  * continue with the next loop.
1105                  */
1106                 break;
1107             }
1108             do {
1109                 ch=*pSrc++;
1110                 if(ch <= 0x7f) {
1111                     *pDest++ = (uint8_t)ch;
1112                 } else if(ch <= 0x7ff) {
1113                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1114                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1115                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1116                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1117                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1118                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1119                 } else /* ch is a surrogate */ {
1120                     /*
1121                      * We will read two UChars and probably output four bytes,
1122                      * which we didn't account for with computing count,
1123                      * so we adjust it here.
1124                      */
1125                     if(--count == 0) {
1126                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1127                         break;  /* recompute count */
1128                     }
1129
1130                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1131                         ++pSrc;
1132                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1133
1134                         /* writing 4 bytes per 2 UChars is ok */
1135                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1136                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1137                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1138                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1139                     } else  {
1140                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1141                         if(subchar>=0) {
1142                             ch=subchar;
1143                             ++numSubstitutions;
1144                         } else {
1145                             *pErrorCode = U_INVALID_CHAR_FOUND;
1146                             return NULL;
1147                         }
1148
1149                         /* convert and append*/
1150                         pDest=_appendUTF8(pDest, ch);
1151                     }
1152                 }
1153             } while(--count > 0);
1154         }
1155
1156         while(pSrc<pSrcLimit) {
1157             ch=*pSrc++;
1158             if(ch <= 0x7f) {
1159                 if(pDest<pDestLimit) {
1160                     *pDest++ = (uint8_t)ch;
1161                 } else {
1162                     reqLength = 1;
1163                     break;
1164                 }
1165             } else if(ch <= 0x7ff) {
1166                 if((pDestLimit - pDest) >= 2) {
1167                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1168                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1169                 } else {
1170                     reqLength = 2;
1171                     break;
1172                 }
1173             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1174                 if((pDestLimit - pDest) >= 3) {
1175                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1176                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1177                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1178                 } else {
1179                     reqLength = 3;
1180                     break;
1181                 }
1182             } else /* ch is a surrogate */ {
1183                 int32_t length;
1184
1185                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1186                     ++pSrc;
1187                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1188                 } else if(subchar>=0) {
1189                     ch=subchar;
1190                     ++numSubstitutions;
1191                 } else {
1192                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1193                     *pErrorCode = U_INVALID_CHAR_FOUND;
1194                     return NULL;
1195                 }
1196
1197                 length = U8_LENGTH(ch);
1198                 if((pDestLimit - pDest) >= length) {
1199                     /* convert and append*/
1200                     pDest=_appendUTF8(pDest, ch);
1201                 } else {
1202                     reqLength = length;
1203                     break;
1204                 }
1205             }
1206         }
1207         while(pSrc<pSrcLimit) {
1208             ch=*pSrc++;
1209             if(ch<=0x7f) {
1210                 ++reqLength;
1211             } else if(ch<=0x7ff) {
1212                 reqLength+=2;
1213             } else if(!U16_IS_SURROGATE(ch)) {
1214                 reqLength+=3;
1215             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1216                 ++pSrc;
1217                 reqLength+=4;
1218             } else if(subchar>=0) {
1219                 reqLength+=U8_LENGTH(subchar);
1220                 ++numSubstitutions;
1221             } else {
1222                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1223                 *pErrorCode = U_INVALID_CHAR_FOUND;
1224                 return NULL;
1225             }
1226         }
1227     }
1228
1229     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1230
1231     if(pNumSubstitutions!=NULL) {
1232         *pNumSubstitutions=numSubstitutions;
1233     }
1234
1235     if(pDestLength){
1236         *pDestLength = reqLength;
1237     }
1238
1239     /* Terminate the buffer */
1240     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1241     return dest;
1242 }
1243
1244 U_CAPI char* U_EXPORT2
1245 u_strToUTF8(char *dest,
1246             int32_t destCapacity,
1247             int32_t *pDestLength,
1248             const UChar *pSrc,
1249             int32_t srcLength,
1250             UErrorCode *pErrorCode){
1251     return u_strToUTF8WithSub(
1252             dest, destCapacity, pDestLength,
1253             pSrc, srcLength,
1254             U_SENTINEL, NULL,
1255             pErrorCode);
1256 }
1257
1258 U_CAPI UChar* U_EXPORT2
1259 u_strFromJavaModifiedUTF8WithSub(
1260         UChar *dest,
1261         int32_t destCapacity,
1262         int32_t *pDestLength,
1263         const char *src,
1264         int32_t srcLength,
1265         UChar32 subchar, int32_t *pNumSubstitutions,
1266         UErrorCode *pErrorCode) {
1267     UChar *pDest = dest;
1268     UChar *pDestLimit = dest+destCapacity;
1269     UChar32 ch;
1270     int32_t reqLength = 0;
1271     const uint8_t* pSrc = (const uint8_t*) src;
1272     const uint8_t *pSrcLimit;
1273     int32_t count;
1274     uint8_t t1, t2; /* trail bytes */
1275     int32_t numSubstitutions;
1276
1277     /* args check */
1278     if(U_FAILURE(*pErrorCode)){
1279         return NULL;
1280     }
1281     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1282         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1283         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1284     ) {
1285         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1286         return NULL;
1287     }
1288
1289     if(pNumSubstitutions!=NULL) {
1290         *pNumSubstitutions=0;
1291     }
1292     numSubstitutions=0;
1293
1294     if(srcLength < 0) {
1295         /*
1296          * Transform a NUL-terminated ASCII string.
1297          * Handle non-ASCII strings with slower code.
1298          */
1299         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1300             *pDest++=(UChar)ch;
1301             ++pSrc;
1302         }
1303         if(ch == 0) {
1304             reqLength=(int32_t)(pDest - dest);
1305             if(pDestLength) {
1306                 *pDestLength = reqLength;
1307             }
1308
1309             /* Terminate the buffer */
1310             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1311             return dest;
1312         }
1313         srcLength = uprv_strlen((const char *)pSrc);
1314     }
1315
1316     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1317     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1318     for(;;) {
1319         count = (int32_t)(pDestLimit - pDest);
1320         srcLength = (int32_t)(pSrcLimit - pSrc);
1321         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1322             /* fast ASCII loop */
1323             const uint8_t *prevSrc = pSrc;
1324             int32_t delta;
1325             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1326                 *pDest++=(UChar)ch;
1327                 ++pSrc;
1328             }
1329             delta = (int32_t)(pSrc - prevSrc);
1330             count -= delta;
1331             srcLength -= delta;
1332         }
1333         /*
1334          * Each iteration of the inner loop progresses by at most 3 UTF-8
1335          * bytes and one UChar.
1336          */
1337         srcLength /= 3;
1338         if(count > srcLength) {
1339             count = srcLength; /* min(remaining dest, remaining src/3) */
1340         }
1341         if(count < 3) {
1342             /*
1343              * Too much overhead if we get near the end of the string,
1344              * continue with the next loop.
1345              */
1346             break;
1347         }
1348         do {
1349             ch = *pSrc;
1350             if(ch <= 0x7f){
1351                 *pDest++=(UChar)ch;
1352                 ++pSrc;
1353             } else {
1354                 if(ch >= 0xe0) {
1355                     if( /* handle U+0000..U+FFFF inline */
1356                         ch <= 0xef &&
1357                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1358                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1359                     ) {
1360                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1361                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1362                         pSrc += 3;
1363                         continue;
1364                     }
1365                 } else {
1366                     if( /* handle U+0000..U+07FF inline */
1367                         ch >= 0xc0 &&
1368                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1369                     ) {
1370                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1371                         pSrc += 2;
1372                         continue;
1373                     }
1374                 }
1375
1376                 if(subchar < 0) {
1377                     *pErrorCode = U_INVALID_CHAR_FOUND;
1378                     return NULL;
1379                 } else if(subchar > 0xffff && --count == 0) {
1380                     /*
1381                      * We need to write two UChars, adjusted count for that,
1382                      * and ran out of space.
1383                      */
1384                     break;
1385                 } else {
1386                     /* function call for error cases */
1387                     ++pSrc; /* continue after the lead byte */
1388                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1389                     ++numSubstitutions;
1390                     if(subchar<=0xFFFF) {
1391                         *(pDest++)=(UChar)subchar;
1392                     } else {
1393                         *(pDest++)=U16_LEAD(subchar);
1394                         *(pDest++)=U16_TRAIL(subchar);
1395                     }
1396                 }
1397             }
1398         } while(--count > 0);
1399     }
1400
1401     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1402         ch = *pSrc;
1403         if(ch <= 0x7f){
1404             *pDest++=(UChar)ch;
1405             ++pSrc;
1406         } else {
1407             if(ch >= 0xe0) {
1408                 if( /* handle U+0000..U+FFFF inline */
1409                     ch <= 0xef &&
1410                     ((pSrcLimit - pSrc) >= 3) &&
1411                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1412                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1413                 ) {
1414                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1415                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1416                     pSrc += 3;
1417                     continue;
1418                 }
1419             } else {
1420                 if( /* handle U+0000..U+07FF inline */
1421                     ch >= 0xc0 &&
1422                     ((pSrcLimit - pSrc) >= 2) &&
1423                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1424                 ) {
1425                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1426                     pSrc += 2;
1427                     continue;
1428                 }
1429             }
1430
1431             if(subchar < 0) {
1432                 *pErrorCode = U_INVALID_CHAR_FOUND;
1433                 return NULL;
1434             } else {
1435                 /* function call for error cases */
1436                 ++pSrc; /* continue after the lead byte */
1437                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1438                 ++numSubstitutions;
1439                 if(subchar<=0xFFFF) {
1440                     *(pDest++)=(UChar)subchar;
1441                 } else {
1442                     *(pDest++)=U16_LEAD(subchar);
1443                     if(pDest<pDestLimit) {
1444                         *(pDest++)=U16_TRAIL(subchar);
1445                     } else {
1446                         reqLength++;
1447                         break;
1448                     }
1449                 }
1450             }
1451         }
1452     }
1453
1454     /* do not fill the dest buffer just count the UChars needed */
1455     while(pSrc < pSrcLimit){
1456         ch = *pSrc;
1457         if(ch <= 0x7f) {
1458             reqLength++;
1459             ++pSrc;
1460         } else {
1461             if(ch >= 0xe0) {
1462                 if( /* handle U+0000..U+FFFF inline */
1463                     ch <= 0xef &&
1464                     ((pSrcLimit - pSrc) >= 3) &&
1465                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1466                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1467                 ) {
1468                     reqLength++;
1469                     pSrc += 3;
1470                     continue;
1471                 }
1472             } else {
1473                 if( /* handle U+0000..U+07FF inline */
1474                     ch >= 0xc0 &&
1475                     ((pSrcLimit - pSrc) >= 2) &&
1476                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1477                 ) {
1478                     reqLength++;
1479                     pSrc += 2;
1480                     continue;
1481                 }
1482             }
1483
1484             if(subchar < 0) {
1485                 *pErrorCode = U_INVALID_CHAR_FOUND;
1486                 return NULL;
1487             } else {
1488                 /* function call for error cases */
1489                 ++pSrc; /* continue after the lead byte */
1490                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1491                 ++numSubstitutions;
1492                 reqLength+=U16_LENGTH(ch);
1493             }
1494         }
1495     }
1496
1497     if(pNumSubstitutions!=NULL) {
1498         *pNumSubstitutions=numSubstitutions;
1499     }
1500
1501     reqLength+=(int32_t)(pDest - dest);
1502     if(pDestLength) {
1503         *pDestLength = reqLength;
1504     }
1505
1506     /* Terminate the buffer */
1507     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1508     return dest;
1509 }
1510
1511 U_CAPI char* U_EXPORT2
1512 u_strToJavaModifiedUTF8(
1513         char *dest,
1514         int32_t destCapacity,
1515         int32_t *pDestLength,
1516         const UChar *src,
1517         int32_t srcLength,
1518         UErrorCode *pErrorCode) {
1519     int32_t reqLength=0;
1520     uint32_t ch=0;
1521     uint8_t *pDest = (uint8_t *)dest;
1522     uint8_t *pDestLimit = pDest + destCapacity;
1523     const UChar *pSrcLimit;
1524     int32_t count;
1525
1526     /* args check */
1527     if(U_FAILURE(*pErrorCode)){
1528         return NULL;
1529     }
1530     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1531         (dest==NULL && destCapacity!=0) || destCapacity<0
1532     ) {
1533         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1534         return NULL;
1535     }
1536
1537     if(srcLength==-1) {
1538         /* Convert NUL-terminated ASCII, then find the string length. */
1539         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1540             *pDest++ = (uint8_t)ch;
1541             ++src;
1542         }
1543         if(ch == 0) {
1544             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1545             if(pDestLength) {
1546                 *pDestLength = reqLength;
1547             }
1548
1549             /* Terminate the buffer */
1550             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1551             return dest;
1552         }
1553         srcLength = u_strlen(src);
1554     }
1555
1556     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1557     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1558     for(;;) {
1559         count = (int32_t)(pDestLimit - pDest);
1560         srcLength = (int32_t)(pSrcLimit - src);
1561         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1562             /* fast ASCII loop */
1563             const UChar *prevSrc = src;
1564             int32_t delta;
1565             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1566                 *pDest++=(uint8_t)ch;
1567                 ++src;
1568             }
1569             delta = (int32_t)(src - prevSrc);
1570             count -= delta;
1571             srcLength -= delta;
1572         }
1573         /*
1574          * Each iteration of the inner loop progresses by at most 3 UTF-8
1575          * bytes and one UChar.
1576          */
1577         count /= 3;
1578         if(count > srcLength) {
1579             count = srcLength; /* min(remaining dest/3, remaining src) */
1580         }
1581         if(count < 3) {
1582             /*
1583              * Too much overhead if we get near the end of the string,
1584              * continue with the next loop.
1585              */
1586             break;
1587         }
1588         do {
1589             ch=*src++;
1590             if(ch <= 0x7f && ch != 0) {
1591                 *pDest++ = (uint8_t)ch;
1592             } else if(ch <= 0x7ff) {
1593                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1594                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1595             } else {
1596                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1597                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1598                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1599             }
1600         } while(--count > 0);
1601     }
1602
1603     while(src<pSrcLimit) {
1604         ch=*src++;
1605         if(ch <= 0x7f && ch != 0) {
1606             if(pDest<pDestLimit) {
1607                 *pDest++ = (uint8_t)ch;
1608             } else {
1609                 reqLength = 1;
1610                 break;
1611             }
1612         } else if(ch <= 0x7ff) {
1613             if((pDestLimit - pDest) >= 2) {
1614                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1615                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1616             } else {
1617                 reqLength = 2;
1618                 break;
1619             }
1620         } else {
1621             if((pDestLimit - pDest) >= 3) {
1622                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1623                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1624                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1625             } else {
1626                 reqLength = 3;
1627                 break;
1628             }
1629         }
1630     }
1631     while(src<pSrcLimit) {
1632         ch=*src++;
1633         if(ch <= 0x7f && ch != 0) {
1634             ++reqLength;
1635         } else if(ch<=0x7ff) {
1636             reqLength+=2;
1637         } else {
1638             reqLength+=3;
1639         }
1640     }
1641
1642     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1643     if(pDestLength){
1644         *pDestLength = reqLength;
1645     }
1646
1647     /* Terminate the buffer */
1648     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1649     return dest;
1650 }