icuSources/common/ustrtrns.c

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2010, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *
   9 * File ustrtrns.c
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   9/10/2001    Ram    Creation.
  15 ******************************************************************************
  16 */
  17
  18 /*******************************************************************************
  19  *
  20  * u_strTo* and u_strFrom* APIs
  21  * WCS functions moved to ustr_wcs.c for better modularization
  22  *
  23  *******************************************************************************
  24  */
  25
  26
  27 #include "unicode/putil.h"
  28 #include "unicode/ustring.h"
  29 #include "cstring.h"
  30 #include "cmemory.h"
  31 #include "ustr_imp.h"
  32
  33 U_CAPI UChar* U_EXPORT2
  34 u_strFromUTF32WithSub(UChar *dest,
  35                int32_t destCapacity,
  36                int32_t *pDestLength,
  37                const UChar32 *src,
  38                int32_t srcLength,
  39                UChar32 subchar, int32_t *pNumSubstitutions,
  40                UErrorCode *pErrorCode) {
  41     const UChar32 *srcLimit;
  42     UChar32 ch;
  43     UChar *destLimit;
  44     UChar *pDest;
  45     int32_t reqLength;
  46     int32_t numSubstitutions;
  47
  48     /* args check */
  49     if(U_FAILURE(*pErrorCode)){
  50         return NULL;
  51     }
  52     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  53         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
  54         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  55     ) {
  56         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  57         return NULL;
  58     }
  59
  60     if(pNumSubstitutions != NULL) {
  61         *pNumSubstitutions = 0;
  62     }
  63
  64     pDest = dest;
  65     destLimit = dest + destCapacity;
  66     reqLength = 0;
  67     numSubstitutions = 0;
  68
  69     if(srcLength < 0) {
  70         /* simple loop for conversion of a NUL-terminated BMP string */
  71         while((ch=*src) != 0 &&
  72               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
  73             ++src;
  74             if(pDest < destLimit) {
  75                 *pDest++ = (UChar)ch;
  76             } else {
  77                 ++reqLength;
  78             }
  79         }
  80         srcLimit = src;
  81         if(ch != 0) {
  82             /* "complicated" case, find the end of the remaining string */
  83             while(*++srcLimit != 0) {}
  84         }
  85     } else {
  86         srcLimit = src + srcLength;
  87     }
  88
  89     /* convert with length */
  90     while(src < srcLimit) {
  91         ch = *src++;
  92         do {
  93             /* usually "loops" once; twice only for writing subchar */
  94             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
  95                 if(pDest < destLimit) {
  96                     *pDest++ = (UChar)ch;
  97                 } else {
  98                     ++reqLength;
  99                 }
 100                 break;
 101             } else if(0x10000 <= ch && ch <= 0x10ffff) {
 102                 if((pDest + 2) <= destLimit) {
 103                     *pDest++ = U16_LEAD(ch);
 104                     *pDest++ = U16_TRAIL(ch);
 105                 } else {
 106                     reqLength += 2;
 107                 }
 108                 break;
 109             } else if((ch = subchar) < 0) {
 110                 /* surrogate code point, or not a Unicode code point at all */
 111                 *pErrorCode = U_INVALID_CHAR_FOUND;
 112                 return NULL;
 113             } else {
 114                 ++numSubstitutions;
 115             }
 116         } while(TRUE);
 117     }
 118
 119     reqLength += (int32_t)(pDest - dest);
 120     if(pDestLength) {
 121         *pDestLength = reqLength;
 122     }
 123     if(pNumSubstitutions != NULL) {
 124         *pNumSubstitutions = numSubstitutions;
 125     }
 126
 127     /* Terminate the buffer */
 128     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
 129
 130     return dest;
 131 }
 132
 133 U_CAPI UChar* U_EXPORT2
 134 u_strFromUTF32(UChar *dest,
 135                int32_t destCapacity,
 136                int32_t *pDestLength,
 137                const UChar32 *src,
 138                int32_t srcLength,
 139                UErrorCode *pErrorCode) {
 140     return u_strFromUTF32WithSub(
 141             dest, destCapacity, pDestLength,
 142             src, srcLength,
 143             U_SENTINEL, NULL,
 144             pErrorCode);
 145 }
 146
 147 U_CAPI UChar32* U_EXPORT2
 148 u_strToUTF32WithSub(UChar32 *dest,
 149              int32_t destCapacity,
 150              int32_t *pDestLength,
 151              const UChar *src,
 152              int32_t srcLength,
 153              UChar32 subchar, int32_t *pNumSubstitutions,
 154              UErrorCode *pErrorCode) {
 155     const UChar *srcLimit;
 156     UChar32 ch;
 157     UChar ch2;
 158     UChar32 *destLimit;
 159     UChar32 *pDest;
 160     int32_t reqLength;
 161     int32_t numSubstitutions;
 162
 163     /* args check */
 164     if(U_FAILURE(*pErrorCode)){
 165         return NULL;
 166     }
 167     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 168         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 169         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 170     ) {
 171         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 172         return NULL;
 173     }
 174
 175     if(pNumSubstitutions != NULL) {
 176         *pNumSubstitutions = 0;
 177     }
 178
 179     pDest = dest;
 180     destLimit = dest + destCapacity;
 181     reqLength = 0;
 182     numSubstitutions = 0;
 183
 184     if(srcLength < 0) {
 185         /* simple loop for conversion of a NUL-terminated BMP string */
 186         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
 187             ++src;
 188             if(pDest < destLimit) {
 189                 *pDest++ = ch;
 190             } else {
 191                 ++reqLength;
 192             }
 193         }
 194         srcLimit = src;
 195         if(ch != 0) {
 196             /* "complicated" case, find the end of the remaining string */
 197             while(*++srcLimit != 0) {}
 198         }
 199     } else {
 200         srcLimit = src + srcLength;
 201     }
 202
 203     /* convert with length */
 204     while(src < srcLimit) {
 205         ch = *src++;
 206         if(!U16_IS_SURROGATE(ch)) {
 207             /* write or count ch below */
 208         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
 209             ++src;
 210             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
 211         } else if((ch = subchar) < 0) {
 212             /* unpaired surrogate */
 213             *pErrorCode = U_INVALID_CHAR_FOUND;
 214             return NULL;
 215         } else {
 216             ++numSubstitutions;
 217         }
 218         if(pDest < destLimit) {
 219             *pDest++ = ch;
 220         } else {
 221             ++reqLength;
 222         }
 223     }
 224
 225     reqLength += (int32_t)(pDest - dest);
 226     if(pDestLength) {
 227         *pDestLength = reqLength;
 228     }
 229     if(pNumSubstitutions != NULL) {
 230         *pNumSubstitutions = numSubstitutions;
 231     }
 232
 233     /* Terminate the buffer */
 234     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
 235
 236     return dest;
 237 }
 238
 239 U_CAPI UChar32* U_EXPORT2
 240 u_strToUTF32(UChar32 *dest,
 241              int32_t destCapacity,
 242              int32_t *pDestLength,
 243              const UChar *src,
 244              int32_t srcLength,
 245              UErrorCode *pErrorCode) {
 246     return u_strToUTF32WithSub(
 247             dest, destCapacity, pDestLength,
 248             src, srcLength,
 249             U_SENTINEL, NULL,
 250             pErrorCode);
 251 }
 252
 253 /* for utf8_nextCharSafeBodyTerminated() */
 254 static const UChar32
 255 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
 256
 257 /*
 258  * Version of utf8_nextCharSafeBody() with the following differences:
 259  * - checks for NUL termination instead of length
 260  * - works with pointers instead of indexes
 261  * - always strict (strict==-1)
 262  *
 263  * *ps points to after the lead byte and will be moved to after the last trail byte.
 264  * c is the lead byte.
 265  * @return the code point, or U_SENTINEL
 266  */
 267 static UChar32
 268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
 269     const uint8_t *s=*ps;
 270     uint8_t trail, illegal=0;
 271     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
 272     UTF8_MASK_LEAD_BYTE((c), count);
 273     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 274     switch(count) {
 275     /* each branch falls through to the next one */
 276     case 5:
 277     case 4:
 278         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 279         illegal=1;
 280         break;
 281     case 3:
 282         trail=(uint8_t)(*s++ - 0x80);
 283         c=(c<<6)|trail;
 284         if(trail>0x3f || c>=0x110) {
 285             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
 286             illegal=1;
 287             break;
 288         }
 289     case 2:
 290         trail=(uint8_t)(*s++ - 0x80);
 291         if(trail>0x3f) {
 292             /* not a trail byte */
 293             illegal=1;
 294             break;
 295         }
 296         c=(c<<6)|trail;
 297     case 1:
 298         trail=(uint8_t)(*s++ - 0x80);
 299         if(trail>0x3f) {
 300             /* not a trail byte */
 301             illegal=1;
 302         }
 303         c=(c<<6)|trail;
 304         break;
 305     case 0:
 306         return U_SENTINEL;
 307     /* no default branch to optimize switch()  - all values are covered */
 308     }
 309
 310     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 311     /* illegal is also set if count>=4 */
 312     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
 313         /* error handling */
 314         /* don't go beyond this sequence */
 315         s=*ps;
 316         while(count>0 && UTF8_IS_TRAIL(*s)) {
 317             ++s;
 318             --count;
 319         }
 320         c=U_SENTINEL;
 321     }
 322     *ps=s;
 323     return c;
 324 }
 325
 326 /*
 327  * Version of utf8_nextCharSafeBody() with the following differences:
 328  * - works with pointers instead of indexes
 329  * - always strict (strict==-1)
 330  *
 331  * *ps points to after the lead byte and will be moved to after the last trail byte.
 332  * c is the lead byte.
 333  * @return the code point, or U_SENTINEL
 334  */
 335 static UChar32
 336 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
 337     const uint8_t *s=*ps;
 338     uint8_t trail, illegal=0;
 339     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
 340     if((limit-s)>=count) {
 341         UTF8_MASK_LEAD_BYTE((c), count);
 342         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 343         switch(count) {
 344         /* each branch falls through to the next one */
 345         case 5:
 346         case 4:
 347             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 348             illegal=1;
 349             break;
 350         case 3:
 351             trail=*s++;
 352             c=(c<<6)|(trail&0x3f);
 353             if(c<0x110) {
 354                 illegal|=(trail&0xc0)^0x80;
 355             } else {
 356                 /* code point>0x10ffff, outside Unicode */
 357                 illegal=1;
 358                 break;
 359             }
 360         case 2:
 361             trail=*s++;
 362             c=(c<<6)|(trail&0x3f);
 363             illegal|=(trail&0xc0)^0x80;
 364         case 1:
 365             trail=*s++;
 366             c=(c<<6)|(trail&0x3f);
 367             illegal|=(trail&0xc0)^0x80;
 368             break;
 369         case 0:
 370             return U_SENTINEL;
 371         /* no default branch to optimize switch()  - all values are covered */
 372         }
 373     } else {
 374         illegal=1; /* too few bytes left */
 375     }
 376
 377     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 378     /* illegal is also set if count>=4 */
 379     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
 380         /* error handling */
 381         /* don't go beyond this sequence */
 382         s=*ps;
 383         while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
 384             ++s;
 385             --count;
 386         }
 387         c=U_SENTINEL;
 388     }
 389     *ps=s;
 390     return c;
 391 }
 392
 393 U_CAPI UChar* U_EXPORT2
 394 u_strFromUTF8WithSub(UChar *dest,
 395               int32_t destCapacity,
 396               int32_t *pDestLength,
 397               const char* src,
 398               int32_t srcLength,
 399               UChar32 subchar, int32_t *pNumSubstitutions,
 400               UErrorCode *pErrorCode){
 401     UChar *pDest = dest;
 402     UChar *pDestLimit = dest+destCapacity;
 403     UChar32 ch;
 404     int32_t reqLength = 0;
 405     const uint8_t* pSrc = (const uint8_t*) src;
 406     uint8_t t1, t2; /* trail bytes */
 407     int32_t numSubstitutions;
 408
 409     /* args check */
 410     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 411         return NULL;
 412     }
 413
 414     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 415         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 416         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 417     ) {
 418         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 419         return NULL;
 420     }
 421
 422     if(pNumSubstitutions!=NULL) {
 423         *pNumSubstitutions=0;
 424     }
 425     numSubstitutions=0;
 426
 427     /*
 428      * Inline processing of UTF-8 byte sequences:
 429      *
 430      * Byte sequences for the most common characters are handled inline in
 431      * the conversion loops. In order to reduce the path lengths for those
 432      * characters, the tests are arranged in a kind of binary search.
 433      * ASCII (<=0x7f) is checked first, followed by the dividing point
 434      * between 2- and 3-byte sequences (0xe0).
 435      * The 3-byte branch is tested first to speed up CJK text.
 436      * The compiler should combine the subtractions for the two tests for 0xe0.
 437      * Each branch then tests for the other end of its range.
 438      */
 439
 440     if(srcLength < 0){
 441         /*
 442          * Transform a NUL-terminated string.
 443          * The code explicitly checks for NULs only in the lead byte position.
 444          * A NUL byte in the trail byte position fails the trail byte range check anyway.
 445          */
 446         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 447             if(ch <= 0x7f){
 448                 *pDest++=(UChar)ch;
 449                 ++pSrc;
 450             } else {
 451                 if(ch > 0xe0) {
 452                     if( /* handle U+1000..U+CFFF inline */
 453                         ch <= 0xec &&
 454                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 455                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 456                     ) {
 457                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 458                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 459                         pSrc += 3;
 460                         continue;
 461                     }
 462                 } else if(ch < 0xe0) {
 463                     if( /* handle U+0080..U+07FF inline */
 464                         ch >= 0xc2 &&
 465                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 466                     ) {
 467                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 468                         pSrc += 2;
 469                         continue;
 470                     }
 471                 }
 472
 473                 /* function call for "complicated" and error cases */
 474                 ++pSrc; /* continue after the lead byte */
 475                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 476                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 477                     *pErrorCode = U_INVALID_CHAR_FOUND;
 478                     return NULL;
 479                 } else if(ch<=0xFFFF) {
 480                     *(pDest++)=(UChar)ch;
 481                 } else {
 482                     *(pDest++)=UTF16_LEAD(ch);
 483                     if(pDest<pDestLimit) {
 484                         *(pDest++)=UTF16_TRAIL(ch);
 485                     } else {
 486                         reqLength++;
 487                         break;
 488                     }
 489                 }
 490             }
 491         }
 492
 493         /* Pre-flight the rest of the string. */
 494         while((ch = *pSrc) != 0) {
 495             if(ch <= 0x7f){
 496                 ++reqLength;
 497                 ++pSrc;
 498             } else {
 499                 if(ch > 0xe0) {
 500                     if( /* handle U+1000..U+CFFF inline */
 501                         ch <= 0xec &&
 502                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 503                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 504                     ) {
 505                         ++reqLength;
 506                         pSrc += 3;
 507                         continue;
 508                     }
 509                 } else if(ch < 0xe0) {
 510                     if( /* handle U+0080..U+07FF inline */
 511                         ch >= 0xc2 &&
 512                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 513                     ) {
 514                         ++reqLength;
 515                         pSrc += 2;
 516                         continue;
 517                     }
 518                 }
 519
 520                 /* function call for "complicated" and error cases */
 521                 ++pSrc; /* continue after the lead byte */
 522                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 523                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 524                     *pErrorCode = U_INVALID_CHAR_FOUND;
 525                     return NULL;
 526                 }
 527                 reqLength += U16_LENGTH(ch);
 528             }
 529         }
 530     } else /* srcLength >= 0 */ {
 531         const uint8_t *pSrcLimit = pSrc + srcLength;
 532         int32_t count;
 533
 534         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
 535         for(;;) {
 536             /*
 537              * Each iteration of the inner loop progresses by at most 3 UTF-8
 538              * bytes and one UChar, for most characters.
 539              * For supplementary code points (4 & 2), which are rare,
 540              * there is an additional adjustment.
 541              */
 542             count = (int32_t)(pDestLimit - pDest);
 543             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
 544             if(count > srcLength) {
 545                 count = srcLength; /* min(remaining dest, remaining src/3) */
 546             }
 547             if(count < 3) {
 548                 /*
 549                  * Too much overhead if we get near the end of the string,
 550                  * continue with the next loop.
 551                  */
 552                 break;
 553             }
 554
 555             do {
 556                 ch = *pSrc;
 557                 if(ch <= 0x7f){
 558                     *pDest++=(UChar)ch;
 559                     ++pSrc;
 560                 } else {
 561                     if(ch > 0xe0) {
 562                         if( /* handle U+1000..U+CFFF inline */
 563                             ch <= 0xec &&
 564                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 565                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 566                         ) {
 567                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 568                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 569                             pSrc += 3;
 570                             continue;
 571                         }
 572                     } else if(ch < 0xe0) {
 573                         if( /* handle U+0080..U+07FF inline */
 574                             ch >= 0xc2 &&
 575                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 576                         ) {
 577                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 578                             pSrc += 2;
 579                             continue;
 580                         }
 581                     }
 582
 583                     if(ch >= 0xf0 || subchar > 0xffff) {
 584                         /*
 585                          * We may read up to six bytes and write up to two UChars,
 586                          * which we didn't account for with computing count,
 587                          * so we adjust it here.
 588                          */
 589                         if(--count == 0) {
 590                             break;
 591                         }
 592                     }
 593
 594                     /* function call for "complicated" and error cases */
 595                     ++pSrc; /* continue after the lead byte */
 596                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 597                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 598                         *pErrorCode = U_INVALID_CHAR_FOUND;
 599                         return NULL;
 600                     }else if(ch<=0xFFFF){
 601                         *(pDest++)=(UChar)ch;
 602                     }else{
 603                         *(pDest++)=UTF16_LEAD(ch);
 604                         *(pDest++)=UTF16_TRAIL(ch);
 605                     }
 606                 }
 607             } while(--count > 0);
 608         }
 609
 610         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
 611             ch = *pSrc;
 612             if(ch <= 0x7f){
 613                 *pDest++=(UChar)ch;
 614                 ++pSrc;
 615             } else {
 616                 if(ch > 0xe0) {
 617                     if( /* handle U+1000..U+CFFF inline */
 618                         ch <= 0xec &&
 619                         ((pSrcLimit - pSrc) >= 3) &&
 620                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 621                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 622                     ) {
 623                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 624                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 625                         pSrc += 3;
 626                         continue;
 627                     }
 628                 } else if(ch < 0xe0) {
 629                     if( /* handle U+0080..U+07FF inline */
 630                         ch >= 0xc2 &&
 631                         ((pSrcLimit - pSrc) >= 2) &&
 632                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 633                     ) {
 634                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 635                         pSrc += 2;
 636                         continue;
 637                     }
 638                 }
 639
 640                 /* function call for "complicated" and error cases */
 641                 ++pSrc; /* continue after the lead byte */
 642                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 643                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 644                     *pErrorCode = U_INVALID_CHAR_FOUND;
 645                     return NULL;
 646                 }else if(ch<=0xFFFF){
 647                     *(pDest++)=(UChar)ch;
 648                 }else{
 649                     *(pDest++)=UTF16_LEAD(ch);
 650                     if(pDest<pDestLimit){
 651                         *(pDest++)=UTF16_TRAIL(ch);
 652                     }else{
 653                         reqLength++;
 654                         break;
 655                     }
 656                 }
 657             }
 658         }
 659         /* do not fill the dest buffer just count the UChars needed */
 660         while(pSrc < pSrcLimit){
 661             ch = *pSrc;
 662             if(ch <= 0x7f){
 663                 reqLength++;
 664                 ++pSrc;
 665             } else {
 666                 if(ch > 0xe0) {
 667                     if( /* handle U+1000..U+CFFF inline */
 668                         ch <= 0xec &&
 669                         ((pSrcLimit - pSrc) >= 3) &&
 670                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 671                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 672                     ) {
 673                         reqLength++;
 674                         pSrc += 3;
 675                         continue;
 676                     }
 677                 } else if(ch < 0xe0) {
 678                     if( /* handle U+0080..U+07FF inline */
 679                         ch >= 0xc2 &&
 680                         ((pSrcLimit - pSrc) >= 2) &&
 681                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 682                     ) {
 683                         reqLength++;
 684                         pSrc += 2;
 685                         continue;
 686                     }
 687                 }
 688
 689                 /* function call for "complicated" and error cases */
 690                 ++pSrc; /* continue after the lead byte */
 691                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 692                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 693                     *pErrorCode = U_INVALID_CHAR_FOUND;
 694                     return NULL;
 695                 }
 696                 reqLength+=UTF_CHAR_LENGTH(ch);
 697             }
 698         }
 699     }
 700
 701     reqLength+=(int32_t)(pDest - dest);
 702
 703     if(pNumSubstitutions!=NULL) {
 704         *pNumSubstitutions=numSubstitutions;
 705     }
 706
 707     if(pDestLength){
 708         *pDestLength = reqLength;
 709     }
 710
 711     /* Terminate the buffer */
 712     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 713
 714     return dest;
 715 }
 716
 717 U_CAPI UChar* U_EXPORT2
 718 u_strFromUTF8(UChar *dest,
 719               int32_t destCapacity,
 720               int32_t *pDestLength,
 721               const char* src,
 722               int32_t srcLength,
 723               UErrorCode *pErrorCode){
 724     return u_strFromUTF8WithSub(
 725             dest, destCapacity, pDestLength,
 726             src, srcLength,
 727             U_SENTINEL, NULL,
 728             pErrorCode);
 729 }
 730
 731 U_CAPI UChar * U_EXPORT2
 732 u_strFromUTF8Lenient(UChar *dest,
 733                      int32_t destCapacity,
 734                      int32_t *pDestLength,
 735                      const char *src,
 736                      int32_t srcLength,
 737                      UErrorCode *pErrorCode) {
 738     UChar *pDest = dest;
 739     UChar32 ch;
 740     int32_t reqLength = 0;
 741     uint8_t* pSrc = (uint8_t*) src;
 742
 743     /* args check */
 744     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 745         return NULL;
 746     }
 747
 748     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 749         (destCapacity<0) || (dest == NULL && destCapacity > 0)
 750     ) {
 751         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 752         return NULL;
 753     }
 754
 755     if(srcLength < 0) {
 756         /* Transform a NUL-terminated string. */
 757         UChar *pDestLimit = dest+destCapacity;
 758         uint8_t t1, t2, t3; /* trail bytes */
 759
 760         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 761             if(ch < 0xc0) {
 762                 /*
 763                  * ASCII, or a trail byte in lead position which is treated like
 764                  * a single-byte sequence for better character boundary
 765                  * resynchronization after illegal sequences.
 766                  */
 767                 *pDest++=(UChar)ch;
 768                 ++pSrc;
 769                 continue;
 770             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 771                 if((t1 = pSrc[1]) != 0) {
 772                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 773                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
 774                     pSrc += 2;
 775                     continue;
 776                 }
 777             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 778                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
 779                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 780                     /* 0x2080 = (0x80 << 6) + 0x80 */
 781                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
 782                     pSrc += 3;
 783                     continue;
 784                 }
 785             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 786                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
 787                     pSrc += 4;
 788                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 789                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
 790                     *(pDest++) = U16_LEAD(ch);
 791                     if(pDest < pDestLimit) {
 792                         *(pDest++) = U16_TRAIL(ch);
 793                     } else {
 794                         reqLength = 1;
 795                         break;
 796                     }
 797                     continue;
 798                 }
 799             }
 800
 801             /* truncated character at the end */
 802             *pDest++ = 0xfffd;
 803             while(*++pSrc != 0) {}
 804             break;
 805         }
 806
 807         /* Pre-flight the rest of the string. */
 808         while((ch = *pSrc) != 0) {
 809             if(ch < 0xc0) {
 810                 /*
 811                  * ASCII, or a trail byte in lead position which is treated like
 812                  * a single-byte sequence for better character boundary
 813                  * resynchronization after illegal sequences.
 814                  */
 815                 ++reqLength;
 816                 ++pSrc;
 817                 continue;
 818             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 819                 if(pSrc[1] != 0) {
 820                     ++reqLength;
 821                     pSrc += 2;
 822                     continue;
 823                 }
 824             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 825                 if(pSrc[1] != 0 && pSrc[2] != 0) {
 826                     ++reqLength;
 827                     pSrc += 3;
 828                     continue;
 829                 }
 830             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 831                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
 832                     reqLength += 2;
 833                     pSrc += 4;
 834                     continue;
 835                 }
 836             }
 837
 838             /* truncated character at the end */
 839             ++reqLength;
 840             break;
 841         }
 842     } else /* srcLength >= 0 */ {
 843         const uint8_t *pSrcLimit = pSrc + srcLength;
 844
 845         /*
 846          * This function requires that if srcLength is given, then it must be
 847          * destCapatity >= srcLength so that we need not check for
 848          * destination buffer overflow in the loop.
 849          */
 850         if(destCapacity < srcLength) {
 851             if(pDestLength != NULL) {
 852                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
 853             }
 854             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
 855             return NULL;
 856         }
 857
 858         if((pSrcLimit - pSrc) >= 4) {
 859             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
 860
 861             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
 862             do {
 863                 ch = *pSrc++;
 864                 if(ch < 0xc0) {
 865                     /*
 866                      * ASCII, or a trail byte in lead position which is treated like
 867                      * a single-byte sequence for better character boundary
 868                      * resynchronization after illegal sequences.
 869                      */
 870                     *pDest++=(UChar)ch;
 871                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
 872                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 873                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 874                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 875                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 876                     /* 0x2080 = (0x80 << 6) + 0x80 */
 877                     ch = (ch << 12) + (*pSrc++ << 6);
 878                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 879                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 880                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 881                     ch = (ch << 18) + (*pSrc++ << 12);
 882                     ch += *pSrc++ << 6;
 883                     ch += *pSrc++ - 0x3c82080;
 884                     *(pDest++) = U16_LEAD(ch);
 885                     *(pDest++) = U16_TRAIL(ch);
 886                 }
 887             } while(pSrc < pSrcLimit);
 888
 889             pSrcLimit += 3; /* restore original pSrcLimit */
 890         }
 891
 892         while(pSrc < pSrcLimit) {
 893             ch = *pSrc++;
 894             if(ch < 0xc0) {
 895                 /*
 896                  * ASCII, or a trail byte in lead position which is treated like
 897                  * a single-byte sequence for better character boundary
 898                  * resynchronization after illegal sequences.
 899                  */
 900                 *pDest++=(UChar)ch;
 901                 continue;
 902             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 903                 if(pSrc < pSrcLimit) {
 904                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 905                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 906                     continue;
 907                 }
 908             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 909                 if((pSrcLimit - pSrc) >= 2) {
 910                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 911                     /* 0x2080 = (0x80 << 6) + 0x80 */
 912                     ch = (ch << 12) + (*pSrc++ << 6);
 913                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 914                     pSrc += 3;
 915                     continue;
 916                 }
 917             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 918                 if((pSrcLimit - pSrc) >= 3) {
 919                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 920                     ch = (ch << 18) + (*pSrc++ << 12);
 921                     ch += *pSrc++ << 6;
 922                     ch += *pSrc++ - 0x3c82080;
 923                     *(pDest++) = U16_LEAD(ch);
 924                     *(pDest++) = U16_TRAIL(ch);
 925                     pSrc += 4;
 926                     continue;
 927                 }
 928             }
 929
 930             /* truncated character at the end */
 931             *pDest++ = 0xfffd;
 932             break;
 933         }
 934     }
 935
 936     reqLength+=(int32_t)(pDest - dest);
 937
 938     if(pDestLength){
 939         *pDestLength = reqLength;
 940     }
 941
 942     /* Terminate the buffer */
 943     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 944
 945     return dest;
 946 }
 947
 948 static U_INLINE uint8_t *
 949 _appendUTF8(uint8_t *pDest, UChar32 c) {
 950     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
 951     if((c)<=0x7f) {
 952         *pDest++=(uint8_t)c;
 953     } else if(c<=0x7ff) {
 954         *pDest++=(uint8_t)((c>>6)|0xc0);
 955         *pDest++=(uint8_t)((c&0x3f)|0x80);
 956     } else if(c<=0xffff) {
 957         *pDest++=(uint8_t)((c>>12)|0xe0);
 958         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
 959         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 960     } else /* if((uint32_t)(c)<=0x10ffff) */ {
 961         *pDest++=(uint8_t)(((c)>>18)|0xf0);
 962         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
 963         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
 964         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 965     }
 966     return pDest;
 967 }
 968
 969
 970 U_CAPI char* U_EXPORT2
 971 u_strToUTF8WithSub(char *dest,
 972             int32_t destCapacity,
 973             int32_t *pDestLength,
 974             const UChar *pSrc,
 975             int32_t srcLength,
 976             UChar32 subchar, int32_t *pNumSubstitutions,
 977             UErrorCode *pErrorCode){
 978     int32_t reqLength=0;
 979     uint32_t ch=0,ch2=0;
 980     uint8_t *pDest = (uint8_t *)dest;
 981     uint8_t *pDestLimit = pDest + destCapacity;
 982     int32_t numSubstitutions;
 983
 984     /* args check */
 985     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 986         return NULL;
 987     }
 988
 989     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
 990         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 991         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 992     ) {
 993         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 994         return NULL;
 995     }
 996
 997     if(pNumSubstitutions!=NULL) {
 998         *pNumSubstitutions=0;
 999     }
1000     numSubstitutions=0;
1001
1002     if(srcLength==-1) {
1003         while((ch=*pSrc)!=0) {
1004             ++pSrc;
1005             if(ch <= 0x7f) {
1006                 if(pDest<pDestLimit) {
1007                     *pDest++ = (uint8_t)ch;
1008                 } else {
1009                     reqLength = 1;
1010                     break;
1011                 }
1012             } else if(ch <= 0x7ff) {
1013                 if((pDestLimit - pDest) >= 2) {
1014                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1015                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1016                 } else {
1017                     reqLength = 2;
1018                     break;
1019                 }
1020             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1021                 if((pDestLimit - pDest) >= 3) {
1022                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1023                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1024                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1025                 } else {
1026                     reqLength = 3;
1027                     break;
1028                 }
1029             } else /* ch is a surrogate */ {
1030                 int32_t length;
1031
1032                 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
1033                 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1034                     ++pSrc;
1035                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1036                 } else if(subchar>=0) {
1037                     ch=subchar;
1038                     ++numSubstitutions;
1039                 } else {
1040                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1041                     *pErrorCode = U_INVALID_CHAR_FOUND;
1042                     return NULL;
1043                 }
1044
1045                 length = U8_LENGTH(ch);
1046                 if((pDestLimit - pDest) >= length) {
1047                     /* convert and append*/
1048                     pDest=_appendUTF8(pDest, ch);
1049                 } else {
1050                     reqLength = length;
1051                     break;
1052                 }
1053             }
1054         }
1055         while((ch=*pSrc++)!=0) {
1056             if(ch<=0x7f) {
1057                 ++reqLength;
1058             } else if(ch<=0x7ff) {
1059                 reqLength+=2;
1060             } else if(!UTF_IS_SURROGATE(ch)) {
1061                 reqLength+=3;
1062             } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1063                 ++pSrc;
1064                 reqLength+=4;
1065             } else if(subchar>=0) {
1066                 reqLength+=U8_LENGTH(subchar);
1067                 ++numSubstitutions;
1068             } else {
1069                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1070                 *pErrorCode = U_INVALID_CHAR_FOUND;
1071                 return NULL;
1072             }
1073         }
1074     } else {
1075         const UChar *pSrcLimit = pSrc+srcLength;
1076         int32_t count;
1077
1078         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1079         for(;;) {
1080             /*
1081              * Each iteration of the inner loop progresses by at most 3 UTF-8
1082              * bytes and one UChar, for most characters.
1083              * For supplementary code points (4 & 2), which are rare,
1084              * there is an additional adjustment.
1085              */
1086             count = (int32_t)((pDestLimit - pDest) / 3);
1087             srcLength = (int32_t)(pSrcLimit - pSrc);
1088             if(count > srcLength) {
1089                 count = srcLength; /* min(remaining dest/3, remaining src) */
1090             }
1091             if(count < 3) {
1092                 /*
1093                  * Too much overhead if we get near the end of the string,
1094                  * continue with the next loop.
1095                  */
1096                 break;
1097             }
1098             do {
1099                 ch=*pSrc++;
1100                 if(ch <= 0x7f) {
1101                     *pDest++ = (uint8_t)ch;
1102                 } else if(ch <= 0x7ff) {
1103                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1104                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1105                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1106                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1107                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1108                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1109                 } else /* ch is a surrogate */ {
1110                     /*
1111                      * We will read two UChars and probably output four bytes,
1112                      * which we didn't account for with computing count,
1113                      * so we adjust it here.
1114                      */
1115                     if(--count == 0) {
1116                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1117                         break;  /* recompute count */
1118                     }
1119
1120                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1121                         ++pSrc;
1122                         ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1123
1124                         /* writing 4 bytes per 2 UChars is ok */
1125                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1126                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1127                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1128                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1129                     } else  {
1130                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1131                         if(subchar>=0) {
1132                             ch=subchar;
1133                             ++numSubstitutions;
1134                         } else {
1135                             *pErrorCode = U_INVALID_CHAR_FOUND;
1136                             return NULL;
1137                         }
1138
1139                         /* convert and append*/
1140                         pDest=_appendUTF8(pDest, ch);
1141                     }
1142                 }
1143             } while(--count > 0);
1144         }
1145
1146         while(pSrc<pSrcLimit) {
1147             ch=*pSrc++;
1148             if(ch <= 0x7f) {
1149                 if(pDest<pDestLimit) {
1150                     *pDest++ = (uint8_t)ch;
1151                 } else {
1152                     reqLength = 1;
1153                     break;
1154                 }
1155             } else if(ch <= 0x7ff) {
1156                 if((pDestLimit - pDest) >= 2) {
1157                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1158                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1159                 } else {
1160                     reqLength = 2;
1161                     break;
1162                 }
1163             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1164                 if((pDestLimit - pDest) >= 3) {
1165                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1166                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1167                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1168                 } else {
1169                     reqLength = 3;
1170                     break;
1171                 }
1172             } else /* ch is a surrogate */ {
1173                 int32_t length;
1174
1175                 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1176                     ++pSrc;
1177                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1178                 } else if(subchar>=0) {
1179                     ch=subchar;
1180                     ++numSubstitutions;
1181                 } else {
1182                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1183                     *pErrorCode = U_INVALID_CHAR_FOUND;
1184                     return NULL;
1185                 }
1186
1187                 length = U8_LENGTH(ch);
1188                 if((pDestLimit - pDest) >= length) {
1189                     /* convert and append*/
1190                     pDest=_appendUTF8(pDest, ch);
1191                 } else {
1192                     reqLength = length;
1193                     break;
1194                 }
1195             }
1196         }
1197         while(pSrc<pSrcLimit) {
1198             ch=*pSrc++;
1199             if(ch<=0x7f) {
1200                 ++reqLength;
1201             } else if(ch<=0x7ff) {
1202                 reqLength+=2;
1203             } else if(!UTF_IS_SURROGATE(ch)) {
1204                 reqLength+=3;
1205             } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1206                 ++pSrc;
1207                 reqLength+=4;
1208             } else if(subchar>=0) {
1209                 reqLength+=U8_LENGTH(subchar);
1210                 ++numSubstitutions;
1211             } else {
1212                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1213                 *pErrorCode = U_INVALID_CHAR_FOUND;
1214                 return NULL;
1215             }
1216         }
1217     }
1218
1219     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1220
1221     if(pNumSubstitutions!=NULL) {
1222         *pNumSubstitutions=numSubstitutions;
1223     }
1224
1225     if(pDestLength){
1226         *pDestLength = reqLength;
1227     }
1228
1229     /* Terminate the buffer */
1230     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1231     return dest;
1232 }
1233
1234 U_CAPI char* U_EXPORT2
1235 u_strToUTF8(char *dest,
1236             int32_t destCapacity,
1237             int32_t *pDestLength,
1238             const UChar *pSrc,
1239             int32_t srcLength,
1240             UErrorCode *pErrorCode){
1241     return u_strToUTF8WithSub(
1242             dest, destCapacity, pDestLength,
1243             pSrc, srcLength,
1244             U_SENTINEL, NULL,
1245             pErrorCode);
1246 }
1247
1248 U_CAPI UChar* U_EXPORT2
1249 u_strFromJavaModifiedUTF8WithSub(
1250         UChar *dest,
1251         int32_t destCapacity,
1252         int32_t *pDestLength,
1253         const char *src,
1254         int32_t srcLength,
1255         UChar32 subchar, int32_t *pNumSubstitutions,
1256         UErrorCode *pErrorCode) {
1257     UChar *pDest = dest;
1258     UChar *pDestLimit = dest+destCapacity;
1259     UChar32 ch;
1260     int32_t reqLength = 0;
1261     const uint8_t* pSrc = (const uint8_t*) src;
1262     const uint8_t *pSrcLimit;
1263     int32_t count;
1264     uint8_t t1, t2; /* trail bytes */
1265     int32_t numSubstitutions;
1266
1267     /* args check */
1268     if(U_FAILURE(*pErrorCode)){
1269         return NULL;
1270     }
1271     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1272         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1273         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1274     ) {
1275         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1276         return NULL;
1277     }
1278
1279     if(pNumSubstitutions!=NULL) {
1280         *pNumSubstitutions=0;
1281     }
1282     numSubstitutions=0;
1283
1284     if(srcLength < 0) {
1285         /*
1286          * Transform a NUL-terminated ASCII string.
1287          * Handle non-ASCII strings with slower code.
1288          */
1289         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1290             *pDest++=(UChar)ch;
1291             ++pSrc;
1292         }
1293         if(ch == 0) {
1294             reqLength=(int32_t)(pDest - dest);
1295             if(pDestLength) {
1296                 *pDestLength = reqLength;
1297             }
1298
1299             /* Terminate the buffer */
1300             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1301             return dest;
1302         }
1303         srcLength = uprv_strlen((const char *)pSrc);
1304     }
1305
1306     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1307     pSrcLimit = pSrc + srcLength;
1308     for(;;) {
1309         count = (int32_t)(pDestLimit - pDest);
1310         srcLength = (int32_t)(pSrcLimit - pSrc);
1311         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1312             /* fast ASCII loop */
1313             const uint8_t *prevSrc = pSrc;
1314             int32_t delta;
1315             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1316                 *pDest++=(UChar)ch;
1317                 ++pSrc;
1318             }
1319             delta = (int32_t)(pSrc - prevSrc);
1320             count -= delta;
1321             srcLength -= delta;
1322         }
1323         /*
1324          * Each iteration of the inner loop progresses by at most 3 UTF-8
1325          * bytes and one UChar.
1326          */
1327         srcLength /= 3;
1328         if(count > srcLength) {
1329             count = srcLength; /* min(remaining dest, remaining src/3) */
1330         }
1331         if(count < 3) {
1332             /*
1333              * Too much overhead if we get near the end of the string,
1334              * continue with the next loop.
1335              */
1336             break;
1337         }
1338         do {
1339             ch = *pSrc;
1340             if(ch <= 0x7f){
1341                 *pDest++=(UChar)ch;
1342                 ++pSrc;
1343             } else {
1344                 if(ch >= 0xe0) {
1345                     if( /* handle U+0000..U+FFFF inline */
1346                         ch <= 0xef &&
1347                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1348                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1349                     ) {
1350                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1351                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1352                         pSrc += 3;
1353                         continue;
1354                     }
1355                 } else {
1356                     if( /* handle U+0000..U+07FF inline */
1357                         ch >= 0xc0 &&
1358                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1359                     ) {
1360                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1361                         pSrc += 2;
1362                         continue;
1363                     }
1364                 }
1365
1366                 if(subchar < 0) {
1367                     *pErrorCode = U_INVALID_CHAR_FOUND;
1368                     return NULL;
1369                 } else if(subchar > 0xffff && --count == 0) {
1370                     /*
1371                      * We need to write two UChars, adjusted count for that,
1372                      * and ran out of space.
1373                      */
1374                     break;
1375                 } else {
1376                     /* function call for error cases */
1377                     ++pSrc; /* continue after the lead byte */
1378                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1379                     ++numSubstitutions;
1380                     if(subchar<=0xFFFF) {
1381                         *(pDest++)=(UChar)subchar;
1382                     } else {
1383                         *(pDest++)=U16_LEAD(subchar);
1384                         *(pDest++)=U16_TRAIL(subchar);
1385                     }
1386                 }
1387             }
1388         } while(--count > 0);
1389     }
1390
1391     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1392         ch = *pSrc;
1393         if(ch <= 0x7f){
1394             *pDest++=(UChar)ch;
1395             ++pSrc;
1396         } else {
1397             if(ch >= 0xe0) {
1398                 if( /* handle U+0000..U+FFFF inline */
1399                     ch <= 0xef &&
1400                     ((pSrcLimit - pSrc) >= 3) &&
1401                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1402                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1403                 ) {
1404                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1405                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1406                     pSrc += 3;
1407                     continue;
1408                 }
1409             } else {
1410                 if( /* handle U+0000..U+07FF inline */
1411                     ch >= 0xc0 &&
1412                     ((pSrcLimit - pSrc) >= 2) &&
1413                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1414                 ) {
1415                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1416                     pSrc += 2;
1417                     continue;
1418                 }
1419             }
1420
1421             if(subchar < 0) {
1422                 *pErrorCode = U_INVALID_CHAR_FOUND;
1423                 return NULL;
1424             } else {
1425                 /* function call for error cases */
1426                 ++pSrc; /* continue after the lead byte */
1427                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1428                 ++numSubstitutions;
1429                 if(subchar<=0xFFFF) {
1430                     *(pDest++)=(UChar)subchar;
1431                 } else {
1432                     *(pDest++)=U16_LEAD(subchar);
1433                     if(pDest<pDestLimit) {
1434                         *(pDest++)=U16_TRAIL(subchar);
1435                     } else {
1436                         reqLength++;
1437                         break;
1438                     }
1439                 }
1440             }
1441         }
1442     }
1443
1444     /* do not fill the dest buffer just count the UChars needed */
1445     while(pSrc < pSrcLimit){
1446         ch = *pSrc;
1447         if(ch <= 0x7f) {
1448             reqLength++;
1449             ++pSrc;
1450         } else {
1451             if(ch >= 0xe0) {
1452                 if( /* handle U+0000..U+FFFF inline */
1453                     ch <= 0xef &&
1454                     ((pSrcLimit - pSrc) >= 3) &&
1455                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1456                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1457                 ) {
1458                     reqLength++;
1459                     pSrc += 3;
1460                     continue;
1461                 }
1462             } else {
1463                 if( /* handle U+0000..U+07FF inline */
1464                     ch >= 0xc0 &&
1465                     ((pSrcLimit - pSrc) >= 2) &&
1466                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1467                 ) {
1468                     reqLength++;
1469                     pSrc += 2;
1470                     continue;
1471                 }
1472             }
1473
1474             if(subchar < 0) {
1475                 *pErrorCode = U_INVALID_CHAR_FOUND;
1476                 return NULL;
1477             } else {
1478                 /* function call for error cases */
1479                 ++pSrc; /* continue after the lead byte */
1480                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1481                 ++numSubstitutions;
1482                 reqLength+=U16_LENGTH(ch);
1483             }
1484         }
1485     }
1486
1487     if(pNumSubstitutions!=NULL) {
1488         *pNumSubstitutions=numSubstitutions;
1489     }
1490
1491     reqLength+=(int32_t)(pDest - dest);
1492     if(pDestLength) {
1493         *pDestLength = reqLength;
1494     }
1495
1496     /* Terminate the buffer */
1497     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1498     return dest;
1499 }
1500
1501 U_CAPI char* U_EXPORT2
1502 u_strToJavaModifiedUTF8(
1503         char *dest,
1504         int32_t destCapacity,
1505         int32_t *pDestLength,
1506         const UChar *src,
1507         int32_t srcLength,
1508         UErrorCode *pErrorCode) {
1509     int32_t reqLength=0;
1510     uint32_t ch=0;
1511     uint8_t *pDest = (uint8_t *)dest;
1512     uint8_t *pDestLimit = pDest + destCapacity;
1513     const UChar *pSrcLimit;
1514     int32_t count;
1515
1516     /* args check */
1517     if(U_FAILURE(*pErrorCode)){
1518         return NULL;
1519     }
1520     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1521         (dest==NULL && destCapacity!=0) || destCapacity<0
1522     ) {
1523         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1524         return NULL;
1525     }
1526
1527     if(srcLength==-1) {
1528         /* Convert NUL-terminated ASCII, then find the string length. */
1529         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1530             *pDest++ = (uint8_t)ch;
1531             ++src;
1532         }
1533         if(ch == 0) {
1534             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1535             if(pDestLength) {
1536                 *pDestLength = reqLength;
1537             }
1538
1539             /* Terminate the buffer */
1540             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1541             return dest;
1542         }
1543         srcLength = u_strlen(src);
1544     }
1545
1546     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1547     pSrcLimit = src+srcLength;
1548     for(;;) {
1549         count = (int32_t)(pDestLimit - pDest);
1550         srcLength = (int32_t)(pSrcLimit - src);
1551         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1552             /* fast ASCII loop */
1553             const UChar *prevSrc = src;
1554             int32_t delta;
1555             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1556                 *pDest++=(uint8_t)ch;
1557                 ++src;
1558             }
1559             delta = (int32_t)(src - prevSrc);
1560             count -= delta;
1561             srcLength -= delta;
1562         }
1563         /*
1564          * Each iteration of the inner loop progresses by at most 3 UTF-8
1565          * bytes and one UChar.
1566          */
1567         count /= 3;
1568         if(count > srcLength) {
1569             count = srcLength; /* min(remaining dest/3, remaining src) */
1570         }
1571         if(count < 3) {
1572             /*
1573              * Too much overhead if we get near the end of the string,
1574              * continue with the next loop.
1575              */
1576             break;
1577         }
1578         do {
1579             ch=*src++;
1580             if(ch <= 0x7f && ch != 0) {
1581                 *pDest++ = (uint8_t)ch;
1582             } else if(ch <= 0x7ff) {
1583                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1584                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1585             } else {
1586                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1587                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1588                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1589             }
1590         } while(--count > 0);
1591     }
1592
1593     while(src<pSrcLimit) {
1594         ch=*src++;
1595         if(ch <= 0x7f && ch != 0) {
1596             if(pDest<pDestLimit) {
1597                 *pDest++ = (uint8_t)ch;
1598             } else {
1599                 reqLength = 1;
1600                 break;
1601             }
1602         } else if(ch <= 0x7ff) {
1603             if((pDestLimit - pDest) >= 2) {
1604                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1605                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1606             } else {
1607                 reqLength = 2;
1608                 break;
1609             }
1610         } else {
1611             if((pDestLimit - pDest) >= 3) {
1612                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1613                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1614                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1615             } else {
1616                 reqLength = 3;
1617                 break;
1618             }
1619         }
1620     }
1621     while(src<pSrcLimit) {
1622         ch=*src++;
1623         if(ch <= 0x7f && ch != 0) {
1624             ++reqLength;
1625         } else if(ch<=0x7ff) {
1626             reqLength+=2;
1627         } else {
1628             reqLength+=3;
1629         }
1630     }
1631
1632     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1633     if(pDestLength){
1634         *pDestLength = reqLength;
1635     }
1636
1637     /* Terminate the buffer */
1638     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1639     return dest;
1640 }