icuSources/common/ustrtrns.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 *
   6 *   Copyright (C) 2001-2016, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 ******************************************************************************
  10 *
  11 * File ustrtrns.cpp
  12 *
  13 * Modification History:
  14 *
  15 *   Date        Name        Description
  16 *   9/10/2001    Ram    Creation.
  17 ******************************************************************************
  18 */
  19
  20 /*******************************************************************************
  21  *
  22  * u_strTo* and u_strFrom* APIs
  23  * WCS functions moved to ustr_wcs.c for better modularization
  24  *
  25  *******************************************************************************
  26  */
  27
  28
  29 #include "unicode/putil.h"
  30 #include "unicode/ustring.h"
  31 #include "unicode/utf.h"
  32 #include "unicode/utf8.h"
  33 #include "unicode/utf16.h"
  34 #include "cstring.h"
  35 #include "cmemory.h"
  36 #include "ustr_imp.h"
  37 #include "uassert.h"
  38
  39 U_CAPI UChar* U_EXPORT2
  40 u_strFromUTF32WithSub(UChar *dest,
  41                int32_t destCapacity,
  42                int32_t *pDestLength,
  43                const UChar32 *src,
  44                int32_t srcLength,
  45                UChar32 subchar, int32_t *pNumSubstitutions,
  46                UErrorCode *pErrorCode) {
  47     const UChar32 *srcLimit;
  48     UChar32 ch;
  49     UChar *destLimit;
  50     UChar *pDest;
  51     int32_t reqLength;
  52     int32_t numSubstitutions;
  53
  54     /* args check */
  55     if(U_FAILURE(*pErrorCode)){
  56         return NULL;
  57     }
  58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
  60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  61     ) {
  62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  63         return NULL;
  64     }
  65
  66     if(pNumSubstitutions != NULL) {
  67         *pNumSubstitutions = 0;
  68     }
  69
  70     pDest = dest;
  71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
  72     reqLength = 0;
  73     numSubstitutions = 0;
  74
  75     if(srcLength < 0) {
  76         /* simple loop for conversion of a NUL-terminated BMP string */
  77         while((ch=*src) != 0 &&
  78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
  79             ++src;
  80             if(pDest < destLimit) {
  81                 *pDest++ = (UChar)ch;
  82             } else {
  83                 ++reqLength;
  84             }
  85         }
  86         srcLimit = src;
  87         if(ch != 0) {
  88             /* "complicated" case, find the end of the remaining string */
  89             while(*++srcLimit != 0) {}
  90         }
  91     } else {
  92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
  93     }
  94
  95     /* convert with length */
  96     while(src < srcLimit) {
  97         ch = *src++;
  98         do {
  99             /* usually "loops" once; twice only for writing subchar */
 100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
 101                 if(pDest < destLimit) {
 102                     *pDest++ = (UChar)ch;
 103                 } else {
 104                     ++reqLength;
 105                 }
 106                 break;
 107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
 108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
 109                     *pDest++ = U16_LEAD(ch);
 110                     *pDest++ = U16_TRAIL(ch);
 111                 } else {
 112                     reqLength += 2;
 113                 }
 114                 break;
 115             } else if((ch = subchar) < 0) {
 116                 /* surrogate code point, or not a Unicode code point at all */
 117                 *pErrorCode = U_INVALID_CHAR_FOUND;
 118                 return NULL;
 119             } else {
 120                 ++numSubstitutions;
 121             }
 122         } while(TRUE);
 123     }
 124
 125     reqLength += (int32_t)(pDest - dest);
 126     if(pDestLength) {
 127         *pDestLength = reqLength;
 128     }
 129     if(pNumSubstitutions != NULL) {
 130         *pNumSubstitutions = numSubstitutions;
 131     }
 132
 133     /* Terminate the buffer */
 134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
 135
 136     return dest;
 137 }
 138
 139 U_CAPI UChar* U_EXPORT2
 140 u_strFromUTF32(UChar *dest,
 141                int32_t destCapacity,
 142                int32_t *pDestLength,
 143                const UChar32 *src,
 144                int32_t srcLength,
 145                UErrorCode *pErrorCode) {
 146     return u_strFromUTF32WithSub(
 147             dest, destCapacity, pDestLength,
 148             src, srcLength,
 149             U_SENTINEL, NULL,
 150             pErrorCode);
 151 }
 152
 153 U_CAPI UChar32* U_EXPORT2
 154 u_strToUTF32WithSub(UChar32 *dest,
 155              int32_t destCapacity,
 156              int32_t *pDestLength,
 157              const UChar *src,
 158              int32_t srcLength,
 159              UChar32 subchar, int32_t *pNumSubstitutions,
 160              UErrorCode *pErrorCode) {
 161     const UChar *srcLimit;
 162     UChar32 ch;
 163     UChar ch2;
 164     UChar32 *destLimit;
 165     UChar32 *pDest;
 166     int32_t reqLength;
 167     int32_t numSubstitutions;
 168
 169     /* args check */
 170     if(U_FAILURE(*pErrorCode)){
 171         return NULL;
 172     }
 173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 176     ) {
 177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 178         return NULL;
 179     }
 180
 181     if(pNumSubstitutions != NULL) {
 182         *pNumSubstitutions = 0;
 183     }
 184
 185     pDest = dest;
 186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
 187     reqLength = 0;
 188     numSubstitutions = 0;
 189
 190     if(srcLength < 0) {
 191         /* simple loop for conversion of a NUL-terminated BMP string */
 192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
 193             ++src;
 194             if(pDest < destLimit) {
 195                 *pDest++ = ch;
 196             } else {
 197                 ++reqLength;
 198             }
 199         }
 200         srcLimit = src;
 201         if(ch != 0) {
 202             /* "complicated" case, find the end of the remaining string */
 203             while(*++srcLimit != 0) {}
 204         }
 205     } else {
 206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
 207     }
 208
 209     /* convert with length */
 210     while(src < srcLimit) {
 211         ch = *src++;
 212         if(!U16_IS_SURROGATE(ch)) {
 213             /* write or count ch below */
 214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
 215             ++src;
 216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
 217         } else if((ch = subchar) < 0) {
 218             /* unpaired surrogate */
 219             *pErrorCode = U_INVALID_CHAR_FOUND;
 220             return NULL;
 221         } else {
 222             ++numSubstitutions;
 223         }
 224         if(pDest < destLimit) {
 225             *pDest++ = ch;
 226         } else {
 227             ++reqLength;
 228         }
 229     }
 230
 231     reqLength += (int32_t)(pDest - dest);
 232     if(pDestLength) {
 233         *pDestLength = reqLength;
 234     }
 235     if(pNumSubstitutions != NULL) {
 236         *pNumSubstitutions = numSubstitutions;
 237     }
 238
 239     /* Terminate the buffer */
 240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
 241
 242     return dest;
 243 }
 244
 245 U_CAPI UChar32* U_EXPORT2
 246 u_strToUTF32(UChar32 *dest,
 247              int32_t destCapacity,
 248              int32_t *pDestLength,
 249              const UChar *src,
 250              int32_t srcLength,
 251              UErrorCode *pErrorCode) {
 252     return u_strToUTF32WithSub(
 253             dest, destCapacity, pDestLength,
 254             src, srcLength,
 255             U_SENTINEL, NULL,
 256             pErrorCode);
 257 }
 258
 259 U_CAPI UChar* U_EXPORT2
 260 u_strFromUTF8WithSub(UChar *dest,
 261               int32_t destCapacity,
 262               int32_t *pDestLength,
 263               const char* src,
 264               int32_t srcLength,
 265               UChar32 subchar, int32_t *pNumSubstitutions,
 266               UErrorCode *pErrorCode){
 267     /* args check */
 268     if(U_FAILURE(*pErrorCode)) {
 269         return NULL;
 270     }
 271     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 272         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 273         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 274     ) {
 275         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 276         return NULL;
 277     }
 278
 279     if(pNumSubstitutions!=NULL) {
 280         *pNumSubstitutions=0;
 281     }
 282     UChar *pDest = dest;
 283     UChar *pDestLimit = dest+destCapacity;
 284     int32_t reqLength = 0;
 285     int32_t numSubstitutions=0;
 286
 287     /*
 288      * Inline processing of UTF-8 byte sequences:
 289      *
 290      * Byte sequences for the most common characters are handled inline in
 291      * the conversion loops. In order to reduce the path lengths for those
 292      * characters, the tests are arranged in a kind of binary search.
 293      * ASCII (<=0x7f) is checked first, followed by the dividing point
 294      * between 2- and 3-byte sequences (0xe0).
 295      * The 3-byte branch is tested first to speed up CJK text.
 296      * The compiler should combine the subtractions for the two tests for 0xe0.
 297      * Each branch then tests for the other end of its range.
 298      */
 299
 300     if(srcLength < 0){
 301         /*
 302          * Transform a NUL-terminated string.
 303          * The code explicitly checks for NULs only in the lead byte position.
 304          * A NUL byte in the trail byte position fails the trail byte range check anyway.
 305          */
 306         int32_t i;
 307         UChar32 c;
 308         for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
 309             // modified copy of U8_NEXT()
 310             ++i;
 311             if(U8_IS_SINGLE(c)) {
 312                 *pDest++=(UChar)c;
 313             } else {
 314                 uint8_t __t1, __t2;
 315                 if( /* handle U+0800..U+FFFF inline */
 316                         (0xe0<=(c) && (c)<0xf0) &&
 317                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
 318                         (__t2=src[(i)+1]-0x80)<=0x3f) {
 319                     *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
 320                     i+=2;
 321                 } else if( /* handle U+0080..U+07FF inline */
 322                         ((c)<0xe0 && (c)>=0xc2) &&
 323                         (__t1=src[i]-0x80)<=0x3f) {
 324                     *pDest++ = (((c)&0x1f)<<6)|__t1;
 325                     ++(i);
 326                 } else {
 327                     /* function call for "complicated" and error cases */
 328                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
 329                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
 330                         *pErrorCode = U_INVALID_CHAR_FOUND;
 331                         return NULL;
 332                     } else if(c<=0xFFFF) {
 333                         *(pDest++)=(UChar)c;
 334                     } else {
 335                         *(pDest++)=U16_LEAD(c);
 336                         if(pDest<pDestLimit) {
 337                             *(pDest++)=U16_TRAIL(c);
 338                         } else {
 339                             reqLength++;
 340                             break;
 341                         }
 342                     }
 343                 }
 344             }
 345         }
 346
 347         /* Pre-flight the rest of the string. */
 348         while((c = (uint8_t)src[i]) != 0) {
 349             // modified copy of U8_NEXT()
 350             ++i;
 351             if(U8_IS_SINGLE(c)) {
 352                 ++reqLength;
 353             } else {
 354                 uint8_t __t1, __t2;
 355                 if( /* handle U+0800..U+FFFF inline */
 356                         (0xe0<=(c) && (c)<0xf0) &&
 357                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
 358                         (__t2=src[(i)+1]-0x80)<=0x3f) {
 359                     ++reqLength;
 360                     i+=2;
 361                 } else if( /* handle U+0080..U+07FF inline */
 362                         ((c)<0xe0 && (c)>=0xc2) &&
 363                         (__t1=src[i]-0x80)<=0x3f) {
 364                     ++reqLength;
 365                     ++(i);
 366                 } else {
 367                     /* function call for "complicated" and error cases */
 368                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
 369                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
 370                         *pErrorCode = U_INVALID_CHAR_FOUND;
 371                         return NULL;
 372                     }
 373                     reqLength += U16_LENGTH(c);
 374                 }
 375             }
 376         }
 377     } else /* srcLength >= 0 */ {
 378         /* Faster loop without ongoing checking for srcLength and pDestLimit. */
 379         int32_t i = 0;
 380         UChar32 c;
 381         for(;;) {
 382             /*
 383              * Each iteration of the inner loop progresses by at most 3 UTF-8
 384              * bytes and one UChar, for most characters.
 385              * For supplementary code points (4 & 2), which are rare,
 386              * there is an additional adjustment.
 387              */
 388             int32_t count = (int32_t)(pDestLimit - pDest);
 389             int32_t count2 = (srcLength - i) / 3;
 390             if(count > count2) {
 391                 count = count2; /* min(remaining dest, remaining src/3) */
 392             }
 393             if(count < 3) {
 394                 /*
 395                  * Too much overhead if we get near the end of the string,
 396                  * continue with the next loop.
 397                  */
 398                 break;
 399             }
 400
 401             do {
 402                 // modified copy of U8_NEXT()
 403                 c = (uint8_t)src[i++];
 404                 if(U8_IS_SINGLE(c)) {
 405                     *pDest++=(UChar)c;
 406                 } else {
 407                     uint8_t __t1, __t2;
 408                     if( /* handle U+0800..U+FFFF inline */
 409                             (0xe0<=(c) && (c)<0xf0) &&
 410                             ((i)+1)<srcLength &&
 411                             U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
 412                             (__t2=src[(i)+1]-0x80)<=0x3f) {
 413                         *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
 414                         i+=2;
 415                     } else if( /* handle U+0080..U+07FF inline */
 416                             ((c)<0xe0 && (c)>=0xc2) &&
 417                             ((i)!=srcLength) &&
 418                             (__t1=src[i]-0x80)<=0x3f) {
 419                         *pDest++ = (((c)&0x1f)<<6)|__t1;
 420                         ++(i);
 421                     } else {
 422                         if(c >= 0xf0 || subchar > 0xffff) {
 423                             // We may read up to four bytes and write up to two UChars,
 424                             // which we didn't account for with computing count,
 425                             // so we adjust it here.
 426                             if(--count == 0) {
 427                                 --i;  // back out byte c
 428                                 break;
 429                             }
 430                         }
 431
 432                         /* function call for "complicated" and error cases */
 433                         (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
 434                         if(c<0 && (++numSubstitutions, c = subchar) < 0) {
 435                             *pErrorCode = U_INVALID_CHAR_FOUND;
 436                             return NULL;
 437                         } else if(c<=0xFFFF) {
 438                             *(pDest++)=(UChar)c;
 439                         } else {
 440                             *(pDest++)=U16_LEAD(c);
 441                             *(pDest++)=U16_TRAIL(c);
 442                         }
 443                     }
 444                 }
 445             } while(--count > 0);
 446         }
 447
 448         while(i < srcLength && (pDest < pDestLimit)) {
 449             // modified copy of U8_NEXT()
 450             c = (uint8_t)src[i++];
 451             if(U8_IS_SINGLE(c)) {
 452                 *pDest++=(UChar)c;
 453             } else {
 454                 uint8_t __t1, __t2;
 455                 if( /* handle U+0800..U+FFFF inline */
 456                         (0xe0<=(c) && (c)<0xf0) &&
 457                         ((i)+1)<srcLength &&
 458                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
 459                         (__t2=src[(i)+1]-0x80)<=0x3f) {
 460                     *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
 461                     i+=2;
 462                 } else if( /* handle U+0080..U+07FF inline */
 463                         ((c)<0xe0 && (c)>=0xc2) &&
 464                         ((i)!=srcLength) &&
 465                         (__t1=src[i]-0x80)<=0x3f) {
 466                     *pDest++ = (((c)&0x1f)<<6)|__t1;
 467                     ++(i);
 468                 } else {
 469                     /* function call for "complicated" and error cases */
 470                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
 471                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
 472                         *pErrorCode = U_INVALID_CHAR_FOUND;
 473                         return NULL;
 474                     } else if(c<=0xFFFF) {
 475                         *(pDest++)=(UChar)c;
 476                     } else {
 477                         *(pDest++)=U16_LEAD(c);
 478                         if(pDest<pDestLimit) {
 479                             *(pDest++)=U16_TRAIL(c);
 480                         } else {
 481                             reqLength++;
 482                             break;
 483                         }
 484                     }
 485                 }
 486             }
 487         }
 488
 489         /* Pre-flight the rest of the string. */
 490         while(i < srcLength) {
 491             // modified copy of U8_NEXT()
 492             c = (uint8_t)src[i++];
 493             if(U8_IS_SINGLE(c)) {
 494                 ++reqLength;
 495             } else {
 496                 uint8_t __t1, __t2;
 497                 if( /* handle U+0800..U+FFFF inline */
 498                         (0xe0<=(c) && (c)<0xf0) &&
 499                         ((i)+1)<srcLength &&
 500                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
 501                         (__t2=src[(i)+1]-0x80)<=0x3f) {
 502                     ++reqLength;
 503                     i+=2;
 504                 } else if( /* handle U+0080..U+07FF inline */
 505                         ((c)<0xe0 && (c)>=0xc2) &&
 506                         ((i)!=srcLength) &&
 507                         (__t1=src[i]-0x80)<=0x3f) {
 508                     ++reqLength;
 509                     ++(i);
 510                 } else {
 511                     /* function call for "complicated" and error cases */
 512                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
 513                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
 514                         *pErrorCode = U_INVALID_CHAR_FOUND;
 515                         return NULL;
 516                     }
 517                     reqLength += U16_LENGTH(c);
 518                 }
 519             }
 520         }
 521     }
 522
 523     reqLength+=(int32_t)(pDest - dest);
 524
 525     if(pNumSubstitutions!=NULL) {
 526         *pNumSubstitutions=numSubstitutions;
 527     }
 528
 529     if(pDestLength){
 530         *pDestLength = reqLength;
 531     }
 532
 533     /* Terminate the buffer */
 534     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 535
 536     return dest;
 537 }
 538
 539 U_CAPI UChar* U_EXPORT2
 540 u_strFromUTF8(UChar *dest,
 541               int32_t destCapacity,
 542               int32_t *pDestLength,
 543               const char* src,
 544               int32_t srcLength,
 545               UErrorCode *pErrorCode){
 546     return u_strFromUTF8WithSub(
 547             dest, destCapacity, pDestLength,
 548             src, srcLength,
 549             U_SENTINEL, NULL,
 550             pErrorCode);
 551 }
 552
 553 U_CAPI UChar * U_EXPORT2
 554 u_strFromUTF8Lenient(UChar *dest,
 555                      int32_t destCapacity,
 556                      int32_t *pDestLength,
 557                      const char *src,
 558                      int32_t srcLength,
 559                      UErrorCode *pErrorCode) {
 560     UChar *pDest = dest;
 561     UChar32 ch;
 562     int32_t reqLength = 0;
 563     uint8_t* pSrc = (uint8_t*) src;
 564
 565     /* args check */
 566     if(U_FAILURE(*pErrorCode)){
 567         return NULL;
 568     }
 569
 570     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 571         (destCapacity<0) || (dest == NULL && destCapacity > 0)
 572     ) {
 573         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 574         return NULL;
 575     }
 576
 577     if(srcLength < 0) {
 578         /* Transform a NUL-terminated string. */
 579         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
 580         uint8_t t1, t2, t3; /* trail bytes */
 581
 582         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 583             if(ch < 0xc0) {
 584                 /*
 585                  * ASCII, or a trail byte in lead position which is treated like
 586                  * a single-byte sequence for better character boundary
 587                  * resynchronization after illegal sequences.
 588                  */
 589                 *pDest++=(UChar)ch;
 590                 ++pSrc;
 591                 continue;
 592             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 593                 if((t1 = pSrc[1]) != 0) {
 594                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 595                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
 596                     pSrc += 2;
 597                     continue;
 598                 }
 599             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 600                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
 601                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 602                     /* 0x2080 = (0x80 << 6) + 0x80 */
 603                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
 604                     pSrc += 3;
 605                     continue;
 606                 }
 607             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 608                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
 609                     pSrc += 4;
 610                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 611                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
 612                     *(pDest++) = U16_LEAD(ch);
 613                     if(pDest < pDestLimit) {
 614                         *(pDest++) = U16_TRAIL(ch);
 615                     } else {
 616                         reqLength = 1;
 617                         break;
 618                     }
 619                     continue;
 620                 }
 621             }
 622
 623             /* truncated character at the end */
 624             *pDest++ = 0xfffd;
 625             while(*++pSrc != 0) {}
 626             break;
 627         }
 628
 629         /* Pre-flight the rest of the string. */
 630         while((ch = *pSrc) != 0) {
 631             if(ch < 0xc0) {
 632                 /*
 633                  * ASCII, or a trail byte in lead position which is treated like
 634                  * a single-byte sequence for better character boundary
 635                  * resynchronization after illegal sequences.
 636                  */
 637                 ++reqLength;
 638                 ++pSrc;
 639                 continue;
 640             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 641                 if(pSrc[1] != 0) {
 642                     ++reqLength;
 643                     pSrc += 2;
 644                     continue;
 645                 }
 646             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 647                 if(pSrc[1] != 0 && pSrc[2] != 0) {
 648                     ++reqLength;
 649                     pSrc += 3;
 650                     continue;
 651                 }
 652             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 653                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
 654                     reqLength += 2;
 655                     pSrc += 4;
 656                     continue;
 657                 }
 658             }
 659
 660             /* truncated character at the end */
 661             ++reqLength;
 662             break;
 663         }
 664     } else /* srcLength >= 0 */ {
 665       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
 666
 667         /*
 668          * This function requires that if srcLength is given, then it must be
 669          * destCapatity >= srcLength so that we need not check for
 670          * destination buffer overflow in the loop.
 671          */
 672         if(destCapacity < srcLength) {
 673             if(pDestLength != NULL) {
 674                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
 675             }
 676             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
 677             return NULL;
 678         }
 679
 680         if((pSrcLimit - pSrc) >= 4) {
 681             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
 682
 683             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
 684             do {
 685                 ch = *pSrc++;
 686                 if(ch < 0xc0) {
 687                     /*
 688                      * ASCII, or a trail byte in lead position which is treated like
 689                      * a single-byte sequence for better character boundary
 690                      * resynchronization after illegal sequences.
 691                      */
 692                     *pDest++=(UChar)ch;
 693                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
 694                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 695                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 696                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 697                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 698                     /* 0x2080 = (0x80 << 6) + 0x80 */
 699                     ch = (ch << 12) + (*pSrc++ << 6);
 700                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 701                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 702                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 703                     ch = (ch << 18) + (*pSrc++ << 12);
 704                     ch += *pSrc++ << 6;
 705                     ch += *pSrc++ - 0x3c82080;
 706                     *(pDest++) = U16_LEAD(ch);
 707                     *(pDest++) = U16_TRAIL(ch);
 708                 }
 709             } while(pSrc < pSrcLimit);
 710
 711             pSrcLimit += 3; /* restore original pSrcLimit */
 712         }
 713
 714         while(pSrc < pSrcLimit) {
 715             ch = *pSrc++;
 716             if(ch < 0xc0) {
 717                 /*
 718                  * ASCII, or a trail byte in lead position which is treated like
 719                  * a single-byte sequence for better character boundary
 720                  * resynchronization after illegal sequences.
 721                  */
 722                 *pDest++=(UChar)ch;
 723                 continue;
 724             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 725                 if(pSrc < pSrcLimit) {
 726                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 727                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 728                     continue;
 729                 }
 730             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 731                 if((pSrcLimit - pSrc) >= 2) {
 732                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 733                     /* 0x2080 = (0x80 << 6) + 0x80 */
 734                     ch = (ch << 12) + (*pSrc++ << 6);
 735                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 736                     pSrc += 3;
 737                     continue;
 738                 }
 739             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 740                 if((pSrcLimit - pSrc) >= 3) {
 741                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 742                     ch = (ch << 18) + (*pSrc++ << 12);
 743                     ch += *pSrc++ << 6;
 744                     ch += *pSrc++ - 0x3c82080;
 745                     *(pDest++) = U16_LEAD(ch);
 746                     *(pDest++) = U16_TRAIL(ch);
 747                     pSrc += 4;
 748                     continue;
 749                 }
 750             }
 751
 752             /* truncated character at the end */
 753             *pDest++ = 0xfffd;
 754             break;
 755         }
 756     }
 757
 758     reqLength+=(int32_t)(pDest - dest);
 759
 760     if(pDestLength){
 761         *pDestLength = reqLength;
 762     }
 763
 764     /* Terminate the buffer */
 765     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 766
 767     return dest;
 768 }
 769
 770 static inline uint8_t *
 771 _appendUTF8(uint8_t *pDest, UChar32 c) {
 772     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
 773     if((c)<=0x7f) {
 774         *pDest++=(uint8_t)c;
 775     } else if(c<=0x7ff) {
 776         *pDest++=(uint8_t)((c>>6)|0xc0);
 777         *pDest++=(uint8_t)((c&0x3f)|0x80);
 778     } else if(c<=0xffff) {
 779         *pDest++=(uint8_t)((c>>12)|0xe0);
 780         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
 781         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 782     } else /* if((uint32_t)(c)<=0x10ffff) */ {
 783         *pDest++=(uint8_t)(((c)>>18)|0xf0);
 784         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
 785         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
 786         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 787     }
 788     return pDest;
 789 }
 790
 791
 792 U_CAPI char* U_EXPORT2
 793 u_strToUTF8WithSub(char *dest,
 794             int32_t destCapacity,
 795             int32_t *pDestLength,
 796             const UChar *pSrc,
 797             int32_t srcLength,
 798             UChar32 subchar, int32_t *pNumSubstitutions,
 799             UErrorCode *pErrorCode){
 800     int32_t reqLength=0;
 801     uint32_t ch=0,ch2=0;
 802     uint8_t *pDest = (uint8_t *)dest;
 803     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
 804     int32_t numSubstitutions;
 805
 806     /* args check */
 807     if(U_FAILURE(*pErrorCode)){
 808         return NULL;
 809     }
 810
 811     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
 812         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 813         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 814     ) {
 815         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 816         return NULL;
 817     }
 818
 819     if(pNumSubstitutions!=NULL) {
 820         *pNumSubstitutions=0;
 821     }
 822     numSubstitutions=0;
 823
 824     if(srcLength==-1) {
 825         while((ch=*pSrc)!=0) {
 826             ++pSrc;
 827             if(ch <= 0x7f) {
 828                 if(pDest<pDestLimit) {
 829                     *pDest++ = (uint8_t)ch;
 830                 } else {
 831                     reqLength = 1;
 832                     break;
 833                 }
 834             } else if(ch <= 0x7ff) {
 835                 if((pDestLimit - pDest) >= 2) {
 836                     *pDest++=(uint8_t)((ch>>6)|0xc0);
 837                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 838                 } else {
 839                     reqLength = 2;
 840                     break;
 841                 }
 842             } else if(ch <= 0xd7ff || ch >= 0xe000) {
 843                 if((pDestLimit - pDest) >= 3) {
 844                     *pDest++=(uint8_t)((ch>>12)|0xe0);
 845                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
 846                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 847                 } else {
 848                     reqLength = 3;
 849                     break;
 850                 }
 851             } else /* ch is a surrogate */ {
 852                 int32_t length;
 853
 854                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
 855                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
 856                     ++pSrc;
 857                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
 858                 } else if(subchar>=0) {
 859                     ch=subchar;
 860                     ++numSubstitutions;
 861                 } else {
 862                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
 863                     *pErrorCode = U_INVALID_CHAR_FOUND;
 864                     return NULL;
 865                 }
 866
 867                 length = U8_LENGTH(ch);
 868                 if((pDestLimit - pDest) >= length) {
 869                     /* convert and append*/
 870                     pDest=_appendUTF8(pDest, ch);
 871                 } else {
 872                     reqLength = length;
 873                     break;
 874                 }
 875             }
 876         }
 877         while((ch=*pSrc++)!=0) {
 878             if(ch<=0x7f) {
 879                 ++reqLength;
 880             } else if(ch<=0x7ff) {
 881                 reqLength+=2;
 882             } else if(!U16_IS_SURROGATE(ch)) {
 883                 reqLength+=3;
 884             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
 885                 ++pSrc;
 886                 reqLength+=4;
 887             } else if(subchar>=0) {
 888                 reqLength+=U8_LENGTH(subchar);
 889                 ++numSubstitutions;
 890             } else {
 891                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
 892                 *pErrorCode = U_INVALID_CHAR_FOUND;
 893                 return NULL;
 894             }
 895         }
 896     } else {
 897         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
 898         int32_t count;
 899
 900         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
 901         for(;;) {
 902             /*
 903              * Each iteration of the inner loop progresses by at most 3 UTF-8
 904              * bytes and one UChar, for most characters.
 905              * For supplementary code points (4 & 2), which are rare,
 906              * there is an additional adjustment.
 907              */
 908             count = (int32_t)((pDestLimit - pDest) / 3);
 909             srcLength = (int32_t)(pSrcLimit - pSrc);
 910             if(count > srcLength) {
 911                 count = srcLength; /* min(remaining dest/3, remaining src) */
 912             }
 913             if(count < 3) {
 914                 /*
 915                  * Too much overhead if we get near the end of the string,
 916                  * continue with the next loop.
 917                  */
 918                 break;
 919             }
 920             do {
 921                 ch=*pSrc++;
 922                 if(ch <= 0x7f) {
 923                     *pDest++ = (uint8_t)ch;
 924                 } else if(ch <= 0x7ff) {
 925                     *pDest++=(uint8_t)((ch>>6)|0xc0);
 926                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 927                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
 928                     *pDest++=(uint8_t)((ch>>12)|0xe0);
 929                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
 930                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 931                 } else /* ch is a surrogate */ {
 932                     /*
 933                      * We will read two UChars and probably output four bytes,
 934                      * which we didn't account for with computing count,
 935                      * so we adjust it here.
 936                      */
 937                     if(--count == 0) {
 938                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
 939                         break;  /* recompute count */
 940                     }
 941
 942                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
 943                         ++pSrc;
 944                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
 945
 946                         /* writing 4 bytes per 2 UChars is ok */
 947                         *pDest++=(uint8_t)((ch>>18)|0xf0);
 948                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
 949                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
 950                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
 951                     } else  {
 952                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
 953                         if(subchar>=0) {
 954                             ch=subchar;
 955                             ++numSubstitutions;
 956                         } else {
 957                             *pErrorCode = U_INVALID_CHAR_FOUND;
 958                             return NULL;
 959                         }
 960
 961                         /* convert and append*/
 962                         pDest=_appendUTF8(pDest, ch);
 963                     }
 964                 }
 965             } while(--count > 0);
 966         }
 967
 968         while(pSrc<pSrcLimit) {
 969             ch=*pSrc++;
 970             if(ch <= 0x7f) {
 971                 if(pDest<pDestLimit) {
 972                     *pDest++ = (uint8_t)ch;
 973                 } else {
 974                     reqLength = 1;
 975                     break;
 976                 }
 977             } else if(ch <= 0x7ff) {
 978                 if((pDestLimit - pDest) >= 2) {
 979                     *pDest++=(uint8_t)((ch>>6)|0xc0);
 980                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 981                 } else {
 982                     reqLength = 2;
 983                     break;
 984                 }
 985             } else if(ch <= 0xd7ff || ch >= 0xe000) {
 986                 if((pDestLimit - pDest) >= 3) {
 987                     *pDest++=(uint8_t)((ch>>12)|0xe0);
 988                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
 989                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 990                 } else {
 991                     reqLength = 3;
 992                     break;
 993                 }
 994             } else /* ch is a surrogate */ {
 995                 int32_t length;
 996
 997                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
 998                     ++pSrc;
 999                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1000                 } else if(subchar>=0) {
1001                     ch=subchar;
1002                     ++numSubstitutions;
1003                 } else {
1004                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1005                     *pErrorCode = U_INVALID_CHAR_FOUND;
1006                     return NULL;
1007                 }
1008
1009                 length = U8_LENGTH(ch);
1010                 if((pDestLimit - pDest) >= length) {
1011                     /* convert and append*/
1012                     pDest=_appendUTF8(pDest, ch);
1013                 } else {
1014                     reqLength = length;
1015                     break;
1016                 }
1017             }
1018         }
1019         while(pSrc<pSrcLimit) {
1020             ch=*pSrc++;
1021             if(ch<=0x7f) {
1022                 ++reqLength;
1023             } else if(ch<=0x7ff) {
1024                 reqLength+=2;
1025             } else if(!U16_IS_SURROGATE(ch)) {
1026                 reqLength+=3;
1027             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1028                 ++pSrc;
1029                 reqLength+=4;
1030             } else if(subchar>=0) {
1031                 reqLength+=U8_LENGTH(subchar);
1032                 ++numSubstitutions;
1033             } else {
1034                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1035                 *pErrorCode = U_INVALID_CHAR_FOUND;
1036                 return NULL;
1037             }
1038         }
1039     }
1040
1041     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1042
1043     if(pNumSubstitutions!=NULL) {
1044         *pNumSubstitutions=numSubstitutions;
1045     }
1046
1047     if(pDestLength){
1048         *pDestLength = reqLength;
1049     }
1050
1051     /* Terminate the buffer */
1052     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1053     return dest;
1054 }
1055
1056 U_CAPI char* U_EXPORT2
1057 u_strToUTF8(char *dest,
1058             int32_t destCapacity,
1059             int32_t *pDestLength,
1060             const UChar *pSrc,
1061             int32_t srcLength,
1062             UErrorCode *pErrorCode){
1063     return u_strToUTF8WithSub(
1064             dest, destCapacity, pDestLength,
1065             pSrc, srcLength,
1066             U_SENTINEL, NULL,
1067             pErrorCode);
1068 }
1069
1070 U_CAPI UChar* U_EXPORT2
1071 u_strFromJavaModifiedUTF8WithSub(
1072         UChar *dest,
1073         int32_t destCapacity,
1074         int32_t *pDestLength,
1075         const char *src,
1076         int32_t srcLength,
1077         UChar32 subchar, int32_t *pNumSubstitutions,
1078         UErrorCode *pErrorCode) {
1079     /* args check */
1080     if(U_FAILURE(*pErrorCode)) {
1081         return NULL;
1082     }
1083     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1084         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1085         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1086     ) {
1087         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1088         return NULL;
1089     }
1090
1091     if(pNumSubstitutions!=NULL) {
1092         *pNumSubstitutions=0;
1093     }
1094     UChar *pDest = dest;
1095     UChar *pDestLimit = dest+destCapacity;
1096     int32_t reqLength = 0;
1097     int32_t numSubstitutions=0;
1098
1099     if(srcLength < 0) {
1100         /*
1101          * Transform a NUL-terminated ASCII string.
1102          * Handle non-ASCII strings with slower code.
1103          */
1104         UChar32 c;
1105         while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
1106             *pDest++=(UChar)c;
1107             ++src;
1108         }
1109         if(c == 0) {
1110             reqLength=(int32_t)(pDest - dest);
1111             if(pDestLength) {
1112                 *pDestLength = reqLength;
1113             }
1114
1115             /* Terminate the buffer */
1116             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1117             return dest;
1118         }
1119         srcLength = static_cast<int32_t>(uprv_strlen(src));
1120     }
1121
1122     /* Faster loop without ongoing checking for srcLength and pDestLimit. */
1123     UChar32 ch;
1124     uint8_t t1, t2;
1125     int32_t i = 0;
1126     for(;;) {
1127         int32_t count = (int32_t)(pDestLimit - pDest);
1128         int32_t count2 = srcLength - i;
1129         if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
1130             /* fast ASCII loop */
1131             int32_t start = i;
1132             uint8_t b;
1133             while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
1134                 *pDest++=b;
1135                 ++i;
1136             }
1137             int32_t delta = i - start;
1138             count -= delta;
1139             count2 -= delta;
1140         }
1141         /*
1142          * Each iteration of the inner loop progresses by at most 3 UTF-8
1143          * bytes and one UChar.
1144          */
1145         if(subchar > 0xFFFF) {
1146             break;
1147         }
1148         count2 /= 3;
1149         if(count > count2) {
1150             count = count2; /* min(remaining dest, remaining src/3) */
1151         }
1152         if(count < 3) {
1153             /*
1154              * Too much overhead if we get near the end of the string,
1155              * continue with the next loop.
1156              */
1157             break;
1158         }
1159         do {
1160             ch = (uint8_t)src[i++];
1161             if(U8_IS_SINGLE(ch)) {
1162                 *pDest++=(UChar)ch;
1163             } else {
1164                 if(ch >= 0xe0) {
1165                     if( /* handle U+0000..U+FFFF inline */
1166                         ch <= 0xef &&
1167                         (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1168                         (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1169                     ) {
1170                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1171                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1172                         i += 2;
1173                         continue;
1174                     }
1175                 } else {
1176                     if( /* handle U+0000..U+07FF inline */
1177                         ch >= 0xc0 &&
1178                         (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1179                     ) {
1180                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1181                         ++i;
1182                         continue;
1183                     }
1184                 }
1185
1186                 if(subchar < 0) {
1187                     *pErrorCode = U_INVALID_CHAR_FOUND;
1188                     return NULL;
1189                 } else if(subchar > 0xffff && --count == 0) {
1190                     /*
1191                      * We need to write two UChars, adjusted count for that,
1192                      * and ran out of space.
1193                      */
1194                     --i;  // back out byte ch
1195                     break;
1196                 } else {
1197                     /* function call for error cases */
1198                     utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1199                     ++numSubstitutions;
1200                     *(pDest++)=(UChar)subchar;
1201                 }
1202             }
1203         } while(--count > 0);
1204     }
1205
1206     while(i < srcLength && (pDest < pDestLimit)) {
1207         ch = (uint8_t)src[i++];
1208         if(U8_IS_SINGLE(ch)){
1209             *pDest++=(UChar)ch;
1210         } else {
1211             if(ch >= 0xe0) {
1212                 if( /* handle U+0000..U+FFFF inline */
1213                     ch <= 0xef &&
1214                     (i+1) < srcLength &&
1215                     (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1216                     (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1217                 ) {
1218                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1219                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1220                     i += 2;
1221                     continue;
1222                 }
1223             } else {
1224                 if( /* handle U+0000..U+07FF inline */
1225                     ch >= 0xc0 &&
1226                     i < srcLength &&
1227                     (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1228                 ) {
1229                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1230                     ++i;
1231                     continue;
1232                 }
1233             }
1234
1235             if(subchar < 0) {
1236                 *pErrorCode = U_INVALID_CHAR_FOUND;
1237                 return NULL;
1238             } else {
1239                 /* function call for error cases */
1240                 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1241                 ++numSubstitutions;
1242                 if(subchar<=0xFFFF) {
1243                     *(pDest++)=(UChar)subchar;
1244                 } else {
1245                     *(pDest++)=U16_LEAD(subchar);
1246                     if(pDest<pDestLimit) {
1247                         *(pDest++)=U16_TRAIL(subchar);
1248                     } else {
1249                         reqLength++;
1250                         break;
1251                     }
1252                 }
1253             }
1254         }
1255     }
1256
1257     /* Pre-flight the rest of the string. */
1258     while(i < srcLength) {
1259         ch = (uint8_t)src[i++];
1260         if(U8_IS_SINGLE(ch)) {
1261             reqLength++;
1262         } else {
1263             if(ch >= 0xe0) {
1264                 if( /* handle U+0000..U+FFFF inline */
1265                     ch <= 0xef &&
1266                     (i+1) < srcLength &&
1267                     (uint8_t)(src[i] - 0x80) <= 0x3f &&
1268                     (uint8_t)(src[i+1] - 0x80) <= 0x3f
1269                 ) {
1270                     reqLength++;
1271                     i += 2;
1272                     continue;
1273                 }
1274             } else {
1275                 if( /* handle U+0000..U+07FF inline */
1276                     ch >= 0xc0 &&
1277                     i < srcLength &&
1278                     (uint8_t)(src[i] - 0x80) <= 0x3f
1279                 ) {
1280                     reqLength++;
1281                     ++i;
1282                     continue;
1283                 }
1284             }
1285
1286             if(subchar < 0) {
1287                 *pErrorCode = U_INVALID_CHAR_FOUND;
1288                 return NULL;
1289             } else {
1290                 /* function call for error cases */
1291                 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1292                 ++numSubstitutions;
1293                 reqLength+=U16_LENGTH(ch);
1294             }
1295         }
1296     }
1297
1298     if(pNumSubstitutions!=NULL) {
1299         *pNumSubstitutions=numSubstitutions;
1300     }
1301
1302     reqLength+=(int32_t)(pDest - dest);
1303     if(pDestLength) {
1304         *pDestLength = reqLength;
1305     }
1306
1307     /* Terminate the buffer */
1308     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309     return dest;
1310 }
1311
1312 U_CAPI char* U_EXPORT2
1313 u_strToJavaModifiedUTF8(
1314         char *dest,
1315         int32_t destCapacity,
1316         int32_t *pDestLength,
1317         const UChar *src,
1318         int32_t srcLength,
1319         UErrorCode *pErrorCode) {
1320     int32_t reqLength=0;
1321     uint32_t ch=0;
1322     uint8_t *pDest = (uint8_t *)dest;
1323     uint8_t *pDestLimit = pDest + destCapacity;
1324     const UChar *pSrcLimit;
1325     int32_t count;
1326
1327     /* args check */
1328     if(U_FAILURE(*pErrorCode)){
1329         return NULL;
1330     }
1331     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1332         (dest==NULL && destCapacity!=0) || destCapacity<0
1333     ) {
1334         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1335         return NULL;
1336     }
1337
1338     if(srcLength==-1) {
1339         /* Convert NUL-terminated ASCII, then find the string length. */
1340         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1341             *pDest++ = (uint8_t)ch;
1342             ++src;
1343         }
1344         if(ch == 0) {
1345             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1346             if(pDestLength) {
1347                 *pDestLength = reqLength;
1348             }
1349
1350             /* Terminate the buffer */
1351             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1352             return dest;
1353         }
1354         srcLength = u_strlen(src);
1355     }
1356
1357     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1358     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1359     for(;;) {
1360         count = (int32_t)(pDestLimit - pDest);
1361         srcLength = (int32_t)(pSrcLimit - src);
1362         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1363             /* fast ASCII loop */
1364             const UChar *prevSrc = src;
1365             int32_t delta;
1366             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1367                 *pDest++=(uint8_t)ch;
1368                 ++src;
1369             }
1370             delta = (int32_t)(src - prevSrc);
1371             count -= delta;
1372             srcLength -= delta;
1373         }
1374         /*
1375          * Each iteration of the inner loop progresses by at most 3 UTF-8
1376          * bytes and one UChar.
1377          */
1378         count /= 3;
1379         if(count > srcLength) {
1380             count = srcLength; /* min(remaining dest/3, remaining src) */
1381         }
1382         if(count < 3) {
1383             /*
1384              * Too much overhead if we get near the end of the string,
1385              * continue with the next loop.
1386              */
1387             break;
1388         }
1389         do {
1390             ch=*src++;
1391             if(ch <= 0x7f && ch != 0) {
1392                 *pDest++ = (uint8_t)ch;
1393             } else if(ch <= 0x7ff) {
1394                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1395                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1396             } else {
1397                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1398                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1399                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1400             }
1401         } while(--count > 0);
1402     }
1403
1404     while(src<pSrcLimit) {
1405         ch=*src++;
1406         if(ch <= 0x7f && ch != 0) {
1407             if(pDest<pDestLimit) {
1408                 *pDest++ = (uint8_t)ch;
1409             } else {
1410                 reqLength = 1;
1411                 break;
1412             }
1413         } else if(ch <= 0x7ff) {
1414             if((pDestLimit - pDest) >= 2) {
1415                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1416                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1417             } else {
1418                 reqLength = 2;
1419                 break;
1420             }
1421         } else {
1422             if((pDestLimit - pDest) >= 3) {
1423                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1424                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1425                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1426             } else {
1427                 reqLength = 3;
1428                 break;
1429             }
1430         }
1431     }
1432     while(src<pSrcLimit) {
1433         ch=*src++;
1434         if(ch <= 0x7f && ch != 0) {
1435             ++reqLength;
1436         } else if(ch<=0x7ff) {
1437             reqLength+=2;
1438         } else {
1439             reqLength+=3;
1440         }
1441     }
1442
1443     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1444     if(pDestLength){
1445         *pDestLength = reqLength;
1446     }
1447
1448     /* Terminate the buffer */
1449     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1450     return dest;
1451 }