icuSources/common/ustrtrns.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2011, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *
   9 * File ustrtrns.cpp
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   9/10/2001    Ram    Creation.
  15 ******************************************************************************
  16 */
  17
  18 /*******************************************************************************
  19  *
  20  * u_strTo* and u_strFrom* APIs
  21  * WCS functions moved to ustr_wcs.c for better modularization
  22  *
  23  *******************************************************************************
  24  */
  25
  26
  27 #include "unicode/putil.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/utf.h"
  30 #include "unicode/utf8.h"
  31 #include "unicode/utf16.h"
  32 #include "cstring.h"
  33 #include "cmemory.h"
  34 #include "ustr_imp.h"
  35 #include "uassert.h"
  36
  37 U_CAPI UChar* U_EXPORT2
  38 u_strFromUTF32WithSub(UChar *dest,
  39                int32_t destCapacity,
  40                int32_t *pDestLength,
  41                const UChar32 *src,
  42                int32_t srcLength,
  43                UChar32 subchar, int32_t *pNumSubstitutions,
  44                UErrorCode *pErrorCode) {
  45     const UChar32 *srcLimit;
  46     UChar32 ch;
  47     UChar *destLimit;
  48     UChar *pDest;
  49     int32_t reqLength;
  50     int32_t numSubstitutions;
  51
  52     /* args check */
  53     if(U_FAILURE(*pErrorCode)){
  54         return NULL;
  55     }
  56     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  57         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
  58         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  59     ) {
  60         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  61         return NULL;
  62     }
  63
  64     if(pNumSubstitutions != NULL) {
  65         *pNumSubstitutions = 0;
  66     }
  67
  68     pDest = dest;
  69     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
  70     reqLength = 0;
  71     numSubstitutions = 0;
  72
  73     if(srcLength < 0) {
  74         /* simple loop for conversion of a NUL-terminated BMP string */
  75         while((ch=*src) != 0 &&
  76               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
  77             ++src;
  78             if(pDest < destLimit) {
  79                 *pDest++ = (UChar)ch;
  80             } else {
  81                 ++reqLength;
  82             }
  83         }
  84         srcLimit = src;
  85         if(ch != 0) {
  86             /* "complicated" case, find the end of the remaining string */
  87             while(*++srcLimit != 0) {}
  88         }
  89     } else {
  90       srcLimit = (src!=NULL)?(src + srcLength):NULL;
  91     }
  92
  93     /* convert with length */
  94     while(src < srcLimit) {
  95         ch = *src++;
  96         do {
  97             /* usually "loops" once; twice only for writing subchar */
  98             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
  99                 if(pDest < destLimit) {
 100                     *pDest++ = (UChar)ch;
 101                 } else {
 102                     ++reqLength;
 103                 }
 104                 break;
 105             } else if(0x10000 <= ch && ch <= 0x10ffff) {
 106                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
 107                     *pDest++ = U16_LEAD(ch);
 108                     *pDest++ = U16_TRAIL(ch);
 109                 } else {
 110                     reqLength += 2;
 111                 }
 112                 break;
 113             } else if((ch = subchar) < 0) {
 114                 /* surrogate code point, or not a Unicode code point at all */
 115                 *pErrorCode = U_INVALID_CHAR_FOUND;
 116                 return NULL;
 117             } else {
 118                 ++numSubstitutions;
 119             }
 120         } while(TRUE);
 121     }
 122
 123     reqLength += (int32_t)(pDest - dest);
 124     if(pDestLength) {
 125         *pDestLength = reqLength;
 126     }
 127     if(pNumSubstitutions != NULL) {
 128         *pNumSubstitutions = numSubstitutions;
 129     }
 130
 131     /* Terminate the buffer */
 132     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
 133
 134     return dest;
 135 }
 136
 137 U_CAPI UChar* U_EXPORT2
 138 u_strFromUTF32(UChar *dest,
 139                int32_t destCapacity,
 140                int32_t *pDestLength,
 141                const UChar32 *src,
 142                int32_t srcLength,
 143                UErrorCode *pErrorCode) {
 144     return u_strFromUTF32WithSub(
 145             dest, destCapacity, pDestLength,
 146             src, srcLength,
 147             U_SENTINEL, NULL,
 148             pErrorCode);
 149 }
 150
 151 U_CAPI UChar32* U_EXPORT2
 152 u_strToUTF32WithSub(UChar32 *dest,
 153              int32_t destCapacity,
 154              int32_t *pDestLength,
 155              const UChar *src,
 156              int32_t srcLength,
 157              UChar32 subchar, int32_t *pNumSubstitutions,
 158              UErrorCode *pErrorCode) {
 159     const UChar *srcLimit;
 160     UChar32 ch;
 161     UChar ch2;
 162     UChar32 *destLimit;
 163     UChar32 *pDest;
 164     int32_t reqLength;
 165     int32_t numSubstitutions;
 166
 167     /* args check */
 168     if(U_FAILURE(*pErrorCode)){
 169         return NULL;
 170     }
 171     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 172         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 173         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 174     ) {
 175         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 176         return NULL;
 177     }
 178
 179     if(pNumSubstitutions != NULL) {
 180         *pNumSubstitutions = 0;
 181     }
 182
 183     pDest = dest;
 184     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
 185     reqLength = 0;
 186     numSubstitutions = 0;
 187
 188     if(srcLength < 0) {
 189         /* simple loop for conversion of a NUL-terminated BMP string */
 190         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
 191             ++src;
 192             if(pDest < destLimit) {
 193                 *pDest++ = ch;
 194             } else {
 195                 ++reqLength;
 196             }
 197         }
 198         srcLimit = src;
 199         if(ch != 0) {
 200             /* "complicated" case, find the end of the remaining string */
 201             while(*++srcLimit != 0) {}
 202         }
 203     } else {
 204         srcLimit = (src!=NULL)?(src + srcLength):NULL;
 205     }
 206
 207     /* convert with length */
 208     while(src < srcLimit) {
 209         ch = *src++;
 210         if(!U16_IS_SURROGATE(ch)) {
 211             /* write or count ch below */
 212         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
 213             ++src;
 214             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
 215         } else if((ch = subchar) < 0) {
 216             /* unpaired surrogate */
 217             *pErrorCode = U_INVALID_CHAR_FOUND;
 218             return NULL;
 219         } else {
 220             ++numSubstitutions;
 221         }
 222         if(pDest < destLimit) {
 223             *pDest++ = ch;
 224         } else {
 225             ++reqLength;
 226         }
 227     }
 228
 229     reqLength += (int32_t)(pDest - dest);
 230     if(pDestLength) {
 231         *pDestLength = reqLength;
 232     }
 233     if(pNumSubstitutions != NULL) {
 234         *pNumSubstitutions = numSubstitutions;
 235     }
 236
 237     /* Terminate the buffer */
 238     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
 239
 240     return dest;
 241 }
 242
 243 U_CAPI UChar32* U_EXPORT2
 244 u_strToUTF32(UChar32 *dest,
 245              int32_t destCapacity,
 246              int32_t *pDestLength,
 247              const UChar *src,
 248              int32_t srcLength,
 249              UErrorCode *pErrorCode) {
 250     return u_strToUTF32WithSub(
 251             dest, destCapacity, pDestLength,
 252             src, srcLength,
 253             U_SENTINEL, NULL,
 254             pErrorCode);
 255 }
 256
 257 /* for utf8_nextCharSafeBodyTerminated() */
 258 static const UChar32
 259 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
 260
 261 /*
 262  * Version of utf8_nextCharSafeBody() with the following differences:
 263  * - checks for NUL termination instead of length
 264  * - works with pointers instead of indexes
 265  * - always strict (strict==-1)
 266  *
 267  * *ps points to after the lead byte and will be moved to after the last trail byte.
 268  * c is the lead byte.
 269  * @return the code point, or U_SENTINEL
 270  */
 271 static UChar32
 272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
 273     const uint8_t *s=*ps;
 274     uint8_t trail, illegal=0;
 275     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 276     U_ASSERT(count<6);
 277     U8_MASK_LEAD_BYTE((c), count);
 278     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 279     switch(count) {
 280     /* each branch falls through to the next one */
 281     case 5:
 282     case 4:
 283         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 284         illegal=1;
 285         break;
 286     case 3:
 287         trail=(uint8_t)(*s++ - 0x80);
 288         c=(c<<6)|trail;
 289         if(trail>0x3f || c>=0x110) {
 290             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
 291             illegal=1;
 292             break;
 293         }
 294     case 2: /*fall through*/
 295         trail=(uint8_t)(*s++ - 0x80);
 296         if(trail>0x3f) {
 297             /* not a trail byte */
 298             illegal=1;
 299             break;
 300         }
 301         c=(c<<6)|trail;
 302     case 1: /*fall through*/
 303         trail=(uint8_t)(*s++ - 0x80);
 304         if(trail>0x3f) {
 305             /* not a trail byte */
 306             illegal=1;
 307         }
 308         c=(c<<6)|trail;
 309         break;
 310     case 0:
 311         return U_SENTINEL;
 312     /* no default branch to optimize switch()  - all values are covered */
 313     }
 314
 315     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 316     /* illegal is also set if count>=4 */
 317     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 318         /* error handling */
 319         /* don't go beyond this sequence */
 320         s=*ps;
 321         while(count>0 && U8_IS_TRAIL(*s)) {
 322             ++s;
 323             --count;
 324         }
 325         c=U_SENTINEL;
 326     }
 327     *ps=s;
 328     return c;
 329 }
 330
 331 /*
 332  * Version of utf8_nextCharSafeBody() with the following differences:
 333  * - works with pointers instead of indexes
 334  * - always strict (strict==-1)
 335  *
 336  * *ps points to after the lead byte and will be moved to after the last trail byte.
 337  * c is the lead byte.
 338  * @return the code point, or U_SENTINEL
 339  */
 340 static UChar32
 341 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
 342     const uint8_t *s=*ps;
 343     uint8_t trail, illegal=0;
 344     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
 345     if((limit-s)>=count) {
 346         U8_MASK_LEAD_BYTE((c), count);
 347         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 348         switch(count) {
 349         /* each branch falls through to the next one */
 350         case 5:
 351         case 4:
 352             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 353             illegal=1;
 354             break;
 355         case 3:
 356             trail=*s++;
 357             c=(c<<6)|(trail&0x3f);
 358             if(c<0x110) {
 359                 illegal|=(trail&0xc0)^0x80;
 360             } else {
 361                 /* code point>0x10ffff, outside Unicode */
 362                 illegal=1;
 363                 break;
 364             }
 365         case 2: /*fall through*/
 366             trail=*s++;
 367             c=(c<<6)|(trail&0x3f);
 368             illegal|=(trail&0xc0)^0x80;
 369         case 1: /*fall through*/
 370             trail=*s++;
 371             c=(c<<6)|(trail&0x3f);
 372             illegal|=(trail&0xc0)^0x80;
 373             break;
 374         case 0:
 375             return U_SENTINEL;
 376         /* no default branch to optimize switch()  - all values are covered */
 377         }
 378     } else {
 379         illegal=1; /* too few bytes left */
 380     }
 381
 382     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 383     /* illegal is also set if count>=4 */
 384     U_ASSERT(count<sizeof(utf8_minLegal)/sizeof(utf8_minLegal[0]));
 385     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
 386         /* error handling */
 387         /* don't go beyond this sequence */
 388         s=*ps;
 389         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
 390             ++s;
 391             --count;
 392         }
 393         c=U_SENTINEL;
 394     }
 395     *ps=s;
 396     return c;
 397 }
 398
 399 U_CAPI UChar* U_EXPORT2
 400 u_strFromUTF8WithSub(UChar *dest,
 401               int32_t destCapacity,
 402               int32_t *pDestLength,
 403               const char* src,
 404               int32_t srcLength,
 405               UChar32 subchar, int32_t *pNumSubstitutions,
 406               UErrorCode *pErrorCode){
 407     UChar *pDest = dest;
 408     UChar *pDestLimit = dest+destCapacity;
 409     UChar32 ch;
 410     int32_t reqLength = 0;
 411     const uint8_t* pSrc = (const uint8_t*) src;
 412     uint8_t t1, t2; /* trail bytes */
 413     int32_t numSubstitutions;
 414
 415     /* args check */
 416     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 417         return NULL;
 418     }
 419
 420     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 421         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 422         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 423     ) {
 424         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 425         return NULL;
 426     }
 427
 428     if(pNumSubstitutions!=NULL) {
 429         *pNumSubstitutions=0;
 430     }
 431     numSubstitutions=0;
 432
 433     /*
 434      * Inline processing of UTF-8 byte sequences:
 435      *
 436      * Byte sequences for the most common characters are handled inline in
 437      * the conversion loops. In order to reduce the path lengths for those
 438      * characters, the tests are arranged in a kind of binary search.
 439      * ASCII (<=0x7f) is checked first, followed by the dividing point
 440      * between 2- and 3-byte sequences (0xe0).
 441      * The 3-byte branch is tested first to speed up CJK text.
 442      * The compiler should combine the subtractions for the two tests for 0xe0.
 443      * Each branch then tests for the other end of its range.
 444      */
 445
 446     if(srcLength < 0){
 447         /*
 448          * Transform a NUL-terminated string.
 449          * The code explicitly checks for NULs only in the lead byte position.
 450          * A NUL byte in the trail byte position fails the trail byte range check anyway.
 451          */
 452         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 453             if(ch <= 0x7f){
 454                 *pDest++=(UChar)ch;
 455                 ++pSrc;
 456             } else {
 457                 if(ch > 0xe0) {
 458                     if( /* handle U+1000..U+CFFF inline */
 459                         ch <= 0xec &&
 460                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 461                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 462                     ) {
 463                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 464                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 465                         pSrc += 3;
 466                         continue;
 467                     }
 468                 } else if(ch < 0xe0) {
 469                     if( /* handle U+0080..U+07FF inline */
 470                         ch >= 0xc2 &&
 471                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 472                     ) {
 473                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 474                         pSrc += 2;
 475                         continue;
 476                     }
 477                 }
 478
 479                 /* function call for "complicated" and error cases */
 480                 ++pSrc; /* continue after the lead byte */
 481                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 482                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 483                     *pErrorCode = U_INVALID_CHAR_FOUND;
 484                     return NULL;
 485                 } else if(ch<=0xFFFF) {
 486                     *(pDest++)=(UChar)ch;
 487                 } else {
 488                     *(pDest++)=U16_LEAD(ch);
 489                     if(pDest<pDestLimit) {
 490                         *(pDest++)=U16_TRAIL(ch);
 491                     } else {
 492                         reqLength++;
 493                         break;
 494                     }
 495                 }
 496             }
 497         }
 498
 499         /* Pre-flight the rest of the string. */
 500         while((ch = *pSrc) != 0) {
 501             if(ch <= 0x7f){
 502                 ++reqLength;
 503                 ++pSrc;
 504             } else {
 505                 if(ch > 0xe0) {
 506                     if( /* handle U+1000..U+CFFF inline */
 507                         ch <= 0xec &&
 508                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 509                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 510                     ) {
 511                         ++reqLength;
 512                         pSrc += 3;
 513                         continue;
 514                     }
 515                 } else if(ch < 0xe0) {
 516                     if( /* handle U+0080..U+07FF inline */
 517                         ch >= 0xc2 &&
 518                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 519                     ) {
 520                         ++reqLength;
 521                         pSrc += 2;
 522                         continue;
 523                     }
 524                 }
 525
 526                 /* function call for "complicated" and error cases */
 527                 ++pSrc; /* continue after the lead byte */
 528                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 529                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 530                     *pErrorCode = U_INVALID_CHAR_FOUND;
 531                     return NULL;
 532                 }
 533                 reqLength += U16_LENGTH(ch);
 534             }
 535         }
 536     } else /* srcLength >= 0 */ {
 537         const uint8_t *pSrcLimit = pSrc + srcLength;
 538         int32_t count;
 539
 540         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
 541         for(;;) {
 542             /*
 543              * Each iteration of the inner loop progresses by at most 3 UTF-8
 544              * bytes and one UChar, for most characters.
 545              * For supplementary code points (4 & 2), which are rare,
 546              * there is an additional adjustment.
 547              */
 548             count = (int32_t)(pDestLimit - pDest);
 549             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
 550             if(count > srcLength) {
 551                 count = srcLength; /* min(remaining dest, remaining src/3) */
 552             }
 553             if(count < 3) {
 554                 /*
 555                  * Too much overhead if we get near the end of the string,
 556                  * continue with the next loop.
 557                  */
 558                 break;
 559             }
 560
 561             do {
 562                 ch = *pSrc;
 563                 if(ch <= 0x7f){
 564                     *pDest++=(UChar)ch;
 565                     ++pSrc;
 566                 } else {
 567                     if(ch > 0xe0) {
 568                         if( /* handle U+1000..U+CFFF inline */
 569                             ch <= 0xec &&
 570                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 571                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 572                         ) {
 573                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 574                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 575                             pSrc += 3;
 576                             continue;
 577                         }
 578                     } else if(ch < 0xe0) {
 579                         if( /* handle U+0080..U+07FF inline */
 580                             ch >= 0xc2 &&
 581                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 582                         ) {
 583                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 584                             pSrc += 2;
 585                             continue;
 586                         }
 587                     }
 588
 589                     if(ch >= 0xf0 || subchar > 0xffff) {
 590                         /*
 591                          * We may read up to six bytes and write up to two UChars,
 592                          * which we didn't account for with computing count,
 593                          * so we adjust it here.
 594                          */
 595                         if(--count == 0) {
 596                             break;
 597                         }
 598                     }
 599
 600                     /* function call for "complicated" and error cases */
 601                     ++pSrc; /* continue after the lead byte */
 602                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 603                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 604                         *pErrorCode = U_INVALID_CHAR_FOUND;
 605                         return NULL;
 606                     }else if(ch<=0xFFFF){
 607                         *(pDest++)=(UChar)ch;
 608                     }else{
 609                         *(pDest++)=U16_LEAD(ch);
 610                         *(pDest++)=U16_TRAIL(ch);
 611                     }
 612                 }
 613             } while(--count > 0);
 614         }
 615
 616         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
 617             ch = *pSrc;
 618             if(ch <= 0x7f){
 619                 *pDest++=(UChar)ch;
 620                 ++pSrc;
 621             } else {
 622                 if(ch > 0xe0) {
 623                     if( /* handle U+1000..U+CFFF inline */
 624                         ch <= 0xec &&
 625                         ((pSrcLimit - pSrc) >= 3) &&
 626                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 627                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 628                     ) {
 629                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 630                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 631                         pSrc += 3;
 632                         continue;
 633                     }
 634                 } else if(ch < 0xe0) {
 635                     if( /* handle U+0080..U+07FF inline */
 636                         ch >= 0xc2 &&
 637                         ((pSrcLimit - pSrc) >= 2) &&
 638                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 639                     ) {
 640                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 641                         pSrc += 2;
 642                         continue;
 643                     }
 644                 }
 645
 646                 /* function call for "complicated" and error cases */
 647                 ++pSrc; /* continue after the lead byte */
 648                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 649                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 650                     *pErrorCode = U_INVALID_CHAR_FOUND;
 651                     return NULL;
 652                 }else if(ch<=0xFFFF){
 653                     *(pDest++)=(UChar)ch;
 654                 }else{
 655                     *(pDest++)=U16_LEAD(ch);
 656                     if(pDest<pDestLimit){
 657                         *(pDest++)=U16_TRAIL(ch);
 658                     }else{
 659                         reqLength++;
 660                         break;
 661                     }
 662                 }
 663             }
 664         }
 665         /* do not fill the dest buffer just count the UChars needed */
 666         while(pSrc < pSrcLimit){
 667             ch = *pSrc;
 668             if(ch <= 0x7f){
 669                 reqLength++;
 670                 ++pSrc;
 671             } else {
 672                 if(ch > 0xe0) {
 673                     if( /* handle U+1000..U+CFFF inline */
 674                         ch <= 0xec &&
 675                         ((pSrcLimit - pSrc) >= 3) &&
 676                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 677                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 678                     ) {
 679                         reqLength++;
 680                         pSrc += 3;
 681                         continue;
 682                     }
 683                 } else if(ch < 0xe0) {
 684                     if( /* handle U+0080..U+07FF inline */
 685                         ch >= 0xc2 &&
 686                         ((pSrcLimit - pSrc) >= 2) &&
 687                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 688                     ) {
 689                         reqLength++;
 690                         pSrc += 2;
 691                         continue;
 692                     }
 693                 }
 694
 695                 /* function call for "complicated" and error cases */
 696                 ++pSrc; /* continue after the lead byte */
 697                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 698                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 699                     *pErrorCode = U_INVALID_CHAR_FOUND;
 700                     return NULL;
 701                 }
 702                 reqLength+=U16_LENGTH(ch);
 703             }
 704         }
 705     }
 706
 707     reqLength+=(int32_t)(pDest - dest);
 708
 709     if(pNumSubstitutions!=NULL) {
 710         *pNumSubstitutions=numSubstitutions;
 711     }
 712
 713     if(pDestLength){
 714         *pDestLength = reqLength;
 715     }
 716
 717     /* Terminate the buffer */
 718     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 719
 720     return dest;
 721 }
 722
 723 U_CAPI UChar* U_EXPORT2
 724 u_strFromUTF8(UChar *dest,
 725               int32_t destCapacity,
 726               int32_t *pDestLength,
 727               const char* src,
 728               int32_t srcLength,
 729               UErrorCode *pErrorCode){
 730     return u_strFromUTF8WithSub(
 731             dest, destCapacity, pDestLength,
 732             src, srcLength,
 733             U_SENTINEL, NULL,
 734             pErrorCode);
 735 }
 736
 737 U_CAPI UChar * U_EXPORT2
 738 u_strFromUTF8Lenient(UChar *dest,
 739                      int32_t destCapacity,
 740                      int32_t *pDestLength,
 741                      const char *src,
 742                      int32_t srcLength,
 743                      UErrorCode *pErrorCode) {
 744     UChar *pDest = dest;
 745     UChar32 ch;
 746     int32_t reqLength = 0;
 747     uint8_t* pSrc = (uint8_t*) src;
 748
 749     /* args check */
 750     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 751         return NULL;
 752     }
 753
 754     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
 755         (destCapacity<0) || (dest == NULL && destCapacity > 0)
 756     ) {
 757         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 758         return NULL;
 759     }
 760
 761     if(srcLength < 0) {
 762         /* Transform a NUL-terminated string. */
 763         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
 764         uint8_t t1, t2, t3; /* trail bytes */
 765
 766         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 767             if(ch < 0xc0) {
 768                 /*
 769                  * ASCII, or a trail byte in lead position which is treated like
 770                  * a single-byte sequence for better character boundary
 771                  * resynchronization after illegal sequences.
 772                  */
 773                 *pDest++=(UChar)ch;
 774                 ++pSrc;
 775                 continue;
 776             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 777                 if((t1 = pSrc[1]) != 0) {
 778                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 779                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
 780                     pSrc += 2;
 781                     continue;
 782                 }
 783             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 784                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
 785                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 786                     /* 0x2080 = (0x80 << 6) + 0x80 */
 787                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
 788                     pSrc += 3;
 789                     continue;
 790                 }
 791             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 792                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
 793                     pSrc += 4;
 794                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 795                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
 796                     *(pDest++) = U16_LEAD(ch);
 797                     if(pDest < pDestLimit) {
 798                         *(pDest++) = U16_TRAIL(ch);
 799                     } else {
 800                         reqLength = 1;
 801                         break;
 802                     }
 803                     continue;
 804                 }
 805             }
 806
 807             /* truncated character at the end */
 808             *pDest++ = 0xfffd;
 809             while(*++pSrc != 0) {}
 810             break;
 811         }
 812
 813         /* Pre-flight the rest of the string. */
 814         while((ch = *pSrc) != 0) {
 815             if(ch < 0xc0) {
 816                 /*
 817                  * ASCII, or a trail byte in lead position which is treated like
 818                  * a single-byte sequence for better character boundary
 819                  * resynchronization after illegal sequences.
 820                  */
 821                 ++reqLength;
 822                 ++pSrc;
 823                 continue;
 824             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 825                 if(pSrc[1] != 0) {
 826                     ++reqLength;
 827                     pSrc += 2;
 828                     continue;
 829                 }
 830             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 831                 if(pSrc[1] != 0 && pSrc[2] != 0) {
 832                     ++reqLength;
 833                     pSrc += 3;
 834                     continue;
 835                 }
 836             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 837                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
 838                     reqLength += 2;
 839                     pSrc += 4;
 840                     continue;
 841                 }
 842             }
 843
 844             /* truncated character at the end */
 845             ++reqLength;
 846             break;
 847         }
 848     } else /* srcLength >= 0 */ {
 849       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
 850
 851         /*
 852          * This function requires that if srcLength is given, then it must be
 853          * destCapatity >= srcLength so that we need not check for
 854          * destination buffer overflow in the loop.
 855          */
 856         if(destCapacity < srcLength) {
 857             if(pDestLength != NULL) {
 858                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
 859             }
 860             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
 861             return NULL;
 862         }
 863
 864         if((pSrcLimit - pSrc) >= 4) {
 865             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
 866
 867             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
 868             do {
 869                 ch = *pSrc++;
 870                 if(ch < 0xc0) {
 871                     /*
 872                      * ASCII, or a trail byte in lead position which is treated like
 873                      * a single-byte sequence for better character boundary
 874                      * resynchronization after illegal sequences.
 875                      */
 876                     *pDest++=(UChar)ch;
 877                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
 878                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 879                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 880                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 881                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 882                     /* 0x2080 = (0x80 << 6) + 0x80 */
 883                     ch = (ch << 12) + (*pSrc++ << 6);
 884                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 885                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 886                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 887                     ch = (ch << 18) + (*pSrc++ << 12);
 888                     ch += *pSrc++ << 6;
 889                     ch += *pSrc++ - 0x3c82080;
 890                     *(pDest++) = U16_LEAD(ch);
 891                     *(pDest++) = U16_TRAIL(ch);
 892                 }
 893             } while(pSrc < pSrcLimit);
 894
 895             pSrcLimit += 3; /* restore original pSrcLimit */
 896         }
 897
 898         while(pSrc < pSrcLimit) {
 899             ch = *pSrc++;
 900             if(ch < 0xc0) {
 901                 /*
 902                  * ASCII, or a trail byte in lead position which is treated like
 903                  * a single-byte sequence for better character boundary
 904                  * resynchronization after illegal sequences.
 905                  */
 906                 *pDest++=(UChar)ch;
 907                 continue;
 908             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 909                 if(pSrc < pSrcLimit) {
 910                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 911                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 912                     continue;
 913                 }
 914             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 915                 if((pSrcLimit - pSrc) >= 2) {
 916                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 917                     /* 0x2080 = (0x80 << 6) + 0x80 */
 918                     ch = (ch << 12) + (*pSrc++ << 6);
 919                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 920                     pSrc += 3;
 921                     continue;
 922                 }
 923             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 924                 if((pSrcLimit - pSrc) >= 3) {
 925                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 926                     ch = (ch << 18) + (*pSrc++ << 12);
 927                     ch += *pSrc++ << 6;
 928                     ch += *pSrc++ - 0x3c82080;
 929                     *(pDest++) = U16_LEAD(ch);
 930                     *(pDest++) = U16_TRAIL(ch);
 931                     pSrc += 4;
 932                     continue;
 933                 }
 934             }
 935
 936             /* truncated character at the end */
 937             *pDest++ = 0xfffd;
 938             break;
 939         }
 940     }
 941
 942     reqLength+=(int32_t)(pDest - dest);
 943
 944     if(pDestLength){
 945         *pDestLength = reqLength;
 946     }
 947
 948     /* Terminate the buffer */
 949     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 950
 951     return dest;
 952 }
 953
 954 static inline uint8_t *
 955 _appendUTF8(uint8_t *pDest, UChar32 c) {
 956     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
 957     if((c)<=0x7f) {
 958         *pDest++=(uint8_t)c;
 959     } else if(c<=0x7ff) {
 960         *pDest++=(uint8_t)((c>>6)|0xc0);
 961         *pDest++=(uint8_t)((c&0x3f)|0x80);
 962     } else if(c<=0xffff) {
 963         *pDest++=(uint8_t)((c>>12)|0xe0);
 964         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
 965         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 966     } else /* if((uint32_t)(c)<=0x10ffff) */ {
 967         *pDest++=(uint8_t)(((c)>>18)|0xf0);
 968         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
 969         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
 970         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 971     }
 972     return pDest;
 973 }
 974
 975
 976 U_CAPI char* U_EXPORT2
 977 u_strToUTF8WithSub(char *dest,
 978             int32_t destCapacity,
 979             int32_t *pDestLength,
 980             const UChar *pSrc,
 981             int32_t srcLength,
 982             UChar32 subchar, int32_t *pNumSubstitutions,
 983             UErrorCode *pErrorCode){
 984     int32_t reqLength=0;
 985     uint32_t ch=0,ch2=0;
 986     uint8_t *pDest = (uint8_t *)dest;
 987     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
 988     int32_t numSubstitutions;
 989
 990     /* args check */
 991     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 992         return NULL;
 993     }
 994
 995     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
 996         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
 997         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 998     ) {
 999         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1000         return NULL;
1001     }
1002
1003     if(pNumSubstitutions!=NULL) {
1004         *pNumSubstitutions=0;
1005     }
1006     numSubstitutions=0;
1007
1008     if(srcLength==-1) {
1009         while((ch=*pSrc)!=0) {
1010             ++pSrc;
1011             if(ch <= 0x7f) {
1012                 if(pDest<pDestLimit) {
1013                     *pDest++ = (uint8_t)ch;
1014                 } else {
1015                     reqLength = 1;
1016                     break;
1017                 }
1018             } else if(ch <= 0x7ff) {
1019                 if((pDestLimit - pDest) >= 2) {
1020                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1021                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1022                 } else {
1023                     reqLength = 2;
1024                     break;
1025                 }
1026             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1027                 if((pDestLimit - pDest) >= 3) {
1028                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1029                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1030                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1031                 } else {
1032                     reqLength = 3;
1033                     break;
1034                 }
1035             } else /* ch is a surrogate */ {
1036                 int32_t length;
1037
1038                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1039                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1040                     ++pSrc;
1041                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1042                 } else if(subchar>=0) {
1043                     ch=subchar;
1044                     ++numSubstitutions;
1045                 } else {
1046                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1047                     *pErrorCode = U_INVALID_CHAR_FOUND;
1048                     return NULL;
1049                 }
1050
1051                 length = U8_LENGTH(ch);
1052                 if((pDestLimit - pDest) >= length) {
1053                     /* convert and append*/
1054                     pDest=_appendUTF8(pDest, ch);
1055                 } else {
1056                     reqLength = length;
1057                     break;
1058                 }
1059             }
1060         }
1061         while((ch=*pSrc++)!=0) {
1062             if(ch<=0x7f) {
1063                 ++reqLength;
1064             } else if(ch<=0x7ff) {
1065                 reqLength+=2;
1066             } else if(!U16_IS_SURROGATE(ch)) {
1067                 reqLength+=3;
1068             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1069                 ++pSrc;
1070                 reqLength+=4;
1071             } else if(subchar>=0) {
1072                 reqLength+=U8_LENGTH(subchar);
1073                 ++numSubstitutions;
1074             } else {
1075                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1076                 *pErrorCode = U_INVALID_CHAR_FOUND;
1077                 return NULL;
1078             }
1079         }
1080     } else {
1081         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1082         int32_t count;
1083
1084         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1085         for(;;) {
1086             /*
1087              * Each iteration of the inner loop progresses by at most 3 UTF-8
1088              * bytes and one UChar, for most characters.
1089              * For supplementary code points (4 & 2), which are rare,
1090              * there is an additional adjustment.
1091              */
1092             count = (int32_t)((pDestLimit - pDest) / 3);
1093             srcLength = (int32_t)(pSrcLimit - pSrc);
1094             if(count > srcLength) {
1095                 count = srcLength; /* min(remaining dest/3, remaining src) */
1096             }
1097             if(count < 3) {
1098                 /*
1099                  * Too much overhead if we get near the end of the string,
1100                  * continue with the next loop.
1101                  */
1102                 break;
1103             }
1104             do {
1105                 ch=*pSrc++;
1106                 if(ch <= 0x7f) {
1107                     *pDest++ = (uint8_t)ch;
1108                 } else if(ch <= 0x7ff) {
1109                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1110                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1111                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1112                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1113                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1114                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1115                 } else /* ch is a surrogate */ {
1116                     /*
1117                      * We will read two UChars and probably output four bytes,
1118                      * which we didn't account for with computing count,
1119                      * so we adjust it here.
1120                      */
1121                     if(--count == 0) {
1122                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1123                         break;  /* recompute count */
1124                     }
1125
1126                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1127                         ++pSrc;
1128                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1129
1130                         /* writing 4 bytes per 2 UChars is ok */
1131                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1132                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1133                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1134                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1135                     } else  {
1136                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1137                         if(subchar>=0) {
1138                             ch=subchar;
1139                             ++numSubstitutions;
1140                         } else {
1141                             *pErrorCode = U_INVALID_CHAR_FOUND;
1142                             return NULL;
1143                         }
1144
1145                         /* convert and append*/
1146                         pDest=_appendUTF8(pDest, ch);
1147                     }
1148                 }
1149             } while(--count > 0);
1150         }
1151
1152         while(pSrc<pSrcLimit) {
1153             ch=*pSrc++;
1154             if(ch <= 0x7f) {
1155                 if(pDest<pDestLimit) {
1156                     *pDest++ = (uint8_t)ch;
1157                 } else {
1158                     reqLength = 1;
1159                     break;
1160                 }
1161             } else if(ch <= 0x7ff) {
1162                 if((pDestLimit - pDest) >= 2) {
1163                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1164                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1165                 } else {
1166                     reqLength = 2;
1167                     break;
1168                 }
1169             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1170                 if((pDestLimit - pDest) >= 3) {
1171                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1172                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1173                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1174                 } else {
1175                     reqLength = 3;
1176                     break;
1177                 }
1178             } else /* ch is a surrogate */ {
1179                 int32_t length;
1180
1181                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1182                     ++pSrc;
1183                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1184                 } else if(subchar>=0) {
1185                     ch=subchar;
1186                     ++numSubstitutions;
1187                 } else {
1188                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1189                     *pErrorCode = U_INVALID_CHAR_FOUND;
1190                     return NULL;
1191                 }
1192
1193                 length = U8_LENGTH(ch);
1194                 if((pDestLimit - pDest) >= length) {
1195                     /* convert and append*/
1196                     pDest=_appendUTF8(pDest, ch);
1197                 } else {
1198                     reqLength = length;
1199                     break;
1200                 }
1201             }
1202         }
1203         while(pSrc<pSrcLimit) {
1204             ch=*pSrc++;
1205             if(ch<=0x7f) {
1206                 ++reqLength;
1207             } else if(ch<=0x7ff) {
1208                 reqLength+=2;
1209             } else if(!U16_IS_SURROGATE(ch)) {
1210                 reqLength+=3;
1211             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1212                 ++pSrc;
1213                 reqLength+=4;
1214             } else if(subchar>=0) {
1215                 reqLength+=U8_LENGTH(subchar);
1216                 ++numSubstitutions;
1217             } else {
1218                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1219                 *pErrorCode = U_INVALID_CHAR_FOUND;
1220                 return NULL;
1221             }
1222         }
1223     }
1224
1225     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1226
1227     if(pNumSubstitutions!=NULL) {
1228         *pNumSubstitutions=numSubstitutions;
1229     }
1230
1231     if(pDestLength){
1232         *pDestLength = reqLength;
1233     }
1234
1235     /* Terminate the buffer */
1236     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1237     return dest;
1238 }
1239
1240 U_CAPI char* U_EXPORT2
1241 u_strToUTF8(char *dest,
1242             int32_t destCapacity,
1243             int32_t *pDestLength,
1244             const UChar *pSrc,
1245             int32_t srcLength,
1246             UErrorCode *pErrorCode){
1247     return u_strToUTF8WithSub(
1248             dest, destCapacity, pDestLength,
1249             pSrc, srcLength,
1250             U_SENTINEL, NULL,
1251             pErrorCode);
1252 }
1253
1254 U_CAPI UChar* U_EXPORT2
1255 u_strFromJavaModifiedUTF8WithSub(
1256         UChar *dest,
1257         int32_t destCapacity,
1258         int32_t *pDestLength,
1259         const char *src,
1260         int32_t srcLength,
1261         UChar32 subchar, int32_t *pNumSubstitutions,
1262         UErrorCode *pErrorCode) {
1263     UChar *pDest = dest;
1264     UChar *pDestLimit = dest+destCapacity;
1265     UChar32 ch;
1266     int32_t reqLength = 0;
1267     const uint8_t* pSrc = (const uint8_t*) src;
1268     const uint8_t *pSrcLimit;
1269     int32_t count;
1270     uint8_t t1, t2; /* trail bytes */
1271     int32_t numSubstitutions;
1272
1273     /* args check */
1274     if(U_FAILURE(*pErrorCode)){
1275         return NULL;
1276     }
1277     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1278         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1279         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1280     ) {
1281         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1282         return NULL;
1283     }
1284
1285     if(pNumSubstitutions!=NULL) {
1286         *pNumSubstitutions=0;
1287     }
1288     numSubstitutions=0;
1289
1290     if(srcLength < 0) {
1291         /*
1292          * Transform a NUL-terminated ASCII string.
1293          * Handle non-ASCII strings with slower code.
1294          */
1295         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1296             *pDest++=(UChar)ch;
1297             ++pSrc;
1298         }
1299         if(ch == 0) {
1300             reqLength=(int32_t)(pDest - dest);
1301             if(pDestLength) {
1302                 *pDestLength = reqLength;
1303             }
1304
1305             /* Terminate the buffer */
1306             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1307             return dest;
1308         }
1309         srcLength = uprv_strlen((const char *)pSrc);
1310     }
1311
1312     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1313     pSrcLimit = pSrc + srcLength;
1314     for(;;) {
1315         count = (int32_t)(pDestLimit - pDest);
1316         srcLength = (int32_t)(pSrcLimit - pSrc);
1317         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1318             /* fast ASCII loop */
1319             const uint8_t *prevSrc = pSrc;
1320             int32_t delta;
1321             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1322                 *pDest++=(UChar)ch;
1323                 ++pSrc;
1324             }
1325             delta = (int32_t)(pSrc - prevSrc);
1326             count -= delta;
1327             srcLength -= delta;
1328         }
1329         /*
1330          * Each iteration of the inner loop progresses by at most 3 UTF-8
1331          * bytes and one UChar.
1332          */
1333         srcLength /= 3;
1334         if(count > srcLength) {
1335             count = srcLength; /* min(remaining dest, remaining src/3) */
1336         }
1337         if(count < 3) {
1338             /*
1339              * Too much overhead if we get near the end of the string,
1340              * continue with the next loop.
1341              */
1342             break;
1343         }
1344         do {
1345             ch = *pSrc;
1346             if(ch <= 0x7f){
1347                 *pDest++=(UChar)ch;
1348                 ++pSrc;
1349             } else {
1350                 if(ch >= 0xe0) {
1351                     if( /* handle U+0000..U+FFFF inline */
1352                         ch <= 0xef &&
1353                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1354                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1355                     ) {
1356                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1357                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1358                         pSrc += 3;
1359                         continue;
1360                     }
1361                 } else {
1362                     if( /* handle U+0000..U+07FF inline */
1363                         ch >= 0xc0 &&
1364                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1365                     ) {
1366                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1367                         pSrc += 2;
1368                         continue;
1369                     }
1370                 }
1371
1372                 if(subchar < 0) {
1373                     *pErrorCode = U_INVALID_CHAR_FOUND;
1374                     return NULL;
1375                 } else if(subchar > 0xffff && --count == 0) {
1376                     /*
1377                      * We need to write two UChars, adjusted count for that,
1378                      * and ran out of space.
1379                      */
1380                     break;
1381                 } else {
1382                     /* function call for error cases */
1383                     ++pSrc; /* continue after the lead byte */
1384                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1385                     ++numSubstitutions;
1386                     if(subchar<=0xFFFF) {
1387                         *(pDest++)=(UChar)subchar;
1388                     } else {
1389                         *(pDest++)=U16_LEAD(subchar);
1390                         *(pDest++)=U16_TRAIL(subchar);
1391                     }
1392                 }
1393             }
1394         } while(--count > 0);
1395     }
1396
1397     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1398         ch = *pSrc;
1399         if(ch <= 0x7f){
1400             *pDest++=(UChar)ch;
1401             ++pSrc;
1402         } else {
1403             if(ch >= 0xe0) {
1404                 if( /* handle U+0000..U+FFFF inline */
1405                     ch <= 0xef &&
1406                     ((pSrcLimit - pSrc) >= 3) &&
1407                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1408                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1409                 ) {
1410                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1411                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1412                     pSrc += 3;
1413                     continue;
1414                 }
1415             } else {
1416                 if( /* handle U+0000..U+07FF inline */
1417                     ch >= 0xc0 &&
1418                     ((pSrcLimit - pSrc) >= 2) &&
1419                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1420                 ) {
1421                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1422                     pSrc += 2;
1423                     continue;
1424                 }
1425             }
1426
1427             if(subchar < 0) {
1428                 *pErrorCode = U_INVALID_CHAR_FOUND;
1429                 return NULL;
1430             } else {
1431                 /* function call for error cases */
1432                 ++pSrc; /* continue after the lead byte */
1433                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1434                 ++numSubstitutions;
1435                 if(subchar<=0xFFFF) {
1436                     *(pDest++)=(UChar)subchar;
1437                 } else {
1438                     *(pDest++)=U16_LEAD(subchar);
1439                     if(pDest<pDestLimit) {
1440                         *(pDest++)=U16_TRAIL(subchar);
1441                     } else {
1442                         reqLength++;
1443                         break;
1444                     }
1445                 }
1446             }
1447         }
1448     }
1449
1450     /* do not fill the dest buffer just count the UChars needed */
1451     while(pSrc < pSrcLimit){
1452         ch = *pSrc;
1453         if(ch <= 0x7f) {
1454             reqLength++;
1455             ++pSrc;
1456         } else {
1457             if(ch >= 0xe0) {
1458                 if( /* handle U+0000..U+FFFF inline */
1459                     ch <= 0xef &&
1460                     ((pSrcLimit - pSrc) >= 3) &&
1461                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1462                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1463                 ) {
1464                     reqLength++;
1465                     pSrc += 3;
1466                     continue;
1467                 }
1468             } else {
1469                 if( /* handle U+0000..U+07FF inline */
1470                     ch >= 0xc0 &&
1471                     ((pSrcLimit - pSrc) >= 2) &&
1472                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1473                 ) {
1474                     reqLength++;
1475                     pSrc += 2;
1476                     continue;
1477                 }
1478             }
1479
1480             if(subchar < 0) {
1481                 *pErrorCode = U_INVALID_CHAR_FOUND;
1482                 return NULL;
1483             } else {
1484                 /* function call for error cases */
1485                 ++pSrc; /* continue after the lead byte */
1486                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1487                 ++numSubstitutions;
1488                 reqLength+=U16_LENGTH(ch);
1489             }
1490         }
1491     }
1492
1493     if(pNumSubstitutions!=NULL) {
1494         *pNumSubstitutions=numSubstitutions;
1495     }
1496
1497     reqLength+=(int32_t)(pDest - dest);
1498     if(pDestLength) {
1499         *pDestLength = reqLength;
1500     }
1501
1502     /* Terminate the buffer */
1503     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1504     return dest;
1505 }
1506
1507 U_CAPI char* U_EXPORT2
1508 u_strToJavaModifiedUTF8(
1509         char *dest,
1510         int32_t destCapacity,
1511         int32_t *pDestLength,
1512         const UChar *src,
1513         int32_t srcLength,
1514         UErrorCode *pErrorCode) {
1515     int32_t reqLength=0;
1516     uint32_t ch=0;
1517     uint8_t *pDest = (uint8_t *)dest;
1518     uint8_t *pDestLimit = pDest + destCapacity;
1519     const UChar *pSrcLimit;
1520     int32_t count;
1521
1522     /* args check */
1523     if(U_FAILURE(*pErrorCode)){
1524         return NULL;
1525     }
1526     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1527         (dest==NULL && destCapacity!=0) || destCapacity<0
1528     ) {
1529         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1530         return NULL;
1531     }
1532
1533     if(srcLength==-1) {
1534         /* Convert NUL-terminated ASCII, then find the string length. */
1535         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1536             *pDest++ = (uint8_t)ch;
1537             ++src;
1538         }
1539         if(ch == 0) {
1540             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1541             if(pDestLength) {
1542                 *pDestLength = reqLength;
1543             }
1544
1545             /* Terminate the buffer */
1546             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1547             return dest;
1548         }
1549         srcLength = u_strlen(src);
1550     }
1551
1552     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1553     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1554     for(;;) {
1555         count = (int32_t)(pDestLimit - pDest);
1556         srcLength = (int32_t)(pSrcLimit - src);
1557         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1558             /* fast ASCII loop */
1559             const UChar *prevSrc = src;
1560             int32_t delta;
1561             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1562                 *pDest++=(uint8_t)ch;
1563                 ++src;
1564             }
1565             delta = (int32_t)(src - prevSrc);
1566             count -= delta;
1567             srcLength -= delta;
1568         }
1569         /*
1570          * Each iteration of the inner loop progresses by at most 3 UTF-8
1571          * bytes and one UChar.
1572          */
1573         count /= 3;
1574         if(count > srcLength) {
1575             count = srcLength; /* min(remaining dest/3, remaining src) */
1576         }
1577         if(count < 3) {
1578             /*
1579              * Too much overhead if we get near the end of the string,
1580              * continue with the next loop.
1581              */
1582             break;
1583         }
1584         do {
1585             ch=*src++;
1586             if(ch <= 0x7f && ch != 0) {
1587                 *pDest++ = (uint8_t)ch;
1588             } else if(ch <= 0x7ff) {
1589                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1590                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1591             } else {
1592                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1593                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1594                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1595             }
1596         } while(--count > 0);
1597     }
1598
1599     while(src<pSrcLimit) {
1600         ch=*src++;
1601         if(ch <= 0x7f && ch != 0) {
1602             if(pDest<pDestLimit) {
1603                 *pDest++ = (uint8_t)ch;
1604             } else {
1605                 reqLength = 1;
1606                 break;
1607             }
1608         } else if(ch <= 0x7ff) {
1609             if((pDestLimit - pDest) >= 2) {
1610                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1611                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1612             } else {
1613                 reqLength = 2;
1614                 break;
1615             }
1616         } else {
1617             if((pDestLimit - pDest) >= 3) {
1618                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1619                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1620                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1621             } else {
1622                 reqLength = 3;
1623                 break;
1624             }
1625         }
1626     }
1627     while(src<pSrcLimit) {
1628         ch=*src++;
1629         if(ch <= 0x7f && ch != 0) {
1630             ++reqLength;
1631         } else if(ch<=0x7ff) {
1632             reqLength+=2;
1633         } else {
1634             reqLength+=3;
1635         }
1636     }
1637
1638     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1639     if(pDestLength){
1640         *pDestLength = reqLength;
1641     }
1642
1643     /* Terminate the buffer */
1644     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1645     return dest;
1646 }