icuSources/common/ustrtrns.c

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2007, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *
   9 * File ustrtrns.c
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   9/10/2001    Ram    Creation.
  15 ******************************************************************************
  16 */
  17
  18 /*******************************************************************************
  19  *
  20  * u_strTo* and u_strFrom* APIs
  21  * WCS functions moved to ustr_wcs.c for better modularization
  22  *
  23  *******************************************************************************
  24  */
  25
  26
  27 #include "unicode/putil.h"
  28 #include "unicode/ustring.h"
  29 #include "cstring.h"
  30 #include "cmemory.h"
  31 #include "ustr_imp.h"
  32
  33 U_CAPI UChar* U_EXPORT2
  34 u_strFromUTF32(UChar   *dest,
  35                int32_t destCapacity,
  36                int32_t *pDestLength,
  37                const UChar32 *src,
  38                int32_t srcLength,
  39                UErrorCode *pErrorCode)
  40 {
  41     int32_t reqLength = 0;
  42     uint32_t ch =0;
  43     UChar *pDestLimit =dest+destCapacity;
  44     UChar *pDest = dest;
  45     const uint32_t *pSrc = (const uint32_t *)src;
  46
  47     /* args check */
  48     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
  49         return NULL;
  50     }
  51
  52     if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
  53         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  54         return NULL;
  55     }
  56
  57      /* Check if the source is null terminated */
  58     if(srcLength == -1 ){
  59         while(((ch=*pSrc)!=0) && (pDest < pDestLimit)){
  60             ++pSrc;
  61             if(ch<=0xFFFF){
  62                 *(pDest++)=(UChar)ch;
  63             }else if(ch<=0x10ffff){
  64                 *(pDest++)=UTF16_LEAD(ch);
  65                 if(pDest<pDestLimit){
  66                     *(pDest++)=UTF16_TRAIL(ch);
  67                 }else{
  68                     reqLength++;
  69                     break;
  70                 }
  71             }else{
  72                 *pErrorCode = U_INVALID_CHAR_FOUND;
  73                 return NULL;
  74             }
  75         }
  76         while((ch=*pSrc++) != 0){
  77             reqLength+=UTF_CHAR_LENGTH(ch);
  78         }
  79     }else{
  80         const uint32_t* pSrcLimit = ((const uint32_t*)pSrc) + srcLength;
  81         while((pSrc < pSrcLimit) && (pDest < pDestLimit)){
  82             ch = *pSrc++;
  83             if(ch<=0xFFFF){
  84                 *(pDest++)=(UChar)ch;
  85             }else if(ch<=0x10FFFF){
  86                 *(pDest++)=UTF16_LEAD(ch);
  87                 if(pDest<pDestLimit){
  88                     *(pDest++)=UTF16_TRAIL(ch);
  89                 }else{
  90                     reqLength++;
  91                     break;
  92                 }
  93             }else{
  94                 *pErrorCode = U_INVALID_CHAR_FOUND;
  95                 return NULL;
  96             }
  97         }
  98         while(pSrc <pSrcLimit){
  99             ch = *pSrc++;
 100             reqLength+=UTF_CHAR_LENGTH(ch);
 101         }
 102     }
 103
 104     reqLength += (int32_t)(pDest - dest);
 105     if(pDestLength){
 106         *pDestLength = reqLength;
 107     }
 108
 109     /* Terminate the buffer */
 110     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 111
 112     return dest;
 113 }
 114
 115
 116 U_CAPI UChar32* U_EXPORT2
 117 u_strToUTF32(UChar32 *dest,
 118              int32_t  destCapacity,
 119              int32_t  *pDestLength,
 120              const UChar *src,
 121              int32_t  srcLength,
 122              UErrorCode *pErrorCode)
 123 {
 124     const UChar* pSrc = src;
 125     const UChar* pSrcLimit;
 126     int32_t reqLength=0;
 127     uint32_t ch=0;
 128     uint32_t *pDest = (uint32_t *)dest;
 129     uint32_t *pDestLimit = pDest + destCapacity;
 130     UChar ch2=0;
 131
 132     /* args check */
 133     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 134         return NULL;
 135     }
 136
 137
 138     if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
 139         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 140         return NULL;
 141     }
 142
 143     if(srcLength==-1) {
 144         while((ch=*pSrc)!=0 && pDest!=pDestLimit) {
 145             ++pSrc;
 146             /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
 147             if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
 148                 ++pSrc;
 149                 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
 150             }
 151             *(pDest++)= ch;
 152         }
 153         while((ch=*pSrc++)!=0) {
 154             if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
 155                 ++pSrc;
 156             }
 157             ++reqLength;
 158         }
 159     } else {
 160         pSrcLimit = pSrc+srcLength;
 161         while(pSrc<pSrcLimit && pDest<pDestLimit) {
 162             ch=*pSrc++;
 163             if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
 164                 ++pSrc;
 165                 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
 166             }
 167             *(pDest++)= ch;
 168         }
 169         while(pSrc!=pSrcLimit) {
 170             ch=*pSrc++;
 171             if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
 172                 ++pSrc;
 173             }
 174             ++reqLength;
 175         }
 176     }
 177
 178     reqLength+=(int32_t)(pDest - (uint32_t *)dest);
 179     if(pDestLength){
 180         *pDestLength = reqLength;
 181     }
 182
 183     /* Terminate the buffer */
 184     u_terminateUChar32s(dest,destCapacity,reqLength,pErrorCode);
 185
 186     return dest;
 187 }
 188
 189 /* for utf8_nextCharSafeBodyTerminated() */
 190 static const UChar32
 191 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
 192
 193 /*
 194  * Version of utf8_nextCharSafeBody() with the following differences:
 195  * - checks for NUL termination instead of length
 196  * - works with pointers instead of indexes
 197  * - always strict (strict==-1)
 198  *
 199  * *ps points to after the lead byte and will be moved to after the last trail byte.
 200  * c is the lead byte.
 201  * @return the code point, or U_SENTINEL
 202  */
 203 static UChar32
 204 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
 205     const uint8_t *s=*ps;
 206     uint8_t trail, illegal=0;
 207     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
 208     UTF8_MASK_LEAD_BYTE((c), count);
 209     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 210     switch(count) {
 211     /* each branch falls through to the next one */
 212     case 5:
 213     case 4:
 214         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 215         illegal=1;
 216         break;
 217     case 3:
 218         trail=(uint8_t)(*s++ - 0x80);
 219         c=(c<<6)|trail;
 220         if(trail>0x3f || c>=0x110) {
 221             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
 222             illegal=1;
 223             break;
 224         }
 225     case 2:
 226         trail=(uint8_t)(*s++ - 0x80);
 227         if(trail>0x3f) {
 228             /* not a trail byte */
 229             illegal=1;
 230             break;
 231         }
 232         c=(c<<6)|trail;
 233     case 1:
 234         trail=(uint8_t)(*s++ - 0x80);
 235         if(trail>0x3f) {
 236             /* not a trail byte */
 237             illegal=1;
 238         }
 239         c=(c<<6)|trail;
 240         break;
 241     case 0:
 242         return U_SENTINEL;
 243     /* no default branch to optimize switch()  - all values are covered */
 244     }
 245
 246     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 247     /* illegal is also set if count>=4 */
 248     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
 249         /* error handling */
 250         /* don't go beyond this sequence */
 251         s=*ps;
 252         while(count>0 && UTF8_IS_TRAIL(*s)) {
 253             ++s;
 254             --count;
 255         }
 256         c=U_SENTINEL;
 257     }
 258     *ps=s;
 259     return c;
 260 }
 261
 262 /*
 263  * Version of utf8_nextCharSafeBody() with the following differences:
 264  * - works with pointers instead of indexes
 265  * - always strict (strict==-1)
 266  *
 267  * *ps points to after the lead byte and will be moved to after the last trail byte.
 268  * c is the lead byte.
 269  * @return the code point, or U_SENTINEL
 270  */
 271 static UChar32
 272 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
 273     const uint8_t *s=*ps;
 274     uint8_t trail, illegal=0;
 275     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
 276     if((limit-s)>=count) {
 277         UTF8_MASK_LEAD_BYTE((c), count);
 278         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
 279         switch(count) {
 280         /* each branch falls through to the next one */
 281         case 5:
 282         case 4:
 283             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 284             illegal=1;
 285             break;
 286         case 3:
 287             trail=*s++;
 288             c=(c<<6)|(trail&0x3f);
 289             if(c<0x110) {
 290                 illegal|=(trail&0xc0)^0x80;
 291             } else {
 292                 /* code point>0x10ffff, outside Unicode */
 293                 illegal=1;
 294                 break;
 295             }
 296         case 2:
 297             trail=*s++;
 298             c=(c<<6)|(trail&0x3f);
 299             illegal|=(trail&0xc0)^0x80;
 300         case 1:
 301             trail=*s++;
 302             c=(c<<6)|(trail&0x3f);
 303             illegal|=(trail&0xc0)^0x80;
 304             break;
 305         case 0:
 306             return U_SENTINEL;
 307         /* no default branch to optimize switch()  - all values are covered */
 308         }
 309     } else {
 310         illegal=1; /* too few bytes left */
 311     }
 312
 313     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
 314     /* illegal is also set if count>=4 */
 315     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
 316         /* error handling */
 317         /* don't go beyond this sequence */
 318         s=*ps;
 319         while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
 320             ++s;
 321             --count;
 322         }
 323         c=U_SENTINEL;
 324     }
 325     *ps=s;
 326     return c;
 327 }
 328
 329 U_CAPI UChar* U_EXPORT2
 330 u_strFromUTF8WithSub(UChar *dest,
 331               int32_t destCapacity,
 332               int32_t *pDestLength,
 333               const char* src,
 334               int32_t srcLength,
 335               UChar32 subchar, int32_t *pNumSubstitutions,
 336               UErrorCode *pErrorCode){
 337
 338     UChar *pDest = dest;
 339     UChar *pDestLimit = dest+destCapacity;
 340     UChar32 ch;
 341     int32_t reqLength = 0;
 342     const uint8_t* pSrc = (const uint8_t*) src;
 343     uint8_t t1, t2; /* trail bytes */
 344     int32_t numSubstitutions;
 345
 346     /* args check */
 347     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 348         return NULL;
 349     }
 350
 351     if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
 352         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 353     ) {
 354         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 355         return NULL;
 356     }
 357
 358     numSubstitutions=0;
 359
 360     /*
 361      * Inline processing of UTF-8 byte sequences:
 362      *
 363      * Byte sequences for the most common characters are handled inline in
 364      * the conversion loops. In order to reduce the path lengths for those
 365      * characters, the tests are arranged in a kind of binary search.
 366      * ASCII (<=0x7f) is checked first, followed by the dividing point
 367      * between 2- and 3-byte sequences (0xe0).
 368      * The 3-byte branch is tested first to speed up CJK text.
 369      * The compiler should combine the subtractions for the two tests for 0xe0.
 370      * Each branch then tests for the other end of its range.
 371      */
 372
 373     if(srcLength < 0){
 374         /*
 375          * Transform a NUL-terminated string.
 376          * The code explicitly checks for NULs only in the lead byte position.
 377          * A NUL byte in the trail byte position fails the trail byte range check anyway.
 378          */
 379         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 380             if(ch <= 0x7f){
 381                 *pDest++=(UChar)ch;
 382                 ++pSrc;
 383             } else {
 384                 if(ch > 0xe0) {
 385                     if( /* handle U+1000..U+CFFF inline */
 386                         ch <= 0xec &&
 387                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 388                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 389                     ) {
 390                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 391                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 392                         pSrc += 3;
 393                         continue;
 394                     }
 395                 } else if(ch < 0xe0) {
 396                     if( /* handle U+0080..U+07FF inline */
 397                         ch >= 0xc2 &&
 398                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 399                     ) {
 400                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 401                         pSrc += 2;
 402                         continue;
 403                     }
 404                 }
 405
 406                 /* function call for "complicated" and error cases */
 407                 ++pSrc; /* continue after the lead byte */
 408                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 409                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 410                     *pErrorCode = U_INVALID_CHAR_FOUND;
 411                     return NULL;
 412                 } else if(ch<=0xFFFF) {
 413                     *(pDest++)=(UChar)ch;
 414                 } else {
 415                     *(pDest++)=UTF16_LEAD(ch);
 416                     if(pDest<pDestLimit) {
 417                         *(pDest++)=UTF16_TRAIL(ch);
 418                     } else {
 419                         reqLength++;
 420                         break;
 421                     }
 422                 }
 423             }
 424         }
 425
 426         /* Pre-flight the rest of the string. */
 427         while((ch = *pSrc) != 0) {
 428             if(ch <= 0x7f){
 429                 ++reqLength;
 430                 ++pSrc;
 431             } else {
 432                 if(ch > 0xe0) {
 433                     if( /* handle U+1000..U+CFFF inline */
 434                         ch <= 0xec &&
 435                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 436                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 437                     ) {
 438                         ++reqLength;
 439                         pSrc += 3;
 440                         continue;
 441                     }
 442                 } else if(ch < 0xe0) {
 443                     if( /* handle U+0080..U+07FF inline */
 444                         ch >= 0xc2 &&
 445                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 446                     ) {
 447                         ++reqLength;
 448                         pSrc += 2;
 449                         continue;
 450                     }
 451                 }
 452
 453                 /* function call for "complicated" and error cases */
 454                 ++pSrc; /* continue after the lead byte */
 455                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
 456                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
 457                     *pErrorCode = U_INVALID_CHAR_FOUND;
 458                     return NULL;
 459                 }
 460                 reqLength += U16_LENGTH(ch);
 461             }
 462         }
 463     } else /* srcLength >= 0 */ {
 464         const uint8_t *pSrcLimit = pSrc + srcLength;
 465         int32_t count;
 466
 467         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
 468         for(;;) {
 469             /*
 470              * Each iteration of the inner loop progresses by at most 3 UTF-8
 471              * bytes and one UChar, for most characters.
 472              * For supplementary code points (4 & 2), which are rare,
 473              * there is an additional adjustment.
 474              */
 475             count = (int32_t)(pDestLimit - pDest);
 476             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
 477             if(count > srcLength) {
 478                 count = srcLength; /* min(remaining dest, remaining src/3) */
 479             }
 480             if(count < 3) {
 481                 /*
 482                  * Too much overhead if we get near the end of the string,
 483                  * continue with the next loop.
 484                  */
 485                 break;
 486             }
 487
 488             do {
 489                 ch = *pSrc;
 490                 if(ch <= 0x7f){
 491                     *pDest++=(UChar)ch;
 492                     ++pSrc;
 493                 } else {
 494                     if(ch > 0xe0) {
 495                         if( /* handle U+1000..U+CFFF inline */
 496                             ch <= 0xec &&
 497                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 498                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 499                         ) {
 500                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 501                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 502                             pSrc += 3;
 503                             continue;
 504                         }
 505                     } else if(ch < 0xe0) {
 506                         if( /* handle U+0080..U+07FF inline */
 507                             ch >= 0xc2 &&
 508                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 509                         ) {
 510                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 511                             pSrc += 2;
 512                             continue;
 513                         }
 514                     }
 515
 516                     if(ch >= 0xf0 || subchar > 0xffff) {
 517                         /*
 518                          * We may read up to six bytes and write up to two UChars,
 519                          * which we didn't account for with computing count,
 520                          * so we adjust it here.
 521                          */
 522                         if(--count == 0) {
 523                             break;
 524                         }
 525                     }
 526
 527                     /* function call for "complicated" and error cases */
 528                     ++pSrc; /* continue after the lead byte */
 529                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 530                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 531                         *pErrorCode = U_INVALID_CHAR_FOUND;
 532                         return NULL;
 533                     }else if(ch<=0xFFFF){
 534                         *(pDest++)=(UChar)ch;
 535                     }else{
 536                         *(pDest++)=UTF16_LEAD(ch);
 537                         if(pDest<pDestLimit){
 538                             *(pDest++)=UTF16_TRAIL(ch);
 539                         }else{
 540                             reqLength++;
 541                             break;
 542                         }
 543                     }
 544                 }
 545             } while(--count > 0);
 546         }
 547
 548         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
 549             ch = *pSrc;
 550             if(ch <= 0x7f){
 551                 *pDest++=(UChar)ch;
 552                 ++pSrc;
 553             } else {
 554                 if(ch > 0xe0) {
 555                     if( /* handle U+1000..U+CFFF inline */
 556                         ch <= 0xec &&
 557                         ((pSrcLimit - pSrc) >= 3) &&
 558                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
 559                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
 560                     ) {
 561                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 562                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
 563                         pSrc += 3;
 564                         continue;
 565                     }
 566                 } else if(ch < 0xe0) {
 567                     if( /* handle U+0080..U+07FF inline */
 568                         ch >= 0xc2 &&
 569                         ((pSrcLimit - pSrc) >= 2) &&
 570                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
 571                     ) {
 572                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
 573                         pSrc += 2;
 574                         continue;
 575                     }
 576                 }
 577
 578                 /* function call for "complicated" and error cases */
 579                 ++pSrc; /* continue after the lead byte */
 580                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 581                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 582                     *pErrorCode = U_INVALID_CHAR_FOUND;
 583                     return NULL;
 584                 }else if(ch<=0xFFFF){
 585                     *(pDest++)=(UChar)ch;
 586                 }else{
 587                     *(pDest++)=UTF16_LEAD(ch);
 588                     if(pDest<pDestLimit){
 589                         *(pDest++)=UTF16_TRAIL(ch);
 590                     }else{
 591                         reqLength++;
 592                         break;
 593                     }
 594                 }
 595             }
 596         }
 597         /* donot fill the dest buffer just count the UChars needed */
 598         while(pSrc < pSrcLimit){
 599             ch = *pSrc;
 600             if(ch <= 0x7f){
 601                 reqLength++;
 602                 ++pSrc;
 603             } else {
 604                 if(ch > 0xe0) {
 605                     if( /* handle U+1000..U+CFFF inline */
 606                         ch <= 0xec &&
 607                         ((pSrcLimit - pSrc) >= 3) &&
 608                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
 609                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
 610                     ) {
 611                         reqLength++;
 612                         pSrc += 3;
 613                         continue;
 614                     }
 615                 } else if(ch < 0xe0) {
 616                     if( /* handle U+0080..U+07FF inline */
 617                         ch >= 0xc2 &&
 618                         ((pSrcLimit - pSrc) >= 2) &&
 619                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
 620                     ) {
 621                         reqLength++;
 622                         pSrc += 2;
 623                         continue;
 624                     }
 625                 }
 626
 627                 /* function call for "complicated" and error cases */
 628                 ++pSrc; /* continue after the lead byte */
 629                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
 630                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
 631                     *pErrorCode = U_INVALID_CHAR_FOUND;
 632                     return NULL;
 633                 }
 634                 reqLength+=UTF_CHAR_LENGTH(ch);
 635             }
 636         }
 637     }
 638
 639     reqLength+=(int32_t)(pDest - dest);
 640
 641     if(pNumSubstitutions!=NULL) {
 642         *pNumSubstitutions=numSubstitutions;
 643     }
 644
 645     if(pDestLength){
 646         *pDestLength = reqLength;
 647     }
 648
 649     /* Terminate the buffer */
 650     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 651
 652     return dest;
 653 }
 654
 655 U_CAPI UChar* U_EXPORT2
 656 u_strFromUTF8(UChar *dest,
 657               int32_t destCapacity,
 658               int32_t *pDestLength,
 659               const char* src,
 660               int32_t srcLength,
 661               UErrorCode *pErrorCode){
 662     return u_strFromUTF8WithSub(
 663             dest, destCapacity, pDestLength,
 664             src, srcLength,
 665             U_SENTINEL, NULL,
 666             pErrorCode);
 667 }
 668
 669 U_CAPI UChar * U_EXPORT2
 670 u_strFromUTF8Lenient(UChar *dest,
 671                      int32_t destCapacity,
 672                      int32_t *pDestLength,
 673                      const char *src,
 674                      int32_t srcLength,
 675                      UErrorCode *pErrorCode) {
 676
 677     UChar *pDest = dest;
 678     UChar32 ch;
 679     int32_t reqLength = 0;
 680     uint8_t* pSrc = (uint8_t*) src;
 681
 682     /* args check */
 683     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 684         return NULL;
 685     }
 686
 687     if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
 688         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 689         return NULL;
 690     }
 691
 692     if(srcLength < 0) {
 693         /* Transform a NUL-terminated string. */
 694         UChar *pDestLimit = dest+destCapacity;
 695         uint8_t t1, t2, t3; /* trail bytes */
 696
 697         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
 698             if(ch < 0xc0) {
 699                 /*
 700                  * ASCII, or a trail byte in lead position which is treated like
 701                  * a single-byte sequence for better character boundary
 702                  * resynchronization after illegal sequences.
 703                  */
 704                 *pDest++=(UChar)ch;
 705                 ++pSrc;
 706                 continue;
 707             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 708                 if((t1 = pSrc[1]) != 0) {
 709                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 710                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
 711                     pSrc += 2;
 712                     continue;
 713                 }
 714             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 715                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
 716                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 717                     /* 0x2080 = (0x80 << 6) + 0x80 */
 718                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
 719                     pSrc += 3;
 720                     continue;
 721                 }
 722             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 723                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
 724                     pSrc += 4;
 725                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 726                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
 727                     *(pDest++) = U16_LEAD(ch);
 728                     if(pDest < pDestLimit) {
 729                         *(pDest++) = U16_TRAIL(ch);
 730                     } else {
 731                         reqLength = 1;
 732                         break;
 733                     }
 734                     continue;
 735                 }
 736             }
 737
 738             /* truncated character at the end */
 739             *pDest++ = 0xfffd;
 740             while(*++pSrc != 0) {}
 741             break;
 742         }
 743
 744         /* Pre-flight the rest of the string. */
 745         while((ch = *pSrc) != 0) {
 746             if(ch < 0xc0) {
 747                 /*
 748                  * ASCII, or a trail byte in lead position which is treated like
 749                  * a single-byte sequence for better character boundary
 750                  * resynchronization after illegal sequences.
 751                  */
 752                 ++reqLength;
 753                 ++pSrc;
 754                 continue;
 755             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 756                 if(pSrc[1] != 0) {
 757                     ++reqLength;
 758                     pSrc += 2;
 759                     continue;
 760                 }
 761             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 762                 if(pSrc[1] != 0 && pSrc[2] != 0) {
 763                     ++reqLength;
 764                     pSrc += 3;
 765                     continue;
 766                 }
 767             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 768                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
 769                     reqLength += 2;
 770                     pSrc += 4;
 771                     continue;
 772                 }
 773             }
 774
 775             /* truncated character at the end */
 776             ++reqLength;
 777             break;
 778         }
 779     } else /* srcLength >= 0 */ {
 780         const uint8_t *pSrcLimit = pSrc + srcLength;
 781
 782         /*
 783          * This function requires that if srcLength is given, then it must be
 784          * destCapatity >= srcLength so that we need not check for
 785          * destination buffer overflow in the loop.
 786          */
 787         if(destCapacity < srcLength) {
 788             if(pDestLength != NULL) {
 789                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
 790             }
 791             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
 792             return NULL;
 793         }
 794
 795         if((pSrcLimit - pSrc) >= 4) {
 796             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
 797
 798             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
 799             do {
 800                 ch = *pSrc++;
 801                 if(ch < 0xc0) {
 802                     /*
 803                      * ASCII, or a trail byte in lead position which is treated like
 804                      * a single-byte sequence for better character boundary
 805                      * resynchronization after illegal sequences.
 806                      */
 807                     *pDest++=(UChar)ch;
 808                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
 809                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 810                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 811                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 812                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 813                     /* 0x2080 = (0x80 << 6) + 0x80 */
 814                     ch = (ch << 12) + (*pSrc++ << 6);
 815                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 816                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 817                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 818                     ch = (ch << 18) + (*pSrc++ << 12);
 819                     ch += *pSrc++ << 6;
 820                     ch += *pSrc++ - 0x3c82080;
 821                     *(pDest++) = U16_LEAD(ch);
 822                     *(pDest++) = U16_TRAIL(ch);
 823                 }
 824             } while(pSrc < pSrcLimit);
 825
 826             pSrcLimit += 3; /* restore original pSrcLimit */
 827         }
 828
 829         while(pSrc < pSrcLimit) {
 830             ch = *pSrc++;
 831             if(ch < 0xc0) {
 832                 /*
 833                  * ASCII, or a trail byte in lead position which is treated like
 834                  * a single-byte sequence for better character boundary
 835                  * resynchronization after illegal sequences.
 836                  */
 837                 *pDest++=(UChar)ch;
 838                 continue;
 839             } else if(ch < 0xe0) { /* U+0080..U+07FF */
 840                 if(pSrc < pSrcLimit) {
 841                     /* 0x3080 = (0xc0 << 6) + 0x80 */
 842                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
 843                     continue;
 844                 }
 845             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
 846                 if((pSrcLimit - pSrc) >= 2) {
 847                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
 848                     /* 0x2080 = (0x80 << 6) + 0x80 */
 849                     ch = (ch << 12) + (*pSrc++ << 6);
 850                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
 851                     pSrc += 3;
 852                     continue;
 853                 }
 854             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
 855                 if((pSrcLimit - pSrc) >= 3) {
 856                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
 857                     ch = (ch << 18) + (*pSrc++ << 12);
 858                     ch += *pSrc++ << 6;
 859                     ch += *pSrc++ - 0x3c82080;
 860                     *(pDest++) = U16_LEAD(ch);
 861                     *(pDest++) = U16_TRAIL(ch);
 862                     pSrc += 4;
 863                     continue;
 864                 }
 865             }
 866
 867             /* truncated character at the end */
 868             *pDest++ = 0xfffd;
 869             break;
 870         }
 871     }
 872
 873     reqLength+=(int32_t)(pDest - dest);
 874
 875     if(pDestLength){
 876         *pDestLength = reqLength;
 877     }
 878
 879     /* Terminate the buffer */
 880     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
 881
 882     return dest;
 883 }
 884
 885 static U_INLINE uint8_t *
 886 _appendUTF8(uint8_t *pDest, UChar32 c) {
 887     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
 888     if((c)<=0x7f) {
 889         *pDest++=(uint8_t)c;
 890     } else if(c<=0x7ff) {
 891         *pDest++=(uint8_t)((c>>6)|0xc0);
 892         *pDest++=(uint8_t)((c&0x3f)|0x80);
 893     } else if(c<=0xffff) {
 894         *pDest++=(uint8_t)((c>>12)|0xe0);
 895         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
 896         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 897     } else /* if((uint32_t)(c)<=0x10ffff) */ {
 898         *pDest++=(uint8_t)(((c)>>18)|0xf0);
 899         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
 900         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
 901         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
 902     }
 903     return pDest;
 904 }
 905
 906
 907 U_CAPI char* U_EXPORT2
 908 u_strToUTF8WithSub(char *dest,
 909             int32_t destCapacity,
 910             int32_t *pDestLength,
 911             const UChar *pSrc,
 912             int32_t srcLength,
 913             UChar32 subchar, int32_t *pNumSubstitutions,
 914             UErrorCode *pErrorCode){
 915
 916     int32_t reqLength=0;
 917     uint32_t ch=0,ch2=0;
 918     uint8_t *pDest = (uint8_t *)dest;
 919     uint8_t *pDestLimit = pDest + destCapacity;
 920     int32_t numSubstitutions;
 921
 922     /* args check */
 923     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
 924         return NULL;
 925     }
 926
 927     if( (pSrc==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
 928         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
 929     ) {
 930         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
 931         return NULL;
 932     }
 933
 934     numSubstitutions=0;
 935
 936     if(srcLength==-1) {
 937         while((ch=*pSrc)!=0) {
 938             ++pSrc;
 939             if(ch <= 0x7f) {
 940                 if(pDest<pDestLimit) {
 941                     *pDest++ = (char)ch;
 942                 } else {
 943                     reqLength = 1;
 944                     break;
 945                 }
 946             } else if(ch <= 0x7ff) {
 947                 if((pDestLimit - pDest) >= 2) {
 948                     *pDest++=(uint8_t)((ch>>6)|0xc0);
 949                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 950                 } else {
 951                     reqLength = 2;
 952                     break;
 953                 }
 954             } else if(ch <= 0xd7ff || ch >= 0xe000) {
 955                 if((pDestLimit - pDest) >= 3) {
 956                     *pDest++=(uint8_t)((ch>>12)|0xe0);
 957                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
 958                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
 959                 } else {
 960                     reqLength = 3;
 961                     break;
 962                 }
 963             } else /* ch is a surrogate */ {
 964                 int32_t length;
 965
 966                 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
 967                 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
 968                     ++pSrc;
 969                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
 970                 } else if(subchar>=0) {
 971                     ch=subchar;
 972                     ++numSubstitutions;
 973                 } else {
 974                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
 975                     *pErrorCode = U_INVALID_CHAR_FOUND;
 976                     return NULL;
 977                 }
 978
 979                 length = U8_LENGTH(ch);
 980                 if((pDestLimit - pDest) >= length) {
 981                     /* convert and append*/
 982                     pDest=_appendUTF8(pDest, ch);
 983                 } else {
 984                     reqLength = length;
 985                     break;
 986                 }
 987             }
 988         }
 989         while((ch=*pSrc++)!=0) {
 990             if(ch<=0x7f) {
 991                 ++reqLength;
 992             } else if(ch<=0x7ff) {
 993                 reqLength+=2;
 994             } else if(!UTF_IS_SURROGATE(ch)) {
 995                 reqLength+=3;
 996             } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
 997                 ++pSrc;
 998                 reqLength+=4;
 999             } else if(subchar>=0) {
1000                 reqLength+=U8_LENGTH(subchar);
1001                 ++numSubstitutions;
1002             } else {
1003                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1004                 *pErrorCode = U_INVALID_CHAR_FOUND;
1005                 return NULL;
1006             }
1007         }
1008     } else {
1009         const UChar *pSrcLimit = pSrc+srcLength;
1010         int32_t count;
1011
1012         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1013         for(;;) {
1014             /*
1015              * Each iteration of the inner loop progresses by at most 3 UTF-8
1016              * bytes and one UChar, for most characters.
1017              * For supplementary code points (4 & 2), which are rare,
1018              * there is an additional adjustment.
1019              */
1020             count = (int32_t)((pDestLimit - pDest) / 3);
1021             srcLength = (int32_t)(pSrcLimit - pSrc);
1022             if(count > srcLength) {
1023                 count = srcLength; /* min(remaining dest/3, remaining src) */
1024             }
1025             if(count < 3) {
1026                 /*
1027                  * Too much overhead if we get near the end of the string,
1028                  * continue with the next loop.
1029                  */
1030                 break;
1031             }
1032             do {
1033                 ch=*pSrc++;
1034                 if(ch <= 0x7f) {
1035                     *pDest++ = (char)ch;
1036                 } else if(ch <= 0x7ff) {
1037                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1038                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1039                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1040                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1041                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1042                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1043                 } else /* ch is a surrogate */ {
1044                     /*
1045                      * We will read two UChars and probably output four bytes,
1046                      * which we didn't account for with computing count,
1047                      * so we adjust it here.
1048                      */
1049                     if(--count == 0) {
1050                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1051                         break;  /* recompute count */
1052                     }
1053
1054                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1055                         ++pSrc;
1056                         ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1057
1058                         /* writing 4 bytes per 2 UChars is ok */
1059                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1060                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1061                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1062                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1063                     } else  {
1064                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1065                         if(subchar>=0) {
1066                             ch=subchar;
1067                             ++numSubstitutions;
1068                         } else {
1069                             *pErrorCode = U_INVALID_CHAR_FOUND;
1070                             return NULL;
1071                         }
1072
1073                         /* convert and append*/
1074                         pDest=_appendUTF8(pDest, ch);
1075                     }
1076                 }
1077             } while(--count > 0);
1078         }
1079
1080         while(pSrc<pSrcLimit) {
1081             ch=*pSrc++;
1082             if(ch <= 0x7f) {
1083                 if(pDest<pDestLimit) {
1084                     *pDest++ = (char)ch;
1085                 } else {
1086                     reqLength = 1;
1087                     break;
1088                 }
1089             } else if(ch <= 0x7ff) {
1090                 if((pDestLimit - pDest) >= 2) {
1091                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1092                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1093                 } else {
1094                     reqLength = 2;
1095                     break;
1096                 }
1097             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1098                 if((pDestLimit - pDest) >= 3) {
1099                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1100                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1101                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1102                 } else {
1103                     reqLength = 3;
1104                     break;
1105                 }
1106             } else /* ch is a surrogate */ {
1107                 int32_t length;
1108
1109                 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1110                     ++pSrc;
1111                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1112                 } else if(subchar>=0) {
1113                     ch=subchar;
1114                     ++numSubstitutions;
1115                 } else {
1116                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1117                     *pErrorCode = U_INVALID_CHAR_FOUND;
1118                     return NULL;
1119                 }
1120
1121                 length = U8_LENGTH(ch);
1122                 if((pDestLimit - pDest) >= length) {
1123                     /* convert and append*/
1124                     pDest=_appendUTF8(pDest, ch);
1125                 } else {
1126                     reqLength = length;
1127                     break;
1128                 }
1129             }
1130         }
1131         while(pSrc<pSrcLimit) {
1132             ch=*pSrc++;
1133             if(ch<=0x7f) {
1134                 ++reqLength;
1135             } else if(ch<=0x7ff) {
1136                 reqLength+=2;
1137             } else if(!UTF_IS_SURROGATE(ch)) {
1138                 reqLength+=3;
1139             } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1140                 ++pSrc;
1141                 reqLength+=4;
1142             } else if(subchar>=0) {
1143                 reqLength+=U8_LENGTH(subchar);
1144                 ++numSubstitutions;
1145             } else {
1146                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1147                 *pErrorCode = U_INVALID_CHAR_FOUND;
1148                 return NULL;
1149             }
1150         }
1151     }
1152
1153     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1154
1155     if(pNumSubstitutions!=NULL) {
1156         *pNumSubstitutions=numSubstitutions;
1157     }
1158
1159     if(pDestLength){
1160         *pDestLength = reqLength;
1161     }
1162
1163     /* Terminate the buffer */
1164     u_terminateChars((char*)dest,destCapacity,reqLength,pErrorCode);
1165
1166     return (char*)dest;
1167 }
1168
1169 U_CAPI char* U_EXPORT2
1170 u_strToUTF8(char *dest,
1171             int32_t destCapacity,
1172             int32_t *pDestLength,
1173             const UChar *pSrc,
1174             int32_t srcLength,
1175             UErrorCode *pErrorCode){
1176     return u_strToUTF8WithSub(
1177             dest, destCapacity, pDestLength,
1178             pSrc, srcLength,
1179             U_SENTINEL, NULL,
1180             pErrorCode);
1181 }