]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
2 ******************************************************************************
4 * Copyright (C) 2001-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
18 /*******************************************************************************
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
23 *******************************************************************************
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utf16.h"
37 U_CAPI UChar
* U_EXPORT2
38 u_strFromUTF32WithSub(UChar
*dest
,
43 UChar32 subchar
, int32_t *pNumSubstitutions
,
44 UErrorCode
*pErrorCode
) {
45 const UChar32
*srcLimit
;
50 int32_t numSubstitutions
;
53 if(U_FAILURE(*pErrorCode
)){
56 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
57 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
58 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
60 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
64 if(pNumSubstitutions
!= NULL
) {
65 *pNumSubstitutions
= 0;
69 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
74 /* simple loop for conversion of a NUL-terminated BMP string */
75 while((ch
=*src
) != 0 &&
76 ((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff))) {
78 if(pDest
< destLimit
) {
86 /* "complicated" case, find the end of the remaining string */
87 while(*++srcLimit
!= 0) {}
90 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
93 /* convert with length */
94 while(src
< srcLimit
) {
97 /* usually "loops" once; twice only for writing subchar */
98 if((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff)) {
99 if(pDest
< destLimit
) {
100 *pDest
++ = (UChar
)ch
;
105 } else if(0x10000 <= ch
&& ch
<= 0x10ffff) {
106 if(pDest
!=NULL
&& ((pDest
+ 2) <= destLimit
)) {
107 *pDest
++ = U16_LEAD(ch
);
108 *pDest
++ = U16_TRAIL(ch
);
113 } else if((ch
= subchar
) < 0) {
114 /* surrogate code point, or not a Unicode code point at all */
115 *pErrorCode
= U_INVALID_CHAR_FOUND
;
123 reqLength
+= (int32_t)(pDest
- dest
);
125 *pDestLength
= reqLength
;
127 if(pNumSubstitutions
!= NULL
) {
128 *pNumSubstitutions
= numSubstitutions
;
131 /* Terminate the buffer */
132 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
137 U_CAPI UChar
* U_EXPORT2
138 u_strFromUTF32(UChar
*dest
,
139 int32_t destCapacity
,
140 int32_t *pDestLength
,
143 UErrorCode
*pErrorCode
) {
144 return u_strFromUTF32WithSub(
145 dest
, destCapacity
, pDestLength
,
151 U_CAPI UChar32
* U_EXPORT2
152 u_strToUTF32WithSub(UChar32
*dest
,
153 int32_t destCapacity
,
154 int32_t *pDestLength
,
157 UChar32 subchar
, int32_t *pNumSubstitutions
,
158 UErrorCode
*pErrorCode
) {
159 const UChar
*srcLimit
;
165 int32_t numSubstitutions
;
168 if(U_FAILURE(*pErrorCode
)){
171 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
172 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
173 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
175 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
179 if(pNumSubstitutions
!= NULL
) {
180 *pNumSubstitutions
= 0;
184 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
186 numSubstitutions
= 0;
189 /* simple loop for conversion of a NUL-terminated BMP string */
190 while((ch
=*src
) != 0 && !U16_IS_SURROGATE(ch
)) {
192 if(pDest
< destLimit
) {
200 /* "complicated" case, find the end of the remaining string */
201 while(*++srcLimit
!= 0) {}
204 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
207 /* convert with length */
208 while(src
< srcLimit
) {
210 if(!U16_IS_SURROGATE(ch
)) {
211 /* write or count ch below */
212 } else if(U16_IS_SURROGATE_LEAD(ch
) && src
< srcLimit
&& U16_IS_TRAIL(ch2
= *src
)) {
214 ch
= U16_GET_SUPPLEMENTARY(ch
, ch2
);
215 } else if((ch
= subchar
) < 0) {
216 /* unpaired surrogate */
217 *pErrorCode
= U_INVALID_CHAR_FOUND
;
222 if(pDest
< destLimit
) {
229 reqLength
+= (int32_t)(pDest
- dest
);
231 *pDestLength
= reqLength
;
233 if(pNumSubstitutions
!= NULL
) {
234 *pNumSubstitutions
= numSubstitutions
;
237 /* Terminate the buffer */
238 u_terminateUChar32s(dest
, destCapacity
, reqLength
, pErrorCode
);
243 U_CAPI UChar32
* U_EXPORT2
244 u_strToUTF32(UChar32
*dest
,
245 int32_t destCapacity
,
246 int32_t *pDestLength
,
249 UErrorCode
*pErrorCode
) {
250 return u_strToUTF32WithSub(
251 dest
, destCapacity
, pDestLength
,
257 /* for utf8_nextCharSafeBodyTerminated() */
259 utf8_minLegal
[4]={ 0, 0x80, 0x800, 0x10000 };
262 * Version of utf8_nextCharSafeBody() with the following differences:
263 * - checks for NUL termination instead of length
264 * - works with pointers instead of indexes
265 * - always strict (strict==-1)
267 * *ps points to after the lead byte and will be moved to after the last trail byte.
268 * c is the lead byte.
269 * @return the code point, or U_SENTINEL
272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps
, UChar32 c
) {
273 const uint8_t *s
=*ps
;
274 uint8_t trail
, illegal
=0;
275 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
277 U8_MASK_LEAD_BYTE((c
), count
);
278 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
280 /* each branch falls through to the next one */
283 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
287 trail
=(uint8_t)(*s
++ - 0x80);
289 if(trail
>0x3f || c
>=0x110) {
290 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
296 trail
=(uint8_t)(*s
++ - 0x80);
298 /* not a trail byte */
305 trail
=(uint8_t)(*s
++ - 0x80);
307 /* not a trail byte */
314 /* no default branch to optimize switch() - all values are covered */
317 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
318 /* illegal is also set if count>=4 */
319 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
321 /* don't go beyond this sequence */
323 while(count
>0 && U8_IS_TRAIL(*s
)) {
334 * Version of utf8_nextCharSafeBody() with the following differences:
335 * - works with pointers instead of indexes
336 * - always strict (strict==-1)
338 * *ps points to after the lead byte and will be moved to after the last trail byte.
339 * c is the lead byte.
340 * @return the code point, or U_SENTINEL
343 utf8_nextCharSafeBodyPointer(const uint8_t **ps
, const uint8_t *limit
, UChar32 c
) {
344 const uint8_t *s
=*ps
;
345 uint8_t trail
, illegal
=0;
346 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
347 if((limit
-s
)>=count
) {
348 U8_MASK_LEAD_BYTE((c
), count
);
349 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
351 /* each branch falls through to the next one */
354 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
359 c
=(c
<<6)|(trail
&0x3f);
361 illegal
|=(trail
&0xc0)^0x80;
363 /* code point>0x10ffff, outside Unicode */
370 c
=(c
<<6)|(trail
&0x3f);
371 illegal
|=(trail
&0xc0)^0x80;
375 c
=(c
<<6)|(trail
&0x3f);
376 illegal
|=(trail
&0xc0)^0x80;
380 /* no default branch to optimize switch() - all values are covered */
383 illegal
=1; /* too few bytes left */
386 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
387 /* illegal is also set if count>=4 */
388 U_ASSERT(illegal
|| count
<UPRV_LENGTHOF(utf8_minLegal
));
389 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
391 /* don't go beyond this sequence */
393 while(count
>0 && s
<limit
&& U8_IS_TRAIL(*s
)) {
403 U_CAPI UChar
* U_EXPORT2
404 u_strFromUTF8WithSub(UChar
*dest
,
405 int32_t destCapacity
,
406 int32_t *pDestLength
,
409 UChar32 subchar
, int32_t *pNumSubstitutions
,
410 UErrorCode
*pErrorCode
){
412 UChar
*pDestLimit
= dest
+destCapacity
;
414 int32_t reqLength
= 0;
415 const uint8_t* pSrc
= (const uint8_t*) src
;
416 uint8_t t1
, t2
; /* trail bytes */
417 int32_t numSubstitutions
;
420 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
424 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
425 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
426 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
428 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
432 if(pNumSubstitutions
!=NULL
) {
433 *pNumSubstitutions
=0;
438 * Inline processing of UTF-8 byte sequences:
440 * Byte sequences for the most common characters are handled inline in
441 * the conversion loops. In order to reduce the path lengths for those
442 * characters, the tests are arranged in a kind of binary search.
443 * ASCII (<=0x7f) is checked first, followed by the dividing point
444 * between 2- and 3-byte sequences (0xe0).
445 * The 3-byte branch is tested first to speed up CJK text.
446 * The compiler should combine the subtractions for the two tests for 0xe0.
447 * Each branch then tests for the other end of its range.
452 * Transform a NUL-terminated string.
453 * The code explicitly checks for NULs only in the lead byte position.
454 * A NUL byte in the trail byte position fails the trail byte range check anyway.
456 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
462 if( /* handle U+1000..U+CFFF inline */
464 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
465 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
467 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
468 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
472 } else if(ch
< 0xe0) {
473 if( /* handle U+0080..U+07FF inline */
475 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
477 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
483 /* function call for "complicated" and error cases */
484 ++pSrc
; /* continue after the lead byte */
485 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
486 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
487 *pErrorCode
= U_INVALID_CHAR_FOUND
;
489 } else if(ch
<=0xFFFF) {
490 *(pDest
++)=(UChar
)ch
;
492 *(pDest
++)=U16_LEAD(ch
);
493 if(pDest
<pDestLimit
) {
494 *(pDest
++)=U16_TRAIL(ch
);
503 /* Pre-flight the rest of the string. */
504 while((ch
= *pSrc
) != 0) {
510 if( /* handle U+1000..U+CFFF inline */
512 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
513 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
519 } else if(ch
< 0xe0) {
520 if( /* handle U+0080..U+07FF inline */
522 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
530 /* function call for "complicated" and error cases */
531 ++pSrc
; /* continue after the lead byte */
532 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
533 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
534 *pErrorCode
= U_INVALID_CHAR_FOUND
;
537 reqLength
+= U16_LENGTH(ch
);
540 } else /* srcLength >= 0 */ {
541 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
544 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
547 * Each iteration of the inner loop progresses by at most 3 UTF-8
548 * bytes and one UChar, for most characters.
549 * For supplementary code points (4 & 2), which are rare,
550 * there is an additional adjustment.
552 count
= (int32_t)(pDestLimit
- pDest
);
553 srcLength
= (int32_t)((pSrcLimit
- pSrc
) / 3);
554 if(count
> srcLength
) {
555 count
= srcLength
; /* min(remaining dest, remaining src/3) */
559 * Too much overhead if we get near the end of the string,
560 * continue with the next loop.
572 if( /* handle U+1000..U+CFFF inline */
574 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
575 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
577 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
578 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
582 } else if(ch
< 0xe0) {
583 if( /* handle U+0080..U+07FF inline */
585 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
587 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
593 if(ch
>= 0xf0 || subchar
> 0xffff) {
595 * We may read up to six bytes and write up to two UChars,
596 * which we didn't account for with computing count,
597 * so we adjust it here.
604 /* function call for "complicated" and error cases */
605 ++pSrc
; /* continue after the lead byte */
606 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
607 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
608 *pErrorCode
= U_INVALID_CHAR_FOUND
;
610 }else if(ch
<=0xFFFF){
611 *(pDest
++)=(UChar
)ch
;
613 *(pDest
++)=U16_LEAD(ch
);
614 *(pDest
++)=U16_TRAIL(ch
);
617 } while(--count
> 0);
620 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
627 if( /* handle U+1000..U+CFFF inline */
629 ((pSrcLimit
- pSrc
) >= 3) &&
630 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
631 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
633 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
634 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
638 } else if(ch
< 0xe0) {
639 if( /* handle U+0080..U+07FF inline */
641 ((pSrcLimit
- pSrc
) >= 2) &&
642 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
644 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
650 /* function call for "complicated" and error cases */
651 ++pSrc
; /* continue after the lead byte */
652 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
653 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
654 *pErrorCode
= U_INVALID_CHAR_FOUND
;
656 }else if(ch
<=0xFFFF){
657 *(pDest
++)=(UChar
)ch
;
659 *(pDest
++)=U16_LEAD(ch
);
660 if(pDest
<pDestLimit
){
661 *(pDest
++)=U16_TRAIL(ch
);
669 /* do not fill the dest buffer just count the UChars needed */
670 while(pSrc
< pSrcLimit
){
677 if( /* handle U+1000..U+CFFF inline */
679 ((pSrcLimit
- pSrc
) >= 3) &&
680 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
681 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
687 } else if(ch
< 0xe0) {
688 if( /* handle U+0080..U+07FF inline */
690 ((pSrcLimit
- pSrc
) >= 2) &&
691 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
699 /* function call for "complicated" and error cases */
700 ++pSrc
; /* continue after the lead byte */
701 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
702 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
703 *pErrorCode
= U_INVALID_CHAR_FOUND
;
706 reqLength
+=U16_LENGTH(ch
);
711 reqLength
+=(int32_t)(pDest
- dest
);
713 if(pNumSubstitutions
!=NULL
) {
714 *pNumSubstitutions
=numSubstitutions
;
718 *pDestLength
= reqLength
;
721 /* Terminate the buffer */
722 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
727 U_CAPI UChar
* U_EXPORT2
728 u_strFromUTF8(UChar
*dest
,
729 int32_t destCapacity
,
730 int32_t *pDestLength
,
733 UErrorCode
*pErrorCode
){
734 return u_strFromUTF8WithSub(
735 dest
, destCapacity
, pDestLength
,
741 U_CAPI UChar
* U_EXPORT2
742 u_strFromUTF8Lenient(UChar
*dest
,
743 int32_t destCapacity
,
744 int32_t *pDestLength
,
747 UErrorCode
*pErrorCode
) {
750 int32_t reqLength
= 0;
751 uint8_t* pSrc
= (uint8_t*) src
;
754 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
758 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
759 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0)
761 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
766 /* Transform a NUL-terminated string. */
767 UChar
*pDestLimit
= (dest
!=NULL
)?(dest
+destCapacity
):NULL
;
768 uint8_t t1
, t2
, t3
; /* trail bytes */
770 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
773 * ASCII, or a trail byte in lead position which is treated like
774 * a single-byte sequence for better character boundary
775 * resynchronization after illegal sequences.
780 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
781 if((t1
= pSrc
[1]) != 0) {
782 /* 0x3080 = (0xc0 << 6) + 0x80 */
783 *pDest
++ = (UChar
)((ch
<< 6) + t1
- 0x3080);
787 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
788 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0) {
789 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
790 /* 0x2080 = (0x80 << 6) + 0x80 */
791 *pDest
++ = (UChar
)((ch
<< 12) + (t1
<< 6) + t2
- 0x2080);
795 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
796 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0 && (t3
= pSrc
[3]) != 0) {
798 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
799 ch
= (ch
<< 18) + (t1
<< 12) + (t2
<< 6) + t3
- 0x3c82080;
800 *(pDest
++) = U16_LEAD(ch
);
801 if(pDest
< pDestLimit
) {
802 *(pDest
++) = U16_TRAIL(ch
);
811 /* truncated character at the end */
813 while(*++pSrc
!= 0) {}
817 /* Pre-flight the rest of the string. */
818 while((ch
= *pSrc
) != 0) {
821 * ASCII, or a trail byte in lead position which is treated like
822 * a single-byte sequence for better character boundary
823 * resynchronization after illegal sequences.
828 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
834 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
835 if(pSrc
[1] != 0 && pSrc
[2] != 0) {
840 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
841 if(pSrc
[1] != 0 && pSrc
[2] != 0 && pSrc
[3] != 0) {
848 /* truncated character at the end */
852 } else /* srcLength >= 0 */ {
853 const uint8_t *pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+ srcLength
):NULL
;
856 * This function requires that if srcLength is given, then it must be
857 * destCapatity >= srcLength so that we need not check for
858 * destination buffer overflow in the loop.
860 if(destCapacity
< srcLength
) {
861 if(pDestLength
!= NULL
) {
862 *pDestLength
= srcLength
; /* this likely overestimates the true destLength! */
864 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
868 if((pSrcLimit
- pSrc
) >= 4) {
869 pSrcLimit
-= 3; /* temporarily reduce pSrcLimit */
871 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
876 * ASCII, or a trail byte in lead position which is treated like
877 * a single-byte sequence for better character boundary
878 * resynchronization after illegal sequences.
881 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
882 /* 0x3080 = (0xc0 << 6) + 0x80 */
883 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
884 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
885 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
886 /* 0x2080 = (0x80 << 6) + 0x80 */
887 ch
= (ch
<< 12) + (*pSrc
++ << 6);
888 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
889 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
890 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
891 ch
= (ch
<< 18) + (*pSrc
++ << 12);
893 ch
+= *pSrc
++ - 0x3c82080;
894 *(pDest
++) = U16_LEAD(ch
);
895 *(pDest
++) = U16_TRAIL(ch
);
897 } while(pSrc
< pSrcLimit
);
899 pSrcLimit
+= 3; /* restore original pSrcLimit */
902 while(pSrc
< pSrcLimit
) {
906 * ASCII, or a trail byte in lead position which is treated like
907 * a single-byte sequence for better character boundary
908 * resynchronization after illegal sequences.
912 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
913 if(pSrc
< pSrcLimit
) {
914 /* 0x3080 = (0xc0 << 6) + 0x80 */
915 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
918 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
919 if((pSrcLimit
- pSrc
) >= 2) {
920 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
921 /* 0x2080 = (0x80 << 6) + 0x80 */
922 ch
= (ch
<< 12) + (*pSrc
++ << 6);
923 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
927 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
928 if((pSrcLimit
- pSrc
) >= 3) {
929 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
930 ch
= (ch
<< 18) + (*pSrc
++ << 12);
932 ch
+= *pSrc
++ - 0x3c82080;
933 *(pDest
++) = U16_LEAD(ch
);
934 *(pDest
++) = U16_TRAIL(ch
);
940 /* truncated character at the end */
946 reqLength
+=(int32_t)(pDest
- dest
);
949 *pDestLength
= reqLength
;
952 /* Terminate the buffer */
953 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
958 static inline uint8_t *
959 _appendUTF8(uint8_t *pDest
, UChar32 c
) {
960 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
963 } else if(c
<=0x7ff) {
964 *pDest
++=(uint8_t)((c
>>6)|0xc0);
965 *pDest
++=(uint8_t)((c
&0x3f)|0x80);
966 } else if(c
<=0xffff) {
967 *pDest
++=(uint8_t)((c
>>12)|0xe0);
968 *pDest
++=(uint8_t)(((c
>>6)&0x3f)|0x80);
969 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
970 } else /* if((uint32_t)(c)<=0x10ffff) */ {
971 *pDest
++=(uint8_t)(((c
)>>18)|0xf0);
972 *pDest
++=(uint8_t)((((c
)>>12)&0x3f)|0x80);
973 *pDest
++=(uint8_t)((((c
)>>6)&0x3f)|0x80);
974 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
980 U_CAPI
char* U_EXPORT2
981 u_strToUTF8WithSub(char *dest
,
982 int32_t destCapacity
,
983 int32_t *pDestLength
,
986 UChar32 subchar
, int32_t *pNumSubstitutions
,
987 UErrorCode
*pErrorCode
){
990 uint8_t *pDest
= (uint8_t *)dest
;
991 uint8_t *pDestLimit
= (pDest
!=NULL
)?(pDest
+ destCapacity
):NULL
;
992 int32_t numSubstitutions
;
995 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
999 if( (pSrc
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1000 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
1001 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1003 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1007 if(pNumSubstitutions
!=NULL
) {
1008 *pNumSubstitutions
=0;
1013 while((ch
=*pSrc
)!=0) {
1016 if(pDest
<pDestLimit
) {
1017 *pDest
++ = (uint8_t)ch
;
1022 } else if(ch
<= 0x7ff) {
1023 if((pDestLimit
- pDest
) >= 2) {
1024 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1025 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1030 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1031 if((pDestLimit
- pDest
) >= 3) {
1032 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1033 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1034 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1039 } else /* ch is a surrogate */ {
1042 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1043 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1045 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1046 } else if(subchar
>=0) {
1050 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1051 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1055 length
= U8_LENGTH(ch
);
1056 if((pDestLimit
- pDest
) >= length
) {
1057 /* convert and append*/
1058 pDest
=_appendUTF8(pDest
, ch
);
1065 while((ch
=*pSrc
++)!=0) {
1068 } else if(ch
<=0x7ff) {
1070 } else if(!U16_IS_SURROGATE(ch
)) {
1072 } else if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1075 } else if(subchar
>=0) {
1076 reqLength
+=U8_LENGTH(subchar
);
1079 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1080 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1085 const UChar
*pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+srcLength
):NULL
;
1088 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1091 * Each iteration of the inner loop progresses by at most 3 UTF-8
1092 * bytes and one UChar, for most characters.
1093 * For supplementary code points (4 & 2), which are rare,
1094 * there is an additional adjustment.
1096 count
= (int32_t)((pDestLimit
- pDest
) / 3);
1097 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1098 if(count
> srcLength
) {
1099 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1103 * Too much overhead if we get near the end of the string,
1104 * continue with the next loop.
1111 *pDest
++ = (uint8_t)ch
;
1112 } else if(ch
<= 0x7ff) {
1113 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1114 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1115 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1116 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1117 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1118 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1119 } else /* ch is a surrogate */ {
1121 * We will read two UChars and probably output four bytes,
1122 * which we didn't account for with computing count,
1123 * so we adjust it here.
1126 --pSrc
; /* undo ch=*pSrc++ for the lead surrogate */
1127 break; /* recompute count */
1130 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1132 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1134 /* writing 4 bytes per 2 UChars is ok */
1135 *pDest
++=(uint8_t)((ch
>>18)|0xf0);
1136 *pDest
++=(uint8_t)(((ch
>>12)&0x3f)|0x80);
1137 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1138 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1140 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1145 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1149 /* convert and append*/
1150 pDest
=_appendUTF8(pDest
, ch
);
1153 } while(--count
> 0);
1156 while(pSrc
<pSrcLimit
) {
1159 if(pDest
<pDestLimit
) {
1160 *pDest
++ = (uint8_t)ch
;
1165 } else if(ch
<= 0x7ff) {
1166 if((pDestLimit
- pDest
) >= 2) {
1167 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1168 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1173 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1174 if((pDestLimit
- pDest
) >= 3) {
1175 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1176 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1177 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1182 } else /* ch is a surrogate */ {
1185 if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1187 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1188 } else if(subchar
>=0) {
1192 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1193 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1197 length
= U8_LENGTH(ch
);
1198 if((pDestLimit
- pDest
) >= length
) {
1199 /* convert and append*/
1200 pDest
=_appendUTF8(pDest
, ch
);
1207 while(pSrc
<pSrcLimit
) {
1211 } else if(ch
<=0x7ff) {
1213 } else if(!U16_IS_SURROGATE(ch
)) {
1215 } else if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1218 } else if(subchar
>=0) {
1219 reqLength
+=U8_LENGTH(subchar
);
1222 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1223 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1229 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1231 if(pNumSubstitutions
!=NULL
) {
1232 *pNumSubstitutions
=numSubstitutions
;
1236 *pDestLength
= reqLength
;
1239 /* Terminate the buffer */
1240 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1244 U_CAPI
char* U_EXPORT2
1245 u_strToUTF8(char *dest
,
1246 int32_t destCapacity
,
1247 int32_t *pDestLength
,
1250 UErrorCode
*pErrorCode
){
1251 return u_strToUTF8WithSub(
1252 dest
, destCapacity
, pDestLength
,
1258 U_CAPI UChar
* U_EXPORT2
1259 u_strFromJavaModifiedUTF8WithSub(
1261 int32_t destCapacity
,
1262 int32_t *pDestLength
,
1265 UChar32 subchar
, int32_t *pNumSubstitutions
,
1266 UErrorCode
*pErrorCode
) {
1267 UChar
*pDest
= dest
;
1268 UChar
*pDestLimit
= dest
+destCapacity
;
1270 int32_t reqLength
= 0;
1271 const uint8_t* pSrc
= (const uint8_t*) src
;
1272 const uint8_t *pSrcLimit
;
1274 uint8_t t1
, t2
; /* trail bytes */
1275 int32_t numSubstitutions
;
1278 if(U_FAILURE(*pErrorCode
)){
1281 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1282 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0 ||
1283 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1285 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1289 if(pNumSubstitutions
!=NULL
) {
1290 *pNumSubstitutions
=0;
1296 * Transform a NUL-terminated ASCII string.
1297 * Handle non-ASCII strings with slower code.
1299 while(((ch
= *pSrc
) != 0) && ch
<= 0x7f && (pDest
< pDestLimit
)) {
1304 reqLength
=(int32_t)(pDest
- dest
);
1306 *pDestLength
= reqLength
;
1309 /* Terminate the buffer */
1310 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1313 srcLength
= uprv_strlen((const char *)pSrc
);
1316 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1317 pSrcLimit
= (pSrc
== NULL
) ? NULL
: pSrc
+ srcLength
;
1319 count
= (int32_t)(pDestLimit
- pDest
);
1320 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1321 if(count
>= srcLength
&& srcLength
> 0 && *pSrc
<= 0x7f) {
1322 /* fast ASCII loop */
1323 const uint8_t *prevSrc
= pSrc
;
1325 while(pSrc
< pSrcLimit
&& (ch
= *pSrc
) <= 0x7f) {
1329 delta
= (int32_t)(pSrc
- prevSrc
);
1334 * Each iteration of the inner loop progresses by at most 3 UTF-8
1335 * bytes and one UChar.
1338 if(count
> srcLength
) {
1339 count
= srcLength
; /* min(remaining dest, remaining src/3) */
1343 * Too much overhead if we get near the end of the string,
1344 * continue with the next loop.
1355 if( /* handle U+0000..U+FFFF inline */
1357 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1358 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1360 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1361 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1366 if( /* handle U+0000..U+07FF inline */
1368 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1370 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1377 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1379 } else if(subchar
> 0xffff && --count
== 0) {
1381 * We need to write two UChars, adjusted count for that,
1382 * and ran out of space.
1386 /* function call for error cases */
1387 ++pSrc
; /* continue after the lead byte */
1388 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1390 if(subchar
<=0xFFFF) {
1391 *(pDest
++)=(UChar
)subchar
;
1393 *(pDest
++)=U16_LEAD(subchar
);
1394 *(pDest
++)=U16_TRAIL(subchar
);
1398 } while(--count
> 0);
1401 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
1408 if( /* handle U+0000..U+FFFF inline */
1410 ((pSrcLimit
- pSrc
) >= 3) &&
1411 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1412 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1414 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1415 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1420 if( /* handle U+0000..U+07FF inline */
1422 ((pSrcLimit
- pSrc
) >= 2) &&
1423 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1425 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1432 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1435 /* function call for error cases */
1436 ++pSrc
; /* continue after the lead byte */
1437 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1439 if(subchar
<=0xFFFF) {
1440 *(pDest
++)=(UChar
)subchar
;
1442 *(pDest
++)=U16_LEAD(subchar
);
1443 if(pDest
<pDestLimit
) {
1444 *(pDest
++)=U16_TRAIL(subchar
);
1454 /* do not fill the dest buffer just count the UChars needed */
1455 while(pSrc
< pSrcLimit
){
1462 if( /* handle U+0000..U+FFFF inline */
1464 ((pSrcLimit
- pSrc
) >= 3) &&
1465 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
1466 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
1473 if( /* handle U+0000..U+07FF inline */
1475 ((pSrcLimit
- pSrc
) >= 2) &&
1476 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
1485 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1488 /* function call for error cases */
1489 ++pSrc
; /* continue after the lead byte */
1490 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1492 reqLength
+=U16_LENGTH(ch
);
1497 if(pNumSubstitutions
!=NULL
) {
1498 *pNumSubstitutions
=numSubstitutions
;
1501 reqLength
+=(int32_t)(pDest
- dest
);
1503 *pDestLength
= reqLength
;
1506 /* Terminate the buffer */
1507 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1511 U_CAPI
char* U_EXPORT2
1512 u_strToJavaModifiedUTF8(
1514 int32_t destCapacity
,
1515 int32_t *pDestLength
,
1518 UErrorCode
*pErrorCode
) {
1519 int32_t reqLength
=0;
1521 uint8_t *pDest
= (uint8_t *)dest
;
1522 uint8_t *pDestLimit
= pDest
+ destCapacity
;
1523 const UChar
*pSrcLimit
;
1527 if(U_FAILURE(*pErrorCode
)){
1530 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1531 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0
1533 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1538 /* Convert NUL-terminated ASCII, then find the string length. */
1539 while((ch
=*src
)<=0x7f && ch
!= 0 && pDest
<pDestLimit
) {
1540 *pDest
++ = (uint8_t)ch
;
1544 reqLength
=(int32_t)(pDest
- (uint8_t *)dest
);
1546 *pDestLength
= reqLength
;
1549 /* Terminate the buffer */
1550 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1553 srcLength
= u_strlen(src
);
1556 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1557 pSrcLimit
= (src
!=NULL
)?(src
+srcLength
):NULL
;
1559 count
= (int32_t)(pDestLimit
- pDest
);
1560 srcLength
= (int32_t)(pSrcLimit
- src
);
1561 if(count
>= srcLength
&& srcLength
> 0 && *src
<= 0x7f) {
1562 /* fast ASCII loop */
1563 const UChar
*prevSrc
= src
;
1565 while(src
< pSrcLimit
&& (ch
= *src
) <= 0x7f && ch
!= 0) {
1566 *pDest
++=(uint8_t)ch
;
1569 delta
= (int32_t)(src
- prevSrc
);
1574 * Each iteration of the inner loop progresses by at most 3 UTF-8
1575 * bytes and one UChar.
1578 if(count
> srcLength
) {
1579 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1583 * Too much overhead if we get near the end of the string,
1584 * continue with the next loop.
1590 if(ch
<= 0x7f && ch
!= 0) {
1591 *pDest
++ = (uint8_t)ch
;
1592 } else if(ch
<= 0x7ff) {
1593 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1594 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1596 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1597 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1598 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1600 } while(--count
> 0);
1603 while(src
<pSrcLimit
) {
1605 if(ch
<= 0x7f && ch
!= 0) {
1606 if(pDest
<pDestLimit
) {
1607 *pDest
++ = (uint8_t)ch
;
1612 } else if(ch
<= 0x7ff) {
1613 if((pDestLimit
- pDest
) >= 2) {
1614 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1615 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1621 if((pDestLimit
- pDest
) >= 3) {
1622 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1623 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1624 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1631 while(src
<pSrcLimit
) {
1633 if(ch
<= 0x7f && ch
!= 0) {
1635 } else if(ch
<=0x7ff) {
1642 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1644 *pDestLength
= reqLength
;
1647 /* Terminate the buffer */
1648 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);