]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.c
2 ******************************************************************************
4 * Copyright (C) 2001-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
18 /*******************************************************************************
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
23 *******************************************************************************
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
33 U_CAPI UChar
* U_EXPORT2
34 u_strFromUTF32(UChar
*dest
,
39 UErrorCode
*pErrorCode
)
41 int32_t reqLength
= 0;
43 UChar
*pDestLimit
=dest
+destCapacity
;
45 const uint32_t *pSrc
= (const uint32_t *)src
;
48 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
52 if((src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0)){
53 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
57 /* Check if the source is null terminated */
59 while(((ch
=*pSrc
)!=0) && (pDest
< pDestLimit
)){
63 }else if(ch
<=0x10ffff){
64 *(pDest
++)=UTF16_LEAD(ch
);
66 *(pDest
++)=UTF16_TRAIL(ch
);
72 *pErrorCode
= U_INVALID_CHAR_FOUND
;
76 while((ch
=*pSrc
++) != 0){
77 reqLength
+=UTF_CHAR_LENGTH(ch
);
80 const uint32_t* pSrcLimit
= ((const uint32_t*)pSrc
) + srcLength
;
81 while((pSrc
< pSrcLimit
) && (pDest
< pDestLimit
)){
85 }else if(ch
<=0x10FFFF){
86 *(pDest
++)=UTF16_LEAD(ch
);
88 *(pDest
++)=UTF16_TRAIL(ch
);
94 *pErrorCode
= U_INVALID_CHAR_FOUND
;
98 while(pSrc
<pSrcLimit
){
100 reqLength
+=UTF_CHAR_LENGTH(ch
);
104 reqLength
+= (int32_t)(pDest
- dest
);
106 *pDestLength
= reqLength
;
109 /* Terminate the buffer */
110 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
116 U_CAPI UChar32
* U_EXPORT2
117 u_strToUTF32(UChar32
*dest
,
118 int32_t destCapacity
,
119 int32_t *pDestLength
,
122 UErrorCode
*pErrorCode
)
124 const UChar
* pSrc
= src
;
125 const UChar
* pSrcLimit
;
128 uint32_t *pDest
= (uint32_t *)dest
;
129 uint32_t *pDestLimit
= pDest
+ destCapacity
;
133 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
138 if((src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0)){
139 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
144 while((ch
=*pSrc
)!=0 && pDest
!=pDestLimit
) {
146 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
147 if(UTF_IS_LEAD(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
149 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
153 while((ch
=*pSrc
++)!=0) {
154 if(UTF_IS_LEAD(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
160 pSrcLimit
= pSrc
+srcLength
;
161 while(pSrc
<pSrcLimit
&& pDest
<pDestLimit
) {
163 if(UTF_IS_LEAD(ch
) && pSrc
<pSrcLimit
&& UTF_IS_TRAIL(ch2
=*pSrc
)) {
165 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
169 while(pSrc
!=pSrcLimit
) {
171 if(UTF_IS_LEAD(ch
) && pSrc
<pSrcLimit
&& UTF_IS_TRAIL(ch2
=*pSrc
)) {
178 reqLength
+=(int32_t)(pDest
- (uint32_t *)dest
);
180 *pDestLength
= reqLength
;
183 /* Terminate the buffer */
184 u_terminateUChar32s(dest
,destCapacity
,reqLength
,pErrorCode
);
189 /* for utf8_nextCharSafeBodyTerminated() */
191 utf8_minLegal
[4]={ 0, 0x80, 0x800, 0x10000 };
194 * Version of utf8_nextCharSafeBody() with the following differences:
195 * - checks for NUL termination instead of length
196 * - works with pointers instead of indexes
197 * - always strict (strict==-1)
199 * *ps points to after the lead byte and will be moved to after the last trail byte.
200 * c is the lead byte.
201 * @return the code point, or U_SENTINEL
204 utf8_nextCharSafeBodyTerminated(const uint8_t **ps
, UChar32 c
) {
205 const uint8_t *s
=*ps
;
206 uint8_t trail
, illegal
=0;
207 uint8_t count
=UTF8_COUNT_TRAIL_BYTES(c
);
208 UTF8_MASK_LEAD_BYTE((c
), count
);
209 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
211 /* each branch falls through to the next one */
214 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
218 trail
=(uint8_t)(*s
++ - 0x80);
220 if(trail
>0x3f || c
>=0x110) {
221 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
226 trail
=(uint8_t)(*s
++ - 0x80);
228 /* not a trail byte */
234 trail
=(uint8_t)(*s
++ - 0x80);
236 /* not a trail byte */
243 /* no default branch to optimize switch() - all values are covered */
246 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
247 /* illegal is also set if count>=4 */
248 if(illegal
|| c
<utf8_minLegal
[count
] || UTF_IS_SURROGATE(c
)) {
250 /* don't go beyond this sequence */
252 while(count
>0 && UTF8_IS_TRAIL(*s
)) {
263 * Version of utf8_nextCharSafeBody() with the following differences:
264 * - works with pointers instead of indexes
265 * - always strict (strict==-1)
267 * *ps points to after the lead byte and will be moved to after the last trail byte.
268 * c is the lead byte.
269 * @return the code point, or U_SENTINEL
272 utf8_nextCharSafeBodyPointer(const uint8_t **ps
, const uint8_t *limit
, UChar32 c
) {
273 const uint8_t *s
=*ps
;
274 uint8_t trail
, illegal
=0;
275 uint8_t count
=UTF8_COUNT_TRAIL_BYTES(c
);
276 if((limit
-s
)>=count
) {
277 UTF8_MASK_LEAD_BYTE((c
), count
);
278 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
280 /* each branch falls through to the next one */
283 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
288 c
=(c
<<6)|(trail
&0x3f);
290 illegal
|=(trail
&0xc0)^0x80;
292 /* code point>0x10ffff, outside Unicode */
298 c
=(c
<<6)|(trail
&0x3f);
299 illegal
|=(trail
&0xc0)^0x80;
302 c
=(c
<<6)|(trail
&0x3f);
303 illegal
|=(trail
&0xc0)^0x80;
307 /* no default branch to optimize switch() - all values are covered */
310 illegal
=1; /* too few bytes left */
313 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
314 /* illegal is also set if count>=4 */
315 if(illegal
|| c
<utf8_minLegal
[count
] || UTF_IS_SURROGATE(c
)) {
317 /* don't go beyond this sequence */
319 while(count
>0 && s
<limit
&& UTF8_IS_TRAIL(*s
)) {
329 U_CAPI UChar
* U_EXPORT2
330 u_strFromUTF8WithSub(UChar
*dest
,
331 int32_t destCapacity
,
332 int32_t *pDestLength
,
335 UChar32 subchar
, int32_t *pNumSubstitutions
,
336 UErrorCode
*pErrorCode
){
339 UChar
*pDestLimit
= dest
+destCapacity
;
341 int32_t reqLength
= 0;
342 const uint8_t* pSrc
= (const uint8_t*) src
;
343 uint8_t t1
, t2
; /* trail bytes */
344 int32_t numSubstitutions
;
347 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
351 if( (src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0) ||
352 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
354 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
361 * Inline processing of UTF-8 byte sequences:
363 * Byte sequences for the most common characters are handled inline in
364 * the conversion loops. In order to reduce the path lengths for those
365 * characters, the tests are arranged in a kind of binary search.
366 * ASCII (<=0x7f) is checked first, followed by the dividing point
367 * between 2- and 3-byte sequences (0xe0).
368 * The 3-byte branch is tested first to speed up CJK text.
369 * The compiler should combine the subtractions for the two tests for 0xe0.
370 * Each branch then tests for the other end of its range.
375 * Transform a NUL-terminated string.
376 * The code explicitly checks for NULs only in the lead byte position.
377 * A NUL byte in the trail byte position fails the trail byte range check anyway.
379 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
385 if( /* handle U+1000..U+CFFF inline */
387 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
388 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
390 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
391 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
395 } else if(ch
< 0xe0) {
396 if( /* handle U+0080..U+07FF inline */
398 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
400 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
406 /* function call for "complicated" and error cases */
407 ++pSrc
; /* continue after the lead byte */
408 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
409 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
410 *pErrorCode
= U_INVALID_CHAR_FOUND
;
412 } else if(ch
<=0xFFFF) {
413 *(pDest
++)=(UChar
)ch
;
415 *(pDest
++)=UTF16_LEAD(ch
);
416 if(pDest
<pDestLimit
) {
417 *(pDest
++)=UTF16_TRAIL(ch
);
426 /* Pre-flight the rest of the string. */
427 while((ch
= *pSrc
) != 0) {
433 if( /* handle U+1000..U+CFFF inline */
435 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
436 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
442 } else if(ch
< 0xe0) {
443 if( /* handle U+0080..U+07FF inline */
445 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
453 /* function call for "complicated" and error cases */
454 ++pSrc
; /* continue after the lead byte */
455 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
456 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
457 *pErrorCode
= U_INVALID_CHAR_FOUND
;
460 reqLength
+= U16_LENGTH(ch
);
463 } else /* srcLength >= 0 */ {
464 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
467 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
470 * Each iteration of the inner loop progresses by at most 3 UTF-8
471 * bytes and one UChar, for most characters.
472 * For supplementary code points (4 & 2), which are rare,
473 * there is an additional adjustment.
475 count
= (int32_t)(pDestLimit
- pDest
);
476 srcLength
= (int32_t)((pSrcLimit
- pSrc
) / 3);
477 if(count
> srcLength
) {
478 count
= srcLength
; /* min(remaining dest, remaining src/3) */
482 * Too much overhead if we get near the end of the string,
483 * continue with the next loop.
495 if( /* handle U+1000..U+CFFF inline */
497 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
498 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
500 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
501 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
505 } else if(ch
< 0xe0) {
506 if( /* handle U+0080..U+07FF inline */
508 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
510 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
516 if(ch
>= 0xf0 || subchar
> 0xffff) {
518 * We may read up to six bytes and write up to two UChars,
519 * which we didn't account for with computing count,
520 * so we adjust it here.
527 /* function call for "complicated" and error cases */
528 ++pSrc
; /* continue after the lead byte */
529 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
530 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
531 *pErrorCode
= U_INVALID_CHAR_FOUND
;
533 }else if(ch
<=0xFFFF){
534 *(pDest
++)=(UChar
)ch
;
536 *(pDest
++)=UTF16_LEAD(ch
);
537 if(pDest
<pDestLimit
){
538 *(pDest
++)=UTF16_TRAIL(ch
);
545 } while(--count
> 0);
548 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
555 if( /* handle U+1000..U+CFFF inline */
557 ((pSrcLimit
- pSrc
) >= 3) &&
558 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
559 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
561 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
562 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
566 } else if(ch
< 0xe0) {
567 if( /* handle U+0080..U+07FF inline */
569 ((pSrcLimit
- pSrc
) >= 2) &&
570 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
572 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
578 /* function call for "complicated" and error cases */
579 ++pSrc
; /* continue after the lead byte */
580 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
581 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
582 *pErrorCode
= U_INVALID_CHAR_FOUND
;
584 }else if(ch
<=0xFFFF){
585 *(pDest
++)=(UChar
)ch
;
587 *(pDest
++)=UTF16_LEAD(ch
);
588 if(pDest
<pDestLimit
){
589 *(pDest
++)=UTF16_TRAIL(ch
);
597 /* donot fill the dest buffer just count the UChars needed */
598 while(pSrc
< pSrcLimit
){
605 if( /* handle U+1000..U+CFFF inline */
607 ((pSrcLimit
- pSrc
) >= 3) &&
608 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
609 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
615 } else if(ch
< 0xe0) {
616 if( /* handle U+0080..U+07FF inline */
618 ((pSrcLimit
- pSrc
) >= 2) &&
619 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
627 /* function call for "complicated" and error cases */
628 ++pSrc
; /* continue after the lead byte */
629 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
630 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
631 *pErrorCode
= U_INVALID_CHAR_FOUND
;
634 reqLength
+=UTF_CHAR_LENGTH(ch
);
639 reqLength
+=(int32_t)(pDest
- dest
);
641 if(pNumSubstitutions
!=NULL
) {
642 *pNumSubstitutions
=numSubstitutions
;
646 *pDestLength
= reqLength
;
649 /* Terminate the buffer */
650 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
655 U_CAPI UChar
* U_EXPORT2
656 u_strFromUTF8(UChar
*dest
,
657 int32_t destCapacity
,
658 int32_t *pDestLength
,
661 UErrorCode
*pErrorCode
){
662 return u_strFromUTF8WithSub(
663 dest
, destCapacity
, pDestLength
,
669 U_CAPI UChar
* U_EXPORT2
670 u_strFromUTF8Lenient(UChar
*dest
,
671 int32_t destCapacity
,
672 int32_t *pDestLength
,
675 UErrorCode
*pErrorCode
) {
679 int32_t reqLength
= 0;
680 uint8_t* pSrc
= (uint8_t*) src
;
683 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
687 if((src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0)) {
688 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
693 /* Transform a NUL-terminated string. */
694 UChar
*pDestLimit
= dest
+destCapacity
;
695 uint8_t t1
, t2
, t3
; /* trail bytes */
697 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
700 * ASCII, or a trail byte in lead position which is treated like
701 * a single-byte sequence for better character boundary
702 * resynchronization after illegal sequences.
707 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
708 if((t1
= pSrc
[1]) != 0) {
709 /* 0x3080 = (0xc0 << 6) + 0x80 */
710 *pDest
++ = (UChar
)((ch
<< 6) + t1
- 0x3080);
714 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
715 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0) {
716 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
717 /* 0x2080 = (0x80 << 6) + 0x80 */
718 *pDest
++ = (UChar
)((ch
<< 12) + (t1
<< 6) + t2
- 0x2080);
722 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
723 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0 && (t3
= pSrc
[3]) != 0) {
725 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
726 ch
= (ch
<< 18) + (t1
<< 12) + (t2
<< 6) + t3
- 0x3c82080;
727 *(pDest
++) = U16_LEAD(ch
);
728 if(pDest
< pDestLimit
) {
729 *(pDest
++) = U16_TRAIL(ch
);
738 /* truncated character at the end */
740 while(*++pSrc
!= 0) {}
744 /* Pre-flight the rest of the string. */
745 while((ch
= *pSrc
) != 0) {
748 * ASCII, or a trail byte in lead position which is treated like
749 * a single-byte sequence for better character boundary
750 * resynchronization after illegal sequences.
755 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
761 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
762 if(pSrc
[1] != 0 && pSrc
[2] != 0) {
767 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
768 if(pSrc
[1] != 0 && pSrc
[2] != 0 && pSrc
[3] != 0) {
775 /* truncated character at the end */
779 } else /* srcLength >= 0 */ {
780 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
783 * This function requires that if srcLength is given, then it must be
784 * destCapatity >= srcLength so that we need not check for
785 * destination buffer overflow in the loop.
787 if(destCapacity
< srcLength
) {
788 if(pDestLength
!= NULL
) {
789 *pDestLength
= srcLength
; /* this likely overestimates the true destLength! */
791 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
795 if((pSrcLimit
- pSrc
) >= 4) {
796 pSrcLimit
-= 3; /* temporarily reduce pSrcLimit */
798 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
803 * ASCII, or a trail byte in lead position which is treated like
804 * a single-byte sequence for better character boundary
805 * resynchronization after illegal sequences.
808 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
809 /* 0x3080 = (0xc0 << 6) + 0x80 */
810 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
811 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
812 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
813 /* 0x2080 = (0x80 << 6) + 0x80 */
814 ch
= (ch
<< 12) + (*pSrc
++ << 6);
815 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
816 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
817 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
818 ch
= (ch
<< 18) + (*pSrc
++ << 12);
820 ch
+= *pSrc
++ - 0x3c82080;
821 *(pDest
++) = U16_LEAD(ch
);
822 *(pDest
++) = U16_TRAIL(ch
);
824 } while(pSrc
< pSrcLimit
);
826 pSrcLimit
+= 3; /* restore original pSrcLimit */
829 while(pSrc
< pSrcLimit
) {
833 * ASCII, or a trail byte in lead position which is treated like
834 * a single-byte sequence for better character boundary
835 * resynchronization after illegal sequences.
839 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
840 if(pSrc
< pSrcLimit
) {
841 /* 0x3080 = (0xc0 << 6) + 0x80 */
842 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
845 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
846 if((pSrcLimit
- pSrc
) >= 2) {
847 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
848 /* 0x2080 = (0x80 << 6) + 0x80 */
849 ch
= (ch
<< 12) + (*pSrc
++ << 6);
850 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
854 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
855 if((pSrcLimit
- pSrc
) >= 3) {
856 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
857 ch
= (ch
<< 18) + (*pSrc
++ << 12);
859 ch
+= *pSrc
++ - 0x3c82080;
860 *(pDest
++) = U16_LEAD(ch
);
861 *(pDest
++) = U16_TRAIL(ch
);
867 /* truncated character at the end */
873 reqLength
+=(int32_t)(pDest
- dest
);
876 *pDestLength
= reqLength
;
879 /* Terminate the buffer */
880 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
885 static U_INLINE
uint8_t *
886 _appendUTF8(uint8_t *pDest
, UChar32 c
) {
887 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
890 } else if(c
<=0x7ff) {
891 *pDest
++=(uint8_t)((c
>>6)|0xc0);
892 *pDest
++=(uint8_t)((c
&0x3f)|0x80);
893 } else if(c
<=0xffff) {
894 *pDest
++=(uint8_t)((c
>>12)|0xe0);
895 *pDest
++=(uint8_t)(((c
>>6)&0x3f)|0x80);
896 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
897 } else /* if((uint32_t)(c)<=0x10ffff) */ {
898 *pDest
++=(uint8_t)(((c
)>>18)|0xf0);
899 *pDest
++=(uint8_t)((((c
)>>12)&0x3f)|0x80);
900 *pDest
++=(uint8_t)((((c
)>>6)&0x3f)|0x80);
901 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
907 U_CAPI
char* U_EXPORT2
908 u_strToUTF8WithSub(char *dest
,
909 int32_t destCapacity
,
910 int32_t *pDestLength
,
913 UChar32 subchar
, int32_t *pNumSubstitutions
,
914 UErrorCode
*pErrorCode
){
918 uint8_t *pDest
= (uint8_t *)dest
;
919 uint8_t *pDestLimit
= pDest
+ destCapacity
;
920 int32_t numSubstitutions
;
923 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
927 if( (pSrc
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0) ||
928 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
930 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
937 while((ch
=*pSrc
)!=0) {
940 if(pDest
<pDestLimit
) {
946 } else if(ch
<= 0x7ff) {
947 if((pDestLimit
- pDest
) >= 2) {
948 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
949 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
954 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
955 if((pDestLimit
- pDest
) >= 3) {
956 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
957 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
958 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
963 } else /* ch is a surrogate */ {
966 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
967 if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
969 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
970 } else if(subchar
>=0) {
974 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
975 *pErrorCode
= U_INVALID_CHAR_FOUND
;
979 length
= U8_LENGTH(ch
);
980 if((pDestLimit
- pDest
) >= length
) {
981 /* convert and append*/
982 pDest
=_appendUTF8(pDest
, ch
);
989 while((ch
=*pSrc
++)!=0) {
992 } else if(ch
<=0x7ff) {
994 } else if(!UTF_IS_SURROGATE(ch
)) {
996 } else if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
999 } else if(subchar
>=0) {
1000 reqLength
+=U8_LENGTH(subchar
);
1003 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1004 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1009 const UChar
*pSrcLimit
= pSrc
+srcLength
;
1012 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1015 * Each iteration of the inner loop progresses by at most 3 UTF-8
1016 * bytes and one UChar, for most characters.
1017 * For supplementary code points (4 & 2), which are rare,
1018 * there is an additional adjustment.
1020 count
= (int32_t)((pDestLimit
- pDest
) / 3);
1021 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1022 if(count
> srcLength
) {
1023 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1027 * Too much overhead if we get near the end of the string,
1028 * continue with the next loop.
1035 *pDest
++ = (char)ch
;
1036 } else if(ch
<= 0x7ff) {
1037 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1038 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1039 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1040 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1041 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1042 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1043 } else /* ch is a surrogate */ {
1045 * We will read two UChars and probably output four bytes,
1046 * which we didn't account for with computing count,
1047 * so we adjust it here.
1050 --pSrc
; /* undo ch=*pSrc++ for the lead surrogate */
1051 break; /* recompute count */
1054 if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
1056 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
1058 /* writing 4 bytes per 2 UChars is ok */
1059 *pDest
++=(uint8_t)((ch
>>18)|0xf0);
1060 *pDest
++=(uint8_t)(((ch
>>12)&0x3f)|0x80);
1061 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1062 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1064 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1069 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1073 /* convert and append*/
1074 pDest
=_appendUTF8(pDest
, ch
);
1077 } while(--count
> 0);
1080 while(pSrc
<pSrcLimit
) {
1083 if(pDest
<pDestLimit
) {
1084 *pDest
++ = (char)ch
;
1089 } else if(ch
<= 0x7ff) {
1090 if((pDestLimit
- pDest
) >= 2) {
1091 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1092 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1097 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1098 if((pDestLimit
- pDest
) >= 3) {
1099 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1100 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1101 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1106 } else /* ch is a surrogate */ {
1109 if(UTF_IS_SURROGATE_FIRST(ch
) && pSrc
<pSrcLimit
&& UTF_IS_TRAIL(ch2
=*pSrc
)) {
1111 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
1112 } else if(subchar
>=0) {
1116 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1117 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1121 length
= U8_LENGTH(ch
);
1122 if((pDestLimit
- pDest
) >= length
) {
1123 /* convert and append*/
1124 pDest
=_appendUTF8(pDest
, ch
);
1131 while(pSrc
<pSrcLimit
) {
1135 } else if(ch
<=0x7ff) {
1137 } else if(!UTF_IS_SURROGATE(ch
)) {
1139 } else if(UTF_IS_SURROGATE_FIRST(ch
) && pSrc
<pSrcLimit
&& UTF_IS_TRAIL(ch2
=*pSrc
)) {
1142 } else if(subchar
>=0) {
1143 reqLength
+=U8_LENGTH(subchar
);
1146 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1147 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1153 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1155 if(pNumSubstitutions
!=NULL
) {
1156 *pNumSubstitutions
=numSubstitutions
;
1160 *pDestLength
= reqLength
;
1163 /* Terminate the buffer */
1164 u_terminateChars((char*)dest
,destCapacity
,reqLength
,pErrorCode
);
1169 U_CAPI
char* U_EXPORT2
1170 u_strToUTF8(char *dest
,
1171 int32_t destCapacity
,
1172 int32_t *pDestLength
,
1175 UErrorCode
*pErrorCode
){
1176 return u_strToUTF8WithSub(
1177 dest
, destCapacity
, pDestLength
,