]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2001-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
13 * Modification History:
15 * Date Name Description
16 * 9/10/2001 Ram Creation.
17 ******************************************************************************
20 /*******************************************************************************
22 * u_strTo* and u_strFrom* APIs
23 * WCS functions moved to ustr_wcs.c for better modularization
25 *******************************************************************************
29 #include "unicode/putil.h"
30 #include "unicode/ustring.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
39 U_CAPI UChar
* U_EXPORT2
40 u_strFromUTF32WithSub(UChar
*dest
,
45 UChar32 subchar
, int32_t *pNumSubstitutions
,
46 UErrorCode
*pErrorCode
) {
47 const UChar32
*srcLimit
;
52 int32_t numSubstitutions
;
55 if(U_FAILURE(*pErrorCode
)){
58 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
59 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
60 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
62 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
66 if(pNumSubstitutions
!= NULL
) {
67 *pNumSubstitutions
= 0;
71 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
76 /* simple loop for conversion of a NUL-terminated BMP string */
77 while((ch
=*src
) != 0 &&
78 ((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff))) {
80 if(pDest
< destLimit
) {
88 /* "complicated" case, find the end of the remaining string */
89 while(*++srcLimit
!= 0) {}
92 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
95 /* convert with length */
96 while(src
< srcLimit
) {
99 /* usually "loops" once; twice only for writing subchar */
100 if((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff)) {
101 if(pDest
< destLimit
) {
102 *pDest
++ = (UChar
)ch
;
107 } else if(0x10000 <= ch
&& ch
<= 0x10ffff) {
108 if(pDest
!=NULL
&& ((pDest
+ 2) <= destLimit
)) {
109 *pDest
++ = U16_LEAD(ch
);
110 *pDest
++ = U16_TRAIL(ch
);
115 } else if((ch
= subchar
) < 0) {
116 /* surrogate code point, or not a Unicode code point at all */
117 *pErrorCode
= U_INVALID_CHAR_FOUND
;
125 reqLength
+= (int32_t)(pDest
- dest
);
127 *pDestLength
= reqLength
;
129 if(pNumSubstitutions
!= NULL
) {
130 *pNumSubstitutions
= numSubstitutions
;
133 /* Terminate the buffer */
134 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
139 U_CAPI UChar
* U_EXPORT2
140 u_strFromUTF32(UChar
*dest
,
141 int32_t destCapacity
,
142 int32_t *pDestLength
,
145 UErrorCode
*pErrorCode
) {
146 return u_strFromUTF32WithSub(
147 dest
, destCapacity
, pDestLength
,
153 U_CAPI UChar32
* U_EXPORT2
154 u_strToUTF32WithSub(UChar32
*dest
,
155 int32_t destCapacity
,
156 int32_t *pDestLength
,
159 UChar32 subchar
, int32_t *pNumSubstitutions
,
160 UErrorCode
*pErrorCode
) {
161 const UChar
*srcLimit
;
167 int32_t numSubstitutions
;
170 if(U_FAILURE(*pErrorCode
)){
173 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
174 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
175 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
177 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
181 if(pNumSubstitutions
!= NULL
) {
182 *pNumSubstitutions
= 0;
186 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
188 numSubstitutions
= 0;
191 /* simple loop for conversion of a NUL-terminated BMP string */
192 while((ch
=*src
) != 0 && !U16_IS_SURROGATE(ch
)) {
194 if(pDest
< destLimit
) {
202 /* "complicated" case, find the end of the remaining string */
203 while(*++srcLimit
!= 0) {}
206 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
209 /* convert with length */
210 while(src
< srcLimit
) {
212 if(!U16_IS_SURROGATE(ch
)) {
213 /* write or count ch below */
214 } else if(U16_IS_SURROGATE_LEAD(ch
) && src
< srcLimit
&& U16_IS_TRAIL(ch2
= *src
)) {
216 ch
= U16_GET_SUPPLEMENTARY(ch
, ch2
);
217 } else if((ch
= subchar
) < 0) {
218 /* unpaired surrogate */
219 *pErrorCode
= U_INVALID_CHAR_FOUND
;
224 if(pDest
< destLimit
) {
231 reqLength
+= (int32_t)(pDest
- dest
);
233 *pDestLength
= reqLength
;
235 if(pNumSubstitutions
!= NULL
) {
236 *pNumSubstitutions
= numSubstitutions
;
239 /* Terminate the buffer */
240 u_terminateUChar32s(dest
, destCapacity
, reqLength
, pErrorCode
);
245 U_CAPI UChar32
* U_EXPORT2
246 u_strToUTF32(UChar32
*dest
,
247 int32_t destCapacity
,
248 int32_t *pDestLength
,
251 UErrorCode
*pErrorCode
) {
252 return u_strToUTF32WithSub(
253 dest
, destCapacity
, pDestLength
,
259 /* for utf8_nextCharSafeBodyTerminated() */
261 utf8_minLegal
[4]={ 0, 0x80, 0x800, 0x10000 };
264 * Version of utf8_nextCharSafeBody() with the following differences:
265 * - checks for NUL termination instead of length
266 * - works with pointers instead of indexes
267 * - always strict (strict==-1)
269 * *ps points to after the lead byte and will be moved to after the last trail byte.
270 * c is the lead byte.
271 * @return the code point, or U_SENTINEL
274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps
, UChar32 c
) {
275 const uint8_t *s
=*ps
;
276 uint8_t trail
, illegal
=0;
277 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
279 U8_MASK_LEAD_BYTE((c
), count
);
280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
282 /* each branch falls through to the next one */
285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
289 trail
=(uint8_t)(*s
++ - 0x80);
291 if(trail
>0x3f || c
>=0x110) {
292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
298 trail
=(uint8_t)(*s
++ - 0x80);
300 /* not a trail byte */
307 trail
=(uint8_t)(*s
++ - 0x80);
309 /* not a trail byte */
316 /* no default branch to optimize switch() - all values are covered */
319 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
320 /* illegal is also set if count>=4 */
321 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
323 /* don't go beyond this sequence */
325 while(count
>0 && U8_IS_TRAIL(*s
)) {
336 * Version of utf8_nextCharSafeBody() with the following differences:
337 * - works with pointers instead of indexes
338 * - always strict (strict==-1)
340 * *ps points to after the lead byte and will be moved to after the last trail byte.
341 * c is the lead byte.
342 * @return the code point, or U_SENTINEL
345 utf8_nextCharSafeBodyPointer(const uint8_t **ps
, const uint8_t *limit
, UChar32 c
) {
346 const uint8_t *s
=*ps
;
347 uint8_t trail
, illegal
=0;
348 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
349 if((limit
-s
)>=count
) {
350 U8_MASK_LEAD_BYTE((c
), count
);
351 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
353 /* each branch falls through to the next one */
356 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
361 c
=(c
<<6)|(trail
&0x3f);
363 illegal
|=(trail
&0xc0)^0x80;
365 /* code point>0x10ffff, outside Unicode */
372 c
=(c
<<6)|(trail
&0x3f);
373 illegal
|=(trail
&0xc0)^0x80;
377 c
=(c
<<6)|(trail
&0x3f);
378 illegal
|=(trail
&0xc0)^0x80;
382 /* no default branch to optimize switch() - all values are covered */
385 illegal
=1; /* too few bytes left */
388 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
389 /* illegal is also set if count>=4 */
390 U_ASSERT(illegal
|| count
<UPRV_LENGTHOF(utf8_minLegal
));
391 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
393 /* don't go beyond this sequence */
395 while(count
>0 && s
<limit
&& U8_IS_TRAIL(*s
)) {
405 U_CAPI UChar
* U_EXPORT2
406 u_strFromUTF8WithSub(UChar
*dest
,
407 int32_t destCapacity
,
408 int32_t *pDestLength
,
411 UChar32 subchar
, int32_t *pNumSubstitutions
,
412 UErrorCode
*pErrorCode
){
414 UChar
*pDestLimit
= dest
+destCapacity
;
416 int32_t reqLength
= 0;
417 const uint8_t* pSrc
= (const uint8_t*) src
;
418 uint8_t t1
, t2
; /* trail bytes */
419 int32_t numSubstitutions
;
422 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
426 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
427 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
428 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
430 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
434 if(pNumSubstitutions
!=NULL
) {
435 *pNumSubstitutions
=0;
440 * Inline processing of UTF-8 byte sequences:
442 * Byte sequences for the most common characters are handled inline in
443 * the conversion loops. In order to reduce the path lengths for those
444 * characters, the tests are arranged in a kind of binary search.
445 * ASCII (<=0x7f) is checked first, followed by the dividing point
446 * between 2- and 3-byte sequences (0xe0).
447 * The 3-byte branch is tested first to speed up CJK text.
448 * The compiler should combine the subtractions for the two tests for 0xe0.
449 * Each branch then tests for the other end of its range.
454 * Transform a NUL-terminated string.
455 * The code explicitly checks for NULs only in the lead byte position.
456 * A NUL byte in the trail byte position fails the trail byte range check anyway.
458 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
464 if( /* handle U+1000..U+CFFF inline */
466 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
467 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
469 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
470 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
474 } else if(ch
< 0xe0) {
475 if( /* handle U+0080..U+07FF inline */
477 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
479 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
485 /* function call for "complicated" and error cases */
486 ++pSrc
; /* continue after the lead byte */
487 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
488 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
489 *pErrorCode
= U_INVALID_CHAR_FOUND
;
491 } else if(ch
<=0xFFFF) {
492 *(pDest
++)=(UChar
)ch
;
494 *(pDest
++)=U16_LEAD(ch
);
495 if(pDest
<pDestLimit
) {
496 *(pDest
++)=U16_TRAIL(ch
);
505 /* Pre-flight the rest of the string. */
506 while((ch
= *pSrc
) != 0) {
512 if( /* handle U+1000..U+CFFF inline */
514 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
515 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
521 } else if(ch
< 0xe0) {
522 if( /* handle U+0080..U+07FF inline */
524 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
532 /* function call for "complicated" and error cases */
533 ++pSrc
; /* continue after the lead byte */
534 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
535 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
536 *pErrorCode
= U_INVALID_CHAR_FOUND
;
539 reqLength
+= U16_LENGTH(ch
);
542 } else /* srcLength >= 0 */ {
543 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
549 * Each iteration of the inner loop progresses by at most 3 UTF-8
550 * bytes and one UChar, for most characters.
551 * For supplementary code points (4 & 2), which are rare,
552 * there is an additional adjustment.
554 count
= (int32_t)(pDestLimit
- pDest
);
555 srcLength
= (int32_t)((pSrcLimit
- pSrc
) / 3);
556 if(count
> srcLength
) {
557 count
= srcLength
; /* min(remaining dest, remaining src/3) */
561 * Too much overhead if we get near the end of the string,
562 * continue with the next loop.
574 if( /* handle U+1000..U+CFFF inline */
576 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
577 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
579 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
580 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
584 } else if(ch
< 0xe0) {
585 if( /* handle U+0080..U+07FF inline */
587 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
589 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
595 if(ch
>= 0xf0 || subchar
> 0xffff) {
597 * We may read up to six bytes and write up to two UChars,
598 * which we didn't account for with computing count,
599 * so we adjust it here.
606 /* function call for "complicated" and error cases */
607 ++pSrc
; /* continue after the lead byte */
608 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
609 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
610 *pErrorCode
= U_INVALID_CHAR_FOUND
;
612 }else if(ch
<=0xFFFF){
613 *(pDest
++)=(UChar
)ch
;
615 *(pDest
++)=U16_LEAD(ch
);
616 *(pDest
++)=U16_TRAIL(ch
);
619 } while(--count
> 0);
622 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
629 if( /* handle U+1000..U+CFFF inline */
631 ((pSrcLimit
- pSrc
) >= 3) &&
632 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
633 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
635 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
636 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
640 } else if(ch
< 0xe0) {
641 if( /* handle U+0080..U+07FF inline */
643 ((pSrcLimit
- pSrc
) >= 2) &&
644 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
646 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
652 /* function call for "complicated" and error cases */
653 ++pSrc
; /* continue after the lead byte */
654 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
655 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
656 *pErrorCode
= U_INVALID_CHAR_FOUND
;
658 }else if(ch
<=0xFFFF){
659 *(pDest
++)=(UChar
)ch
;
661 *(pDest
++)=U16_LEAD(ch
);
662 if(pDest
<pDestLimit
){
663 *(pDest
++)=U16_TRAIL(ch
);
671 /* do not fill the dest buffer just count the UChars needed */
672 while(pSrc
< pSrcLimit
){
679 if( /* handle U+1000..U+CFFF inline */
681 ((pSrcLimit
- pSrc
) >= 3) &&
682 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
683 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
689 } else if(ch
< 0xe0) {
690 if( /* handle U+0080..U+07FF inline */
692 ((pSrcLimit
- pSrc
) >= 2) &&
693 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
701 /* function call for "complicated" and error cases */
702 ++pSrc
; /* continue after the lead byte */
703 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
704 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
705 *pErrorCode
= U_INVALID_CHAR_FOUND
;
708 reqLength
+=U16_LENGTH(ch
);
713 reqLength
+=(int32_t)(pDest
- dest
);
715 if(pNumSubstitutions
!=NULL
) {
716 *pNumSubstitutions
=numSubstitutions
;
720 *pDestLength
= reqLength
;
723 /* Terminate the buffer */
724 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
729 U_CAPI UChar
* U_EXPORT2
730 u_strFromUTF8(UChar
*dest
,
731 int32_t destCapacity
,
732 int32_t *pDestLength
,
735 UErrorCode
*pErrorCode
){
736 return u_strFromUTF8WithSub(
737 dest
, destCapacity
, pDestLength
,
743 U_CAPI UChar
* U_EXPORT2
744 u_strFromUTF8Lenient(UChar
*dest
,
745 int32_t destCapacity
,
746 int32_t *pDestLength
,
749 UErrorCode
*pErrorCode
) {
752 int32_t reqLength
= 0;
753 uint8_t* pSrc
= (uint8_t*) src
;
756 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
760 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
761 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0)
763 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
768 /* Transform a NUL-terminated string. */
769 UChar
*pDestLimit
= (dest
!=NULL
)?(dest
+destCapacity
):NULL
;
770 uint8_t t1
, t2
, t3
; /* trail bytes */
772 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
775 * ASCII, or a trail byte in lead position which is treated like
776 * a single-byte sequence for better character boundary
777 * resynchronization after illegal sequences.
782 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
783 if((t1
= pSrc
[1]) != 0) {
784 /* 0x3080 = (0xc0 << 6) + 0x80 */
785 *pDest
++ = (UChar
)((ch
<< 6) + t1
- 0x3080);
789 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
790 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0) {
791 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
792 /* 0x2080 = (0x80 << 6) + 0x80 */
793 *pDest
++ = (UChar
)((ch
<< 12) + (t1
<< 6) + t2
- 0x2080);
797 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
798 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0 && (t3
= pSrc
[3]) != 0) {
800 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
801 ch
= (ch
<< 18) + (t1
<< 12) + (t2
<< 6) + t3
- 0x3c82080;
802 *(pDest
++) = U16_LEAD(ch
);
803 if(pDest
< pDestLimit
) {
804 *(pDest
++) = U16_TRAIL(ch
);
813 /* truncated character at the end */
815 while(*++pSrc
!= 0) {}
819 /* Pre-flight the rest of the string. */
820 while((ch
= *pSrc
) != 0) {
823 * ASCII, or a trail byte in lead position which is treated like
824 * a single-byte sequence for better character boundary
825 * resynchronization after illegal sequences.
830 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
836 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
837 if(pSrc
[1] != 0 && pSrc
[2] != 0) {
842 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
843 if(pSrc
[1] != 0 && pSrc
[2] != 0 && pSrc
[3] != 0) {
850 /* truncated character at the end */
854 } else /* srcLength >= 0 */ {
855 const uint8_t *pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+ srcLength
):NULL
;
858 * This function requires that if srcLength is given, then it must be
859 * destCapatity >= srcLength so that we need not check for
860 * destination buffer overflow in the loop.
862 if(destCapacity
< srcLength
) {
863 if(pDestLength
!= NULL
) {
864 *pDestLength
= srcLength
; /* this likely overestimates the true destLength! */
866 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
870 if((pSrcLimit
- pSrc
) >= 4) {
871 pSrcLimit
-= 3; /* temporarily reduce pSrcLimit */
873 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
878 * ASCII, or a trail byte in lead position which is treated like
879 * a single-byte sequence for better character boundary
880 * resynchronization after illegal sequences.
883 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
884 /* 0x3080 = (0xc0 << 6) + 0x80 */
885 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
886 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
887 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
888 /* 0x2080 = (0x80 << 6) + 0x80 */
889 ch
= (ch
<< 12) + (*pSrc
++ << 6);
890 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
891 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
892 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
893 ch
= (ch
<< 18) + (*pSrc
++ << 12);
895 ch
+= *pSrc
++ - 0x3c82080;
896 *(pDest
++) = U16_LEAD(ch
);
897 *(pDest
++) = U16_TRAIL(ch
);
899 } while(pSrc
< pSrcLimit
);
901 pSrcLimit
+= 3; /* restore original pSrcLimit */
904 while(pSrc
< pSrcLimit
) {
908 * ASCII, or a trail byte in lead position which is treated like
909 * a single-byte sequence for better character boundary
910 * resynchronization after illegal sequences.
914 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
915 if(pSrc
< pSrcLimit
) {
916 /* 0x3080 = (0xc0 << 6) + 0x80 */
917 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
920 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
921 if((pSrcLimit
- pSrc
) >= 2) {
922 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
923 /* 0x2080 = (0x80 << 6) + 0x80 */
924 ch
= (ch
<< 12) + (*pSrc
++ << 6);
925 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
929 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
930 if((pSrcLimit
- pSrc
) >= 3) {
931 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
932 ch
= (ch
<< 18) + (*pSrc
++ << 12);
934 ch
+= *pSrc
++ - 0x3c82080;
935 *(pDest
++) = U16_LEAD(ch
);
936 *(pDest
++) = U16_TRAIL(ch
);
942 /* truncated character at the end */
948 reqLength
+=(int32_t)(pDest
- dest
);
951 *pDestLength
= reqLength
;
954 /* Terminate the buffer */
955 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
960 static inline uint8_t *
961 _appendUTF8(uint8_t *pDest
, UChar32 c
) {
962 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
965 } else if(c
<=0x7ff) {
966 *pDest
++=(uint8_t)((c
>>6)|0xc0);
967 *pDest
++=(uint8_t)((c
&0x3f)|0x80);
968 } else if(c
<=0xffff) {
969 *pDest
++=(uint8_t)((c
>>12)|0xe0);
970 *pDest
++=(uint8_t)(((c
>>6)&0x3f)|0x80);
971 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
972 } else /* if((uint32_t)(c)<=0x10ffff) */ {
973 *pDest
++=(uint8_t)(((c
)>>18)|0xf0);
974 *pDest
++=(uint8_t)((((c
)>>12)&0x3f)|0x80);
975 *pDest
++=(uint8_t)((((c
)>>6)&0x3f)|0x80);
976 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
982 U_CAPI
char* U_EXPORT2
983 u_strToUTF8WithSub(char *dest
,
984 int32_t destCapacity
,
985 int32_t *pDestLength
,
988 UChar32 subchar
, int32_t *pNumSubstitutions
,
989 UErrorCode
*pErrorCode
){
992 uint8_t *pDest
= (uint8_t *)dest
;
993 uint8_t *pDestLimit
= (pDest
!=NULL
)?(pDest
+ destCapacity
):NULL
;
994 int32_t numSubstitutions
;
997 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
1001 if( (pSrc
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1002 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
1003 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1005 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1009 if(pNumSubstitutions
!=NULL
) {
1010 *pNumSubstitutions
=0;
1015 while((ch
=*pSrc
)!=0) {
1018 if(pDest
<pDestLimit
) {
1019 *pDest
++ = (uint8_t)ch
;
1024 } else if(ch
<= 0x7ff) {
1025 if((pDestLimit
- pDest
) >= 2) {
1026 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1027 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1032 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1033 if((pDestLimit
- pDest
) >= 3) {
1034 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1035 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1036 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1041 } else /* ch is a surrogate */ {
1044 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1045 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1047 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1048 } else if(subchar
>=0) {
1052 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1053 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1057 length
= U8_LENGTH(ch
);
1058 if((pDestLimit
- pDest
) >= length
) {
1059 /* convert and append*/
1060 pDest
=_appendUTF8(pDest
, ch
);
1067 while((ch
=*pSrc
++)!=0) {
1070 } else if(ch
<=0x7ff) {
1072 } else if(!U16_IS_SURROGATE(ch
)) {
1074 } else if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1077 } else if(subchar
>=0) {
1078 reqLength
+=U8_LENGTH(subchar
);
1081 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1082 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1087 const UChar
*pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+srcLength
):NULL
;
1090 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1093 * Each iteration of the inner loop progresses by at most 3 UTF-8
1094 * bytes and one UChar, for most characters.
1095 * For supplementary code points (4 & 2), which are rare,
1096 * there is an additional adjustment.
1098 count
= (int32_t)((pDestLimit
- pDest
) / 3);
1099 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1100 if(count
> srcLength
) {
1101 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1105 * Too much overhead if we get near the end of the string,
1106 * continue with the next loop.
1113 *pDest
++ = (uint8_t)ch
;
1114 } else if(ch
<= 0x7ff) {
1115 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1116 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1117 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1118 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1119 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1120 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1121 } else /* ch is a surrogate */ {
1123 * We will read two UChars and probably output four bytes,
1124 * which we didn't account for with computing count,
1125 * so we adjust it here.
1128 --pSrc
; /* undo ch=*pSrc++ for the lead surrogate */
1129 break; /* recompute count */
1132 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1134 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1136 /* writing 4 bytes per 2 UChars is ok */
1137 *pDest
++=(uint8_t)((ch
>>18)|0xf0);
1138 *pDest
++=(uint8_t)(((ch
>>12)&0x3f)|0x80);
1139 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1140 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1142 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1147 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1151 /* convert and append*/
1152 pDest
=_appendUTF8(pDest
, ch
);
1155 } while(--count
> 0);
1158 while(pSrc
<pSrcLimit
) {
1161 if(pDest
<pDestLimit
) {
1162 *pDest
++ = (uint8_t)ch
;
1167 } else if(ch
<= 0x7ff) {
1168 if((pDestLimit
- pDest
) >= 2) {
1169 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1170 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1175 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1176 if((pDestLimit
- pDest
) >= 3) {
1177 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1178 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1179 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1184 } else /* ch is a surrogate */ {
1187 if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1189 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1190 } else if(subchar
>=0) {
1194 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1195 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1199 length
= U8_LENGTH(ch
);
1200 if((pDestLimit
- pDest
) >= length
) {
1201 /* convert and append*/
1202 pDest
=_appendUTF8(pDest
, ch
);
1209 while(pSrc
<pSrcLimit
) {
1213 } else if(ch
<=0x7ff) {
1215 } else if(!U16_IS_SURROGATE(ch
)) {
1217 } else if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1220 } else if(subchar
>=0) {
1221 reqLength
+=U8_LENGTH(subchar
);
1224 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1225 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1231 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1233 if(pNumSubstitutions
!=NULL
) {
1234 *pNumSubstitutions
=numSubstitutions
;
1238 *pDestLength
= reqLength
;
1241 /* Terminate the buffer */
1242 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1246 U_CAPI
char* U_EXPORT2
1247 u_strToUTF8(char *dest
,
1248 int32_t destCapacity
,
1249 int32_t *pDestLength
,
1252 UErrorCode
*pErrorCode
){
1253 return u_strToUTF8WithSub(
1254 dest
, destCapacity
, pDestLength
,
1260 U_CAPI UChar
* U_EXPORT2
1261 u_strFromJavaModifiedUTF8WithSub(
1263 int32_t destCapacity
,
1264 int32_t *pDestLength
,
1267 UChar32 subchar
, int32_t *pNumSubstitutions
,
1268 UErrorCode
*pErrorCode
) {
1269 UChar
*pDest
= dest
;
1270 UChar
*pDestLimit
= dest
+destCapacity
;
1272 int32_t reqLength
= 0;
1273 const uint8_t* pSrc
= (const uint8_t*) src
;
1274 const uint8_t *pSrcLimit
;
1276 uint8_t t1
, t2
; /* trail bytes */
1277 int32_t numSubstitutions
;
1280 if(U_FAILURE(*pErrorCode
)){
1283 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1284 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0 ||
1285 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1287 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1291 if(pNumSubstitutions
!=NULL
) {
1292 *pNumSubstitutions
=0;
1298 * Transform a NUL-terminated ASCII string.
1299 * Handle non-ASCII strings with slower code.
1301 while(((ch
= *pSrc
) != 0) && ch
<= 0x7f && (pDest
< pDestLimit
)) {
1306 reqLength
=(int32_t)(pDest
- dest
);
1308 *pDestLength
= reqLength
;
1311 /* Terminate the buffer */
1312 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1315 srcLength
= uprv_strlen((const char *)pSrc
);
1318 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1319 pSrcLimit
= (pSrc
== NULL
) ? NULL
: pSrc
+ srcLength
;
1321 count
= (int32_t)(pDestLimit
- pDest
);
1322 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1323 if(count
>= srcLength
&& srcLength
> 0 && *pSrc
<= 0x7f) {
1324 /* fast ASCII loop */
1325 const uint8_t *prevSrc
= pSrc
;
1327 while(pSrc
< pSrcLimit
&& (ch
= *pSrc
) <= 0x7f) {
1331 delta
= (int32_t)(pSrc
- prevSrc
);
1336 * Each iteration of the inner loop progresses by at most 3 UTF-8
1337 * bytes and one UChar.
1340 if(count
> srcLength
) {
1341 count
= srcLength
; /* min(remaining dest, remaining src/3) */
1345 * Too much overhead if we get near the end of the string,
1346 * continue with the next loop.
1357 if( /* handle U+0000..U+FFFF inline */
1359 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1360 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1362 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1363 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1368 if( /* handle U+0000..U+07FF inline */
1370 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1372 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1379 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1381 } else if(subchar
> 0xffff && --count
== 0) {
1383 * We need to write two UChars, adjusted count for that,
1384 * and ran out of space.
1388 /* function call for error cases */
1389 ++pSrc
; /* continue after the lead byte */
1390 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1392 if(subchar
<=0xFFFF) {
1393 *(pDest
++)=(UChar
)subchar
;
1395 *(pDest
++)=U16_LEAD(subchar
);
1396 *(pDest
++)=U16_TRAIL(subchar
);
1400 } while(--count
> 0);
1403 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
1410 if( /* handle U+0000..U+FFFF inline */
1412 ((pSrcLimit
- pSrc
) >= 3) &&
1413 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1414 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1416 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1417 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1422 if( /* handle U+0000..U+07FF inline */
1424 ((pSrcLimit
- pSrc
) >= 2) &&
1425 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1427 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1434 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1437 /* function call for error cases */
1438 ++pSrc
; /* continue after the lead byte */
1439 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1441 if(subchar
<=0xFFFF) {
1442 *(pDest
++)=(UChar
)subchar
;
1444 *(pDest
++)=U16_LEAD(subchar
);
1445 if(pDest
<pDestLimit
) {
1446 *(pDest
++)=U16_TRAIL(subchar
);
1456 /* do not fill the dest buffer just count the UChars needed */
1457 while(pSrc
< pSrcLimit
){
1464 if( /* handle U+0000..U+FFFF inline */
1466 ((pSrcLimit
- pSrc
) >= 3) &&
1467 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
1468 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
1475 if( /* handle U+0000..U+07FF inline */
1477 ((pSrcLimit
- pSrc
) >= 2) &&
1478 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
1487 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1490 /* function call for error cases */
1491 ++pSrc
; /* continue after the lead byte */
1492 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1494 reqLength
+=U16_LENGTH(ch
);
1499 if(pNumSubstitutions
!=NULL
) {
1500 *pNumSubstitutions
=numSubstitutions
;
1503 reqLength
+=(int32_t)(pDest
- dest
);
1505 *pDestLength
= reqLength
;
1508 /* Terminate the buffer */
1509 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1513 U_CAPI
char* U_EXPORT2
1514 u_strToJavaModifiedUTF8(
1516 int32_t destCapacity
,
1517 int32_t *pDestLength
,
1520 UErrorCode
*pErrorCode
) {
1521 int32_t reqLength
=0;
1523 uint8_t *pDest
= (uint8_t *)dest
;
1524 uint8_t *pDestLimit
= pDest
+ destCapacity
;
1525 const UChar
*pSrcLimit
;
1529 if(U_FAILURE(*pErrorCode
)){
1532 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1533 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0
1535 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1540 /* Convert NUL-terminated ASCII, then find the string length. */
1541 while((ch
=*src
)<=0x7f && ch
!= 0 && pDest
<pDestLimit
) {
1542 *pDest
++ = (uint8_t)ch
;
1546 reqLength
=(int32_t)(pDest
- (uint8_t *)dest
);
1548 *pDestLength
= reqLength
;
1551 /* Terminate the buffer */
1552 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1555 srcLength
= u_strlen(src
);
1558 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1559 pSrcLimit
= (src
!=NULL
)?(src
+srcLength
):NULL
;
1561 count
= (int32_t)(pDestLimit
- pDest
);
1562 srcLength
= (int32_t)(pSrcLimit
- src
);
1563 if(count
>= srcLength
&& srcLength
> 0 && *src
<= 0x7f) {
1564 /* fast ASCII loop */
1565 const UChar
*prevSrc
= src
;
1567 while(src
< pSrcLimit
&& (ch
= *src
) <= 0x7f && ch
!= 0) {
1568 *pDest
++=(uint8_t)ch
;
1571 delta
= (int32_t)(src
- prevSrc
);
1576 * Each iteration of the inner loop progresses by at most 3 UTF-8
1577 * bytes and one UChar.
1580 if(count
> srcLength
) {
1581 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1585 * Too much overhead if we get near the end of the string,
1586 * continue with the next loop.
1592 if(ch
<= 0x7f && ch
!= 0) {
1593 *pDest
++ = (uint8_t)ch
;
1594 } else if(ch
<= 0x7ff) {
1595 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1596 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1598 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1599 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1600 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1602 } while(--count
> 0);
1605 while(src
<pSrcLimit
) {
1607 if(ch
<= 0x7f && ch
!= 0) {
1608 if(pDest
<pDestLimit
) {
1609 *pDest
++ = (uint8_t)ch
;
1614 } else if(ch
<= 0x7ff) {
1615 if((pDestLimit
- pDest
) >= 2) {
1616 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1617 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1623 if((pDestLimit
- pDest
) >= 3) {
1624 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1625 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1626 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1633 while(src
<pSrcLimit
) {
1635 if(ch
<= 0x7f && ch
!= 0) {
1637 } else if(ch
<=0x7ff) {
1644 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1646 *pDestLength
= reqLength
;
1649 /* Terminate the buffer */
1650 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);