]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
2 ******************************************************************************
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
18 /*******************************************************************************
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
23 *******************************************************************************
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utf16.h"
37 U_CAPI UChar
* U_EXPORT2
38 u_strFromUTF32WithSub(UChar
*dest
,
43 UChar32 subchar
, int32_t *pNumSubstitutions
,
44 UErrorCode
*pErrorCode
) {
45 const UChar32
*srcLimit
;
50 int32_t numSubstitutions
;
53 if(U_FAILURE(*pErrorCode
)){
56 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
57 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
58 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
60 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
64 if(pNumSubstitutions
!= NULL
) {
65 *pNumSubstitutions
= 0;
69 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
74 /* simple loop for conversion of a NUL-terminated BMP string */
75 while((ch
=*src
) != 0 &&
76 ((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff))) {
78 if(pDest
< destLimit
) {
86 /* "complicated" case, find the end of the remaining string */
87 while(*++srcLimit
!= 0) {}
90 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
93 /* convert with length */
94 while(src
< srcLimit
) {
97 /* usually "loops" once; twice only for writing subchar */
98 if((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff)) {
99 if(pDest
< destLimit
) {
100 *pDest
++ = (UChar
)ch
;
105 } else if(0x10000 <= ch
&& ch
<= 0x10ffff) {
106 if(pDest
!=NULL
&& ((pDest
+ 2) <= destLimit
)) {
107 *pDest
++ = U16_LEAD(ch
);
108 *pDest
++ = U16_TRAIL(ch
);
113 } else if((ch
= subchar
) < 0) {
114 /* surrogate code point, or not a Unicode code point at all */
115 *pErrorCode
= U_INVALID_CHAR_FOUND
;
123 reqLength
+= (int32_t)(pDest
- dest
);
125 *pDestLength
= reqLength
;
127 if(pNumSubstitutions
!= NULL
) {
128 *pNumSubstitutions
= numSubstitutions
;
131 /* Terminate the buffer */
132 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
137 U_CAPI UChar
* U_EXPORT2
138 u_strFromUTF32(UChar
*dest
,
139 int32_t destCapacity
,
140 int32_t *pDestLength
,
143 UErrorCode
*pErrorCode
) {
144 return u_strFromUTF32WithSub(
145 dest
, destCapacity
, pDestLength
,
151 U_CAPI UChar32
* U_EXPORT2
152 u_strToUTF32WithSub(UChar32
*dest
,
153 int32_t destCapacity
,
154 int32_t *pDestLength
,
157 UChar32 subchar
, int32_t *pNumSubstitutions
,
158 UErrorCode
*pErrorCode
) {
159 const UChar
*srcLimit
;
165 int32_t numSubstitutions
;
168 if(U_FAILURE(*pErrorCode
)){
171 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
172 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
173 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
175 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
179 if(pNumSubstitutions
!= NULL
) {
180 *pNumSubstitutions
= 0;
184 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
186 numSubstitutions
= 0;
189 /* simple loop for conversion of a NUL-terminated BMP string */
190 while((ch
=*src
) != 0 && !U16_IS_SURROGATE(ch
)) {
192 if(pDest
< destLimit
) {
200 /* "complicated" case, find the end of the remaining string */
201 while(*++srcLimit
!= 0) {}
204 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
207 /* convert with length */
208 while(src
< srcLimit
) {
210 if(!U16_IS_SURROGATE(ch
)) {
211 /* write or count ch below */
212 } else if(U16_IS_SURROGATE_LEAD(ch
) && src
< srcLimit
&& U16_IS_TRAIL(ch2
= *src
)) {
214 ch
= U16_GET_SUPPLEMENTARY(ch
, ch2
);
215 } else if((ch
= subchar
) < 0) {
216 /* unpaired surrogate */
217 *pErrorCode
= U_INVALID_CHAR_FOUND
;
222 if(pDest
< destLimit
) {
229 reqLength
+= (int32_t)(pDest
- dest
);
231 *pDestLength
= reqLength
;
233 if(pNumSubstitutions
!= NULL
) {
234 *pNumSubstitutions
= numSubstitutions
;
237 /* Terminate the buffer */
238 u_terminateUChar32s(dest
, destCapacity
, reqLength
, pErrorCode
);
243 U_CAPI UChar32
* U_EXPORT2
244 u_strToUTF32(UChar32
*dest
,
245 int32_t destCapacity
,
246 int32_t *pDestLength
,
249 UErrorCode
*pErrorCode
) {
250 return u_strToUTF32WithSub(
251 dest
, destCapacity
, pDestLength
,
257 /* for utf8_nextCharSafeBodyTerminated() */
259 utf8_minLegal
[4]={ 0, 0x80, 0x800, 0x10000 };
262 * Version of utf8_nextCharSafeBody() with the following differences:
263 * - checks for NUL termination instead of length
264 * - works with pointers instead of indexes
265 * - always strict (strict==-1)
267 * *ps points to after the lead byte and will be moved to after the last trail byte.
268 * c is the lead byte.
269 * @return the code point, or U_SENTINEL
272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps
, UChar32 c
) {
273 const uint8_t *s
=*ps
;
274 uint8_t trail
, illegal
=0;
275 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
277 U8_MASK_LEAD_BYTE((c
), count
);
278 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
280 /* each branch falls through to the next one */
283 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
287 trail
=(uint8_t)(*s
++ - 0x80);
289 if(trail
>0x3f || c
>=0x110) {
290 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
294 case 2: /*fall through*/
295 trail
=(uint8_t)(*s
++ - 0x80);
297 /* not a trail byte */
302 case 1: /*fall through*/
303 trail
=(uint8_t)(*s
++ - 0x80);
305 /* not a trail byte */
312 /* no default branch to optimize switch() - all values are covered */
315 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
316 /* illegal is also set if count>=4 */
317 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
319 /* don't go beyond this sequence */
321 while(count
>0 && U8_IS_TRAIL(*s
)) {
332 * Version of utf8_nextCharSafeBody() with the following differences:
333 * - works with pointers instead of indexes
334 * - always strict (strict==-1)
336 * *ps points to after the lead byte and will be moved to after the last trail byte.
337 * c is the lead byte.
338 * @return the code point, or U_SENTINEL
341 utf8_nextCharSafeBodyPointer(const uint8_t **ps
, const uint8_t *limit
, UChar32 c
) {
342 const uint8_t *s
=*ps
;
343 uint8_t trail
, illegal
=0;
344 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
345 if((limit
-s
)>=count
) {
346 U8_MASK_LEAD_BYTE((c
), count
);
347 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
349 /* each branch falls through to the next one */
352 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
357 c
=(c
<<6)|(trail
&0x3f);
359 illegal
|=(trail
&0xc0)^0x80;
361 /* code point>0x10ffff, outside Unicode */
365 case 2: /*fall through*/
367 c
=(c
<<6)|(trail
&0x3f);
368 illegal
|=(trail
&0xc0)^0x80;
369 case 1: /*fall through*/
371 c
=(c
<<6)|(trail
&0x3f);
372 illegal
|=(trail
&0xc0)^0x80;
376 /* no default branch to optimize switch() - all values are covered */
379 illegal
=1; /* too few bytes left */
382 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
383 /* illegal is also set if count>=4 */
384 U_ASSERT(count
<sizeof(utf8_minLegal
)/sizeof(utf8_minLegal
[0]));
385 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
387 /* don't go beyond this sequence */
389 while(count
>0 && s
<limit
&& U8_IS_TRAIL(*s
)) {
399 U_CAPI UChar
* U_EXPORT2
400 u_strFromUTF8WithSub(UChar
*dest
,
401 int32_t destCapacity
,
402 int32_t *pDestLength
,
405 UChar32 subchar
, int32_t *pNumSubstitutions
,
406 UErrorCode
*pErrorCode
){
408 UChar
*pDestLimit
= dest
+destCapacity
;
410 int32_t reqLength
= 0;
411 const uint8_t* pSrc
= (const uint8_t*) src
;
412 uint8_t t1
, t2
; /* trail bytes */
413 int32_t numSubstitutions
;
416 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
420 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
421 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
422 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
424 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
428 if(pNumSubstitutions
!=NULL
) {
429 *pNumSubstitutions
=0;
434 * Inline processing of UTF-8 byte sequences:
436 * Byte sequences for the most common characters are handled inline in
437 * the conversion loops. In order to reduce the path lengths for those
438 * characters, the tests are arranged in a kind of binary search.
439 * ASCII (<=0x7f) is checked first, followed by the dividing point
440 * between 2- and 3-byte sequences (0xe0).
441 * The 3-byte branch is tested first to speed up CJK text.
442 * The compiler should combine the subtractions for the two tests for 0xe0.
443 * Each branch then tests for the other end of its range.
448 * Transform a NUL-terminated string.
449 * The code explicitly checks for NULs only in the lead byte position.
450 * A NUL byte in the trail byte position fails the trail byte range check anyway.
452 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
458 if( /* handle U+1000..U+CFFF inline */
460 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
461 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
463 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
464 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
468 } else if(ch
< 0xe0) {
469 if( /* handle U+0080..U+07FF inline */
471 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
473 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
479 /* function call for "complicated" and error cases */
480 ++pSrc
; /* continue after the lead byte */
481 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
482 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
483 *pErrorCode
= U_INVALID_CHAR_FOUND
;
485 } else if(ch
<=0xFFFF) {
486 *(pDest
++)=(UChar
)ch
;
488 *(pDest
++)=U16_LEAD(ch
);
489 if(pDest
<pDestLimit
) {
490 *(pDest
++)=U16_TRAIL(ch
);
499 /* Pre-flight the rest of the string. */
500 while((ch
= *pSrc
) != 0) {
506 if( /* handle U+1000..U+CFFF inline */
508 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
509 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
515 } else if(ch
< 0xe0) {
516 if( /* handle U+0080..U+07FF inline */
518 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
526 /* function call for "complicated" and error cases */
527 ++pSrc
; /* continue after the lead byte */
528 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
529 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
530 *pErrorCode
= U_INVALID_CHAR_FOUND
;
533 reqLength
+= U16_LENGTH(ch
);
536 } else /* srcLength >= 0 */ {
537 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
540 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
543 * Each iteration of the inner loop progresses by at most 3 UTF-8
544 * bytes and one UChar, for most characters.
545 * For supplementary code points (4 & 2), which are rare,
546 * there is an additional adjustment.
548 count
= (int32_t)(pDestLimit
- pDest
);
549 srcLength
= (int32_t)((pSrcLimit
- pSrc
) / 3);
550 if(count
> srcLength
) {
551 count
= srcLength
; /* min(remaining dest, remaining src/3) */
555 * Too much overhead if we get near the end of the string,
556 * continue with the next loop.
568 if( /* handle U+1000..U+CFFF inline */
570 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
571 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
573 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
574 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
578 } else if(ch
< 0xe0) {
579 if( /* handle U+0080..U+07FF inline */
581 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
583 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
589 if(ch
>= 0xf0 || subchar
> 0xffff) {
591 * We may read up to six bytes and write up to two UChars,
592 * which we didn't account for with computing count,
593 * so we adjust it here.
600 /* function call for "complicated" and error cases */
601 ++pSrc
; /* continue after the lead byte */
602 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
603 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
604 *pErrorCode
= U_INVALID_CHAR_FOUND
;
606 }else if(ch
<=0xFFFF){
607 *(pDest
++)=(UChar
)ch
;
609 *(pDest
++)=U16_LEAD(ch
);
610 *(pDest
++)=U16_TRAIL(ch
);
613 } while(--count
> 0);
616 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
623 if( /* handle U+1000..U+CFFF inline */
625 ((pSrcLimit
- pSrc
) >= 3) &&
626 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
627 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
629 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
630 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
634 } else if(ch
< 0xe0) {
635 if( /* handle U+0080..U+07FF inline */
637 ((pSrcLimit
- pSrc
) >= 2) &&
638 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
640 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
646 /* function call for "complicated" and error cases */
647 ++pSrc
; /* continue after the lead byte */
648 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
649 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
650 *pErrorCode
= U_INVALID_CHAR_FOUND
;
652 }else if(ch
<=0xFFFF){
653 *(pDest
++)=(UChar
)ch
;
655 *(pDest
++)=U16_LEAD(ch
);
656 if(pDest
<pDestLimit
){
657 *(pDest
++)=U16_TRAIL(ch
);
665 /* do not fill the dest buffer just count the UChars needed */
666 while(pSrc
< pSrcLimit
){
673 if( /* handle U+1000..U+CFFF inline */
675 ((pSrcLimit
- pSrc
) >= 3) &&
676 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
677 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
683 } else if(ch
< 0xe0) {
684 if( /* handle U+0080..U+07FF inline */
686 ((pSrcLimit
- pSrc
) >= 2) &&
687 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
695 /* function call for "complicated" and error cases */
696 ++pSrc
; /* continue after the lead byte */
697 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
698 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
699 *pErrorCode
= U_INVALID_CHAR_FOUND
;
702 reqLength
+=U16_LENGTH(ch
);
707 reqLength
+=(int32_t)(pDest
- dest
);
709 if(pNumSubstitutions
!=NULL
) {
710 *pNumSubstitutions
=numSubstitutions
;
714 *pDestLength
= reqLength
;
717 /* Terminate the buffer */
718 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
723 U_CAPI UChar
* U_EXPORT2
724 u_strFromUTF8(UChar
*dest
,
725 int32_t destCapacity
,
726 int32_t *pDestLength
,
729 UErrorCode
*pErrorCode
){
730 return u_strFromUTF8WithSub(
731 dest
, destCapacity
, pDestLength
,
737 U_CAPI UChar
* U_EXPORT2
738 u_strFromUTF8Lenient(UChar
*dest
,
739 int32_t destCapacity
,
740 int32_t *pDestLength
,
743 UErrorCode
*pErrorCode
) {
746 int32_t reqLength
= 0;
747 uint8_t* pSrc
= (uint8_t*) src
;
750 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
754 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
755 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0)
757 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
762 /* Transform a NUL-terminated string. */
763 UChar
*pDestLimit
= (dest
!=NULL
)?(dest
+destCapacity
):NULL
;
764 uint8_t t1
, t2
, t3
; /* trail bytes */
766 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
769 * ASCII, or a trail byte in lead position which is treated like
770 * a single-byte sequence for better character boundary
771 * resynchronization after illegal sequences.
776 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
777 if((t1
= pSrc
[1]) != 0) {
778 /* 0x3080 = (0xc0 << 6) + 0x80 */
779 *pDest
++ = (UChar
)((ch
<< 6) + t1
- 0x3080);
783 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
784 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0) {
785 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
786 /* 0x2080 = (0x80 << 6) + 0x80 */
787 *pDest
++ = (UChar
)((ch
<< 12) + (t1
<< 6) + t2
- 0x2080);
791 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
792 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0 && (t3
= pSrc
[3]) != 0) {
794 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
795 ch
= (ch
<< 18) + (t1
<< 12) + (t2
<< 6) + t3
- 0x3c82080;
796 *(pDest
++) = U16_LEAD(ch
);
797 if(pDest
< pDestLimit
) {
798 *(pDest
++) = U16_TRAIL(ch
);
807 /* truncated character at the end */
809 while(*++pSrc
!= 0) {}
813 /* Pre-flight the rest of the string. */
814 while((ch
= *pSrc
) != 0) {
817 * ASCII, or a trail byte in lead position which is treated like
818 * a single-byte sequence for better character boundary
819 * resynchronization after illegal sequences.
824 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
830 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
831 if(pSrc
[1] != 0 && pSrc
[2] != 0) {
836 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
837 if(pSrc
[1] != 0 && pSrc
[2] != 0 && pSrc
[3] != 0) {
844 /* truncated character at the end */
848 } else /* srcLength >= 0 */ {
849 const uint8_t *pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+ srcLength
):NULL
;
852 * This function requires that if srcLength is given, then it must be
853 * destCapatity >= srcLength so that we need not check for
854 * destination buffer overflow in the loop.
856 if(destCapacity
< srcLength
) {
857 if(pDestLength
!= NULL
) {
858 *pDestLength
= srcLength
; /* this likely overestimates the true destLength! */
860 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
864 if((pSrcLimit
- pSrc
) >= 4) {
865 pSrcLimit
-= 3; /* temporarily reduce pSrcLimit */
867 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
872 * ASCII, or a trail byte in lead position which is treated like
873 * a single-byte sequence for better character boundary
874 * resynchronization after illegal sequences.
877 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
878 /* 0x3080 = (0xc0 << 6) + 0x80 */
879 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
880 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
881 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
882 /* 0x2080 = (0x80 << 6) + 0x80 */
883 ch
= (ch
<< 12) + (*pSrc
++ << 6);
884 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
885 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
886 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
887 ch
= (ch
<< 18) + (*pSrc
++ << 12);
889 ch
+= *pSrc
++ - 0x3c82080;
890 *(pDest
++) = U16_LEAD(ch
);
891 *(pDest
++) = U16_TRAIL(ch
);
893 } while(pSrc
< pSrcLimit
);
895 pSrcLimit
+= 3; /* restore original pSrcLimit */
898 while(pSrc
< pSrcLimit
) {
902 * ASCII, or a trail byte in lead position which is treated like
903 * a single-byte sequence for better character boundary
904 * resynchronization after illegal sequences.
908 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
909 if(pSrc
< pSrcLimit
) {
910 /* 0x3080 = (0xc0 << 6) + 0x80 */
911 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
914 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
915 if((pSrcLimit
- pSrc
) >= 2) {
916 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
917 /* 0x2080 = (0x80 << 6) + 0x80 */
918 ch
= (ch
<< 12) + (*pSrc
++ << 6);
919 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
923 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
924 if((pSrcLimit
- pSrc
) >= 3) {
925 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
926 ch
= (ch
<< 18) + (*pSrc
++ << 12);
928 ch
+= *pSrc
++ - 0x3c82080;
929 *(pDest
++) = U16_LEAD(ch
);
930 *(pDest
++) = U16_TRAIL(ch
);
936 /* truncated character at the end */
942 reqLength
+=(int32_t)(pDest
- dest
);
945 *pDestLength
= reqLength
;
948 /* Terminate the buffer */
949 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
954 static inline uint8_t *
955 _appendUTF8(uint8_t *pDest
, UChar32 c
) {
956 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
959 } else if(c
<=0x7ff) {
960 *pDest
++=(uint8_t)((c
>>6)|0xc0);
961 *pDest
++=(uint8_t)((c
&0x3f)|0x80);
962 } else if(c
<=0xffff) {
963 *pDest
++=(uint8_t)((c
>>12)|0xe0);
964 *pDest
++=(uint8_t)(((c
>>6)&0x3f)|0x80);
965 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
966 } else /* if((uint32_t)(c)<=0x10ffff) */ {
967 *pDest
++=(uint8_t)(((c
)>>18)|0xf0);
968 *pDest
++=(uint8_t)((((c
)>>12)&0x3f)|0x80);
969 *pDest
++=(uint8_t)((((c
)>>6)&0x3f)|0x80);
970 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
976 U_CAPI
char* U_EXPORT2
977 u_strToUTF8WithSub(char *dest
,
978 int32_t destCapacity
,
979 int32_t *pDestLength
,
982 UChar32 subchar
, int32_t *pNumSubstitutions
,
983 UErrorCode
*pErrorCode
){
986 uint8_t *pDest
= (uint8_t *)dest
;
987 uint8_t *pDestLimit
= (pDest
!=NULL
)?(pDest
+ destCapacity
):NULL
;
988 int32_t numSubstitutions
;
991 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
995 if( (pSrc
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
996 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
997 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
999 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1003 if(pNumSubstitutions
!=NULL
) {
1004 *pNumSubstitutions
=0;
1009 while((ch
=*pSrc
)!=0) {
1012 if(pDest
<pDestLimit
) {
1013 *pDest
++ = (uint8_t)ch
;
1018 } else if(ch
<= 0x7ff) {
1019 if((pDestLimit
- pDest
) >= 2) {
1020 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1021 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1026 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1027 if((pDestLimit
- pDest
) >= 3) {
1028 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1029 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1030 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1035 } else /* ch is a surrogate */ {
1038 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1039 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1041 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1042 } else if(subchar
>=0) {
1046 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1047 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1051 length
= U8_LENGTH(ch
);
1052 if((pDestLimit
- pDest
) >= length
) {
1053 /* convert and append*/
1054 pDest
=_appendUTF8(pDest
, ch
);
1061 while((ch
=*pSrc
++)!=0) {
1064 } else if(ch
<=0x7ff) {
1066 } else if(!U16_IS_SURROGATE(ch
)) {
1068 } else if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1071 } else if(subchar
>=0) {
1072 reqLength
+=U8_LENGTH(subchar
);
1075 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1076 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1081 const UChar
*pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+srcLength
):NULL
;
1084 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1087 * Each iteration of the inner loop progresses by at most 3 UTF-8
1088 * bytes and one UChar, for most characters.
1089 * For supplementary code points (4 & 2), which are rare,
1090 * there is an additional adjustment.
1092 count
= (int32_t)((pDestLimit
- pDest
) / 3);
1093 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1094 if(count
> srcLength
) {
1095 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1099 * Too much overhead if we get near the end of the string,
1100 * continue with the next loop.
1107 *pDest
++ = (uint8_t)ch
;
1108 } else if(ch
<= 0x7ff) {
1109 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1110 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1111 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1112 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1113 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1114 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1115 } else /* ch is a surrogate */ {
1117 * We will read two UChars and probably output four bytes,
1118 * which we didn't account for with computing count,
1119 * so we adjust it here.
1122 --pSrc
; /* undo ch=*pSrc++ for the lead surrogate */
1123 break; /* recompute count */
1126 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1128 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1130 /* writing 4 bytes per 2 UChars is ok */
1131 *pDest
++=(uint8_t)((ch
>>18)|0xf0);
1132 *pDest
++=(uint8_t)(((ch
>>12)&0x3f)|0x80);
1133 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1134 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1136 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1141 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1145 /* convert and append*/
1146 pDest
=_appendUTF8(pDest
, ch
);
1149 } while(--count
> 0);
1152 while(pSrc
<pSrcLimit
) {
1155 if(pDest
<pDestLimit
) {
1156 *pDest
++ = (uint8_t)ch
;
1161 } else if(ch
<= 0x7ff) {
1162 if((pDestLimit
- pDest
) >= 2) {
1163 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1164 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1169 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1170 if((pDestLimit
- pDest
) >= 3) {
1171 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1172 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1173 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1178 } else /* ch is a surrogate */ {
1181 if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1183 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1184 } else if(subchar
>=0) {
1188 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1189 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1193 length
= U8_LENGTH(ch
);
1194 if((pDestLimit
- pDest
) >= length
) {
1195 /* convert and append*/
1196 pDest
=_appendUTF8(pDest
, ch
);
1203 while(pSrc
<pSrcLimit
) {
1207 } else if(ch
<=0x7ff) {
1209 } else if(!U16_IS_SURROGATE(ch
)) {
1211 } else if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1214 } else if(subchar
>=0) {
1215 reqLength
+=U8_LENGTH(subchar
);
1218 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1219 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1225 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1227 if(pNumSubstitutions
!=NULL
) {
1228 *pNumSubstitutions
=numSubstitutions
;
1232 *pDestLength
= reqLength
;
1235 /* Terminate the buffer */
1236 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1240 U_CAPI
char* U_EXPORT2
1241 u_strToUTF8(char *dest
,
1242 int32_t destCapacity
,
1243 int32_t *pDestLength
,
1246 UErrorCode
*pErrorCode
){
1247 return u_strToUTF8WithSub(
1248 dest
, destCapacity
, pDestLength
,
1254 U_CAPI UChar
* U_EXPORT2
1255 u_strFromJavaModifiedUTF8WithSub(
1257 int32_t destCapacity
,
1258 int32_t *pDestLength
,
1261 UChar32 subchar
, int32_t *pNumSubstitutions
,
1262 UErrorCode
*pErrorCode
) {
1263 UChar
*pDest
= dest
;
1264 UChar
*pDestLimit
= dest
+destCapacity
;
1266 int32_t reqLength
= 0;
1267 const uint8_t* pSrc
= (const uint8_t*) src
;
1268 const uint8_t *pSrcLimit
;
1270 uint8_t t1
, t2
; /* trail bytes */
1271 int32_t numSubstitutions
;
1274 if(U_FAILURE(*pErrorCode
)){
1277 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1278 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0 ||
1279 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1281 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1285 if(pNumSubstitutions
!=NULL
) {
1286 *pNumSubstitutions
=0;
1292 * Transform a NUL-terminated ASCII string.
1293 * Handle non-ASCII strings with slower code.
1295 while(((ch
= *pSrc
) != 0) && ch
<= 0x7f && (pDest
< pDestLimit
)) {
1300 reqLength
=(int32_t)(pDest
- dest
);
1302 *pDestLength
= reqLength
;
1305 /* Terminate the buffer */
1306 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1309 srcLength
= uprv_strlen((const char *)pSrc
);
1312 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1313 pSrcLimit
= pSrc
+ srcLength
;
1315 count
= (int32_t)(pDestLimit
- pDest
);
1316 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1317 if(count
>= srcLength
&& srcLength
> 0 && *pSrc
<= 0x7f) {
1318 /* fast ASCII loop */
1319 const uint8_t *prevSrc
= pSrc
;
1321 while(pSrc
< pSrcLimit
&& (ch
= *pSrc
) <= 0x7f) {
1325 delta
= (int32_t)(pSrc
- prevSrc
);
1330 * Each iteration of the inner loop progresses by at most 3 UTF-8
1331 * bytes and one UChar.
1334 if(count
> srcLength
) {
1335 count
= srcLength
; /* min(remaining dest, remaining src/3) */
1339 * Too much overhead if we get near the end of the string,
1340 * continue with the next loop.
1351 if( /* handle U+0000..U+FFFF inline */
1353 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1354 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1356 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1357 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1362 if( /* handle U+0000..U+07FF inline */
1364 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1366 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1373 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1375 } else if(subchar
> 0xffff && --count
== 0) {
1377 * We need to write two UChars, adjusted count for that,
1378 * and ran out of space.
1382 /* function call for error cases */
1383 ++pSrc
; /* continue after the lead byte */
1384 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1386 if(subchar
<=0xFFFF) {
1387 *(pDest
++)=(UChar
)subchar
;
1389 *(pDest
++)=U16_LEAD(subchar
);
1390 *(pDest
++)=U16_TRAIL(subchar
);
1394 } while(--count
> 0);
1397 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
1404 if( /* handle U+0000..U+FFFF inline */
1406 ((pSrcLimit
- pSrc
) >= 3) &&
1407 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1408 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1410 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1411 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1416 if( /* handle U+0000..U+07FF inline */
1418 ((pSrcLimit
- pSrc
) >= 2) &&
1419 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1421 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1428 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1431 /* function call for error cases */
1432 ++pSrc
; /* continue after the lead byte */
1433 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1435 if(subchar
<=0xFFFF) {
1436 *(pDest
++)=(UChar
)subchar
;
1438 *(pDest
++)=U16_LEAD(subchar
);
1439 if(pDest
<pDestLimit
) {
1440 *(pDest
++)=U16_TRAIL(subchar
);
1450 /* do not fill the dest buffer just count the UChars needed */
1451 while(pSrc
< pSrcLimit
){
1458 if( /* handle U+0000..U+FFFF inline */
1460 ((pSrcLimit
- pSrc
) >= 3) &&
1461 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
1462 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
1469 if( /* handle U+0000..U+07FF inline */
1471 ((pSrcLimit
- pSrc
) >= 2) &&
1472 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
1481 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1484 /* function call for error cases */
1485 ++pSrc
; /* continue after the lead byte */
1486 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1488 reqLength
+=U16_LENGTH(ch
);
1493 if(pNumSubstitutions
!=NULL
) {
1494 *pNumSubstitutions
=numSubstitutions
;
1497 reqLength
+=(int32_t)(pDest
- dest
);
1499 *pDestLength
= reqLength
;
1502 /* Terminate the buffer */
1503 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1507 U_CAPI
char* U_EXPORT2
1508 u_strToJavaModifiedUTF8(
1510 int32_t destCapacity
,
1511 int32_t *pDestLength
,
1514 UErrorCode
*pErrorCode
) {
1515 int32_t reqLength
=0;
1517 uint8_t *pDest
= (uint8_t *)dest
;
1518 uint8_t *pDestLimit
= pDest
+ destCapacity
;
1519 const UChar
*pSrcLimit
;
1523 if(U_FAILURE(*pErrorCode
)){
1526 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1527 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0
1529 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1534 /* Convert NUL-terminated ASCII, then find the string length. */
1535 while((ch
=*src
)<=0x7f && ch
!= 0 && pDest
<pDestLimit
) {
1536 *pDest
++ = (uint8_t)ch
;
1540 reqLength
=(int32_t)(pDest
- (uint8_t *)dest
);
1542 *pDestLength
= reqLength
;
1545 /* Terminate the buffer */
1546 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1549 srcLength
= u_strlen(src
);
1552 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1553 pSrcLimit
= (src
!=NULL
)?(src
+srcLength
):NULL
;
1555 count
= (int32_t)(pDestLimit
- pDest
);
1556 srcLength
= (int32_t)(pSrcLimit
- src
);
1557 if(count
>= srcLength
&& srcLength
> 0 && *src
<= 0x7f) {
1558 /* fast ASCII loop */
1559 const UChar
*prevSrc
= src
;
1561 while(src
< pSrcLimit
&& (ch
= *src
) <= 0x7f && ch
!= 0) {
1562 *pDest
++=(uint8_t)ch
;
1565 delta
= (int32_t)(src
- prevSrc
);
1570 * Each iteration of the inner loop progresses by at most 3 UTF-8
1571 * bytes and one UChar.
1574 if(count
> srcLength
) {
1575 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1579 * Too much overhead if we get near the end of the string,
1580 * continue with the next loop.
1586 if(ch
<= 0x7f && ch
!= 0) {
1587 *pDest
++ = (uint8_t)ch
;
1588 } else if(ch
<= 0x7ff) {
1589 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1590 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1592 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1593 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1594 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1596 } while(--count
> 0);
1599 while(src
<pSrcLimit
) {
1601 if(ch
<= 0x7f && ch
!= 0) {
1602 if(pDest
<pDestLimit
) {
1603 *pDest
++ = (uint8_t)ch
;
1608 } else if(ch
<= 0x7ff) {
1609 if((pDestLimit
- pDest
) >= 2) {
1610 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1611 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1617 if((pDestLimit
- pDest
) >= 3) {
1618 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1619 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1620 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1627 while(src
<pSrcLimit
) {
1629 if(ch
<= 0x7f && ch
!= 0) {
1631 } else if(ch
<=0x7ff) {
1638 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1640 *pDestLength
= reqLength
;
1643 /* Terminate the buffer */
1644 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);