]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
2 ******************************************************************************
4 * Copyright (C) 2001-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
18 /*******************************************************************************
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
23 *******************************************************************************
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utf16.h"
37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39 U_CAPI UChar
* U_EXPORT2
40 u_strFromUTF32WithSub(UChar
*dest
,
45 UChar32 subchar
, int32_t *pNumSubstitutions
,
46 UErrorCode
*pErrorCode
) {
47 const UChar32
*srcLimit
;
52 int32_t numSubstitutions
;
55 if(U_FAILURE(*pErrorCode
)){
58 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
59 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
60 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
62 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
66 if(pNumSubstitutions
!= NULL
) {
67 *pNumSubstitutions
= 0;
71 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
76 /* simple loop for conversion of a NUL-terminated BMP string */
77 while((ch
=*src
) != 0 &&
78 ((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff))) {
80 if(pDest
< destLimit
) {
88 /* "complicated" case, find the end of the remaining string */
89 while(*++srcLimit
!= 0) {}
92 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
95 /* convert with length */
96 while(src
< srcLimit
) {
99 /* usually "loops" once; twice only for writing subchar */
100 if((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff)) {
101 if(pDest
< destLimit
) {
102 *pDest
++ = (UChar
)ch
;
107 } else if(0x10000 <= ch
&& ch
<= 0x10ffff) {
108 if(pDest
!=NULL
&& ((pDest
+ 2) <= destLimit
)) {
109 *pDest
++ = U16_LEAD(ch
);
110 *pDest
++ = U16_TRAIL(ch
);
115 } else if((ch
= subchar
) < 0) {
116 /* surrogate code point, or not a Unicode code point at all */
117 *pErrorCode
= U_INVALID_CHAR_FOUND
;
125 reqLength
+= (int32_t)(pDest
- dest
);
127 *pDestLength
= reqLength
;
129 if(pNumSubstitutions
!= NULL
) {
130 *pNumSubstitutions
= numSubstitutions
;
133 /* Terminate the buffer */
134 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
139 U_CAPI UChar
* U_EXPORT2
140 u_strFromUTF32(UChar
*dest
,
141 int32_t destCapacity
,
142 int32_t *pDestLength
,
145 UErrorCode
*pErrorCode
) {
146 return u_strFromUTF32WithSub(
147 dest
, destCapacity
, pDestLength
,
153 U_CAPI UChar32
* U_EXPORT2
154 u_strToUTF32WithSub(UChar32
*dest
,
155 int32_t destCapacity
,
156 int32_t *pDestLength
,
159 UChar32 subchar
, int32_t *pNumSubstitutions
,
160 UErrorCode
*pErrorCode
) {
161 const UChar
*srcLimit
;
167 int32_t numSubstitutions
;
170 if(U_FAILURE(*pErrorCode
)){
173 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
174 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
175 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
177 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
181 if(pNumSubstitutions
!= NULL
) {
182 *pNumSubstitutions
= 0;
186 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
188 numSubstitutions
= 0;
191 /* simple loop for conversion of a NUL-terminated BMP string */
192 while((ch
=*src
) != 0 && !U16_IS_SURROGATE(ch
)) {
194 if(pDest
< destLimit
) {
202 /* "complicated" case, find the end of the remaining string */
203 while(*++srcLimit
!= 0) {}
206 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
209 /* convert with length */
210 while(src
< srcLimit
) {
212 if(!U16_IS_SURROGATE(ch
)) {
213 /* write or count ch below */
214 } else if(U16_IS_SURROGATE_LEAD(ch
) && src
< srcLimit
&& U16_IS_TRAIL(ch2
= *src
)) {
216 ch
= U16_GET_SUPPLEMENTARY(ch
, ch2
);
217 } else if((ch
= subchar
) < 0) {
218 /* unpaired surrogate */
219 *pErrorCode
= U_INVALID_CHAR_FOUND
;
224 if(pDest
< destLimit
) {
231 reqLength
+= (int32_t)(pDest
- dest
);
233 *pDestLength
= reqLength
;
235 if(pNumSubstitutions
!= NULL
) {
236 *pNumSubstitutions
= numSubstitutions
;
239 /* Terminate the buffer */
240 u_terminateUChar32s(dest
, destCapacity
, reqLength
, pErrorCode
);
245 U_CAPI UChar32
* U_EXPORT2
246 u_strToUTF32(UChar32
*dest
,
247 int32_t destCapacity
,
248 int32_t *pDestLength
,
251 UErrorCode
*pErrorCode
) {
252 return u_strToUTF32WithSub(
253 dest
, destCapacity
, pDestLength
,
259 /* for utf8_nextCharSafeBodyTerminated() */
261 utf8_minLegal
[4]={ 0, 0x80, 0x800, 0x10000 };
264 * Version of utf8_nextCharSafeBody() with the following differences:
265 * - checks for NUL termination instead of length
266 * - works with pointers instead of indexes
267 * - always strict (strict==-1)
269 * *ps points to after the lead byte and will be moved to after the last trail byte.
270 * c is the lead byte.
271 * @return the code point, or U_SENTINEL
274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps
, UChar32 c
) {
275 const uint8_t *s
=*ps
;
276 uint8_t trail
, illegal
=0;
277 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
279 U8_MASK_LEAD_BYTE((c
), count
);
280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
282 /* each branch falls through to the next one */
285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
289 trail
=(uint8_t)(*s
++ - 0x80);
291 if(trail
>0x3f || c
>=0x110) {
292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
296 case 2: /*fall through*/
297 trail
=(uint8_t)(*s
++ - 0x80);
299 /* not a trail byte */
304 case 1: /*fall through*/
305 trail
=(uint8_t)(*s
++ - 0x80);
307 /* not a trail byte */
314 /* no default branch to optimize switch() - all values are covered */
317 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
318 /* illegal is also set if count>=4 */
319 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
321 /* don't go beyond this sequence */
323 while(count
>0 && U8_IS_TRAIL(*s
)) {
334 * Version of utf8_nextCharSafeBody() with the following differences:
335 * - works with pointers instead of indexes
336 * - always strict (strict==-1)
338 * *ps points to after the lead byte and will be moved to after the last trail byte.
339 * c is the lead byte.
340 * @return the code point, or U_SENTINEL
343 utf8_nextCharSafeBodyPointer(const uint8_t **ps
, const uint8_t *limit
, UChar32 c
) {
344 const uint8_t *s
=*ps
;
345 uint8_t trail
, illegal
=0;
346 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
347 if((limit
-s
)>=count
) {
348 U8_MASK_LEAD_BYTE((c
), count
);
349 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
351 /* each branch falls through to the next one */
354 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
359 c
=(c
<<6)|(trail
&0x3f);
361 illegal
|=(trail
&0xc0)^0x80;
363 /* code point>0x10ffff, outside Unicode */
367 case 2: /*fall through*/
369 c
=(c
<<6)|(trail
&0x3f);
370 illegal
|=(trail
&0xc0)^0x80;
371 case 1: /*fall through*/
373 c
=(c
<<6)|(trail
&0x3f);
374 illegal
|=(trail
&0xc0)^0x80;
378 /* no default branch to optimize switch() - all values are covered */
381 illegal
=1; /* too few bytes left */
384 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
385 /* illegal is also set if count>=4 */
386 U_ASSERT(illegal
|| count
<LENGTHOF(utf8_minLegal
));
387 if(illegal
|| c
<utf8_minLegal
[count
] || U_IS_SURROGATE(c
)) {
389 /* don't go beyond this sequence */
391 while(count
>0 && s
<limit
&& U8_IS_TRAIL(*s
)) {
401 U_CAPI UChar
* U_EXPORT2
402 u_strFromUTF8WithSub(UChar
*dest
,
403 int32_t destCapacity
,
404 int32_t *pDestLength
,
407 UChar32 subchar
, int32_t *pNumSubstitutions
,
408 UErrorCode
*pErrorCode
){
410 UChar
*pDestLimit
= dest
+destCapacity
;
412 int32_t reqLength
= 0;
413 const uint8_t* pSrc
= (const uint8_t*) src
;
414 uint8_t t1
, t2
; /* trail bytes */
415 int32_t numSubstitutions
;
418 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
422 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
423 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
424 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
426 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
430 if(pNumSubstitutions
!=NULL
) {
431 *pNumSubstitutions
=0;
436 * Inline processing of UTF-8 byte sequences:
438 * Byte sequences for the most common characters are handled inline in
439 * the conversion loops. In order to reduce the path lengths for those
440 * characters, the tests are arranged in a kind of binary search.
441 * ASCII (<=0x7f) is checked first, followed by the dividing point
442 * between 2- and 3-byte sequences (0xe0).
443 * The 3-byte branch is tested first to speed up CJK text.
444 * The compiler should combine the subtractions for the two tests for 0xe0.
445 * Each branch then tests for the other end of its range.
450 * Transform a NUL-terminated string.
451 * The code explicitly checks for NULs only in the lead byte position.
452 * A NUL byte in the trail byte position fails the trail byte range check anyway.
454 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
460 if( /* handle U+1000..U+CFFF inline */
462 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
463 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
465 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
466 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
470 } else if(ch
< 0xe0) {
471 if( /* handle U+0080..U+07FF inline */
473 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
475 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
481 /* function call for "complicated" and error cases */
482 ++pSrc
; /* continue after the lead byte */
483 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
484 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
485 *pErrorCode
= U_INVALID_CHAR_FOUND
;
487 } else if(ch
<=0xFFFF) {
488 *(pDest
++)=(UChar
)ch
;
490 *(pDest
++)=U16_LEAD(ch
);
491 if(pDest
<pDestLimit
) {
492 *(pDest
++)=U16_TRAIL(ch
);
501 /* Pre-flight the rest of the string. */
502 while((ch
= *pSrc
) != 0) {
508 if( /* handle U+1000..U+CFFF inline */
510 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
511 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
517 } else if(ch
< 0xe0) {
518 if( /* handle U+0080..U+07FF inline */
520 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
528 /* function call for "complicated" and error cases */
529 ++pSrc
; /* continue after the lead byte */
530 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
531 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
532 *pErrorCode
= U_INVALID_CHAR_FOUND
;
535 reqLength
+= U16_LENGTH(ch
);
538 } else /* srcLength >= 0 */ {
539 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
542 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
545 * Each iteration of the inner loop progresses by at most 3 UTF-8
546 * bytes and one UChar, for most characters.
547 * For supplementary code points (4 & 2), which are rare,
548 * there is an additional adjustment.
550 count
= (int32_t)(pDestLimit
- pDest
);
551 srcLength
= (int32_t)((pSrcLimit
- pSrc
) / 3);
552 if(count
> srcLength
) {
553 count
= srcLength
; /* min(remaining dest, remaining src/3) */
557 * Too much overhead if we get near the end of the string,
558 * continue with the next loop.
570 if( /* handle U+1000..U+CFFF inline */
572 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
573 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
575 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
576 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
580 } else if(ch
< 0xe0) {
581 if( /* handle U+0080..U+07FF inline */
583 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
585 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
591 if(ch
>= 0xf0 || subchar
> 0xffff) {
593 * We may read up to six bytes and write up to two UChars,
594 * which we didn't account for with computing count,
595 * so we adjust it here.
602 /* function call for "complicated" and error cases */
603 ++pSrc
; /* continue after the lead byte */
604 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
605 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
606 *pErrorCode
= U_INVALID_CHAR_FOUND
;
608 }else if(ch
<=0xFFFF){
609 *(pDest
++)=(UChar
)ch
;
611 *(pDest
++)=U16_LEAD(ch
);
612 *(pDest
++)=U16_TRAIL(ch
);
615 } while(--count
> 0);
618 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
625 if( /* handle U+1000..U+CFFF inline */
627 ((pSrcLimit
- pSrc
) >= 3) &&
628 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
629 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
631 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
632 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
636 } else if(ch
< 0xe0) {
637 if( /* handle U+0080..U+07FF inline */
639 ((pSrcLimit
- pSrc
) >= 2) &&
640 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
642 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
648 /* function call for "complicated" and error cases */
649 ++pSrc
; /* continue after the lead byte */
650 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
651 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
652 *pErrorCode
= U_INVALID_CHAR_FOUND
;
654 }else if(ch
<=0xFFFF){
655 *(pDest
++)=(UChar
)ch
;
657 *(pDest
++)=U16_LEAD(ch
);
658 if(pDest
<pDestLimit
){
659 *(pDest
++)=U16_TRAIL(ch
);
667 /* do not fill the dest buffer just count the UChars needed */
668 while(pSrc
< pSrcLimit
){
675 if( /* handle U+1000..U+CFFF inline */
677 ((pSrcLimit
- pSrc
) >= 3) &&
678 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
679 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
685 } else if(ch
< 0xe0) {
686 if( /* handle U+0080..U+07FF inline */
688 ((pSrcLimit
- pSrc
) >= 2) &&
689 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
697 /* function call for "complicated" and error cases */
698 ++pSrc
; /* continue after the lead byte */
699 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
700 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
701 *pErrorCode
= U_INVALID_CHAR_FOUND
;
704 reqLength
+=U16_LENGTH(ch
);
709 reqLength
+=(int32_t)(pDest
- dest
);
711 if(pNumSubstitutions
!=NULL
) {
712 *pNumSubstitutions
=numSubstitutions
;
716 *pDestLength
= reqLength
;
719 /* Terminate the buffer */
720 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
725 U_CAPI UChar
* U_EXPORT2
726 u_strFromUTF8(UChar
*dest
,
727 int32_t destCapacity
,
728 int32_t *pDestLength
,
731 UErrorCode
*pErrorCode
){
732 return u_strFromUTF8WithSub(
733 dest
, destCapacity
, pDestLength
,
739 U_CAPI UChar
* U_EXPORT2
740 u_strFromUTF8Lenient(UChar
*dest
,
741 int32_t destCapacity
,
742 int32_t *pDestLength
,
745 UErrorCode
*pErrorCode
) {
748 int32_t reqLength
= 0;
749 uint8_t* pSrc
= (uint8_t*) src
;
752 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
756 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
757 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0)
759 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
764 /* Transform a NUL-terminated string. */
765 UChar
*pDestLimit
= (dest
!=NULL
)?(dest
+destCapacity
):NULL
;
766 uint8_t t1
, t2
, t3
; /* trail bytes */
768 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
771 * ASCII, or a trail byte in lead position which is treated like
772 * a single-byte sequence for better character boundary
773 * resynchronization after illegal sequences.
778 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
779 if((t1
= pSrc
[1]) != 0) {
780 /* 0x3080 = (0xc0 << 6) + 0x80 */
781 *pDest
++ = (UChar
)((ch
<< 6) + t1
- 0x3080);
785 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
786 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0) {
787 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
788 /* 0x2080 = (0x80 << 6) + 0x80 */
789 *pDest
++ = (UChar
)((ch
<< 12) + (t1
<< 6) + t2
- 0x2080);
793 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
794 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0 && (t3
= pSrc
[3]) != 0) {
796 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
797 ch
= (ch
<< 18) + (t1
<< 12) + (t2
<< 6) + t3
- 0x3c82080;
798 *(pDest
++) = U16_LEAD(ch
);
799 if(pDest
< pDestLimit
) {
800 *(pDest
++) = U16_TRAIL(ch
);
809 /* truncated character at the end */
811 while(*++pSrc
!= 0) {}
815 /* Pre-flight the rest of the string. */
816 while((ch
= *pSrc
) != 0) {
819 * ASCII, or a trail byte in lead position which is treated like
820 * a single-byte sequence for better character boundary
821 * resynchronization after illegal sequences.
826 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
832 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
833 if(pSrc
[1] != 0 && pSrc
[2] != 0) {
838 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
839 if(pSrc
[1] != 0 && pSrc
[2] != 0 && pSrc
[3] != 0) {
846 /* truncated character at the end */
850 } else /* srcLength >= 0 */ {
851 const uint8_t *pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+ srcLength
):NULL
;
854 * This function requires that if srcLength is given, then it must be
855 * destCapatity >= srcLength so that we need not check for
856 * destination buffer overflow in the loop.
858 if(destCapacity
< srcLength
) {
859 if(pDestLength
!= NULL
) {
860 *pDestLength
= srcLength
; /* this likely overestimates the true destLength! */
862 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
866 if((pSrcLimit
- pSrc
) >= 4) {
867 pSrcLimit
-= 3; /* temporarily reduce pSrcLimit */
869 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
874 * ASCII, or a trail byte in lead position which is treated like
875 * a single-byte sequence for better character boundary
876 * resynchronization after illegal sequences.
879 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
880 /* 0x3080 = (0xc0 << 6) + 0x80 */
881 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
882 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
883 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
884 /* 0x2080 = (0x80 << 6) + 0x80 */
885 ch
= (ch
<< 12) + (*pSrc
++ << 6);
886 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
887 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
888 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
889 ch
= (ch
<< 18) + (*pSrc
++ << 12);
891 ch
+= *pSrc
++ - 0x3c82080;
892 *(pDest
++) = U16_LEAD(ch
);
893 *(pDest
++) = U16_TRAIL(ch
);
895 } while(pSrc
< pSrcLimit
);
897 pSrcLimit
+= 3; /* restore original pSrcLimit */
900 while(pSrc
< pSrcLimit
) {
904 * ASCII, or a trail byte in lead position which is treated like
905 * a single-byte sequence for better character boundary
906 * resynchronization after illegal sequences.
910 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
911 if(pSrc
< pSrcLimit
) {
912 /* 0x3080 = (0xc0 << 6) + 0x80 */
913 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
916 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
917 if((pSrcLimit
- pSrc
) >= 2) {
918 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
919 /* 0x2080 = (0x80 << 6) + 0x80 */
920 ch
= (ch
<< 12) + (*pSrc
++ << 6);
921 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
925 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
926 if((pSrcLimit
- pSrc
) >= 3) {
927 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
928 ch
= (ch
<< 18) + (*pSrc
++ << 12);
930 ch
+= *pSrc
++ - 0x3c82080;
931 *(pDest
++) = U16_LEAD(ch
);
932 *(pDest
++) = U16_TRAIL(ch
);
938 /* truncated character at the end */
944 reqLength
+=(int32_t)(pDest
- dest
);
947 *pDestLength
= reqLength
;
950 /* Terminate the buffer */
951 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
956 static inline uint8_t *
957 _appendUTF8(uint8_t *pDest
, UChar32 c
) {
958 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
961 } else if(c
<=0x7ff) {
962 *pDest
++=(uint8_t)((c
>>6)|0xc0);
963 *pDest
++=(uint8_t)((c
&0x3f)|0x80);
964 } else if(c
<=0xffff) {
965 *pDest
++=(uint8_t)((c
>>12)|0xe0);
966 *pDest
++=(uint8_t)(((c
>>6)&0x3f)|0x80);
967 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
968 } else /* if((uint32_t)(c)<=0x10ffff) */ {
969 *pDest
++=(uint8_t)(((c
)>>18)|0xf0);
970 *pDest
++=(uint8_t)((((c
)>>12)&0x3f)|0x80);
971 *pDest
++=(uint8_t)((((c
)>>6)&0x3f)|0x80);
972 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
978 U_CAPI
char* U_EXPORT2
979 u_strToUTF8WithSub(char *dest
,
980 int32_t destCapacity
,
981 int32_t *pDestLength
,
984 UChar32 subchar
, int32_t *pNumSubstitutions
,
985 UErrorCode
*pErrorCode
){
988 uint8_t *pDest
= (uint8_t *)dest
;
989 uint8_t *pDestLimit
= (pDest
!=NULL
)?(pDest
+ destCapacity
):NULL
;
990 int32_t numSubstitutions
;
993 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
997 if( (pSrc
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
998 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
999 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1001 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1005 if(pNumSubstitutions
!=NULL
) {
1006 *pNumSubstitutions
=0;
1011 while((ch
=*pSrc
)!=0) {
1014 if(pDest
<pDestLimit
) {
1015 *pDest
++ = (uint8_t)ch
;
1020 } else if(ch
<= 0x7ff) {
1021 if((pDestLimit
- pDest
) >= 2) {
1022 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1023 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1028 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1029 if((pDestLimit
- pDest
) >= 3) {
1030 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1031 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1032 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1037 } else /* ch is a surrogate */ {
1040 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1041 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1043 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1044 } else if(subchar
>=0) {
1048 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1049 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1053 length
= U8_LENGTH(ch
);
1054 if((pDestLimit
- pDest
) >= length
) {
1055 /* convert and append*/
1056 pDest
=_appendUTF8(pDest
, ch
);
1063 while((ch
=*pSrc
++)!=0) {
1066 } else if(ch
<=0x7ff) {
1068 } else if(!U16_IS_SURROGATE(ch
)) {
1070 } else if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1073 } else if(subchar
>=0) {
1074 reqLength
+=U8_LENGTH(subchar
);
1077 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1078 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1083 const UChar
*pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+srcLength
):NULL
;
1086 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1089 * Each iteration of the inner loop progresses by at most 3 UTF-8
1090 * bytes and one UChar, for most characters.
1091 * For supplementary code points (4 & 2), which are rare,
1092 * there is an additional adjustment.
1094 count
= (int32_t)((pDestLimit
- pDest
) / 3);
1095 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1096 if(count
> srcLength
) {
1097 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1101 * Too much overhead if we get near the end of the string,
1102 * continue with the next loop.
1109 *pDest
++ = (uint8_t)ch
;
1110 } else if(ch
<= 0x7ff) {
1111 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1112 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1113 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1114 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1115 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1116 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1117 } else /* ch is a surrogate */ {
1119 * We will read two UChars and probably output four bytes,
1120 * which we didn't account for with computing count,
1121 * so we adjust it here.
1124 --pSrc
; /* undo ch=*pSrc++ for the lead surrogate */
1125 break; /* recompute count */
1128 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
1130 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1132 /* writing 4 bytes per 2 UChars is ok */
1133 *pDest
++=(uint8_t)((ch
>>18)|0xf0);
1134 *pDest
++=(uint8_t)(((ch
>>12)&0x3f)|0x80);
1135 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1136 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1138 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1143 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1147 /* convert and append*/
1148 pDest
=_appendUTF8(pDest
, ch
);
1151 } while(--count
> 0);
1154 while(pSrc
<pSrcLimit
) {
1157 if(pDest
<pDestLimit
) {
1158 *pDest
++ = (uint8_t)ch
;
1163 } else if(ch
<= 0x7ff) {
1164 if((pDestLimit
- pDest
) >= 2) {
1165 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1166 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1171 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1172 if((pDestLimit
- pDest
) >= 3) {
1173 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1174 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1175 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1180 } else /* ch is a surrogate */ {
1183 if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1185 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1186 } else if(subchar
>=0) {
1190 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1191 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1195 length
= U8_LENGTH(ch
);
1196 if((pDestLimit
- pDest
) >= length
) {
1197 /* convert and append*/
1198 pDest
=_appendUTF8(pDest
, ch
);
1205 while(pSrc
<pSrcLimit
) {
1209 } else if(ch
<=0x7ff) {
1211 } else if(!U16_IS_SURROGATE(ch
)) {
1213 } else if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1216 } else if(subchar
>=0) {
1217 reqLength
+=U8_LENGTH(subchar
);
1220 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1221 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1227 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1229 if(pNumSubstitutions
!=NULL
) {
1230 *pNumSubstitutions
=numSubstitutions
;
1234 *pDestLength
= reqLength
;
1237 /* Terminate the buffer */
1238 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1242 U_CAPI
char* U_EXPORT2
1243 u_strToUTF8(char *dest
,
1244 int32_t destCapacity
,
1245 int32_t *pDestLength
,
1248 UErrorCode
*pErrorCode
){
1249 return u_strToUTF8WithSub(
1250 dest
, destCapacity
, pDestLength
,
1256 U_CAPI UChar
* U_EXPORT2
1257 u_strFromJavaModifiedUTF8WithSub(
1259 int32_t destCapacity
,
1260 int32_t *pDestLength
,
1263 UChar32 subchar
, int32_t *pNumSubstitutions
,
1264 UErrorCode
*pErrorCode
) {
1265 UChar
*pDest
= dest
;
1266 UChar
*pDestLimit
= dest
+destCapacity
;
1268 int32_t reqLength
= 0;
1269 const uint8_t* pSrc
= (const uint8_t*) src
;
1270 const uint8_t *pSrcLimit
;
1272 uint8_t t1
, t2
; /* trail bytes */
1273 int32_t numSubstitutions
;
1276 if(U_FAILURE(*pErrorCode
)){
1279 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1280 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0 ||
1281 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1283 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1287 if(pNumSubstitutions
!=NULL
) {
1288 *pNumSubstitutions
=0;
1294 * Transform a NUL-terminated ASCII string.
1295 * Handle non-ASCII strings with slower code.
1297 while(((ch
= *pSrc
) != 0) && ch
<= 0x7f && (pDest
< pDestLimit
)) {
1302 reqLength
=(int32_t)(pDest
- dest
);
1304 *pDestLength
= reqLength
;
1307 /* Terminate the buffer */
1308 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1311 srcLength
= uprv_strlen((const char *)pSrc
);
1314 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1315 pSrcLimit
= (pSrc
== NULL
) ? NULL
: pSrc
+ srcLength
;
1317 count
= (int32_t)(pDestLimit
- pDest
);
1318 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1319 if(count
>= srcLength
&& srcLength
> 0 && *pSrc
<= 0x7f) {
1320 /* fast ASCII loop */
1321 const uint8_t *prevSrc
= pSrc
;
1323 while(pSrc
< pSrcLimit
&& (ch
= *pSrc
) <= 0x7f) {
1327 delta
= (int32_t)(pSrc
- prevSrc
);
1332 * Each iteration of the inner loop progresses by at most 3 UTF-8
1333 * bytes and one UChar.
1336 if(count
> srcLength
) {
1337 count
= srcLength
; /* min(remaining dest, remaining src/3) */
1341 * Too much overhead if we get near the end of the string,
1342 * continue with the next loop.
1353 if( /* handle U+0000..U+FFFF inline */
1355 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1356 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1358 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1359 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1364 if( /* handle U+0000..U+07FF inline */
1366 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1368 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1375 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1377 } else if(subchar
> 0xffff && --count
== 0) {
1379 * We need to write two UChars, adjusted count for that,
1380 * and ran out of space.
1384 /* function call for error cases */
1385 ++pSrc
; /* continue after the lead byte */
1386 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1388 if(subchar
<=0xFFFF) {
1389 *(pDest
++)=(UChar
)subchar
;
1391 *(pDest
++)=U16_LEAD(subchar
);
1392 *(pDest
++)=U16_TRAIL(subchar
);
1396 } while(--count
> 0);
1399 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
1406 if( /* handle U+0000..U+FFFF inline */
1408 ((pSrcLimit
- pSrc
) >= 3) &&
1409 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1410 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1412 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1413 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1418 if( /* handle U+0000..U+07FF inline */
1420 ((pSrcLimit
- pSrc
) >= 2) &&
1421 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1423 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1430 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1433 /* function call for error cases */
1434 ++pSrc
; /* continue after the lead byte */
1435 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1437 if(subchar
<=0xFFFF) {
1438 *(pDest
++)=(UChar
)subchar
;
1440 *(pDest
++)=U16_LEAD(subchar
);
1441 if(pDest
<pDestLimit
) {
1442 *(pDest
++)=U16_TRAIL(subchar
);
1452 /* do not fill the dest buffer just count the UChars needed */
1453 while(pSrc
< pSrcLimit
){
1460 if( /* handle U+0000..U+FFFF inline */
1462 ((pSrcLimit
- pSrc
) >= 3) &&
1463 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
1464 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
1471 if( /* handle U+0000..U+07FF inline */
1473 ((pSrcLimit
- pSrc
) >= 2) &&
1474 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
1483 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1486 /* function call for error cases */
1487 ++pSrc
; /* continue after the lead byte */
1488 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1490 reqLength
+=U16_LENGTH(ch
);
1495 if(pNumSubstitutions
!=NULL
) {
1496 *pNumSubstitutions
=numSubstitutions
;
1499 reqLength
+=(int32_t)(pDest
- dest
);
1501 *pDestLength
= reqLength
;
1504 /* Terminate the buffer */
1505 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1509 U_CAPI
char* U_EXPORT2
1510 u_strToJavaModifiedUTF8(
1512 int32_t destCapacity
,
1513 int32_t *pDestLength
,
1516 UErrorCode
*pErrorCode
) {
1517 int32_t reqLength
=0;
1519 uint8_t *pDest
= (uint8_t *)dest
;
1520 uint8_t *pDestLimit
= pDest
+ destCapacity
;
1521 const UChar
*pSrcLimit
;
1525 if(U_FAILURE(*pErrorCode
)){
1528 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1529 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0
1531 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1536 /* Convert NUL-terminated ASCII, then find the string length. */
1537 while((ch
=*src
)<=0x7f && ch
!= 0 && pDest
<pDestLimit
) {
1538 *pDest
++ = (uint8_t)ch
;
1542 reqLength
=(int32_t)(pDest
- (uint8_t *)dest
);
1544 *pDestLength
= reqLength
;
1547 /* Terminate the buffer */
1548 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1551 srcLength
= u_strlen(src
);
1554 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1555 pSrcLimit
= (src
!=NULL
)?(src
+srcLength
):NULL
;
1557 count
= (int32_t)(pDestLimit
- pDest
);
1558 srcLength
= (int32_t)(pSrcLimit
- src
);
1559 if(count
>= srcLength
&& srcLength
> 0 && *src
<= 0x7f) {
1560 /* fast ASCII loop */
1561 const UChar
*prevSrc
= src
;
1563 while(src
< pSrcLimit
&& (ch
= *src
) <= 0x7f && ch
!= 0) {
1564 *pDest
++=(uint8_t)ch
;
1567 delta
= (int32_t)(src
- prevSrc
);
1572 * Each iteration of the inner loop progresses by at most 3 UTF-8
1573 * bytes and one UChar.
1576 if(count
> srcLength
) {
1577 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1581 * Too much overhead if we get near the end of the string,
1582 * continue with the next loop.
1588 if(ch
<= 0x7f && ch
!= 0) {
1589 *pDest
++ = (uint8_t)ch
;
1590 } else if(ch
<= 0x7ff) {
1591 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1592 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1594 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1595 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1596 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1598 } while(--count
> 0);
1601 while(src
<pSrcLimit
) {
1603 if(ch
<= 0x7f && ch
!= 0) {
1604 if(pDest
<pDestLimit
) {
1605 *pDest
++ = (uint8_t)ch
;
1610 } else if(ch
<= 0x7ff) {
1611 if((pDestLimit
- pDest
) >= 2) {
1612 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1613 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1619 if((pDestLimit
- pDest
) >= 3) {
1620 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1621 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1622 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1629 while(src
<pSrcLimit
) {
1631 if(ch
<= 0x7f && ch
!= 0) {
1633 } else if(ch
<=0x7ff) {
1640 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1642 *pDestLength
= reqLength
;
1645 /* Terminate the buffer */
1646 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);