]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.c
2 ******************************************************************************
4 * Copyright (C) 2001-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
18 /*******************************************************************************
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
23 *******************************************************************************
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
33 U_CAPI UChar
* U_EXPORT2
34 u_strFromUTF32WithSub(UChar
*dest
,
39 UChar32 subchar
, int32_t *pNumSubstitutions
,
40 UErrorCode
*pErrorCode
) {
41 const UChar32
*srcLimit
;
46 int32_t numSubstitutions
;
49 if(U_FAILURE(*pErrorCode
)){
52 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
53 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
54 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
56 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
60 if(pNumSubstitutions
!= NULL
) {
61 *pNumSubstitutions
= 0;
65 destLimit
= dest
+ destCapacity
;
70 /* simple loop for conversion of a NUL-terminated BMP string */
71 while((ch
=*src
) != 0 &&
72 ((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff))) {
74 if(pDest
< destLimit
) {
82 /* "complicated" case, find the end of the remaining string */
83 while(*++srcLimit
!= 0) {}
86 srcLimit
= src
+ srcLength
;
89 /* convert with length */
90 while(src
< srcLimit
) {
93 /* usually "loops" once; twice only for writing subchar */
94 if((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff)) {
95 if(pDest
< destLimit
) {
101 } else if(0x10000 <= ch
&& ch
<= 0x10ffff) {
102 if((pDest
+ 2) <= destLimit
) {
103 *pDest
++ = U16_LEAD(ch
);
104 *pDest
++ = U16_TRAIL(ch
);
109 } else if((ch
= subchar
) < 0) {
110 /* surrogate code point, or not a Unicode code point at all */
111 *pErrorCode
= U_INVALID_CHAR_FOUND
;
119 reqLength
+= (int32_t)(pDest
- dest
);
121 *pDestLength
= reqLength
;
123 if(pNumSubstitutions
!= NULL
) {
124 *pNumSubstitutions
= numSubstitutions
;
127 /* Terminate the buffer */
128 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
133 U_CAPI UChar
* U_EXPORT2
134 u_strFromUTF32(UChar
*dest
,
135 int32_t destCapacity
,
136 int32_t *pDestLength
,
139 UErrorCode
*pErrorCode
) {
140 return u_strFromUTF32WithSub(
141 dest
, destCapacity
, pDestLength
,
147 U_CAPI UChar32
* U_EXPORT2
148 u_strToUTF32WithSub(UChar32
*dest
,
149 int32_t destCapacity
,
150 int32_t *pDestLength
,
153 UChar32 subchar
, int32_t *pNumSubstitutions
,
154 UErrorCode
*pErrorCode
) {
155 const UChar
*srcLimit
;
161 int32_t numSubstitutions
;
164 if(U_FAILURE(*pErrorCode
)){
167 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
168 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
169 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
171 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
175 if(pNumSubstitutions
!= NULL
) {
176 *pNumSubstitutions
= 0;
180 destLimit
= dest
+ destCapacity
;
182 numSubstitutions
= 0;
185 /* simple loop for conversion of a NUL-terminated BMP string */
186 while((ch
=*src
) != 0 && !U16_IS_SURROGATE(ch
)) {
188 if(pDest
< destLimit
) {
196 /* "complicated" case, find the end of the remaining string */
197 while(*++srcLimit
!= 0) {}
200 srcLimit
= src
+ srcLength
;
203 /* convert with length */
204 while(src
< srcLimit
) {
206 if(!U16_IS_SURROGATE(ch
)) {
207 /* write or count ch below */
208 } else if(U16_IS_SURROGATE_LEAD(ch
) && src
< srcLimit
&& U16_IS_TRAIL(ch2
= *src
)) {
210 ch
= U16_GET_SUPPLEMENTARY(ch
, ch2
);
211 } else if((ch
= subchar
) < 0) {
212 /* unpaired surrogate */
213 *pErrorCode
= U_INVALID_CHAR_FOUND
;
218 if(pDest
< destLimit
) {
225 reqLength
+= (int32_t)(pDest
- dest
);
227 *pDestLength
= reqLength
;
229 if(pNumSubstitutions
!= NULL
) {
230 *pNumSubstitutions
= numSubstitutions
;
233 /* Terminate the buffer */
234 u_terminateUChar32s(dest
, destCapacity
, reqLength
, pErrorCode
);
239 U_CAPI UChar32
* U_EXPORT2
240 u_strToUTF32(UChar32
*dest
,
241 int32_t destCapacity
,
242 int32_t *pDestLength
,
245 UErrorCode
*pErrorCode
) {
246 return u_strToUTF32WithSub(
247 dest
, destCapacity
, pDestLength
,
253 /* for utf8_nextCharSafeBodyTerminated() */
255 utf8_minLegal
[4]={ 0, 0x80, 0x800, 0x10000 };
258 * Version of utf8_nextCharSafeBody() with the following differences:
259 * - checks for NUL termination instead of length
260 * - works with pointers instead of indexes
261 * - always strict (strict==-1)
263 * *ps points to after the lead byte and will be moved to after the last trail byte.
264 * c is the lead byte.
265 * @return the code point, or U_SENTINEL
268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps
, UChar32 c
) {
269 const uint8_t *s
=*ps
;
270 uint8_t trail
, illegal
=0;
271 uint8_t count
=UTF8_COUNT_TRAIL_BYTES(c
);
272 UTF8_MASK_LEAD_BYTE((c
), count
);
273 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
275 /* each branch falls through to the next one */
278 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
282 trail
=(uint8_t)(*s
++ - 0x80);
284 if(trail
>0x3f || c
>=0x110) {
285 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
290 trail
=(uint8_t)(*s
++ - 0x80);
292 /* not a trail byte */
298 trail
=(uint8_t)(*s
++ - 0x80);
300 /* not a trail byte */
307 /* no default branch to optimize switch() - all values are covered */
310 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
311 /* illegal is also set if count>=4 */
312 if(illegal
|| c
<utf8_minLegal
[count
] || UTF_IS_SURROGATE(c
)) {
314 /* don't go beyond this sequence */
316 while(count
>0 && UTF8_IS_TRAIL(*s
)) {
327 * Version of utf8_nextCharSafeBody() with the following differences:
328 * - works with pointers instead of indexes
329 * - always strict (strict==-1)
331 * *ps points to after the lead byte and will be moved to after the last trail byte.
332 * c is the lead byte.
333 * @return the code point, or U_SENTINEL
336 utf8_nextCharSafeBodyPointer(const uint8_t **ps
, const uint8_t *limit
, UChar32 c
) {
337 const uint8_t *s
=*ps
;
338 uint8_t trail
, illegal
=0;
339 uint8_t count
=UTF8_COUNT_TRAIL_BYTES(c
);
340 if((limit
-s
)>=count
) {
341 UTF8_MASK_LEAD_BYTE((c
), count
);
342 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
344 /* each branch falls through to the next one */
347 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
352 c
=(c
<<6)|(trail
&0x3f);
354 illegal
|=(trail
&0xc0)^0x80;
356 /* code point>0x10ffff, outside Unicode */
362 c
=(c
<<6)|(trail
&0x3f);
363 illegal
|=(trail
&0xc0)^0x80;
366 c
=(c
<<6)|(trail
&0x3f);
367 illegal
|=(trail
&0xc0)^0x80;
371 /* no default branch to optimize switch() - all values are covered */
374 illegal
=1; /* too few bytes left */
377 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
378 /* illegal is also set if count>=4 */
379 if(illegal
|| c
<utf8_minLegal
[count
] || UTF_IS_SURROGATE(c
)) {
381 /* don't go beyond this sequence */
383 while(count
>0 && s
<limit
&& UTF8_IS_TRAIL(*s
)) {
393 U_CAPI UChar
* U_EXPORT2
394 u_strFromUTF8WithSub(UChar
*dest
,
395 int32_t destCapacity
,
396 int32_t *pDestLength
,
399 UChar32 subchar
, int32_t *pNumSubstitutions
,
400 UErrorCode
*pErrorCode
){
402 UChar
*pDestLimit
= dest
+destCapacity
;
404 int32_t reqLength
= 0;
405 const uint8_t* pSrc
= (const uint8_t*) src
;
406 uint8_t t1
, t2
; /* trail bytes */
407 int32_t numSubstitutions
;
410 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
414 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
415 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
416 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
418 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
422 if(pNumSubstitutions
!=NULL
) {
423 *pNumSubstitutions
=0;
428 * Inline processing of UTF-8 byte sequences:
430 * Byte sequences for the most common characters are handled inline in
431 * the conversion loops. In order to reduce the path lengths for those
432 * characters, the tests are arranged in a kind of binary search.
433 * ASCII (<=0x7f) is checked first, followed by the dividing point
434 * between 2- and 3-byte sequences (0xe0).
435 * The 3-byte branch is tested first to speed up CJK text.
436 * The compiler should combine the subtractions for the two tests for 0xe0.
437 * Each branch then tests for the other end of its range.
442 * Transform a NUL-terminated string.
443 * The code explicitly checks for NULs only in the lead byte position.
444 * A NUL byte in the trail byte position fails the trail byte range check anyway.
446 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
452 if( /* handle U+1000..U+CFFF inline */
454 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
455 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
457 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
458 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
462 } else if(ch
< 0xe0) {
463 if( /* handle U+0080..U+07FF inline */
465 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
467 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
473 /* function call for "complicated" and error cases */
474 ++pSrc
; /* continue after the lead byte */
475 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
476 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
477 *pErrorCode
= U_INVALID_CHAR_FOUND
;
479 } else if(ch
<=0xFFFF) {
480 *(pDest
++)=(UChar
)ch
;
482 *(pDest
++)=UTF16_LEAD(ch
);
483 if(pDest
<pDestLimit
) {
484 *(pDest
++)=UTF16_TRAIL(ch
);
493 /* Pre-flight the rest of the string. */
494 while((ch
= *pSrc
) != 0) {
500 if( /* handle U+1000..U+CFFF inline */
502 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
503 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
509 } else if(ch
< 0xe0) {
510 if( /* handle U+0080..U+07FF inline */
512 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
520 /* function call for "complicated" and error cases */
521 ++pSrc
; /* continue after the lead byte */
522 ch
=utf8_nextCharSafeBodyTerminated(&pSrc
, ch
);
523 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0) {
524 *pErrorCode
= U_INVALID_CHAR_FOUND
;
527 reqLength
+= U16_LENGTH(ch
);
530 } else /* srcLength >= 0 */ {
531 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
534 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
537 * Each iteration of the inner loop progresses by at most 3 UTF-8
538 * bytes and one UChar, for most characters.
539 * For supplementary code points (4 & 2), which are rare,
540 * there is an additional adjustment.
542 count
= (int32_t)(pDestLimit
- pDest
);
543 srcLength
= (int32_t)((pSrcLimit
- pSrc
) / 3);
544 if(count
> srcLength
) {
545 count
= srcLength
; /* min(remaining dest, remaining src/3) */
549 * Too much overhead if we get near the end of the string,
550 * continue with the next loop.
562 if( /* handle U+1000..U+CFFF inline */
564 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
565 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
567 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
568 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
572 } else if(ch
< 0xe0) {
573 if( /* handle U+0080..U+07FF inline */
575 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
577 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
583 if(ch
>= 0xf0 || subchar
> 0xffff) {
585 * We may read up to six bytes and write up to two UChars,
586 * which we didn't account for with computing count,
587 * so we adjust it here.
594 /* function call for "complicated" and error cases */
595 ++pSrc
; /* continue after the lead byte */
596 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
597 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
598 *pErrorCode
= U_INVALID_CHAR_FOUND
;
600 }else if(ch
<=0xFFFF){
601 *(pDest
++)=(UChar
)ch
;
603 *(pDest
++)=UTF16_LEAD(ch
);
604 *(pDest
++)=UTF16_TRAIL(ch
);
607 } while(--count
> 0);
610 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
617 if( /* handle U+1000..U+CFFF inline */
619 ((pSrcLimit
- pSrc
) >= 3) &&
620 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
621 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
623 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
624 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
628 } else if(ch
< 0xe0) {
629 if( /* handle U+0080..U+07FF inline */
631 ((pSrcLimit
- pSrc
) >= 2) &&
632 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
634 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
640 /* function call for "complicated" and error cases */
641 ++pSrc
; /* continue after the lead byte */
642 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
643 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
644 *pErrorCode
= U_INVALID_CHAR_FOUND
;
646 }else if(ch
<=0xFFFF){
647 *(pDest
++)=(UChar
)ch
;
649 *(pDest
++)=UTF16_LEAD(ch
);
650 if(pDest
<pDestLimit
){
651 *(pDest
++)=UTF16_TRAIL(ch
);
659 /* do not fill the dest buffer just count the UChars needed */
660 while(pSrc
< pSrcLimit
){
667 if( /* handle U+1000..U+CFFF inline */
669 ((pSrcLimit
- pSrc
) >= 3) &&
670 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
671 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
677 } else if(ch
< 0xe0) {
678 if( /* handle U+0080..U+07FF inline */
680 ((pSrcLimit
- pSrc
) >= 2) &&
681 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
689 /* function call for "complicated" and error cases */
690 ++pSrc
; /* continue after the lead byte */
691 ch
=utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
692 if(ch
<0 && (++numSubstitutions
, ch
= subchar
) < 0){
693 *pErrorCode
= U_INVALID_CHAR_FOUND
;
696 reqLength
+=UTF_CHAR_LENGTH(ch
);
701 reqLength
+=(int32_t)(pDest
- dest
);
703 if(pNumSubstitutions
!=NULL
) {
704 *pNumSubstitutions
=numSubstitutions
;
708 *pDestLength
= reqLength
;
711 /* Terminate the buffer */
712 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
717 U_CAPI UChar
* U_EXPORT2
718 u_strFromUTF8(UChar
*dest
,
719 int32_t destCapacity
,
720 int32_t *pDestLength
,
723 UErrorCode
*pErrorCode
){
724 return u_strFromUTF8WithSub(
725 dest
, destCapacity
, pDestLength
,
731 U_CAPI UChar
* U_EXPORT2
732 u_strFromUTF8Lenient(UChar
*dest
,
733 int32_t destCapacity
,
734 int32_t *pDestLength
,
737 UErrorCode
*pErrorCode
) {
740 int32_t reqLength
= 0;
741 uint8_t* pSrc
= (uint8_t*) src
;
744 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
748 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
749 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0)
751 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
756 /* Transform a NUL-terminated string. */
757 UChar
*pDestLimit
= dest
+destCapacity
;
758 uint8_t t1
, t2
, t3
; /* trail bytes */
760 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
763 * ASCII, or a trail byte in lead position which is treated like
764 * a single-byte sequence for better character boundary
765 * resynchronization after illegal sequences.
770 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
771 if((t1
= pSrc
[1]) != 0) {
772 /* 0x3080 = (0xc0 << 6) + 0x80 */
773 *pDest
++ = (UChar
)((ch
<< 6) + t1
- 0x3080);
777 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
778 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0) {
779 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
780 /* 0x2080 = (0x80 << 6) + 0x80 */
781 *pDest
++ = (UChar
)((ch
<< 12) + (t1
<< 6) + t2
- 0x2080);
785 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
786 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0 && (t3
= pSrc
[3]) != 0) {
788 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
789 ch
= (ch
<< 18) + (t1
<< 12) + (t2
<< 6) + t3
- 0x3c82080;
790 *(pDest
++) = U16_LEAD(ch
);
791 if(pDest
< pDestLimit
) {
792 *(pDest
++) = U16_TRAIL(ch
);
801 /* truncated character at the end */
803 while(*++pSrc
!= 0) {}
807 /* Pre-flight the rest of the string. */
808 while((ch
= *pSrc
) != 0) {
811 * ASCII, or a trail byte in lead position which is treated like
812 * a single-byte sequence for better character boundary
813 * resynchronization after illegal sequences.
818 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
824 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
825 if(pSrc
[1] != 0 && pSrc
[2] != 0) {
830 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
831 if(pSrc
[1] != 0 && pSrc
[2] != 0 && pSrc
[3] != 0) {
838 /* truncated character at the end */
842 } else /* srcLength >= 0 */ {
843 const uint8_t *pSrcLimit
= pSrc
+ srcLength
;
846 * This function requires that if srcLength is given, then it must be
847 * destCapatity >= srcLength so that we need not check for
848 * destination buffer overflow in the loop.
850 if(destCapacity
< srcLength
) {
851 if(pDestLength
!= NULL
) {
852 *pDestLength
= srcLength
; /* this likely overestimates the true destLength! */
854 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
858 if((pSrcLimit
- pSrc
) >= 4) {
859 pSrcLimit
-= 3; /* temporarily reduce pSrcLimit */
861 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
866 * ASCII, or a trail byte in lead position which is treated like
867 * a single-byte sequence for better character boundary
868 * resynchronization after illegal sequences.
871 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
872 /* 0x3080 = (0xc0 << 6) + 0x80 */
873 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
874 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
875 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
876 /* 0x2080 = (0x80 << 6) + 0x80 */
877 ch
= (ch
<< 12) + (*pSrc
++ << 6);
878 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
879 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
880 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
881 ch
= (ch
<< 18) + (*pSrc
++ << 12);
883 ch
+= *pSrc
++ - 0x3c82080;
884 *(pDest
++) = U16_LEAD(ch
);
885 *(pDest
++) = U16_TRAIL(ch
);
887 } while(pSrc
< pSrcLimit
);
889 pSrcLimit
+= 3; /* restore original pSrcLimit */
892 while(pSrc
< pSrcLimit
) {
896 * ASCII, or a trail byte in lead position which is treated like
897 * a single-byte sequence for better character boundary
898 * resynchronization after illegal sequences.
902 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
903 if(pSrc
< pSrcLimit
) {
904 /* 0x3080 = (0xc0 << 6) + 0x80 */
905 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
908 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
909 if((pSrcLimit
- pSrc
) >= 2) {
910 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
911 /* 0x2080 = (0x80 << 6) + 0x80 */
912 ch
= (ch
<< 12) + (*pSrc
++ << 6);
913 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
917 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
918 if((pSrcLimit
- pSrc
) >= 3) {
919 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
920 ch
= (ch
<< 18) + (*pSrc
++ << 12);
922 ch
+= *pSrc
++ - 0x3c82080;
923 *(pDest
++) = U16_LEAD(ch
);
924 *(pDest
++) = U16_TRAIL(ch
);
930 /* truncated character at the end */
936 reqLength
+=(int32_t)(pDest
- dest
);
939 *pDestLength
= reqLength
;
942 /* Terminate the buffer */
943 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
948 static U_INLINE
uint8_t *
949 _appendUTF8(uint8_t *pDest
, UChar32 c
) {
950 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
953 } else if(c
<=0x7ff) {
954 *pDest
++=(uint8_t)((c
>>6)|0xc0);
955 *pDest
++=(uint8_t)((c
&0x3f)|0x80);
956 } else if(c
<=0xffff) {
957 *pDest
++=(uint8_t)((c
>>12)|0xe0);
958 *pDest
++=(uint8_t)(((c
>>6)&0x3f)|0x80);
959 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
960 } else /* if((uint32_t)(c)<=0x10ffff) */ {
961 *pDest
++=(uint8_t)(((c
)>>18)|0xf0);
962 *pDest
++=(uint8_t)((((c
)>>12)&0x3f)|0x80);
963 *pDest
++=(uint8_t)((((c
)>>6)&0x3f)|0x80);
964 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
970 U_CAPI
char* U_EXPORT2
971 u_strToUTF8WithSub(char *dest
,
972 int32_t destCapacity
,
973 int32_t *pDestLength
,
976 UChar32 subchar
, int32_t *pNumSubstitutions
,
977 UErrorCode
*pErrorCode
){
980 uint8_t *pDest
= (uint8_t *)dest
;
981 uint8_t *pDestLimit
= pDest
+ destCapacity
;
982 int32_t numSubstitutions
;
985 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)){
989 if( (pSrc
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
990 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
991 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
993 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
997 if(pNumSubstitutions
!=NULL
) {
998 *pNumSubstitutions
=0;
1003 while((ch
=*pSrc
)!=0) {
1006 if(pDest
<pDestLimit
) {
1007 *pDest
++ = (uint8_t)ch
;
1012 } else if(ch
<= 0x7ff) {
1013 if((pDestLimit
- pDest
) >= 2) {
1014 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1015 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1020 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1021 if((pDestLimit
- pDest
) >= 3) {
1022 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1023 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1024 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1029 } else /* ch is a surrogate */ {
1032 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
1033 if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
1035 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
1036 } else if(subchar
>=0) {
1040 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1041 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1045 length
= U8_LENGTH(ch
);
1046 if((pDestLimit
- pDest
) >= length
) {
1047 /* convert and append*/
1048 pDest
=_appendUTF8(pDest
, ch
);
1055 while((ch
=*pSrc
++)!=0) {
1058 } else if(ch
<=0x7ff) {
1060 } else if(!UTF_IS_SURROGATE(ch
)) {
1062 } else if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
1065 } else if(subchar
>=0) {
1066 reqLength
+=U8_LENGTH(subchar
);
1069 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1070 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1075 const UChar
*pSrcLimit
= pSrc
+srcLength
;
1078 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1081 * Each iteration of the inner loop progresses by at most 3 UTF-8
1082 * bytes and one UChar, for most characters.
1083 * For supplementary code points (4 & 2), which are rare,
1084 * there is an additional adjustment.
1086 count
= (int32_t)((pDestLimit
- pDest
) / 3);
1087 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1088 if(count
> srcLength
) {
1089 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1093 * Too much overhead if we get near the end of the string,
1094 * continue with the next loop.
1101 *pDest
++ = (uint8_t)ch
;
1102 } else if(ch
<= 0x7ff) {
1103 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1104 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1105 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1106 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1107 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1108 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1109 } else /* ch is a surrogate */ {
1111 * We will read two UChars and probably output four bytes,
1112 * which we didn't account for with computing count,
1113 * so we adjust it here.
1116 --pSrc
; /* undo ch=*pSrc++ for the lead surrogate */
1117 break; /* recompute count */
1120 if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_TRAIL(ch2
=*pSrc
)) {
1122 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
1124 /* writing 4 bytes per 2 UChars is ok */
1125 *pDest
++=(uint8_t)((ch
>>18)|0xf0);
1126 *pDest
++=(uint8_t)(((ch
>>12)&0x3f)|0x80);
1127 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1128 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1130 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1135 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1139 /* convert and append*/
1140 pDest
=_appendUTF8(pDest
, ch
);
1143 } while(--count
> 0);
1146 while(pSrc
<pSrcLimit
) {
1149 if(pDest
<pDestLimit
) {
1150 *pDest
++ = (uint8_t)ch
;
1155 } else if(ch
<= 0x7ff) {
1156 if((pDestLimit
- pDest
) >= 2) {
1157 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1158 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1163 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
1164 if((pDestLimit
- pDest
) >= 3) {
1165 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1166 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1167 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1172 } else /* ch is a surrogate */ {
1175 if(UTF_IS_SURROGATE_FIRST(ch
) && pSrc
<pSrcLimit
&& UTF_IS_TRAIL(ch2
=*pSrc
)) {
1177 ch
=UTF16_GET_PAIR_VALUE(ch
, ch2
);
1178 } else if(subchar
>=0) {
1182 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1183 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1187 length
= U8_LENGTH(ch
);
1188 if((pDestLimit
- pDest
) >= length
) {
1189 /* convert and append*/
1190 pDest
=_appendUTF8(pDest
, ch
);
1197 while(pSrc
<pSrcLimit
) {
1201 } else if(ch
<=0x7ff) {
1203 } else if(!UTF_IS_SURROGATE(ch
)) {
1205 } else if(UTF_IS_SURROGATE_FIRST(ch
) && pSrc
<pSrcLimit
&& UTF_IS_TRAIL(ch2
=*pSrc
)) {
1208 } else if(subchar
>=0) {
1209 reqLength
+=U8_LENGTH(subchar
);
1212 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1213 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1219 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1221 if(pNumSubstitutions
!=NULL
) {
1222 *pNumSubstitutions
=numSubstitutions
;
1226 *pDestLength
= reqLength
;
1229 /* Terminate the buffer */
1230 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1234 U_CAPI
char* U_EXPORT2
1235 u_strToUTF8(char *dest
,
1236 int32_t destCapacity
,
1237 int32_t *pDestLength
,
1240 UErrorCode
*pErrorCode
){
1241 return u_strToUTF8WithSub(
1242 dest
, destCapacity
, pDestLength
,
1248 U_CAPI UChar
* U_EXPORT2
1249 u_strFromJavaModifiedUTF8WithSub(
1251 int32_t destCapacity
,
1252 int32_t *pDestLength
,
1255 UChar32 subchar
, int32_t *pNumSubstitutions
,
1256 UErrorCode
*pErrorCode
) {
1257 UChar
*pDest
= dest
;
1258 UChar
*pDestLimit
= dest
+destCapacity
;
1260 int32_t reqLength
= 0;
1261 const uint8_t* pSrc
= (const uint8_t*) src
;
1262 const uint8_t *pSrcLimit
;
1264 uint8_t t1
, t2
; /* trail bytes */
1265 int32_t numSubstitutions
;
1268 if(U_FAILURE(*pErrorCode
)){
1271 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1272 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0 ||
1273 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1275 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1279 if(pNumSubstitutions
!=NULL
) {
1280 *pNumSubstitutions
=0;
1286 * Transform a NUL-terminated ASCII string.
1287 * Handle non-ASCII strings with slower code.
1289 while(((ch
= *pSrc
) != 0) && ch
<= 0x7f && (pDest
< pDestLimit
)) {
1294 reqLength
=(int32_t)(pDest
- dest
);
1296 *pDestLength
= reqLength
;
1299 /* Terminate the buffer */
1300 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1303 srcLength
= uprv_strlen((const char *)pSrc
);
1306 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1307 pSrcLimit
= pSrc
+ srcLength
;
1309 count
= (int32_t)(pDestLimit
- pDest
);
1310 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
1311 if(count
>= srcLength
&& srcLength
> 0 && *pSrc
<= 0x7f) {
1312 /* fast ASCII loop */
1313 const uint8_t *prevSrc
= pSrc
;
1315 while(pSrc
< pSrcLimit
&& (ch
= *pSrc
) <= 0x7f) {
1319 delta
= (int32_t)(pSrc
- prevSrc
);
1324 * Each iteration of the inner loop progresses by at most 3 UTF-8
1325 * bytes and one UChar.
1328 if(count
> srcLength
) {
1329 count
= srcLength
; /* min(remaining dest, remaining src/3) */
1333 * Too much overhead if we get near the end of the string,
1334 * continue with the next loop.
1345 if( /* handle U+0000..U+FFFF inline */
1347 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1348 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1350 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1351 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1356 if( /* handle U+0000..U+07FF inline */
1358 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1360 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1367 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1369 } else if(subchar
> 0xffff && --count
== 0) {
1371 * We need to write two UChars, adjusted count for that,
1372 * and ran out of space.
1376 /* function call for error cases */
1377 ++pSrc
; /* continue after the lead byte */
1378 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1380 if(subchar
<=0xFFFF) {
1381 *(pDest
++)=(UChar
)subchar
;
1383 *(pDest
++)=U16_LEAD(subchar
);
1384 *(pDest
++)=U16_TRAIL(subchar
);
1388 } while(--count
> 0);
1391 while((pSrc
<pSrcLimit
) && (pDest
<pDestLimit
)) {
1398 if( /* handle U+0000..U+FFFF inline */
1400 ((pSrcLimit
- pSrc
) >= 3) &&
1401 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f &&
1402 (t2
= (uint8_t)(pSrc
[2] - 0x80)) <= 0x3f
1404 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1405 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1410 if( /* handle U+0000..U+07FF inline */
1412 ((pSrcLimit
- pSrc
) >= 2) &&
1413 (t1
= (uint8_t)(pSrc
[1] - 0x80)) <= 0x3f
1415 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1422 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1425 /* function call for error cases */
1426 ++pSrc
; /* continue after the lead byte */
1427 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1429 if(subchar
<=0xFFFF) {
1430 *(pDest
++)=(UChar
)subchar
;
1432 *(pDest
++)=U16_LEAD(subchar
);
1433 if(pDest
<pDestLimit
) {
1434 *(pDest
++)=U16_TRAIL(subchar
);
1444 /* do not fill the dest buffer just count the UChars needed */
1445 while(pSrc
< pSrcLimit
){
1452 if( /* handle U+0000..U+FFFF inline */
1454 ((pSrcLimit
- pSrc
) >= 3) &&
1455 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f &&
1456 (uint8_t)(pSrc
[2] - 0x80) <= 0x3f
1463 if( /* handle U+0000..U+07FF inline */
1465 ((pSrcLimit
- pSrc
) >= 2) &&
1466 (uint8_t)(pSrc
[1] - 0x80) <= 0x3f
1475 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1478 /* function call for error cases */
1479 ++pSrc
; /* continue after the lead byte */
1480 utf8_nextCharSafeBodyPointer(&pSrc
, pSrcLimit
, ch
);
1482 reqLength
+=U16_LENGTH(ch
);
1487 if(pNumSubstitutions
!=NULL
) {
1488 *pNumSubstitutions
=numSubstitutions
;
1491 reqLength
+=(int32_t)(pDest
- dest
);
1493 *pDestLength
= reqLength
;
1496 /* Terminate the buffer */
1497 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1501 U_CAPI
char* U_EXPORT2
1502 u_strToJavaModifiedUTF8(
1504 int32_t destCapacity
,
1505 int32_t *pDestLength
,
1508 UErrorCode
*pErrorCode
) {
1509 int32_t reqLength
=0;
1511 uint8_t *pDest
= (uint8_t *)dest
;
1512 uint8_t *pDestLimit
= pDest
+ destCapacity
;
1513 const UChar
*pSrcLimit
;
1517 if(U_FAILURE(*pErrorCode
)){
1520 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1521 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0
1523 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1528 /* Convert NUL-terminated ASCII, then find the string length. */
1529 while((ch
=*src
)<=0x7f && ch
!= 0 && pDest
<pDestLimit
) {
1530 *pDest
++ = (uint8_t)ch
;
1534 reqLength
=(int32_t)(pDest
- (uint8_t *)dest
);
1536 *pDestLength
= reqLength
;
1539 /* Terminate the buffer */
1540 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1543 srcLength
= u_strlen(src
);
1546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1547 pSrcLimit
= src
+srcLength
;
1549 count
= (int32_t)(pDestLimit
- pDest
);
1550 srcLength
= (int32_t)(pSrcLimit
- src
);
1551 if(count
>= srcLength
&& srcLength
> 0 && *src
<= 0x7f) {
1552 /* fast ASCII loop */
1553 const UChar
*prevSrc
= src
;
1555 while(src
< pSrcLimit
&& (ch
= *src
) <= 0x7f && ch
!= 0) {
1556 *pDest
++=(uint8_t)ch
;
1559 delta
= (int32_t)(src
- prevSrc
);
1564 * Each iteration of the inner loop progresses by at most 3 UTF-8
1565 * bytes and one UChar.
1568 if(count
> srcLength
) {
1569 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1573 * Too much overhead if we get near the end of the string,
1574 * continue with the next loop.
1580 if(ch
<= 0x7f && ch
!= 0) {
1581 *pDest
++ = (uint8_t)ch
;
1582 } else if(ch
<= 0x7ff) {
1583 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1584 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1586 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1587 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1588 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1590 } while(--count
> 0);
1593 while(src
<pSrcLimit
) {
1595 if(ch
<= 0x7f && ch
!= 0) {
1596 if(pDest
<pDestLimit
) {
1597 *pDest
++ = (uint8_t)ch
;
1602 } else if(ch
<= 0x7ff) {
1603 if((pDestLimit
- pDest
) >= 2) {
1604 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1605 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1611 if((pDestLimit
- pDest
) >= 3) {
1612 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1613 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1614 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1621 while(src
<pSrcLimit
) {
1623 if(ch
<= 0x7f && ch
!= 0) {
1625 } else if(ch
<=0x7ff) {
1632 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1634 *pDestLength
= reqLength
;
1637 /* Terminate the buffer */
1638 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);