]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrtrns.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2001-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
13 * Modification History:
15 * Date Name Description
16 * 9/10/2001 Ram Creation.
17 ******************************************************************************
20 /*******************************************************************************
22 * u_strTo* and u_strFrom* APIs
23 * WCS functions moved to ustr_wcs.c for better modularization
25 *******************************************************************************
29 #include "unicode/putil.h"
30 #include "unicode/ustring.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
39 U_CAPI UChar
* U_EXPORT2
40 u_strFromUTF32WithSub(UChar
*dest
,
45 UChar32 subchar
, int32_t *pNumSubstitutions
,
46 UErrorCode
*pErrorCode
) {
47 const UChar32
*srcLimit
;
52 int32_t numSubstitutions
;
55 if(U_FAILURE(*pErrorCode
)){
58 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
59 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
60 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
62 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
66 if(pNumSubstitutions
!= NULL
) {
67 *pNumSubstitutions
= 0;
71 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
76 /* simple loop for conversion of a NUL-terminated BMP string */
77 while((ch
=*src
) != 0 &&
78 ((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff))) {
80 if(pDest
< destLimit
) {
88 /* "complicated" case, find the end of the remaining string */
89 while(*++srcLimit
!= 0) {}
92 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
95 /* convert with length */
96 while(src
< srcLimit
) {
99 /* usually "loops" once; twice only for writing subchar */
100 if((uint32_t)ch
< 0xd800 || (0xe000 <= ch
&& ch
<= 0xffff)) {
101 if(pDest
< destLimit
) {
102 *pDest
++ = (UChar
)ch
;
107 } else if(0x10000 <= ch
&& ch
<= 0x10ffff) {
108 if(pDest
!=NULL
&& ((pDest
+ 2) <= destLimit
)) {
109 *pDest
++ = U16_LEAD(ch
);
110 *pDest
++ = U16_TRAIL(ch
);
115 } else if((ch
= subchar
) < 0) {
116 /* surrogate code point, or not a Unicode code point at all */
117 *pErrorCode
= U_INVALID_CHAR_FOUND
;
125 reqLength
+= (int32_t)(pDest
- dest
);
127 *pDestLength
= reqLength
;
129 if(pNumSubstitutions
!= NULL
) {
130 *pNumSubstitutions
= numSubstitutions
;
133 /* Terminate the buffer */
134 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
139 U_CAPI UChar
* U_EXPORT2
140 u_strFromUTF32(UChar
*dest
,
141 int32_t destCapacity
,
142 int32_t *pDestLength
,
145 UErrorCode
*pErrorCode
) {
146 return u_strFromUTF32WithSub(
147 dest
, destCapacity
, pDestLength
,
153 U_CAPI UChar32
* U_EXPORT2
154 u_strToUTF32WithSub(UChar32
*dest
,
155 int32_t destCapacity
,
156 int32_t *pDestLength
,
159 UChar32 subchar
, int32_t *pNumSubstitutions
,
160 UErrorCode
*pErrorCode
) {
161 const UChar
*srcLimit
;
167 int32_t numSubstitutions
;
170 if(U_FAILURE(*pErrorCode
)){
173 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
174 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
175 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
177 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
181 if(pNumSubstitutions
!= NULL
) {
182 *pNumSubstitutions
= 0;
186 destLimit
= (dest
!=NULL
)?(dest
+ destCapacity
):NULL
;
188 numSubstitutions
= 0;
191 /* simple loop for conversion of a NUL-terminated BMP string */
192 while((ch
=*src
) != 0 && !U16_IS_SURROGATE(ch
)) {
194 if(pDest
< destLimit
) {
202 /* "complicated" case, find the end of the remaining string */
203 while(*++srcLimit
!= 0) {}
206 srcLimit
= (src
!=NULL
)?(src
+ srcLength
):NULL
;
209 /* convert with length */
210 while(src
< srcLimit
) {
212 if(!U16_IS_SURROGATE(ch
)) {
213 /* write or count ch below */
214 } else if(U16_IS_SURROGATE_LEAD(ch
) && src
< srcLimit
&& U16_IS_TRAIL(ch2
= *src
)) {
216 ch
= U16_GET_SUPPLEMENTARY(ch
, ch2
);
217 } else if((ch
= subchar
) < 0) {
218 /* unpaired surrogate */
219 *pErrorCode
= U_INVALID_CHAR_FOUND
;
224 if(pDest
< destLimit
) {
231 reqLength
+= (int32_t)(pDest
- dest
);
233 *pDestLength
= reqLength
;
235 if(pNumSubstitutions
!= NULL
) {
236 *pNumSubstitutions
= numSubstitutions
;
239 /* Terminate the buffer */
240 u_terminateUChar32s(dest
, destCapacity
, reqLength
, pErrorCode
);
245 U_CAPI UChar32
* U_EXPORT2
246 u_strToUTF32(UChar32
*dest
,
247 int32_t destCapacity
,
248 int32_t *pDestLength
,
251 UErrorCode
*pErrorCode
) {
252 return u_strToUTF32WithSub(
253 dest
, destCapacity
, pDestLength
,
259 U_CAPI UChar
* U_EXPORT2
260 u_strFromUTF8WithSub(UChar
*dest
,
261 int32_t destCapacity
,
262 int32_t *pDestLength
,
265 UChar32 subchar
, int32_t *pNumSubstitutions
,
266 UErrorCode
*pErrorCode
){
268 if(U_FAILURE(*pErrorCode
)) {
271 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
272 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
273 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
275 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
279 if(pNumSubstitutions
!=NULL
) {
280 *pNumSubstitutions
=0;
283 UChar
*pDestLimit
= dest
+destCapacity
;
284 int32_t reqLength
= 0;
285 int32_t numSubstitutions
=0;
288 * Inline processing of UTF-8 byte sequences:
290 * Byte sequences for the most common characters are handled inline in
291 * the conversion loops. In order to reduce the path lengths for those
292 * characters, the tests are arranged in a kind of binary search.
293 * ASCII (<=0x7f) is checked first, followed by the dividing point
294 * between 2- and 3-byte sequences (0xe0).
295 * The 3-byte branch is tested first to speed up CJK text.
296 * The compiler should combine the subtractions for the two tests for 0xe0.
297 * Each branch then tests for the other end of its range.
302 * Transform a NUL-terminated string.
303 * The code explicitly checks for NULs only in the lead byte position.
304 * A NUL byte in the trail byte position fails the trail byte range check anyway.
308 for(i
= 0; (c
= (uint8_t)src
[i
]) != 0 && (pDest
< pDestLimit
);) {
309 // modified copy of U8_NEXT()
311 if(U8_IS_SINGLE(c
)) {
315 if( /* handle U+0800..U+FFFF inline */
316 (0xe0<=(c
) && (c
)<0xf0) &&
317 U8_IS_VALID_LEAD3_AND_T1((c
), src
[i
]) &&
318 (__t2
=src
[(i
)+1]-0x80)<=0x3f) {
319 *pDest
++ = (((c
)&0xf)<<12)|((src
[i
]&0x3f)<<6)|__t2
;
321 } else if( /* handle U+0080..U+07FF inline */
322 ((c
)<0xe0 && (c
)>=0xc2) &&
323 (__t1
=src
[i
]-0x80)<=0x3f) {
324 *pDest
++ = (((c
)&0x1f)<<6)|__t1
;
327 /* function call for "complicated" and error cases */
328 (c
)=utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), -1, c
, -1);
329 if(c
<0 && (++numSubstitutions
, c
= subchar
) < 0) {
330 *pErrorCode
= U_INVALID_CHAR_FOUND
;
332 } else if(c
<=0xFFFF) {
335 *(pDest
++)=U16_LEAD(c
);
336 if(pDest
<pDestLimit
) {
337 *(pDest
++)=U16_TRAIL(c
);
347 /* Pre-flight the rest of the string. */
348 while((c
= (uint8_t)src
[i
]) != 0) {
349 // modified copy of U8_NEXT()
351 if(U8_IS_SINGLE(c
)) {
355 if( /* handle U+0800..U+FFFF inline */
356 (0xe0<=(c
) && (c
)<0xf0) &&
357 U8_IS_VALID_LEAD3_AND_T1((c
), src
[i
]) &&
358 (__t2
=src
[(i
)+1]-0x80)<=0x3f) {
361 } else if( /* handle U+0080..U+07FF inline */
362 ((c
)<0xe0 && (c
)>=0xc2) &&
363 (__t1
=src
[i
]-0x80)<=0x3f) {
367 /* function call for "complicated" and error cases */
368 (c
)=utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), -1, c
, -1);
369 if(c
<0 && (++numSubstitutions
, c
= subchar
) < 0) {
370 *pErrorCode
= U_INVALID_CHAR_FOUND
;
373 reqLength
+= U16_LENGTH(c
);
377 } else /* srcLength >= 0 */ {
378 /* Faster loop without ongoing checking for srcLength and pDestLimit. */
383 * Each iteration of the inner loop progresses by at most 3 UTF-8
384 * bytes and one UChar, for most characters.
385 * For supplementary code points (4 & 2), which are rare,
386 * there is an additional adjustment.
388 int32_t count
= (int32_t)(pDestLimit
- pDest
);
389 int32_t count2
= (srcLength
- i
) / 3;
391 count
= count2
; /* min(remaining dest, remaining src/3) */
395 * Too much overhead if we get near the end of the string,
396 * continue with the next loop.
402 // modified copy of U8_NEXT()
403 c
= (uint8_t)src
[i
++];
404 if(U8_IS_SINGLE(c
)) {
408 if( /* handle U+0800..U+FFFF inline */
409 (0xe0<=(c
) && (c
)<0xf0) &&
411 U8_IS_VALID_LEAD3_AND_T1((c
), src
[i
]) &&
412 (__t2
=src
[(i
)+1]-0x80)<=0x3f) {
413 *pDest
++ = (((c
)&0xf)<<12)|((src
[i
]&0x3f)<<6)|__t2
;
415 } else if( /* handle U+0080..U+07FF inline */
416 ((c
)<0xe0 && (c
)>=0xc2) &&
418 (__t1
=src
[i
]-0x80)<=0x3f) {
419 *pDest
++ = (((c
)&0x1f)<<6)|__t1
;
422 if(c
>= 0xf0 || subchar
> 0xffff) {
423 // We may read up to four bytes and write up to two UChars,
424 // which we didn't account for with computing count,
425 // so we adjust it here.
427 --i
; // back out byte c
432 /* function call for "complicated" and error cases */
433 (c
)=utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), srcLength
, c
, -1);
434 if(c
<0 && (++numSubstitutions
, c
= subchar
) < 0) {
435 *pErrorCode
= U_INVALID_CHAR_FOUND
;
437 } else if(c
<=0xFFFF) {
440 *(pDest
++)=U16_LEAD(c
);
441 *(pDest
++)=U16_TRAIL(c
);
445 } while(--count
> 0);
448 while(i
< srcLength
&& (pDest
< pDestLimit
)) {
449 // modified copy of U8_NEXT()
450 c
= (uint8_t)src
[i
++];
451 if(U8_IS_SINGLE(c
)) {
455 if( /* handle U+0800..U+FFFF inline */
456 (0xe0<=(c
) && (c
)<0xf0) &&
458 U8_IS_VALID_LEAD3_AND_T1((c
), src
[i
]) &&
459 (__t2
=src
[(i
)+1]-0x80)<=0x3f) {
460 *pDest
++ = (((c
)&0xf)<<12)|((src
[i
]&0x3f)<<6)|__t2
;
462 } else if( /* handle U+0080..U+07FF inline */
463 ((c
)<0xe0 && (c
)>=0xc2) &&
465 (__t1
=src
[i
]-0x80)<=0x3f) {
466 *pDest
++ = (((c
)&0x1f)<<6)|__t1
;
469 /* function call for "complicated" and error cases */
470 (c
)=utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), srcLength
, c
, -1);
471 if(c
<0 && (++numSubstitutions
, c
= subchar
) < 0) {
472 *pErrorCode
= U_INVALID_CHAR_FOUND
;
474 } else if(c
<=0xFFFF) {
477 *(pDest
++)=U16_LEAD(c
);
478 if(pDest
<pDestLimit
) {
479 *(pDest
++)=U16_TRAIL(c
);
489 /* Pre-flight the rest of the string. */
490 while(i
< srcLength
) {
491 // modified copy of U8_NEXT()
492 c
= (uint8_t)src
[i
++];
493 if(U8_IS_SINGLE(c
)) {
497 if( /* handle U+0800..U+FFFF inline */
498 (0xe0<=(c
) && (c
)<0xf0) &&
500 U8_IS_VALID_LEAD3_AND_T1((c
), src
[i
]) &&
501 (__t2
=src
[(i
)+1]-0x80)<=0x3f) {
504 } else if( /* handle U+0080..U+07FF inline */
505 ((c
)<0xe0 && (c
)>=0xc2) &&
507 (__t1
=src
[i
]-0x80)<=0x3f) {
511 /* function call for "complicated" and error cases */
512 (c
)=utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), srcLength
, c
, -1);
513 if(c
<0 && (++numSubstitutions
, c
= subchar
) < 0) {
514 *pErrorCode
= U_INVALID_CHAR_FOUND
;
517 reqLength
+= U16_LENGTH(c
);
523 reqLength
+=(int32_t)(pDest
- dest
);
525 if(pNumSubstitutions
!=NULL
) {
526 *pNumSubstitutions
=numSubstitutions
;
530 *pDestLength
= reqLength
;
533 /* Terminate the buffer */
534 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
539 U_CAPI UChar
* U_EXPORT2
540 u_strFromUTF8(UChar
*dest
,
541 int32_t destCapacity
,
542 int32_t *pDestLength
,
545 UErrorCode
*pErrorCode
){
546 return u_strFromUTF8WithSub(
547 dest
, destCapacity
, pDestLength
,
553 U_CAPI UChar
* U_EXPORT2
554 u_strFromUTF8Lenient(UChar
*dest
,
555 int32_t destCapacity
,
556 int32_t *pDestLength
,
559 UErrorCode
*pErrorCode
) {
562 int32_t reqLength
= 0;
563 uint8_t* pSrc
= (uint8_t*) src
;
566 if(U_FAILURE(*pErrorCode
)){
570 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
571 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0)
573 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
578 /* Transform a NUL-terminated string. */
579 UChar
*pDestLimit
= (dest
!=NULL
)?(dest
+destCapacity
):NULL
;
580 uint8_t t1
, t2
, t3
; /* trail bytes */
582 while(((ch
= *pSrc
) != 0) && (pDest
< pDestLimit
)) {
585 * ASCII, or a trail byte in lead position which is treated like
586 * a single-byte sequence for better character boundary
587 * resynchronization after illegal sequences.
592 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
593 if((t1
= pSrc
[1]) != 0) {
594 /* 0x3080 = (0xc0 << 6) + 0x80 */
595 *pDest
++ = (UChar
)((ch
<< 6) + t1
- 0x3080);
599 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
600 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0) {
601 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
602 /* 0x2080 = (0x80 << 6) + 0x80 */
603 *pDest
++ = (UChar
)((ch
<< 12) + (t1
<< 6) + t2
- 0x2080);
607 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
608 if((t1
= pSrc
[1]) != 0 && (t2
= pSrc
[2]) != 0 && (t3
= pSrc
[3]) != 0) {
610 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
611 ch
= (ch
<< 18) + (t1
<< 12) + (t2
<< 6) + t3
- 0x3c82080;
612 *(pDest
++) = U16_LEAD(ch
);
613 if(pDest
< pDestLimit
) {
614 *(pDest
++) = U16_TRAIL(ch
);
623 /* truncated character at the end */
625 while(*++pSrc
!= 0) {}
629 /* Pre-flight the rest of the string. */
630 while((ch
= *pSrc
) != 0) {
633 * ASCII, or a trail byte in lead position which is treated like
634 * a single-byte sequence for better character boundary
635 * resynchronization after illegal sequences.
640 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
646 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
647 if(pSrc
[1] != 0 && pSrc
[2] != 0) {
652 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
653 if(pSrc
[1] != 0 && pSrc
[2] != 0 && pSrc
[3] != 0) {
660 /* truncated character at the end */
664 } else /* srcLength >= 0 */ {
665 const uint8_t *pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+ srcLength
):NULL
;
668 * This function requires that if srcLength is given, then it must be
669 * destCapatity >= srcLength so that we need not check for
670 * destination buffer overflow in the loop.
672 if(destCapacity
< srcLength
) {
673 if(pDestLength
!= NULL
) {
674 *pDestLength
= srcLength
; /* this likely overestimates the true destLength! */
676 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
680 if((pSrcLimit
- pSrc
) >= 4) {
681 pSrcLimit
-= 3; /* temporarily reduce pSrcLimit */
683 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
688 * ASCII, or a trail byte in lead position which is treated like
689 * a single-byte sequence for better character boundary
690 * resynchronization after illegal sequences.
693 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
694 /* 0x3080 = (0xc0 << 6) + 0x80 */
695 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
696 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
697 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
698 /* 0x2080 = (0x80 << 6) + 0x80 */
699 ch
= (ch
<< 12) + (*pSrc
++ << 6);
700 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
701 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
702 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
703 ch
= (ch
<< 18) + (*pSrc
++ << 12);
705 ch
+= *pSrc
++ - 0x3c82080;
706 *(pDest
++) = U16_LEAD(ch
);
707 *(pDest
++) = U16_TRAIL(ch
);
709 } while(pSrc
< pSrcLimit
);
711 pSrcLimit
+= 3; /* restore original pSrcLimit */
714 while(pSrc
< pSrcLimit
) {
718 * ASCII, or a trail byte in lead position which is treated like
719 * a single-byte sequence for better character boundary
720 * resynchronization after illegal sequences.
724 } else if(ch
< 0xe0) { /* U+0080..U+07FF */
725 if(pSrc
< pSrcLimit
) {
726 /* 0x3080 = (0xc0 << 6) + 0x80 */
727 *pDest
++ = (UChar
)((ch
<< 6) + *pSrc
++ - 0x3080);
730 } else if(ch
< 0xf0) { /* U+0800..U+FFFF */
731 if((pSrcLimit
- pSrc
) >= 2) {
732 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
733 /* 0x2080 = (0x80 << 6) + 0x80 */
734 ch
= (ch
<< 12) + (*pSrc
++ << 6);
735 *pDest
++ = (UChar
)(ch
+ *pSrc
++ - 0x2080);
739 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
740 if((pSrcLimit
- pSrc
) >= 3) {
741 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
742 ch
= (ch
<< 18) + (*pSrc
++ << 12);
744 ch
+= *pSrc
++ - 0x3c82080;
745 *(pDest
++) = U16_LEAD(ch
);
746 *(pDest
++) = U16_TRAIL(ch
);
752 /* truncated character at the end */
758 reqLength
+=(int32_t)(pDest
- dest
);
761 *pDestLength
= reqLength
;
764 /* Terminate the buffer */
765 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
770 static inline uint8_t *
771 _appendUTF8(uint8_t *pDest
, UChar32 c
) {
772 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
775 } else if(c
<=0x7ff) {
776 *pDest
++=(uint8_t)((c
>>6)|0xc0);
777 *pDest
++=(uint8_t)((c
&0x3f)|0x80);
778 } else if(c
<=0xffff) {
779 *pDest
++=(uint8_t)((c
>>12)|0xe0);
780 *pDest
++=(uint8_t)(((c
>>6)&0x3f)|0x80);
781 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
782 } else /* if((uint32_t)(c)<=0x10ffff) */ {
783 *pDest
++=(uint8_t)(((c
)>>18)|0xf0);
784 *pDest
++=(uint8_t)((((c
)>>12)&0x3f)|0x80);
785 *pDest
++=(uint8_t)((((c
)>>6)&0x3f)|0x80);
786 *pDest
++=(uint8_t)(((c
)&0x3f)|0x80);
792 U_CAPI
char* U_EXPORT2
793 u_strToUTF8WithSub(char *dest
,
794 int32_t destCapacity
,
795 int32_t *pDestLength
,
798 UChar32 subchar
, int32_t *pNumSubstitutions
,
799 UErrorCode
*pErrorCode
){
802 uint8_t *pDest
= (uint8_t *)dest
;
803 uint8_t *pDestLimit
= (pDest
!=NULL
)?(pDest
+ destCapacity
):NULL
;
804 int32_t numSubstitutions
;
807 if(U_FAILURE(*pErrorCode
)){
811 if( (pSrc
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
812 (destCapacity
<0) || (dest
== NULL
&& destCapacity
> 0) ||
813 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
815 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
819 if(pNumSubstitutions
!=NULL
) {
820 *pNumSubstitutions
=0;
825 while((ch
=*pSrc
)!=0) {
828 if(pDest
<pDestLimit
) {
829 *pDest
++ = (uint8_t)ch
;
834 } else if(ch
<= 0x7ff) {
835 if((pDestLimit
- pDest
) >= 2) {
836 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
837 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
842 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
843 if((pDestLimit
- pDest
) >= 3) {
844 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
845 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
846 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
851 } else /* ch is a surrogate */ {
854 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
855 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
857 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
858 } else if(subchar
>=0) {
862 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
863 *pErrorCode
= U_INVALID_CHAR_FOUND
;
867 length
= U8_LENGTH(ch
);
868 if((pDestLimit
- pDest
) >= length
) {
869 /* convert and append*/
870 pDest
=_appendUTF8(pDest
, ch
);
877 while((ch
=*pSrc
++)!=0) {
880 } else if(ch
<=0x7ff) {
882 } else if(!U16_IS_SURROGATE(ch
)) {
884 } else if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
887 } else if(subchar
>=0) {
888 reqLength
+=U8_LENGTH(subchar
);
891 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
892 *pErrorCode
= U_INVALID_CHAR_FOUND
;
897 const UChar
*pSrcLimit
= (pSrc
!=NULL
)?(pSrc
+srcLength
):NULL
;
900 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
903 * Each iteration of the inner loop progresses by at most 3 UTF-8
904 * bytes and one UChar, for most characters.
905 * For supplementary code points (4 & 2), which are rare,
906 * there is an additional adjustment.
908 count
= (int32_t)((pDestLimit
- pDest
) / 3);
909 srcLength
= (int32_t)(pSrcLimit
- pSrc
);
910 if(count
> srcLength
) {
911 count
= srcLength
; /* min(remaining dest/3, remaining src) */
915 * Too much overhead if we get near the end of the string,
916 * continue with the next loop.
923 *pDest
++ = (uint8_t)ch
;
924 } else if(ch
<= 0x7ff) {
925 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
926 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
927 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
928 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
929 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
930 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
931 } else /* ch is a surrogate */ {
933 * We will read two UChars and probably output four bytes,
934 * which we didn't account for with computing count,
935 * so we adjust it here.
938 --pSrc
; /* undo ch=*pSrc++ for the lead surrogate */
939 break; /* recompute count */
942 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(ch2
=*pSrc
)) {
944 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
946 /* writing 4 bytes per 2 UChars is ok */
947 *pDest
++=(uint8_t)((ch
>>18)|0xf0);
948 *pDest
++=(uint8_t)(((ch
>>12)&0x3f)|0x80);
949 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
950 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
952 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
957 *pErrorCode
= U_INVALID_CHAR_FOUND
;
961 /* convert and append*/
962 pDest
=_appendUTF8(pDest
, ch
);
965 } while(--count
> 0);
968 while(pSrc
<pSrcLimit
) {
971 if(pDest
<pDestLimit
) {
972 *pDest
++ = (uint8_t)ch
;
977 } else if(ch
<= 0x7ff) {
978 if((pDestLimit
- pDest
) >= 2) {
979 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
980 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
985 } else if(ch
<= 0xd7ff || ch
>= 0xe000) {
986 if((pDestLimit
- pDest
) >= 3) {
987 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
988 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
989 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
994 } else /* ch is a surrogate */ {
997 if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
999 ch
=U16_GET_SUPPLEMENTARY(ch
, ch2
);
1000 } else if(subchar
>=0) {
1004 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1005 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1009 length
= U8_LENGTH(ch
);
1010 if((pDestLimit
- pDest
) >= length
) {
1011 /* convert and append*/
1012 pDest
=_appendUTF8(pDest
, ch
);
1019 while(pSrc
<pSrcLimit
) {
1023 } else if(ch
<=0x7ff) {
1025 } else if(!U16_IS_SURROGATE(ch
)) {
1027 } else if(U16_IS_SURROGATE_LEAD(ch
) && pSrc
<pSrcLimit
&& U16_IS_TRAIL(ch2
=*pSrc
)) {
1030 } else if(subchar
>=0) {
1031 reqLength
+=U8_LENGTH(subchar
);
1034 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1035 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1041 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1043 if(pNumSubstitutions
!=NULL
) {
1044 *pNumSubstitutions
=numSubstitutions
;
1048 *pDestLength
= reqLength
;
1051 /* Terminate the buffer */
1052 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1056 U_CAPI
char* U_EXPORT2
1057 u_strToUTF8(char *dest
,
1058 int32_t destCapacity
,
1059 int32_t *pDestLength
,
1062 UErrorCode
*pErrorCode
){
1063 return u_strToUTF8WithSub(
1064 dest
, destCapacity
, pDestLength
,
1070 U_CAPI UChar
* U_EXPORT2
1071 u_strFromJavaModifiedUTF8WithSub(
1073 int32_t destCapacity
,
1074 int32_t *pDestLength
,
1077 UChar32 subchar
, int32_t *pNumSubstitutions
,
1078 UErrorCode
*pErrorCode
) {
1080 if(U_FAILURE(*pErrorCode
)) {
1083 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1084 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0 ||
1085 subchar
> 0x10ffff || U_IS_SURROGATE(subchar
)
1087 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1091 if(pNumSubstitutions
!=NULL
) {
1092 *pNumSubstitutions
=0;
1094 UChar
*pDest
= dest
;
1095 UChar
*pDestLimit
= dest
+destCapacity
;
1096 int32_t reqLength
= 0;
1097 int32_t numSubstitutions
=0;
1101 * Transform a NUL-terminated ASCII string.
1102 * Handle non-ASCII strings with slower code.
1105 while(((c
= (uint8_t)*src
) != 0) && c
<= 0x7f && (pDest
< pDestLimit
)) {
1110 reqLength
=(int32_t)(pDest
- dest
);
1112 *pDestLength
= reqLength
;
1115 /* Terminate the buffer */
1116 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1119 srcLength
= static_cast<int32_t>(uprv_strlen(src
));
1122 /* Faster loop without ongoing checking for srcLength and pDestLimit. */
1127 int32_t count
= (int32_t)(pDestLimit
- pDest
);
1128 int32_t count2
= srcLength
- i
;
1129 if(count
>= count2
&& srcLength
> 0 && U8_IS_SINGLE(*src
)) {
1130 /* fast ASCII loop */
1133 while(i
< srcLength
&& U8_IS_SINGLE(b
= src
[i
])) {
1137 int32_t delta
= i
- start
;
1142 * Each iteration of the inner loop progresses by at most 3 UTF-8
1143 * bytes and one UChar.
1145 if(subchar
> 0xFFFF) {
1149 if(count
> count2
) {
1150 count
= count2
; /* min(remaining dest, remaining src/3) */
1154 * Too much overhead if we get near the end of the string,
1155 * continue with the next loop.
1160 ch
= (uint8_t)src
[i
++];
1161 if(U8_IS_SINGLE(ch
)) {
1165 if( /* handle U+0000..U+FFFF inline */
1167 (t1
= (uint8_t)(src
[i
] - 0x80)) <= 0x3f &&
1168 (t2
= (uint8_t)(src
[i
+1] - 0x80)) <= 0x3f
1170 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1171 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1176 if( /* handle U+0000..U+07FF inline */
1178 (t1
= (uint8_t)(src
[i
] - 0x80)) <= 0x3f
1180 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1187 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1189 } else if(subchar
> 0xffff && --count
== 0) {
1191 * We need to write two UChars, adjusted count for that,
1192 * and ran out of space.
1194 --i
; // back out byte ch
1197 /* function call for error cases */
1198 utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), srcLength
, ch
, -1);
1200 *(pDest
++)=(UChar
)subchar
;
1203 } while(--count
> 0);
1206 while(i
< srcLength
&& (pDest
< pDestLimit
)) {
1207 ch
= (uint8_t)src
[i
++];
1208 if(U8_IS_SINGLE(ch
)){
1212 if( /* handle U+0000..U+FFFF inline */
1214 (i
+1) < srcLength
&&
1215 (t1
= (uint8_t)(src
[i
] - 0x80)) <= 0x3f &&
1216 (t2
= (uint8_t)(src
[i
+1] - 0x80)) <= 0x3f
1218 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1219 *pDest
++ = (UChar
)((ch
<< 12) | (t1
<< 6) | t2
);
1224 if( /* handle U+0000..U+07FF inline */
1227 (t1
= (uint8_t)(src
[i
] - 0x80)) <= 0x3f
1229 *pDest
++ = (UChar
)(((ch
& 0x1f) << 6) | t1
);
1236 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1239 /* function call for error cases */
1240 utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), srcLength
, ch
, -1);
1242 if(subchar
<=0xFFFF) {
1243 *(pDest
++)=(UChar
)subchar
;
1245 *(pDest
++)=U16_LEAD(subchar
);
1246 if(pDest
<pDestLimit
) {
1247 *(pDest
++)=U16_TRAIL(subchar
);
1257 /* Pre-flight the rest of the string. */
1258 while(i
< srcLength
) {
1259 ch
= (uint8_t)src
[i
++];
1260 if(U8_IS_SINGLE(ch
)) {
1264 if( /* handle U+0000..U+FFFF inline */
1266 (i
+1) < srcLength
&&
1267 (uint8_t)(src
[i
] - 0x80) <= 0x3f &&
1268 (uint8_t)(src
[i
+1] - 0x80) <= 0x3f
1275 if( /* handle U+0000..U+07FF inline */
1278 (uint8_t)(src
[i
] - 0x80) <= 0x3f
1287 *pErrorCode
= U_INVALID_CHAR_FOUND
;
1290 /* function call for error cases */
1291 utf8_nextCharSafeBody((const uint8_t *)src
, &(i
), srcLength
, ch
, -1);
1293 reqLength
+=U16_LENGTH(ch
);
1298 if(pNumSubstitutions
!=NULL
) {
1299 *pNumSubstitutions
=numSubstitutions
;
1302 reqLength
+=(int32_t)(pDest
- dest
);
1304 *pDestLength
= reqLength
;
1307 /* Terminate the buffer */
1308 u_terminateUChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1312 U_CAPI
char* U_EXPORT2
1313 u_strToJavaModifiedUTF8(
1315 int32_t destCapacity
,
1316 int32_t *pDestLength
,
1319 UErrorCode
*pErrorCode
) {
1320 int32_t reqLength
=0;
1322 uint8_t *pDest
= (uint8_t *)dest
;
1323 uint8_t *pDestLimit
= pDest
+ destCapacity
;
1324 const UChar
*pSrcLimit
;
1328 if(U_FAILURE(*pErrorCode
)){
1331 if( (src
==NULL
&& srcLength
!=0) || srcLength
< -1 ||
1332 (dest
==NULL
&& destCapacity
!=0) || destCapacity
<0
1334 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1339 /* Convert NUL-terminated ASCII, then find the string length. */
1340 while((ch
=*src
)<=0x7f && ch
!= 0 && pDest
<pDestLimit
) {
1341 *pDest
++ = (uint8_t)ch
;
1345 reqLength
=(int32_t)(pDest
- (uint8_t *)dest
);
1347 *pDestLength
= reqLength
;
1350 /* Terminate the buffer */
1351 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);
1354 srcLength
= u_strlen(src
);
1357 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1358 pSrcLimit
= (src
!=NULL
)?(src
+srcLength
):NULL
;
1360 count
= (int32_t)(pDestLimit
- pDest
);
1361 srcLength
= (int32_t)(pSrcLimit
- src
);
1362 if(count
>= srcLength
&& srcLength
> 0 && *src
<= 0x7f) {
1363 /* fast ASCII loop */
1364 const UChar
*prevSrc
= src
;
1366 while(src
< pSrcLimit
&& (ch
= *src
) <= 0x7f && ch
!= 0) {
1367 *pDest
++=(uint8_t)ch
;
1370 delta
= (int32_t)(src
- prevSrc
);
1375 * Each iteration of the inner loop progresses by at most 3 UTF-8
1376 * bytes and one UChar.
1379 if(count
> srcLength
) {
1380 count
= srcLength
; /* min(remaining dest/3, remaining src) */
1384 * Too much overhead if we get near the end of the string,
1385 * continue with the next loop.
1391 if(ch
<= 0x7f && ch
!= 0) {
1392 *pDest
++ = (uint8_t)ch
;
1393 } else if(ch
<= 0x7ff) {
1394 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1395 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1397 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1398 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1399 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1401 } while(--count
> 0);
1404 while(src
<pSrcLimit
) {
1406 if(ch
<= 0x7f && ch
!= 0) {
1407 if(pDest
<pDestLimit
) {
1408 *pDest
++ = (uint8_t)ch
;
1413 } else if(ch
<= 0x7ff) {
1414 if((pDestLimit
- pDest
) >= 2) {
1415 *pDest
++=(uint8_t)((ch
>>6)|0xc0);
1416 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1422 if((pDestLimit
- pDest
) >= 3) {
1423 *pDest
++=(uint8_t)((ch
>>12)|0xe0);
1424 *pDest
++=(uint8_t)(((ch
>>6)&0x3f)|0x80);
1425 *pDest
++=(uint8_t)((ch
&0x3f)|0x80);
1432 while(src
<pSrcLimit
) {
1434 if(ch
<= 0x7f && ch
!= 0) {
1436 } else if(ch
<=0x7ff) {
1443 reqLength
+=(int32_t)(pDest
- (uint8_t *)dest
);
1445 *pDestLength
= reqLength
;
1448 /* Terminate the buffer */
1449 u_terminateChars(dest
, destCapacity
, reqLength
, pErrorCode
);