1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 * tab size: 8 (not used)
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
16 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
18 * Also, CESU-8 implementation, see UTR 26.
19 * The CESU-8 converter uses all the same functions as the
20 * UTF-8 converter, with a branch for converting supplementary code points.
23 #include "unicode/utypes.h"
25 #if !UCONFIG_NO_CONVERSION
27 #include "unicode/ucnv.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
37 /* Prototypes --------------------------------------------------------------- */
39 /* Keep these here to make finicky compilers happy */
41 U_CFUNC
void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
*args
,
43 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
*args
,
47 /* UTF-8 -------------------------------------------------------------------- */
49 #define MAXIMUM_UCS2 0x0000FFFF
51 static const uint32_t offsetsFromUTF8
[5] = {0,
52 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
56 static UBool
hasCESU8Data(const UConverter
*cnv
)
58 #if UCONFIG_ONLY_HTML_CONVERSION
61 return (UBool
)(cnv
->sharedData
== &_CESU8Data
);
65 static void U_CALLCONV
ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs
* args
,
68 UConverter
*cnv
= args
->converter
;
69 const unsigned char *mySource
= (unsigned char *) args
->source
;
70 UChar
*myTarget
= args
->target
;
71 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
72 const UChar
*targetLimit
= args
->targetLimit
;
73 unsigned char *toUBytes
= cnv
->toUBytes
;
74 UBool isCESU8
= hasCESU8Data(cnv
);
78 /* Restore size of current sequence */
79 if (cnv
->toULength
> 0 && myTarget
< targetLimit
)
81 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
82 i
= cnv
->toULength
; /* restore # of bytes consumed */
85 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
86 cnv
->toUnicodeStatus
= 0;
91 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
94 if (U8_IS_SINGLE(ch
)) /* Simple case */
96 *(myTarget
++) = (UChar
) ch
;
100 /* store the first char */
101 toUBytes
[0] = (char)ch
;
102 inBytes
= U8_COUNT_BYTES_NON_ASCII(ch
); /* lookup current sequence length */
108 if (mySource
< sourceLimit
)
110 toUBytes
[i
] = (char) (ch2
= *mySource
);
111 if (!icu::UTF8::isValidTrail(ch
, static_cast<uint8_t>(ch2
), i
, inBytes
) &&
112 !(isCESU8
&& i
== 1 && ch
== 0xed && U8_IS_TRAIL(ch2
)))
114 break; /* i < inBytes */
116 ch
= (ch
<< 6) + ch2
;
122 /* stores a partially calculated target*/
123 cnv
->toUnicodeStatus
= ch
;
125 cnv
->toULength
= (int8_t) i
;
130 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131 if (i
== inBytes
&& (!isCESU8
|| i
<= 3))
133 /* Remove the accumulated high bits */
134 ch
-= offsetsFromUTF8
[inBytes
];
136 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
137 if (ch
<= MAXIMUM_UCS2
)
139 /* fits in 16 bits */
140 *(myTarget
++) = (UChar
) ch
;
144 /* write out the surrogates */
145 *(myTarget
++) = U16_LEAD(ch
);
147 if (myTarget
< targetLimit
)
149 *(myTarget
++) = (UChar
)ch
;
153 /* Put in overflow buffer (not handled here) */
154 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
155 cnv
->UCharErrorBufferLength
= 1;
156 *err
= U_BUFFER_OVERFLOW_ERROR
;
163 cnv
->toULength
= (int8_t)i
;
164 *err
= U_ILLEGAL_CHAR_FOUND
;
171 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
173 /* End of target buffer */
174 *err
= U_BUFFER_OVERFLOW_ERROR
;
177 args
->target
= myTarget
;
178 args
->source
= (const char *) mySource
;
181 static void U_CALLCONV
ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
* args
,
184 UConverter
*cnv
= args
->converter
;
185 const unsigned char *mySource
= (unsigned char *) args
->source
;
186 UChar
*myTarget
= args
->target
;
187 int32_t *myOffsets
= args
->offsets
;
188 int32_t offsetNum
= 0;
189 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
190 const UChar
*targetLimit
= args
->targetLimit
;
191 unsigned char *toUBytes
= cnv
->toUBytes
;
192 UBool isCESU8
= hasCESU8Data(cnv
);
193 uint32_t ch
, ch2
= 0;
196 /* Restore size of current sequence */
197 if (cnv
->toULength
> 0 && myTarget
< targetLimit
)
199 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
200 i
= cnv
->toULength
; /* restore # of bytes consumed */
203 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
204 cnv
->toUnicodeStatus
= 0;
208 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
211 if (U8_IS_SINGLE(ch
)) /* Simple case */
213 *(myTarget
++) = (UChar
) ch
;
214 *(myOffsets
++) = offsetNum
++;
218 toUBytes
[0] = (char)ch
;
219 inBytes
= U8_COUNT_BYTES_NON_ASCII(ch
);
225 if (mySource
< sourceLimit
)
227 toUBytes
[i
] = (char) (ch2
= *mySource
);
228 if (!icu::UTF8::isValidTrail(ch
, static_cast<uint8_t>(ch2
), i
, inBytes
) &&
229 !(isCESU8
&& i
== 1 && ch
== 0xed && U8_IS_TRAIL(ch2
)))
231 break; /* i < inBytes */
233 ch
= (ch
<< 6) + ch2
;
239 cnv
->toUnicodeStatus
= ch
;
241 cnv
->toULength
= (int8_t)i
;
246 // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247 if (i
== inBytes
&& (!isCESU8
|| i
<= 3))
249 /* Remove the accumulated high bits */
250 ch
-= offsetsFromUTF8
[inBytes
];
252 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
253 if (ch
<= MAXIMUM_UCS2
)
255 /* fits in 16 bits */
256 *(myTarget
++) = (UChar
) ch
;
257 *(myOffsets
++) = offsetNum
;
261 /* write out the surrogates */
262 *(myTarget
++) = U16_LEAD(ch
);
263 *(myOffsets
++) = offsetNum
;
265 if (myTarget
< targetLimit
)
267 *(myTarget
++) = (UChar
)ch
;
268 *(myOffsets
++) = offsetNum
;
272 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
273 cnv
->UCharErrorBufferLength
= 1;
274 *err
= U_BUFFER_OVERFLOW_ERROR
;
281 cnv
->toULength
= (int8_t)i
;
282 *err
= U_ILLEGAL_CHAR_FOUND
;
289 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
290 { /* End of target buffer */
291 *err
= U_BUFFER_OVERFLOW_ERROR
;
294 args
->target
= myTarget
;
295 args
->source
= (const char *) mySource
;
296 args
->offsets
= myOffsets
;
300 U_CFUNC
void U_CALLCONV
ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs
* args
,
303 UConverter
*cnv
= args
->converter
;
304 const UChar
*mySource
= args
->source
;
305 const UChar
*sourceLimit
= args
->sourceLimit
;
306 uint8_t *myTarget
= (uint8_t *) args
->target
;
307 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
311 int32_t indexToWrite
;
312 UBool isNotCESU8
= !hasCESU8Data(cnv
);
314 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
316 ch
= cnv
->fromUChar32
;
317 cnv
->fromUChar32
= 0;
321 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
325 if (ch
< 0x80) /* Single byte */
327 *(myTarget
++) = (uint8_t) ch
;
329 else if (ch
< 0x800) /* Double byte */
331 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
332 if (myTarget
< targetLimit
)
334 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
338 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
339 cnv
->charErrorBufferLength
= 1;
340 *err
= U_BUFFER_OVERFLOW_ERROR
;
344 /* Check for surrogates */
345 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
347 if (mySource
< sourceLimit
) {
348 /* test both code units */
349 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
350 /* convert and consume this supplementary code point */
351 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
353 /* exit this condition tree */
356 /* this is an unpaired trail or lead code unit */
357 /* callback(illegal) */
358 cnv
->fromUChar32
= ch
;
359 *err
= U_ILLEGAL_CHAR_FOUND
;
365 cnv
->fromUChar32
= ch
;
370 /* Do we write the buffer directly for speed,
371 or do we have to be careful about target buffer space? */
372 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
374 if (ch
<= MAXIMUM_UCS2
) {
376 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
380 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
381 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
383 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
384 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
386 if (tempPtr
== myTarget
) {
387 /* There was enough space to write the codepoint directly. */
388 myTarget
+= (indexToWrite
+ 1);
391 /* We might run out of room soon. Write it slowly. */
392 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
393 if (myTarget
< targetLimit
) {
394 *(myTarget
++) = *tempPtr
;
397 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
398 *err
= U_BUFFER_OVERFLOW_ERROR
;
405 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
407 *err
= U_BUFFER_OVERFLOW_ERROR
;
410 args
->target
= (char *) myTarget
;
411 args
->source
= mySource
;
414 U_CFUNC
void U_CALLCONV
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
417 UConverter
*cnv
= args
->converter
;
418 const UChar
*mySource
= args
->source
;
419 int32_t *myOffsets
= args
->offsets
;
420 const UChar
*sourceLimit
= args
->sourceLimit
;
421 uint8_t *myTarget
= (uint8_t *) args
->target
;
422 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
425 int32_t offsetNum
, nextSourceIndex
;
426 int32_t indexToWrite
;
428 UBool isNotCESU8
= !hasCESU8Data(cnv
);
430 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
432 ch
= cnv
->fromUChar32
;
433 cnv
->fromUChar32
= 0;
441 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
445 if (ch
< 0x80) /* Single byte */
447 *(myOffsets
++) = offsetNum
++;
448 *(myTarget
++) = (char) ch
;
450 else if (ch
< 0x800) /* Double byte */
452 *(myOffsets
++) = offsetNum
;
453 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
454 if (myTarget
< targetLimit
)
456 *(myOffsets
++) = offsetNum
++;
457 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
461 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
462 cnv
->charErrorBufferLength
= 1;
463 *err
= U_BUFFER_OVERFLOW_ERROR
;
467 /* Check for surrogates */
469 nextSourceIndex
= offsetNum
+ 1;
471 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
473 if (mySource
< sourceLimit
) {
474 /* test both code units */
475 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
476 /* convert and consume this supplementary code point */
477 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
480 /* exit this condition tree */
483 /* this is an unpaired trail or lead code unit */
484 /* callback(illegal) */
485 cnv
->fromUChar32
= ch
;
486 *err
= U_ILLEGAL_CHAR_FOUND
;
492 cnv
->fromUChar32
= ch
;
497 /* Do we write the buffer directly for speed,
498 or do we have to be careful about target buffer space? */
499 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
501 if (ch
<= MAXIMUM_UCS2
) {
503 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
507 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
508 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
510 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
511 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
513 if (tempPtr
== myTarget
) {
514 /* There was enough space to write the codepoint directly. */
515 myTarget
+= (indexToWrite
+ 1);
516 myOffsets
[0] = offsetNum
;
517 myOffsets
[1] = offsetNum
;
518 myOffsets
[2] = offsetNum
;
519 if (indexToWrite
>= 3) {
520 myOffsets
[3] = offsetNum
;
522 myOffsets
+= (indexToWrite
+ 1);
525 /* We might run out of room soon. Write it slowly. */
526 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
527 if (myTarget
< targetLimit
)
529 *(myOffsets
++) = offsetNum
;
530 *(myTarget
++) = *tempPtr
;
534 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
535 *err
= U_BUFFER_OVERFLOW_ERROR
;
539 offsetNum
= nextSourceIndex
;
543 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
545 *err
= U_BUFFER_OVERFLOW_ERROR
;
548 args
->target
= (char *) myTarget
;
549 args
->source
= mySource
;
550 args
->offsets
= myOffsets
;
554 static UChar32 U_CALLCONV
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs
*args
,
557 const uint8_t *sourceInitial
;
558 const uint8_t *source
;
563 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
565 cnv
= args
->converter
;
566 sourceInitial
= source
= (const uint8_t *)args
->source
;
567 if (source
>= (const uint8_t *)args
->sourceLimit
)
570 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
574 myByte
= (uint8_t)*(source
++);
575 if (U8_IS_SINGLE(myByte
))
577 args
->source
= (const char *)source
;
578 return (UChar32
)myByte
;
581 uint16_t countTrailBytes
= U8_COUNT_TRAIL_BYTES(myByte
);
582 if (countTrailBytes
== 0) {
583 cnv
->toUBytes
[0] = myByte
;
585 *err
= U_ILLEGAL_CHAR_FOUND
;
586 args
->source
= (const char *)source
;
590 /*The byte sequence is longer than the buffer area passed*/
591 if (((const char *)source
+ countTrailBytes
) > args
->sourceLimit
)
593 /* check if all of the remaining bytes are trail bytes */
594 uint16_t extraBytesToWrite
= countTrailBytes
+ 1;
595 cnv
->toUBytes
[0] = myByte
;
597 *err
= U_TRUNCATED_CHAR_FOUND
;
598 while(source
< (const uint8_t *)args
->sourceLimit
) {
600 if(icu::UTF8::isValidTrail(myByte
, b
, i
, extraBytesToWrite
)) {
601 cnv
->toUBytes
[i
++] = b
;
604 /* error even before we run out of input */
605 *err
= U_ILLEGAL_CHAR_FOUND
;
610 args
->source
= (const char *)source
;
615 if(countTrailBytes
== 2) {
616 uint8_t t1
= *source
, t2
;
617 if(U8_IS_VALID_LEAD3_AND_T1(myByte
, t1
) && U8_IS_TRAIL(t2
= *++source
)) {
618 args
->source
= (const char *)(source
+ 1);
619 return (((ch
+ t1
) << 6) + t2
) - offsetsFromUTF8
[3];
621 } else if(countTrailBytes
== 1) {
622 uint8_t t1
= *source
;
623 if(U8_IS_TRAIL(t1
)) {
624 args
->source
= (const char *)(source
+ 1);
625 return (ch
+ t1
) - offsetsFromUTF8
[2];
627 } else { // countTrailBytes == 3
628 uint8_t t1
= *source
, t2
, t3
;
629 if(U8_IS_VALID_LEAD4_AND_T1(myByte
, t1
) && U8_IS_TRAIL(t2
= *++source
) &&
630 U8_IS_TRAIL(t3
= *++source
)) {
631 args
->source
= (const char *)(source
+ 1);
632 return (((((ch
+ t1
) << 6) + t2
) << 6) + t3
) - offsetsFromUTF8
[4];
635 args
->source
= (const char *)source
;
637 for(i
= 0; sourceInitial
< source
; ++i
) {
638 cnv
->toUBytes
[i
] = *sourceInitial
++;
641 *err
= U_ILLEGAL_CHAR_FOUND
;
646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
650 static void U_CALLCONV
651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
652 UConverterToUnicodeArgs
*pToUArgs
,
653 UErrorCode
*pErrorCode
) {
655 const uint8_t *source
, *sourceLimit
;
657 int32_t targetCapacity
;
660 int8_t oldToULength
, toULength
, toULimit
;
665 /* set up the local pointers */
666 utf8
=pToUArgs
->converter
;
667 source
=(uint8_t *)pToUArgs
->source
;
668 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
669 target
=(uint8_t *)pFromUArgs
->target
;
670 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
672 /* get the converter state from the UTF-8 UConverter */
673 if(utf8
->toULength
> 0) {
674 toULength
=oldToULength
=utf8
->toULength
;
675 toULimit
=(int8_t)utf8
->mode
;
676 c
=(UChar32
)utf8
->toUnicodeStatus
;
678 toULength
=oldToULength
=toULimit
=0;
682 count
=(int32_t)(sourceLimit
-source
)+oldToULength
;
685 * Not enough input to complete the partial character.
686 * Jump to moreBytes below - it will not output to target.
688 } else if(targetCapacity
<toULimit
) {
690 * Not enough target capacity to output the partial character.
691 * Let the standard converter handle this.
693 *pErrorCode
=U_USING_DEFAULT_WARNING
;
696 // Use a single counter for source and target, counting the minimum of
697 // the source length and the target capacity.
698 // Let the standard converter handle edge cases.
699 if(count
>targetCapacity
) {
700 count
=targetCapacity
;
703 // The conversion loop checks count>0 only once per character.
704 // If the buffer ends with a truncated sequence,
705 // then we reduce the count to stop before that,
706 // and collect the remaining bytes after the conversion loop.
708 // Do not go back into the bytes that will be read for finishing a partial
709 // sequence from the previous buffer.
710 int32_t length
=count
-toULimit
;
711 U8_TRUNCATE_IF_INCOMPLETE(source
, 0, length
);
712 count
=toULimit
+length
;
716 utf8
->toUnicodeStatus
=0;
719 /* See note in ucnv_SBCSFromUTF8() about this goto. */
722 /* conversion loop */
725 if(U8_IS_SINGLE(b
)) {
732 if( /* handle U+0800..U+FFFF inline */
734 U8_IS_VALID_LEAD3_AND_T1(b
, t1
=source
[0]) &&
735 U8_IS_TRAIL(t2
=source
[1])
745 if( /* handle U+0080..U+07FF inline */
747 U8_IS_TRAIL(t1
=*source
)
757 /* handle "complicated" and error cases, and continuing partial characters */
760 toULimit
=U8_COUNT_BYTES_NON_ASCII(b
);
763 while(toULength
<toULimit
) {
764 if(source
<sourceLimit
) {
766 if(icu::UTF8::isValidTrail(c
, b
, toULength
, toULimit
)) {
771 break; /* sequence too short, stop with toULength<toULimit */
774 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
775 source
-=(toULength
-oldToULength
);
776 while(oldToULength
<toULength
) {
777 utf8
->toUBytes
[oldToULength
++]=*source
++;
779 utf8
->toUnicodeStatus
=c
;
780 utf8
->toULength
=toULength
;
782 pToUArgs
->source
=(char *)source
;
783 pFromUArgs
->target
=(char *)target
;
788 if(toULength
!=toULimit
) {
789 /* error handling: illegal UTF-8 byte sequence */
790 source
-=(toULength
-oldToULength
);
791 while(oldToULength
<toULength
) {
792 utf8
->toUBytes
[oldToULength
++]=*source
++;
794 utf8
->toULength
=toULength
;
795 pToUArgs
->source
=(char *)source
;
796 pFromUArgs
->target
=(char *)target
;
797 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
801 /* copy the legal byte sequence to the target */
805 for(i
=0; i
<oldToULength
; ++i
) {
806 *target
++=utf8
->toUBytes
[i
];
808 source
-=(toULength
-oldToULength
);
809 for(; i
<toULength
; ++i
) {
818 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
) {
819 if(target
==(const uint8_t *)pFromUArgs
->targetLimit
) {
820 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
823 toULimit
=U8_COUNT_BYTES(b
);
824 if(toULimit
>(sourceLimit
-source
)) {
825 /* collect a truncated byte sequence */
829 utf8
->toUBytes
[toULength
++]=b
;
830 if(++source
==sourceLimit
) {
831 /* partial byte sequence at end of source */
832 utf8
->toUnicodeStatus
=c
;
833 utf8
->toULength
=toULength
;
836 } else if(!icu::UTF8::isValidTrail(c
, b
=*source
, toULength
, toULimit
)) {
837 utf8
->toULength
=toULength
;
838 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
844 /* partial-sequence target overflow: fall back to the pivoting implementation */
845 *pErrorCode
=U_USING_DEFAULT_WARNING
;
850 /* write back the updated pointers */
851 pToUArgs
->source
=(char *)source
;
852 pFromUArgs
->target
=(char *)target
;
857 /* UTF-8 converter data ----------------------------------------------------- */
859 static const UConverterImpl _UTF8Impl
={
870 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
871 ucnv_fromUnicode_UTF8
,
872 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
873 ucnv_getNextUChar_UTF8
,
879 ucnv_getNonSurrogateUnicodeSet
,
885 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
886 static const UConverterStaticData _UTF8StaticData
={
887 sizeof(UConverterStaticData
),
889 1208, UCNV_IBM
, UCNV_UTF8
,
890 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
891 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
894 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
898 const UConverterSharedData _UTF8Data
=
899 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData
, &_UTF8Impl
);
901 /* CESU-8 converter data ---------------------------------------------------- */
903 static const UConverterImpl _CESU8Impl
={
914 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
915 ucnv_fromUnicode_UTF8
,
916 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
923 ucnv_getCompleteUnicodeSet
,
929 static const UConverterStaticData _CESU8StaticData
={
930 sizeof(UConverterStaticData
),
932 9400, /* CCSID for CESU-8 */
933 UCNV_UNKNOWN
, UCNV_CESU8
, 1, 3,
934 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
937 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
941 const UConverterSharedData _CESU8Data
=
942 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData
, &_CESU8Impl
);