1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 * tab size: 8 (not used)
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
16 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
18 * Also, CESU-8 implementation, see UTR 26.
19 * The CESU-8 converter uses all the same functions as the
20 * UTF-8 converter, with a branch for converting supplementary code points.
23 #include "unicode/utypes.h"
25 #if !UCONFIG_NO_CONVERSION
27 #include "unicode/ucnv.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
35 /* Prototypes --------------------------------------------------------------- */
37 /* Keep these here to make finicky compilers happy */
39 U_CFUNC
void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
*args
,
41 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
*args
,
45 /* UTF-8 -------------------------------------------------------------------- */
47 /* UTF-8 Conversion DATA
48 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
50 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
51 #define MAXIMUM_UCS2 0x0000FFFF
52 #define MAXIMUM_UTF 0x0010FFFF
53 #define MAXIMUM_UCS4 0x7FFFFFFF
55 #define HALF_BASE 0x0010000
56 #define HALF_MASK 0x3FF
57 #define SURROGATE_HIGH_START 0xD800
58 #define SURROGATE_HIGH_END 0xDBFF
59 #define SURROGATE_LOW_START 0xDC00
60 #define SURROGATE_LOW_END 0xDFFF
62 /* -SURROGATE_LOW_START + HALF_BASE */
63 #define SURROGATE_LOW_BASE 9216
65 static const uint32_t offsetsFromUTF8
[7] = {0,
66 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
67 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
70 /* END OF UTF-8 Conversion DATA */
72 static const int8_t bytesFromUTF8
[256] = {
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
84 * Starting with Unicode 3.0.1:
85 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
86 * byte sequences with more than 4 bytes are illegal in UTF-8,
87 * which is tested with impossible values for them
90 utf8_minChar32
[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
92 static UBool
hasCESU8Data(const UConverter
*cnv
)
94 #if UCONFIG_ONLY_HTML_CONVERSION
97 return (UBool
)(cnv
->sharedData
== &_CESU8Data
);
101 static void U_CALLCONV
ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs
* args
,
104 UConverter
*cnv
= args
->converter
;
105 const unsigned char *mySource
= (unsigned char *) args
->source
;
106 UChar
*myTarget
= args
->target
;
107 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
108 const UChar
*targetLimit
= args
->targetLimit
;
109 unsigned char *toUBytes
= cnv
->toUBytes
;
110 UBool isCESU8
= hasCESU8Data(cnv
);
111 uint32_t ch
, ch2
= 0;
114 /* Restore size of current sequence */
115 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
117 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
118 i
= cnv
->toULength
; /* restore # of bytes consumed */
121 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
122 cnv
->toUnicodeStatus
= 0;
127 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
130 if (ch
< 0x80) /* Simple case */
132 *(myTarget
++) = (UChar
) ch
;
136 /* store the first char */
137 toUBytes
[0] = (char)ch
;
138 inBytes
= bytesFromUTF8
[ch
]; /* lookup current sequence length */
144 if (mySource
< sourceLimit
)
146 toUBytes
[i
] = (char) (ch2
= *mySource
);
147 if (!U8_IS_TRAIL(ch2
))
149 break; /* i < inBytes */
151 ch
= (ch
<< 6) + ch2
;
157 /* stores a partially calculated target*/
158 cnv
->toUnicodeStatus
= ch
;
160 cnv
->toULength
= (int8_t) i
;
165 /* Remove the accumulated high bits */
166 ch
-= offsetsFromUTF8
[inBytes
];
169 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
170 * - use only trail bytes after a lead byte (checked above)
171 * - use the right number of trail bytes for a given lead byte
172 * - encode a code point <= U+10ffff
173 * - use the fewest possible number of bytes for their code points
174 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
176 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
177 * There are no irregular sequences any more.
178 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
180 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
181 (isCESU8
? i
<= 3 : !U_IS_SURROGATE(ch
)))
183 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
184 if (ch
<= MAXIMUM_UCS2
)
186 /* fits in 16 bits */
187 *(myTarget
++) = (UChar
) ch
;
191 /* write out the surrogates */
193 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
194 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
195 if (myTarget
< targetLimit
)
197 *(myTarget
++) = (UChar
)ch
;
201 /* Put in overflow buffer (not handled here) */
202 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
203 cnv
->UCharErrorBufferLength
= 1;
204 *err
= U_BUFFER_OVERFLOW_ERROR
;
211 cnv
->toULength
= (int8_t)i
;
212 *err
= U_ILLEGAL_CHAR_FOUND
;
219 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
221 /* End of target buffer */
222 *err
= U_BUFFER_OVERFLOW_ERROR
;
225 args
->target
= myTarget
;
226 args
->source
= (const char *) mySource
;
229 static void U_CALLCONV
ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
* args
,
232 UConverter
*cnv
= args
->converter
;
233 const unsigned char *mySource
= (unsigned char *) args
->source
;
234 UChar
*myTarget
= args
->target
;
235 int32_t *myOffsets
= args
->offsets
;
236 int32_t offsetNum
= 0;
237 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
238 const UChar
*targetLimit
= args
->targetLimit
;
239 unsigned char *toUBytes
= cnv
->toUBytes
;
240 UBool isCESU8
= hasCESU8Data(cnv
);
241 uint32_t ch
, ch2
= 0;
244 /* Restore size of current sequence */
245 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
247 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
248 i
= cnv
->toULength
; /* restore # of bytes consumed */
251 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
252 cnv
->toUnicodeStatus
= 0;
256 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
259 if (ch
< 0x80) /* Simple case */
261 *(myTarget
++) = (UChar
) ch
;
262 *(myOffsets
++) = offsetNum
++;
266 toUBytes
[0] = (char)ch
;
267 inBytes
= bytesFromUTF8
[ch
];
273 if (mySource
< sourceLimit
)
275 toUBytes
[i
] = (char) (ch2
= *mySource
);
276 if (!U8_IS_TRAIL(ch2
))
278 break; /* i < inBytes */
280 ch
= (ch
<< 6) + ch2
;
286 cnv
->toUnicodeStatus
= ch
;
288 cnv
->toULength
= (int8_t)i
;
293 /* Remove the accumulated high bits */
294 ch
-= offsetsFromUTF8
[inBytes
];
297 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
298 * - use only trail bytes after a lead byte (checked above)
299 * - use the right number of trail bytes for a given lead byte
300 * - encode a code point <= U+10ffff
301 * - use the fewest possible number of bytes for their code points
302 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
304 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
305 * There are no irregular sequences any more.
306 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
308 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
309 (isCESU8
? i
<= 3 : !U_IS_SURROGATE(ch
)))
311 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
312 if (ch
<= MAXIMUM_UCS2
)
314 /* fits in 16 bits */
315 *(myTarget
++) = (UChar
) ch
;
316 *(myOffsets
++) = offsetNum
;
320 /* write out the surrogates */
322 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
323 *(myOffsets
++) = offsetNum
;
324 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
325 if (myTarget
< targetLimit
)
327 *(myTarget
++) = (UChar
)ch
;
328 *(myOffsets
++) = offsetNum
;
332 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
333 cnv
->UCharErrorBufferLength
= 1;
334 *err
= U_BUFFER_OVERFLOW_ERROR
;
341 cnv
->toULength
= (int8_t)i
;
342 *err
= U_ILLEGAL_CHAR_FOUND
;
349 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
350 { /* End of target buffer */
351 *err
= U_BUFFER_OVERFLOW_ERROR
;
354 args
->target
= myTarget
;
355 args
->source
= (const char *) mySource
;
356 args
->offsets
= myOffsets
;
360 U_CFUNC
void U_CALLCONV
ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs
* args
,
363 UConverter
*cnv
= args
->converter
;
364 const UChar
*mySource
= args
->source
;
365 const UChar
*sourceLimit
= args
->sourceLimit
;
366 uint8_t *myTarget
= (uint8_t *) args
->target
;
367 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
371 int32_t indexToWrite
;
372 UBool isNotCESU8
= !hasCESU8Data(cnv
);
374 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
376 ch
= cnv
->fromUChar32
;
377 cnv
->fromUChar32
= 0;
381 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
385 if (ch
< 0x80) /* Single byte */
387 *(myTarget
++) = (uint8_t) ch
;
389 else if (ch
< 0x800) /* Double byte */
391 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
392 if (myTarget
< targetLimit
)
394 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
398 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
399 cnv
->charErrorBufferLength
= 1;
400 *err
= U_BUFFER_OVERFLOW_ERROR
;
404 /* Check for surrogates */
405 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
407 if (mySource
< sourceLimit
) {
408 /* test both code units */
409 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
410 /* convert and consume this supplementary code point */
411 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
413 /* exit this condition tree */
416 /* this is an unpaired trail or lead code unit */
417 /* callback(illegal) */
418 cnv
->fromUChar32
= ch
;
419 *err
= U_ILLEGAL_CHAR_FOUND
;
425 cnv
->fromUChar32
= ch
;
430 /* Do we write the buffer directly for speed,
431 or do we have to be careful about target buffer space? */
432 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
434 if (ch
<= MAXIMUM_UCS2
) {
436 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
440 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
441 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
443 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
444 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
446 if (tempPtr
== myTarget
) {
447 /* There was enough space to write the codepoint directly. */
448 myTarget
+= (indexToWrite
+ 1);
451 /* We might run out of room soon. Write it slowly. */
452 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
453 if (myTarget
< targetLimit
) {
454 *(myTarget
++) = *tempPtr
;
457 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
458 *err
= U_BUFFER_OVERFLOW_ERROR
;
465 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
467 *err
= U_BUFFER_OVERFLOW_ERROR
;
470 args
->target
= (char *) myTarget
;
471 args
->source
= mySource
;
474 U_CFUNC
void U_CALLCONV
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
477 UConverter
*cnv
= args
->converter
;
478 const UChar
*mySource
= args
->source
;
479 int32_t *myOffsets
= args
->offsets
;
480 const UChar
*sourceLimit
= args
->sourceLimit
;
481 uint8_t *myTarget
= (uint8_t *) args
->target
;
482 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
485 int32_t offsetNum
, nextSourceIndex
;
486 int32_t indexToWrite
;
488 UBool isNotCESU8
= !hasCESU8Data(cnv
);
490 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
492 ch
= cnv
->fromUChar32
;
493 cnv
->fromUChar32
= 0;
501 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
505 if (ch
< 0x80) /* Single byte */
507 *(myOffsets
++) = offsetNum
++;
508 *(myTarget
++) = (char) ch
;
510 else if (ch
< 0x800) /* Double byte */
512 *(myOffsets
++) = offsetNum
;
513 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
514 if (myTarget
< targetLimit
)
516 *(myOffsets
++) = offsetNum
++;
517 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
521 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
522 cnv
->charErrorBufferLength
= 1;
523 *err
= U_BUFFER_OVERFLOW_ERROR
;
527 /* Check for surrogates */
529 nextSourceIndex
= offsetNum
+ 1;
531 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
533 if (mySource
< sourceLimit
) {
534 /* test both code units */
535 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
536 /* convert and consume this supplementary code point */
537 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
540 /* exit this condition tree */
543 /* this is an unpaired trail or lead code unit */
544 /* callback(illegal) */
545 cnv
->fromUChar32
= ch
;
546 *err
= U_ILLEGAL_CHAR_FOUND
;
552 cnv
->fromUChar32
= ch
;
557 /* Do we write the buffer directly for speed,
558 or do we have to be careful about target buffer space? */
559 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
561 if (ch
<= MAXIMUM_UCS2
) {
563 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
567 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
568 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
570 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
571 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
573 if (tempPtr
== myTarget
) {
574 /* There was enough space to write the codepoint directly. */
575 myTarget
+= (indexToWrite
+ 1);
576 myOffsets
[0] = offsetNum
;
577 myOffsets
[1] = offsetNum
;
578 myOffsets
[2] = offsetNum
;
579 if (indexToWrite
>= 3) {
580 myOffsets
[3] = offsetNum
;
582 myOffsets
+= (indexToWrite
+ 1);
585 /* We might run out of room soon. Write it slowly. */
586 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
587 if (myTarget
< targetLimit
)
589 *(myOffsets
++) = offsetNum
;
590 *(myTarget
++) = *tempPtr
;
594 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
595 *err
= U_BUFFER_OVERFLOW_ERROR
;
599 offsetNum
= nextSourceIndex
;
603 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
605 *err
= U_BUFFER_OVERFLOW_ERROR
;
608 args
->target
= (char *) myTarget
;
609 args
->source
= mySource
;
610 args
->offsets
= myOffsets
;
614 static UChar32 U_CALLCONV
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs
*args
,
617 const uint8_t *sourceInitial
;
618 const uint8_t *source
;
619 uint16_t extraBytesToWrite
;
622 int8_t i
, isLegalSequence
;
624 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
626 cnv
= args
->converter
;
627 sourceInitial
= source
= (const uint8_t *)args
->source
;
628 if (source
>= (const uint8_t *)args
->sourceLimit
)
631 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
635 myByte
= (uint8_t)*(source
++);
638 args
->source
= (const char *)source
;
639 return (UChar32
)myByte
;
642 extraBytesToWrite
= (uint16_t)bytesFromUTF8
[myByte
];
643 if (extraBytesToWrite
== 0) {
644 cnv
->toUBytes
[0] = myByte
;
646 *err
= U_ILLEGAL_CHAR_FOUND
;
647 args
->source
= (const char *)source
;
651 /*The byte sequence is longer than the buffer area passed*/
652 if (((const char *)source
+ extraBytesToWrite
- 1) > args
->sourceLimit
)
654 /* check if all of the remaining bytes are trail bytes */
655 cnv
->toUBytes
[0] = myByte
;
657 *err
= U_TRUNCATED_CHAR_FOUND
;
658 while(source
< (const uint8_t *)args
->sourceLimit
) {
659 if(U8_IS_TRAIL(myByte
= *source
)) {
660 cnv
->toUBytes
[i
++] = myByte
;
663 /* error even before we run out of input */
664 *err
= U_ILLEGAL_CHAR_FOUND
;
669 args
->source
= (const char *)source
;
675 switch(extraBytesToWrite
)
677 /* note: code falls through cases! (sic)*/
679 ch
+= (myByte
= *source
);
681 if (!U8_IS_TRAIL(myByte
))
689 ch
+= (myByte
= *source
);
691 if (!U8_IS_TRAIL(myByte
))
699 ch
+= (myByte
= *source
);
701 if (!U8_IS_TRAIL(myByte
))
709 ch
+= (myByte
= *source
);
711 if (!U8_IS_TRAIL(myByte
))
719 ch
+= (myByte
= *source
);
720 if (!U8_IS_TRAIL(myByte
))
727 ch
-= offsetsFromUTF8
[extraBytesToWrite
];
728 args
->source
= (const char *)source
;
731 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
732 * - use only trail bytes after a lead byte (checked above)
733 * - use the right number of trail bytes for a given lead byte
734 * - encode a code point <= U+10ffff
735 * - use the fewest possible number of bytes for their code points
736 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
738 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
739 * There are no irregular sequences any more.
741 if (isLegalSequence
&&
742 (uint32_t)ch
<= MAXIMUM_UTF
&&
743 (uint32_t)ch
>= utf8_minChar32
[extraBytesToWrite
] &&
746 return ch
; /* return the code point */
749 for(i
= 0; sourceInitial
< source
; ++i
) {
750 cnv
->toUBytes
[i
] = *sourceInitial
++;
753 *err
= U_ILLEGAL_CHAR_FOUND
;
758 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
760 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
762 utf8_minLegal
[5]={ 0, 0, 0x80, 0x800, 0x10000 };
764 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
766 utf8_offsets
[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
769 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
770 static void U_CALLCONV
771 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
772 UConverterToUnicodeArgs
*pToUArgs
,
773 UErrorCode
*pErrorCode
) {
775 const uint8_t *source
, *sourceLimit
;
777 int32_t targetCapacity
;
780 int8_t oldToULength
, toULength
, toULimit
;
785 /* set up the local pointers */
786 utf8
=pToUArgs
->converter
;
787 source
=(uint8_t *)pToUArgs
->source
;
788 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
789 target
=(uint8_t *)pFromUArgs
->target
;
790 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
792 /* get the converter state from the UTF-8 UConverter */
793 c
=(UChar32
)utf8
->toUnicodeStatus
;
795 toULength
=oldToULength
=utf8
->toULength
;
796 toULimit
=(int8_t)utf8
->mode
;
798 toULength
=oldToULength
=toULimit
=0;
801 count
=(int32_t)(sourceLimit
-source
)+oldToULength
;
804 * Not enough input to complete the partial character.
805 * Jump to moreBytes below - it will not output to target.
807 } else if(targetCapacity
<toULimit
) {
809 * Not enough target capacity to output the partial character.
810 * Let the standard converter handle this.
812 *pErrorCode
=U_USING_DEFAULT_WARNING
;
816 * Use a single counter for source and target, counting the minimum of
817 * the source length and the target capacity.
818 * As a result, the source length is checked only once per multi-byte
819 * character instead of twice.
821 * Make sure that the last byte sequence is complete, or else
822 * stop just before it.
823 * (The longest legal byte sequence has 3 trail bytes.)
824 * Count oldToULength (number of source bytes from a previous buffer)
825 * into the source length but reduce the source index by toULimit
826 * while going back over trail bytes in order to not go back into
827 * the bytes that will be read for finishing a partial
828 * sequence from the previous buffer.
829 * Let the standard converter handle edge cases.
833 if(count
>targetCapacity
) {
834 count
=targetCapacity
;
838 while(i
<3 && i
<(count
-toULimit
)) {
839 b
=source
[count
-oldToULength
-i
-1];
843 if(i
<U8_COUNT_TRAIL_BYTES(b
)) {
844 /* stop converting before the lead byte if there are not enough trail bytes for it */
853 utf8
->toUnicodeStatus
=0;
856 /* See note in ucnv_SBCSFromUTF8() about this goto. */
859 /* conversion loop */
869 if( /* handle U+1000..U+D7FF inline */
870 (t1
=source
[0]) >= 0x80 && ((b
<0xed && (t1
<= 0xbf)) ||
871 (b
==0xed && (t1
<= 0x9f))) &&
872 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
882 if( /* handle U+0080..U+07FF inline */
884 (t1
=*source
) >= 0x80 && t1
<= 0xbf
893 if( /* handle U+0800..U+0FFF inline */
894 (t1
=source
[0]) >= 0xa0 && t1
<= 0xbf &&
895 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
906 /* handle "complicated" and error cases, and continuing partial characters */
909 toULimit
=U8_COUNT_TRAIL_BYTES(b
)+1;
912 while(toULength
<toULimit
) {
913 if(source
<sourceLimit
) {
920 break; /* sequence too short, stop with toULength<toULimit */
923 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
924 source
-=(toULength
-oldToULength
);
925 while(oldToULength
<toULength
) {
926 utf8
->toUBytes
[oldToULength
++]=*source
++;
928 utf8
->toUnicodeStatus
=c
;
929 utf8
->toULength
=toULength
;
931 pToUArgs
->source
=(char *)source
;
932 pFromUArgs
->target
=(char *)target
;
937 if( toULength
==toULimit
&& /* consumed all trail bytes */
938 (toULength
==3 || toULength
==2) && /* BMP */
939 (c
-=utf8_offsets
[toULength
])>=utf8_minLegal
[toULength
] &&
940 (c
<=0xd7ff || 0xe000<=c
) /* not a surrogate */
942 /* legal byte sequence for BMP code point */
944 toULength
==toULimit
&& toULength
==4 &&
945 (0x10000<=(c
-=utf8_offsets
[4]) && c
<=0x10ffff)
947 /* legal byte sequence for supplementary code point */
949 /* error handling: illegal UTF-8 byte sequence */
950 source
-=(toULength
-oldToULength
);
951 while(oldToULength
<toULength
) {
952 utf8
->toUBytes
[oldToULength
++]=*source
++;
954 utf8
->toULength
=toULength
;
955 pToUArgs
->source
=(char *)source
;
956 pFromUArgs
->target
=(char *)target
;
957 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
961 /* copy the legal byte sequence to the target */
965 for(i
=0; i
<oldToULength
; ++i
) {
966 *target
++=utf8
->toUBytes
[i
];
968 source
-=(toULength
-oldToULength
);
969 for(; i
<toULength
; ++i
) {
977 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
) {
978 if(target
==(const uint8_t *)pFromUArgs
->targetLimit
) {
979 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
982 toULimit
=U8_COUNT_TRAIL_BYTES(b
)+1;
983 if(toULimit
>(sourceLimit
-source
)) {
984 /* collect a truncated byte sequence */
988 utf8
->toUBytes
[toULength
++]=b
;
989 if(++source
==sourceLimit
) {
990 /* partial byte sequence at end of source */
991 utf8
->toUnicodeStatus
=c
;
992 utf8
->toULength
=toULength
;
995 } else if(!U8_IS_TRAIL(b
=*source
)) {
996 /* lead byte in trail byte position */
997 utf8
->toULength
=toULength
;
998 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1004 /* partial-sequence target overflow: fall back to the pivoting implementation */
1005 *pErrorCode
=U_USING_DEFAULT_WARNING
;
1010 /* write back the updated pointers */
1011 pToUArgs
->source
=(char *)source
;
1012 pFromUArgs
->target
=(char *)target
;
1017 /* UTF-8 converter data ----------------------------------------------------- */
1019 static const UConverterImpl _UTF8Impl
={
1029 ucnv_toUnicode_UTF8
,
1030 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1031 ucnv_fromUnicode_UTF8
,
1032 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1033 ucnv_getNextUChar_UTF8
,
1039 ucnv_getNonSurrogateUnicodeSet
,
1045 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1046 static const UConverterStaticData _UTF8StaticData
={
1047 sizeof(UConverterStaticData
),
1049 1208, UCNV_IBM
, UCNV_UTF8
,
1050 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1051 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1054 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1058 const UConverterSharedData _UTF8Data
=
1059 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData
, &_UTF8Impl
);
1061 /* CESU-8 converter data ---------------------------------------------------- */
1063 static const UConverterImpl _CESU8Impl
={
1073 ucnv_toUnicode_UTF8
,
1074 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1075 ucnv_fromUnicode_UTF8
,
1076 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1083 ucnv_getCompleteUnicodeSet
,
1089 static const UConverterStaticData _CESU8StaticData
={
1090 sizeof(UConverterStaticData
),
1092 9400, /* CCSID for CESU-8 */
1093 UCNV_UNKNOWN
, UCNV_CESU8
, 1, 3,
1094 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1097 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1101 const UConverterSharedData _CESU8Data
=
1102 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData
, &_CESU8Impl
);