2 **********************************************************************
3 * Copyright (C) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
16 * Also, CESU-8 implementation, see UTR 26.
17 * The CESU-8 converter uses all the same functions as the
18 * UTF-8 converter, with a branch for converting supplementary code points.
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_CONVERSION
25 #include "unicode/ucnv.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
33 /* Prototypes --------------------------------------------------------------- */
35 /* Keep these here to make finicky compilers happy */
37 U_CFUNC
void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
*args
,
39 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
*args
,
43 /* UTF-8 -------------------------------------------------------------------- */
45 /* UTF-8 Conversion DATA
46 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49 #define MAXIMUM_UCS2 0x0000FFFF
50 #define MAXIMUM_UTF 0x0010FFFF
51 #define MAXIMUM_UCS4 0x7FFFFFFF
53 #define HALF_BASE 0x0010000
54 #define HALF_MASK 0x3FF
55 #define SURROGATE_HIGH_START 0xD800
56 #define SURROGATE_HIGH_END 0xDBFF
57 #define SURROGATE_LOW_START 0xDC00
58 #define SURROGATE_LOW_END 0xDFFF
60 /* -SURROGATE_LOW_START + HALF_BASE */
61 #define SURROGATE_LOW_BASE 9216
63 static const uint32_t offsetsFromUTF8
[7] = {0,
64 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
68 /* END OF UTF-8 Conversion DATA */
70 static const int8_t bytesFromUTF8
[256] = {
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
82 * Starting with Unicode 3.0.1:
83 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84 * byte sequences with more than 4 bytes are illegal in UTF-8,
85 * which is tested with impossible values for them
88 utf8_minChar32
[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
90 static UBool
hasCESU8Data(const UConverter
*cnv
)
92 #if UCONFIG_ONLY_HTML_CONVERSION
95 return (UBool
)(cnv
->sharedData
== &_CESU8Data
);
99 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs
* args
,
102 UConverter
*cnv
= args
->converter
;
103 const unsigned char *mySource
= (unsigned char *) args
->source
;
104 UChar
*myTarget
= args
->target
;
105 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
106 const UChar
*targetLimit
= args
->targetLimit
;
107 unsigned char *toUBytes
= cnv
->toUBytes
;
108 UBool isCESU8
= hasCESU8Data(cnv
);
109 uint32_t ch
, ch2
= 0;
112 /* Restore size of current sequence */
113 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
115 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
116 i
= cnv
->toULength
; /* restore # of bytes consumed */
119 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
120 cnv
->toUnicodeStatus
= 0;
125 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
128 if (ch
< 0x80) /* Simple case */
130 *(myTarget
++) = (UChar
) ch
;
134 /* store the first char */
135 toUBytes
[0] = (char)ch
;
136 inBytes
= bytesFromUTF8
[ch
]; /* lookup current sequence length */
142 if (mySource
< sourceLimit
)
144 toUBytes
[i
] = (char) (ch2
= *mySource
);
145 if (!U8_IS_TRAIL(ch2
))
147 break; /* i < inBytes */
149 ch
= (ch
<< 6) + ch2
;
155 /* stores a partially calculated target*/
156 cnv
->toUnicodeStatus
= ch
;
158 cnv
->toULength
= (int8_t) i
;
163 /* Remove the accumulated high bits */
164 ch
-= offsetsFromUTF8
[inBytes
];
167 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
168 * - use only trail bytes after a lead byte (checked above)
169 * - use the right number of trail bytes for a given lead byte
170 * - encode a code point <= U+10ffff
171 * - use the fewest possible number of bytes for their code points
172 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
174 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
175 * There are no irregular sequences any more.
176 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
178 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
179 (isCESU8
? i
<= 3 : !U_IS_SURROGATE(ch
)))
181 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
182 if (ch
<= MAXIMUM_UCS2
)
184 /* fits in 16 bits */
185 *(myTarget
++) = (UChar
) ch
;
189 /* write out the surrogates */
191 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
192 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
193 if (myTarget
< targetLimit
)
195 *(myTarget
++) = (UChar
)ch
;
199 /* Put in overflow buffer (not handled here) */
200 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
201 cnv
->UCharErrorBufferLength
= 1;
202 *err
= U_BUFFER_OVERFLOW_ERROR
;
209 cnv
->toULength
= (int8_t)i
;
210 *err
= U_ILLEGAL_CHAR_FOUND
;
217 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
219 /* End of target buffer */
220 *err
= U_BUFFER_OVERFLOW_ERROR
;
223 args
->target
= myTarget
;
224 args
->source
= (const char *) mySource
;
227 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
* args
,
230 UConverter
*cnv
= args
->converter
;
231 const unsigned char *mySource
= (unsigned char *) args
->source
;
232 UChar
*myTarget
= args
->target
;
233 int32_t *myOffsets
= args
->offsets
;
234 int32_t offsetNum
= 0;
235 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
236 const UChar
*targetLimit
= args
->targetLimit
;
237 unsigned char *toUBytes
= cnv
->toUBytes
;
238 UBool isCESU8
= hasCESU8Data(cnv
);
239 uint32_t ch
, ch2
= 0;
242 /* Restore size of current sequence */
243 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
245 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
246 i
= cnv
->toULength
; /* restore # of bytes consumed */
249 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
250 cnv
->toUnicodeStatus
= 0;
254 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
257 if (ch
< 0x80) /* Simple case */
259 *(myTarget
++) = (UChar
) ch
;
260 *(myOffsets
++) = offsetNum
++;
264 toUBytes
[0] = (char)ch
;
265 inBytes
= bytesFromUTF8
[ch
];
271 if (mySource
< sourceLimit
)
273 toUBytes
[i
] = (char) (ch2
= *mySource
);
274 if (!U8_IS_TRAIL(ch2
))
276 break; /* i < inBytes */
278 ch
= (ch
<< 6) + ch2
;
284 cnv
->toUnicodeStatus
= ch
;
286 cnv
->toULength
= (int8_t)i
;
291 /* Remove the accumulated high bits */
292 ch
-= offsetsFromUTF8
[inBytes
];
295 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
296 * - use only trail bytes after a lead byte (checked above)
297 * - use the right number of trail bytes for a given lead byte
298 * - encode a code point <= U+10ffff
299 * - use the fewest possible number of bytes for their code points
300 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
302 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
303 * There are no irregular sequences any more.
304 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
306 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
307 (isCESU8
? i
<= 3 : !U_IS_SURROGATE(ch
)))
309 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
310 if (ch
<= MAXIMUM_UCS2
)
312 /* fits in 16 bits */
313 *(myTarget
++) = (UChar
) ch
;
314 *(myOffsets
++) = offsetNum
;
318 /* write out the surrogates */
320 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
321 *(myOffsets
++) = offsetNum
;
322 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
323 if (myTarget
< targetLimit
)
325 *(myTarget
++) = (UChar
)ch
;
326 *(myOffsets
++) = offsetNum
;
330 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
331 cnv
->UCharErrorBufferLength
= 1;
332 *err
= U_BUFFER_OVERFLOW_ERROR
;
339 cnv
->toULength
= (int8_t)i
;
340 *err
= U_ILLEGAL_CHAR_FOUND
;
347 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
348 { /* End of target buffer */
349 *err
= U_BUFFER_OVERFLOW_ERROR
;
352 args
->target
= myTarget
;
353 args
->source
= (const char *) mySource
;
354 args
->offsets
= myOffsets
;
357 U_CFUNC
void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs
* args
,
360 UConverter
*cnv
= args
->converter
;
361 const UChar
*mySource
= args
->source
;
362 const UChar
*sourceLimit
= args
->sourceLimit
;
363 uint8_t *myTarget
= (uint8_t *) args
->target
;
364 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
368 int32_t indexToWrite
;
369 UBool isNotCESU8
= !hasCESU8Data(cnv
);
371 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
373 ch
= cnv
->fromUChar32
;
374 cnv
->fromUChar32
= 0;
378 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
382 if (ch
< 0x80) /* Single byte */
384 *(myTarget
++) = (uint8_t) ch
;
386 else if (ch
< 0x800) /* Double byte */
388 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
389 if (myTarget
< targetLimit
)
391 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
395 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
396 cnv
->charErrorBufferLength
= 1;
397 *err
= U_BUFFER_OVERFLOW_ERROR
;
401 /* Check for surrogates */
402 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
404 if (mySource
< sourceLimit
) {
405 /* test both code units */
406 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
407 /* convert and consume this supplementary code point */
408 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
410 /* exit this condition tree */
413 /* this is an unpaired trail or lead code unit */
414 /* callback(illegal) */
415 cnv
->fromUChar32
= ch
;
416 *err
= U_ILLEGAL_CHAR_FOUND
;
422 cnv
->fromUChar32
= ch
;
427 /* Do we write the buffer directly for speed,
428 or do we have to be careful about target buffer space? */
429 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
431 if (ch
<= MAXIMUM_UCS2
) {
433 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
437 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
438 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
440 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
441 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
443 if (tempPtr
== myTarget
) {
444 /* There was enough space to write the codepoint directly. */
445 myTarget
+= (indexToWrite
+ 1);
448 /* We might run out of room soon. Write it slowly. */
449 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
450 if (myTarget
< targetLimit
) {
451 *(myTarget
++) = *tempPtr
;
454 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
455 *err
= U_BUFFER_OVERFLOW_ERROR
;
462 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
464 *err
= U_BUFFER_OVERFLOW_ERROR
;
467 args
->target
= (char *) myTarget
;
468 args
->source
= mySource
;
471 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
474 UConverter
*cnv
= args
->converter
;
475 const UChar
*mySource
= args
->source
;
476 int32_t *myOffsets
= args
->offsets
;
477 const UChar
*sourceLimit
= args
->sourceLimit
;
478 uint8_t *myTarget
= (uint8_t *) args
->target
;
479 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
482 int32_t offsetNum
, nextSourceIndex
;
483 int32_t indexToWrite
;
485 UBool isNotCESU8
= !hasCESU8Data(cnv
);
487 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
489 ch
= cnv
->fromUChar32
;
490 cnv
->fromUChar32
= 0;
498 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
502 if (ch
< 0x80) /* Single byte */
504 *(myOffsets
++) = offsetNum
++;
505 *(myTarget
++) = (char) ch
;
507 else if (ch
< 0x800) /* Double byte */
509 *(myOffsets
++) = offsetNum
;
510 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
511 if (myTarget
< targetLimit
)
513 *(myOffsets
++) = offsetNum
++;
514 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
518 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
519 cnv
->charErrorBufferLength
= 1;
520 *err
= U_BUFFER_OVERFLOW_ERROR
;
524 /* Check for surrogates */
526 nextSourceIndex
= offsetNum
+ 1;
528 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
530 if (mySource
< sourceLimit
) {
531 /* test both code units */
532 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
533 /* convert and consume this supplementary code point */
534 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
537 /* exit this condition tree */
540 /* this is an unpaired trail or lead code unit */
541 /* callback(illegal) */
542 cnv
->fromUChar32
= ch
;
543 *err
= U_ILLEGAL_CHAR_FOUND
;
549 cnv
->fromUChar32
= ch
;
554 /* Do we write the buffer directly for speed,
555 or do we have to be careful about target buffer space? */
556 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
558 if (ch
<= MAXIMUM_UCS2
) {
560 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
564 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
565 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
567 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
568 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
570 if (tempPtr
== myTarget
) {
571 /* There was enough space to write the codepoint directly. */
572 myTarget
+= (indexToWrite
+ 1);
573 myOffsets
[0] = offsetNum
;
574 myOffsets
[1] = offsetNum
;
575 myOffsets
[2] = offsetNum
;
576 if (indexToWrite
>= 3) {
577 myOffsets
[3] = offsetNum
;
579 myOffsets
+= (indexToWrite
+ 1);
582 /* We might run out of room soon. Write it slowly. */
583 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
584 if (myTarget
< targetLimit
)
586 *(myOffsets
++) = offsetNum
;
587 *(myTarget
++) = *tempPtr
;
591 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
592 *err
= U_BUFFER_OVERFLOW_ERROR
;
596 offsetNum
= nextSourceIndex
;
600 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
602 *err
= U_BUFFER_OVERFLOW_ERROR
;
605 args
->target
= (char *) myTarget
;
606 args
->source
= mySource
;
607 args
->offsets
= myOffsets
;
610 static UChar32
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs
*args
,
613 const uint8_t *sourceInitial
;
614 const uint8_t *source
;
615 uint16_t extraBytesToWrite
;
618 int8_t i
, isLegalSequence
;
620 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
622 cnv
= args
->converter
;
623 sourceInitial
= source
= (const uint8_t *)args
->source
;
624 if (source
>= (const uint8_t *)args
->sourceLimit
)
627 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
631 myByte
= (uint8_t)*(source
++);
634 args
->source
= (const char *)source
;
635 return (UChar32
)myByte
;
638 extraBytesToWrite
= (uint16_t)bytesFromUTF8
[myByte
];
639 if (extraBytesToWrite
== 0) {
640 cnv
->toUBytes
[0] = myByte
;
642 *err
= U_ILLEGAL_CHAR_FOUND
;
643 args
->source
= (const char *)source
;
647 /*The byte sequence is longer than the buffer area passed*/
648 if (((const char *)source
+ extraBytesToWrite
- 1) > args
->sourceLimit
)
650 /* check if all of the remaining bytes are trail bytes */
651 cnv
->toUBytes
[0] = myByte
;
653 *err
= U_TRUNCATED_CHAR_FOUND
;
654 while(source
< (const uint8_t *)args
->sourceLimit
) {
655 if(U8_IS_TRAIL(myByte
= *source
)) {
656 cnv
->toUBytes
[i
++] = myByte
;
659 /* error even before we run out of input */
660 *err
= U_ILLEGAL_CHAR_FOUND
;
665 args
->source
= (const char *)source
;
671 switch(extraBytesToWrite
)
673 /* note: code falls through cases! (sic)*/
675 ch
+= (myByte
= *source
);
677 if (!U8_IS_TRAIL(myByte
))
685 ch
+= (myByte
= *source
);
687 if (!U8_IS_TRAIL(myByte
))
695 ch
+= (myByte
= *source
);
697 if (!U8_IS_TRAIL(myByte
))
705 ch
+= (myByte
= *source
);
707 if (!U8_IS_TRAIL(myByte
))
715 ch
+= (myByte
= *source
);
716 if (!U8_IS_TRAIL(myByte
))
723 ch
-= offsetsFromUTF8
[extraBytesToWrite
];
724 args
->source
= (const char *)source
;
727 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
728 * - use only trail bytes after a lead byte (checked above)
729 * - use the right number of trail bytes for a given lead byte
730 * - encode a code point <= U+10ffff
731 * - use the fewest possible number of bytes for their code points
732 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
734 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
735 * There are no irregular sequences any more.
737 if (isLegalSequence
&&
738 (uint32_t)ch
<= MAXIMUM_UTF
&&
739 (uint32_t)ch
>= utf8_minChar32
[extraBytesToWrite
] &&
742 return ch
; /* return the code point */
745 for(i
= 0; sourceInitial
< source
; ++i
) {
746 cnv
->toUBytes
[i
] = *sourceInitial
++;
749 *err
= U_ILLEGAL_CHAR_FOUND
;
753 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
755 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
757 utf8_minLegal
[5]={ 0, 0, 0x80, 0x800, 0x10000 };
759 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
761 utf8_offsets
[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
763 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
765 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
766 UConverterToUnicodeArgs
*pToUArgs
,
767 UErrorCode
*pErrorCode
) {
769 const uint8_t *source
, *sourceLimit
;
771 int32_t targetCapacity
;
774 int8_t oldToULength
, toULength
, toULimit
;
779 /* set up the local pointers */
780 utf8
=pToUArgs
->converter
;
781 source
=(uint8_t *)pToUArgs
->source
;
782 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
783 target
=(uint8_t *)pFromUArgs
->target
;
784 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
786 /* get the converter state from the UTF-8 UConverter */
787 c
=(UChar32
)utf8
->toUnicodeStatus
;
789 toULength
=oldToULength
=utf8
->toULength
;
790 toULimit
=(int8_t)utf8
->mode
;
792 toULength
=oldToULength
=toULimit
=0;
795 count
=(int32_t)(sourceLimit
-source
)+oldToULength
;
798 * Not enough input to complete the partial character.
799 * Jump to moreBytes below - it will not output to target.
801 } else if(targetCapacity
<toULimit
) {
803 * Not enough target capacity to output the partial character.
804 * Let the standard converter handle this.
806 *pErrorCode
=U_USING_DEFAULT_WARNING
;
810 * Use a single counter for source and target, counting the minimum of
811 * the source length and the target capacity.
812 * As a result, the source length is checked only once per multi-byte
813 * character instead of twice.
815 * Make sure that the last byte sequence is complete, or else
816 * stop just before it.
817 * (The longest legal byte sequence has 3 trail bytes.)
818 * Count oldToULength (number of source bytes from a previous buffer)
819 * into the source length but reduce the source index by toULimit
820 * while going back over trail bytes in order to not go back into
821 * the bytes that will be read for finishing a partial
822 * sequence from the previous buffer.
823 * Let the standard converter handle edge cases.
827 if(count
>targetCapacity
) {
828 count
=targetCapacity
;
832 while(i
<3 && i
<(count
-toULimit
)) {
833 b
=source
[count
-oldToULength
-i
-1];
837 if(i
<U8_COUNT_TRAIL_BYTES(b
)) {
838 /* stop converting before the lead byte if there are not enough trail bytes for it */
847 utf8
->toUnicodeStatus
=0;
850 /* See note in ucnv_SBCSFromUTF8() about this goto. */
853 /* conversion loop */
863 if( /* handle U+1000..U+D7FF inline */
864 (t1
=source
[0]) >= 0x80 && ((b
<0xed && (t1
<= 0xbf)) ||
865 (b
==0xed && (t1
<= 0x9f))) &&
866 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
876 if( /* handle U+0080..U+07FF inline */
878 (t1
=*source
) >= 0x80 && t1
<= 0xbf
887 if( /* handle U+0800..U+0FFF inline */
888 (t1
=source
[0]) >= 0xa0 && t1
<= 0xbf &&
889 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
900 /* handle "complicated" and error cases, and continuing partial characters */
903 toULimit
=U8_COUNT_TRAIL_BYTES(b
)+1;
906 while(toULength
<toULimit
) {
907 if(source
<sourceLimit
) {
914 break; /* sequence too short, stop with toULength<toULimit */
917 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
918 source
-=(toULength
-oldToULength
);
919 while(oldToULength
<toULength
) {
920 utf8
->toUBytes
[oldToULength
++]=*source
++;
922 utf8
->toUnicodeStatus
=c
;
923 utf8
->toULength
=toULength
;
925 pToUArgs
->source
=(char *)source
;
926 pFromUArgs
->target
=(char *)target
;
931 if( toULength
==toULimit
&& /* consumed all trail bytes */
932 (toULength
==3 || toULength
==2) && /* BMP */
933 (c
-=utf8_offsets
[toULength
])>=utf8_minLegal
[toULength
] &&
934 (c
<=0xd7ff || 0xe000<=c
) /* not a surrogate */
936 /* legal byte sequence for BMP code point */
938 toULength
==toULimit
&& toULength
==4 &&
939 (0x10000<=(c
-=utf8_offsets
[4]) && c
<=0x10ffff)
941 /* legal byte sequence for supplementary code point */
943 /* error handling: illegal UTF-8 byte sequence */
944 source
-=(toULength
-oldToULength
);
945 while(oldToULength
<toULength
) {
946 utf8
->toUBytes
[oldToULength
++]=*source
++;
948 utf8
->toULength
=toULength
;
949 pToUArgs
->source
=(char *)source
;
950 pFromUArgs
->target
=(char *)target
;
951 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
955 /* copy the legal byte sequence to the target */
959 for(i
=0; i
<oldToULength
; ++i
) {
960 *target
++=utf8
->toUBytes
[i
];
962 source
-=(toULength
-oldToULength
);
963 for(; i
<toULength
; ++i
) {
971 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
) {
972 if(target
==(const uint8_t *)pFromUArgs
->targetLimit
) {
973 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
976 toULimit
=U8_COUNT_TRAIL_BYTES(b
)+1;
977 if(toULimit
>(sourceLimit
-source
)) {
978 /* collect a truncated byte sequence */
982 utf8
->toUBytes
[toULength
++]=b
;
983 if(++source
==sourceLimit
) {
984 /* partial byte sequence at end of source */
985 utf8
->toUnicodeStatus
=c
;
986 utf8
->toULength
=toULength
;
989 } else if(!U8_IS_TRAIL(b
=*source
)) {
990 /* lead byte in trail byte position */
991 utf8
->toULength
=toULength
;
992 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
998 /* partial-sequence target overflow: fall back to the pivoting implementation */
999 *pErrorCode
=U_USING_DEFAULT_WARNING
;
1004 /* write back the updated pointers */
1005 pToUArgs
->source
=(char *)source
;
1006 pFromUArgs
->target
=(char *)target
;
1009 /* UTF-8 converter data ----------------------------------------------------- */
1011 static const UConverterImpl _UTF8Impl
={
1021 ucnv_toUnicode_UTF8
,
1022 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1023 ucnv_fromUnicode_UTF8
,
1024 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1025 ucnv_getNextUChar_UTF8
,
1031 ucnv_getNonSurrogateUnicodeSet
,
1037 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1038 static const UConverterStaticData _UTF8StaticData
={
1039 sizeof(UConverterStaticData
),
1041 1208, UCNV_IBM
, UCNV_UTF8
,
1042 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1043 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1046 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1050 const UConverterSharedData _UTF8Data
=
1051 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData
, &_UTF8Impl
);
1053 /* CESU-8 converter data ---------------------------------------------------- */
1055 static const UConverterImpl _CESU8Impl
={
1065 ucnv_toUnicode_UTF8
,
1066 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1067 ucnv_fromUnicode_UTF8
,
1068 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1075 ucnv_getCompleteUnicodeSet
1078 static const UConverterStaticData _CESU8StaticData
={
1079 sizeof(UConverterStaticData
),
1081 9400, /* CCSID for CESU-8 */
1082 UCNV_UNKNOWN
, UCNV_CESU8
, 1, 3,
1083 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1086 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1090 const UConverterSharedData _CESU8Data
=
1091 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData
, &_CESU8Impl
);