2 **********************************************************************
3 * Copyright (C) 2002-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
16 * Also, CESU-8 implementation, see UTR 26.
17 * The CESU-8 converter uses all the same functions as the
18 * UTF-8 converter, with a branch for converting supplementary code points.
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_CONVERSION
25 #include "unicode/ucnv.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
33 /* Prototypes --------------------------------------------------------------- */
35 /* Keep these here to make finicky compilers happy */
37 U_CFUNC
void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
*args
,
39 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
*args
,
43 /* UTF-8 -------------------------------------------------------------------- */
45 /* UTF-8 Conversion DATA
46 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49 #define MAXIMUM_UCS2 0x0000FFFF
50 #define MAXIMUM_UTF 0x0010FFFF
51 #define MAXIMUM_UCS4 0x7FFFFFFF
53 #define HALF_BASE 0x0010000
54 #define HALF_MASK 0x3FF
55 #define SURROGATE_HIGH_START 0xD800
56 #define SURROGATE_HIGH_END 0xDBFF
57 #define SURROGATE_LOW_START 0xDC00
58 #define SURROGATE_LOW_END 0xDFFF
60 /* -SURROGATE_LOW_START + HALF_BASE */
61 #define SURROGATE_LOW_BASE 9216
63 static const uint32_t offsetsFromUTF8
[7] = {0,
64 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
68 /* END OF UTF-8 Conversion DATA */
70 static const int8_t bytesFromUTF8
[256] = {
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
82 * Starting with Unicode 3.0.1:
83 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84 * byte sequences with more than 4 bytes are illegal in UTF-8,
85 * which is tested with impossible values for them
88 utf8_minChar32
[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
90 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs
* args
,
93 UConverter
*cnv
= args
->converter
;
94 const unsigned char *mySource
= (unsigned char *) args
->source
;
95 UChar
*myTarget
= args
->target
;
96 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
97 const UChar
*targetLimit
= args
->targetLimit
;
98 unsigned char *toUBytes
= cnv
->toUBytes
;
99 UBool isCESU8
= (UBool
)(cnv
->sharedData
== &_CESU8Data
);
100 uint32_t ch
, ch2
= 0;
103 /* Restore size of current sequence */
104 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
106 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
107 i
= cnv
->toULength
; /* restore # of bytes consumed */
110 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
111 cnv
->toUnicodeStatus
= 0;
116 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
119 if (ch
< 0x80) /* Simple case */
121 *(myTarget
++) = (UChar
) ch
;
125 /* store the first char */
126 toUBytes
[0] = (char)ch
;
127 inBytes
= bytesFromUTF8
[ch
]; /* lookup current sequence length */
133 if (mySource
< sourceLimit
)
135 toUBytes
[i
] = (char) (ch2
= *mySource
);
136 if (!U8_IS_TRAIL(ch2
))
138 break; /* i < inBytes */
140 ch
= (ch
<< 6) + ch2
;
146 /* stores a partially calculated target*/
147 cnv
->toUnicodeStatus
= ch
;
149 cnv
->toULength
= (int8_t) i
;
154 /* Remove the accumulated high bits */
155 ch
-= offsetsFromUTF8
[inBytes
];
158 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
159 * - use only trail bytes after a lead byte (checked above)
160 * - use the right number of trail bytes for a given lead byte
161 * - encode a code point <= U+10ffff
162 * - use the fewest possible number of bytes for their code points
163 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
165 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
166 * There are no irregular sequences any more.
167 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
169 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
170 (isCESU8
? i
<= 3 : !U_IS_SURROGATE(ch
)))
172 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
173 if (ch
<= MAXIMUM_UCS2
)
175 /* fits in 16 bits */
176 *(myTarget
++) = (UChar
) ch
;
180 /* write out the surrogates */
182 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
183 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
184 if (myTarget
< targetLimit
)
186 *(myTarget
++) = (UChar
)ch
;
190 /* Put in overflow buffer (not handled here) */
191 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
192 cnv
->UCharErrorBufferLength
= 1;
193 *err
= U_BUFFER_OVERFLOW_ERROR
;
200 cnv
->toULength
= (int8_t)i
;
201 *err
= U_ILLEGAL_CHAR_FOUND
;
208 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
210 /* End of target buffer */
211 *err
= U_BUFFER_OVERFLOW_ERROR
;
214 args
->target
= myTarget
;
215 args
->source
= (const char *) mySource
;
218 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
* args
,
221 UConverter
*cnv
= args
->converter
;
222 const unsigned char *mySource
= (unsigned char *) args
->source
;
223 UChar
*myTarget
= args
->target
;
224 int32_t *myOffsets
= args
->offsets
;
225 int32_t offsetNum
= 0;
226 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
227 const UChar
*targetLimit
= args
->targetLimit
;
228 unsigned char *toUBytes
= cnv
->toUBytes
;
229 UBool isCESU8
= (UBool
)(cnv
->sharedData
== &_CESU8Data
);
230 uint32_t ch
, ch2
= 0;
233 /* Restore size of current sequence */
234 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
236 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
237 i
= cnv
->toULength
; /* restore # of bytes consumed */
240 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
241 cnv
->toUnicodeStatus
= 0;
245 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
248 if (ch
< 0x80) /* Simple case */
250 *(myTarget
++) = (UChar
) ch
;
251 *(myOffsets
++) = offsetNum
++;
255 toUBytes
[0] = (char)ch
;
256 inBytes
= bytesFromUTF8
[ch
];
262 if (mySource
< sourceLimit
)
264 toUBytes
[i
] = (char) (ch2
= *mySource
);
265 if (!U8_IS_TRAIL(ch2
))
267 break; /* i < inBytes */
269 ch
= (ch
<< 6) + ch2
;
275 cnv
->toUnicodeStatus
= ch
;
277 cnv
->toULength
= (int8_t)i
;
282 /* Remove the accumulated high bits */
283 ch
-= offsetsFromUTF8
[inBytes
];
286 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
287 * - use only trail bytes after a lead byte (checked above)
288 * - use the right number of trail bytes for a given lead byte
289 * - encode a code point <= U+10ffff
290 * - use the fewest possible number of bytes for their code points
291 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
293 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
294 * There are no irregular sequences any more.
295 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
297 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
298 (isCESU8
? i
<= 3 : !U_IS_SURROGATE(ch
)))
300 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
301 if (ch
<= MAXIMUM_UCS2
)
303 /* fits in 16 bits */
304 *(myTarget
++) = (UChar
) ch
;
305 *(myOffsets
++) = offsetNum
;
309 /* write out the surrogates */
311 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
312 *(myOffsets
++) = offsetNum
;
313 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
314 if (myTarget
< targetLimit
)
316 *(myTarget
++) = (UChar
)ch
;
317 *(myOffsets
++) = offsetNum
;
321 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
322 cnv
->UCharErrorBufferLength
= 1;
323 *err
= U_BUFFER_OVERFLOW_ERROR
;
330 cnv
->toULength
= (int8_t)i
;
331 *err
= U_ILLEGAL_CHAR_FOUND
;
338 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
339 { /* End of target buffer */
340 *err
= U_BUFFER_OVERFLOW_ERROR
;
343 args
->target
= myTarget
;
344 args
->source
= (const char *) mySource
;
345 args
->offsets
= myOffsets
;
348 U_CFUNC
void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs
* args
,
351 UConverter
*cnv
= args
->converter
;
352 const UChar
*mySource
= args
->source
;
353 const UChar
*sourceLimit
= args
->sourceLimit
;
354 uint8_t *myTarget
= (uint8_t *) args
->target
;
355 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
359 int32_t indexToWrite
;
360 UBool isNotCESU8
= (UBool
)(cnv
->sharedData
!= &_CESU8Data
);
362 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
364 ch
= cnv
->fromUChar32
;
365 cnv
->fromUChar32
= 0;
369 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
373 if (ch
< 0x80) /* Single byte */
375 *(myTarget
++) = (uint8_t) ch
;
377 else if (ch
< 0x800) /* Double byte */
379 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
380 if (myTarget
< targetLimit
)
382 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
386 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
387 cnv
->charErrorBufferLength
= 1;
388 *err
= U_BUFFER_OVERFLOW_ERROR
;
392 /* Check for surrogates */
393 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
395 if (mySource
< sourceLimit
) {
396 /* test both code units */
397 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
398 /* convert and consume this supplementary code point */
399 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
401 /* exit this condition tree */
404 /* this is an unpaired trail or lead code unit */
405 /* callback(illegal) */
406 cnv
->fromUChar32
= ch
;
407 *err
= U_ILLEGAL_CHAR_FOUND
;
413 cnv
->fromUChar32
= ch
;
418 /* Do we write the buffer directly for speed,
419 or do we have to be careful about target buffer space? */
420 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
422 if (ch
<= MAXIMUM_UCS2
) {
424 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
428 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
429 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
431 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
432 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
434 if (tempPtr
== myTarget
) {
435 /* There was enough space to write the codepoint directly. */
436 myTarget
+= (indexToWrite
+ 1);
439 /* We might run out of room soon. Write it slowly. */
440 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
441 if (myTarget
< targetLimit
) {
442 *(myTarget
++) = *tempPtr
;
445 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
446 *err
= U_BUFFER_OVERFLOW_ERROR
;
453 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
455 *err
= U_BUFFER_OVERFLOW_ERROR
;
458 args
->target
= (char *) myTarget
;
459 args
->source
= mySource
;
462 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
465 UConverter
*cnv
= args
->converter
;
466 const UChar
*mySource
= args
->source
;
467 int32_t *myOffsets
= args
->offsets
;
468 const UChar
*sourceLimit
= args
->sourceLimit
;
469 uint8_t *myTarget
= (uint8_t *) args
->target
;
470 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
473 int32_t offsetNum
, nextSourceIndex
;
474 int32_t indexToWrite
;
476 UBool isNotCESU8
= (UBool
)(cnv
->sharedData
!= &_CESU8Data
);
478 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
480 ch
= cnv
->fromUChar32
;
481 cnv
->fromUChar32
= 0;
489 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
493 if (ch
< 0x80) /* Single byte */
495 *(myOffsets
++) = offsetNum
++;
496 *(myTarget
++) = (char) ch
;
498 else if (ch
< 0x800) /* Double byte */
500 *(myOffsets
++) = offsetNum
;
501 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
502 if (myTarget
< targetLimit
)
504 *(myOffsets
++) = offsetNum
++;
505 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
509 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
510 cnv
->charErrorBufferLength
= 1;
511 *err
= U_BUFFER_OVERFLOW_ERROR
;
515 /* Check for surrogates */
517 nextSourceIndex
= offsetNum
+ 1;
519 if(U16_IS_SURROGATE(ch
) && isNotCESU8
) {
521 if (mySource
< sourceLimit
) {
522 /* test both code units */
523 if(U16_IS_SURROGATE_LEAD(ch
) && U16_IS_TRAIL(*mySource
)) {
524 /* convert and consume this supplementary code point */
525 ch
=U16_GET_SUPPLEMENTARY(ch
, *mySource
);
528 /* exit this condition tree */
531 /* this is an unpaired trail or lead code unit */
532 /* callback(illegal) */
533 cnv
->fromUChar32
= ch
;
534 *err
= U_ILLEGAL_CHAR_FOUND
;
540 cnv
->fromUChar32
= ch
;
545 /* Do we write the buffer directly for speed,
546 or do we have to be careful about target buffer space? */
547 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
549 if (ch
<= MAXIMUM_UCS2
) {
551 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
555 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
556 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
558 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
559 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
561 if (tempPtr
== myTarget
) {
562 /* There was enough space to write the codepoint directly. */
563 myTarget
+= (indexToWrite
+ 1);
564 myOffsets
[0] = offsetNum
;
565 myOffsets
[1] = offsetNum
;
566 myOffsets
[2] = offsetNum
;
567 if (indexToWrite
>= 3) {
568 myOffsets
[3] = offsetNum
;
570 myOffsets
+= (indexToWrite
+ 1);
573 /* We might run out of room soon. Write it slowly. */
574 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
575 if (myTarget
< targetLimit
)
577 *(myOffsets
++) = offsetNum
;
578 *(myTarget
++) = *tempPtr
;
582 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
583 *err
= U_BUFFER_OVERFLOW_ERROR
;
587 offsetNum
= nextSourceIndex
;
591 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
593 *err
= U_BUFFER_OVERFLOW_ERROR
;
596 args
->target
= (char *) myTarget
;
597 args
->source
= mySource
;
598 args
->offsets
= myOffsets
;
601 static UChar32
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs
*args
,
604 const uint8_t *sourceInitial
;
605 const uint8_t *source
;
606 uint16_t extraBytesToWrite
;
609 int8_t i
, isLegalSequence
;
611 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
613 cnv
= args
->converter
;
614 sourceInitial
= source
= (const uint8_t *)args
->source
;
615 if (source
>= (const uint8_t *)args
->sourceLimit
)
618 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
622 myByte
= (uint8_t)*(source
++);
625 args
->source
= (const char *)source
;
626 return (UChar32
)myByte
;
629 extraBytesToWrite
= (uint16_t)bytesFromUTF8
[myByte
];
630 if (extraBytesToWrite
== 0) {
631 cnv
->toUBytes
[0] = myByte
;
633 *err
= U_ILLEGAL_CHAR_FOUND
;
634 args
->source
= (const char *)source
;
638 /*The byte sequence is longer than the buffer area passed*/
639 if (((const char *)source
+ extraBytesToWrite
- 1) > args
->sourceLimit
)
641 /* check if all of the remaining bytes are trail bytes */
642 cnv
->toUBytes
[0] = myByte
;
644 *err
= U_TRUNCATED_CHAR_FOUND
;
645 while(source
< (const uint8_t *)args
->sourceLimit
) {
646 if(U8_IS_TRAIL(myByte
= *source
)) {
647 cnv
->toUBytes
[i
++] = myByte
;
650 /* error even before we run out of input */
651 *err
= U_ILLEGAL_CHAR_FOUND
;
656 args
->source
= (const char *)source
;
662 switch(extraBytesToWrite
)
664 /* note: code falls through cases! (sic)*/
666 ch
+= (myByte
= *source
);
668 if (!U8_IS_TRAIL(myByte
))
674 case 5: /*fall through*/
675 ch
+= (myByte
= *source
);
677 if (!U8_IS_TRAIL(myByte
))
683 case 4: /*fall through*/
684 ch
+= (myByte
= *source
);
686 if (!U8_IS_TRAIL(myByte
))
692 case 3: /*fall through*/
693 ch
+= (myByte
= *source
);
695 if (!U8_IS_TRAIL(myByte
))
701 case 2: /*fall through*/
702 ch
+= (myByte
= *source
);
703 if (!U8_IS_TRAIL(myByte
))
710 ch
-= offsetsFromUTF8
[extraBytesToWrite
];
711 args
->source
= (const char *)source
;
714 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
715 * - use only trail bytes after a lead byte (checked above)
716 * - use the right number of trail bytes for a given lead byte
717 * - encode a code point <= U+10ffff
718 * - use the fewest possible number of bytes for their code points
719 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
721 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
722 * There are no irregular sequences any more.
724 if (isLegalSequence
&&
725 (uint32_t)ch
<= MAXIMUM_UTF
&&
726 (uint32_t)ch
>= utf8_minChar32
[extraBytesToWrite
] &&
729 return ch
; /* return the code point */
732 for(i
= 0; sourceInitial
< source
; ++i
) {
733 cnv
->toUBytes
[i
] = *sourceInitial
++;
736 *err
= U_ILLEGAL_CHAR_FOUND
;
740 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
742 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
744 utf8_minLegal
[5]={ 0, 0, 0x80, 0x800, 0x10000 };
746 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
748 utf8_offsets
[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
750 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
752 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
753 UConverterToUnicodeArgs
*pToUArgs
,
754 UErrorCode
*pErrorCode
) {
756 const uint8_t *source
, *sourceLimit
;
758 int32_t targetCapacity
;
761 int8_t oldToULength
, toULength
, toULimit
;
766 /* set up the local pointers */
767 utf8
=pToUArgs
->converter
;
768 source
=(uint8_t *)pToUArgs
->source
;
769 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
770 target
=(uint8_t *)pFromUArgs
->target
;
771 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
773 /* get the converter state from the UTF-8 UConverter */
774 c
=(UChar32
)utf8
->toUnicodeStatus
;
776 toULength
=oldToULength
=utf8
->toULength
;
777 toULimit
=(int8_t)utf8
->mode
;
779 toULength
=oldToULength
=toULimit
=0;
782 count
=(int32_t)(sourceLimit
-source
)+oldToULength
;
785 * Not enough input to complete the partial character.
786 * Jump to moreBytes below - it will not output to target.
788 } else if(targetCapacity
<toULimit
) {
790 * Not enough target capacity to output the partial character.
791 * Let the standard converter handle this.
793 *pErrorCode
=U_USING_DEFAULT_WARNING
;
797 * Use a single counter for source and target, counting the minimum of
798 * the source length and the target capacity.
799 * As a result, the source length is checked only once per multi-byte
800 * character instead of twice.
802 * Make sure that the last byte sequence is complete, or else
803 * stop just before it.
804 * (The longest legal byte sequence has 3 trail bytes.)
805 * Count oldToULength (number of source bytes from a previous buffer)
806 * into the source length but reduce the source index by toULimit
807 * while going back over trail bytes in order to not go back into
808 * the bytes that will be read for finishing a partial
809 * sequence from the previous buffer.
810 * Let the standard converter handle edge cases.
814 if(count
>targetCapacity
) {
815 count
=targetCapacity
;
819 while(i
<3 && i
<(count
-toULimit
)) {
820 b
=source
[count
-oldToULength
-i
-1];
824 if(i
<utf8_countTrailBytes
[b
]) {
825 /* stop converting before the lead byte if there are not enough trail bytes for it */
834 utf8
->toUnicodeStatus
=0;
837 /* See note in ucnv_SBCSFromUTF8() about this goto. */
840 /* conversion loop */
850 if( /* handle U+1000..U+D7FF inline */
851 (t1
=source
[0]) >= 0x80 && ((b
<0xed && (t1
<= 0xbf)) ||
852 (b
==0xed && (t1
<= 0x9f))) &&
853 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
863 if( /* handle U+0080..U+07FF inline */
865 (t1
=*source
) >= 0x80 && t1
<= 0xbf
874 if( /* handle U+0800..U+0FFF inline */
875 (t1
=source
[0]) >= 0xa0 && t1
<= 0xbf &&
876 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
887 /* handle "complicated" and error cases, and continuing partial characters */
890 toULimit
=utf8_countTrailBytes
[b
]+1;
893 while(toULength
<toULimit
) {
894 if(source
<sourceLimit
) {
901 break; /* sequence too short, stop with toULength<toULimit */
904 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
905 source
-=(toULength
-oldToULength
);
906 while(oldToULength
<toULength
) {
907 utf8
->toUBytes
[oldToULength
++]=*source
++;
909 utf8
->toUnicodeStatus
=c
;
910 utf8
->toULength
=toULength
;
912 pToUArgs
->source
=(char *)source
;
913 pFromUArgs
->target
=(char *)target
;
918 if( toULength
==toULimit
&& /* consumed all trail bytes */
919 (toULength
==3 || toULength
==2) && /* BMP */
920 (c
-=utf8_offsets
[toULength
])>=utf8_minLegal
[toULength
] &&
921 (c
<=0xd7ff || 0xe000<=c
) /* not a surrogate */
923 /* legal byte sequence for BMP code point */
925 toULength
==toULimit
&& toULength
==4 &&
926 (0x10000<=(c
-=utf8_offsets
[4]) && c
<=0x10ffff)
928 /* legal byte sequence for supplementary code point */
930 /* error handling: illegal UTF-8 byte sequence */
931 source
-=(toULength
-oldToULength
);
932 while(oldToULength
<toULength
) {
933 utf8
->toUBytes
[oldToULength
++]=*source
++;
935 utf8
->toULength
=toULength
;
936 pToUArgs
->source
=(char *)source
;
937 pFromUArgs
->target
=(char *)target
;
938 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
942 /* copy the legal byte sequence to the target */
946 for(i
=0; i
<oldToULength
; ++i
) {
947 *target
++=utf8
->toUBytes
[i
];
949 source
-=(toULength
-oldToULength
);
950 for(; i
<toULength
; ++i
) {
958 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
) {
959 if(target
==(const uint8_t *)pFromUArgs
->targetLimit
) {
960 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
963 toULimit
=utf8_countTrailBytes
[b
]+1;
964 if(toULimit
>(sourceLimit
-source
)) {
965 /* collect a truncated byte sequence */
969 utf8
->toUBytes
[toULength
++]=b
;
970 if(++source
==sourceLimit
) {
971 /* partial byte sequence at end of source */
972 utf8
->toUnicodeStatus
=c
;
973 utf8
->toULength
=toULength
;
976 } else if(!U8_IS_TRAIL(b
=*source
)) {
977 /* lead byte in trail byte position */
978 utf8
->toULength
=toULength
;
979 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
985 /* partial-sequence target overflow: fall back to the pivoting implementation */
986 *pErrorCode
=U_USING_DEFAULT_WARNING
;
991 /* write back the updated pointers */
992 pToUArgs
->source
=(char *)source
;
993 pFromUArgs
->target
=(char *)target
;
996 /* UTF-8 converter data ----------------------------------------------------- */
998 static const UConverterImpl _UTF8Impl
={
1008 ucnv_toUnicode_UTF8
,
1009 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1010 ucnv_fromUnicode_UTF8
,
1011 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1012 ucnv_getNextUChar_UTF8
,
1018 ucnv_getNonSurrogateUnicodeSet
,
1024 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1025 static const UConverterStaticData _UTF8StaticData
={
1026 sizeof(UConverterStaticData
),
1028 1208, UCNV_IBM
, UCNV_UTF8
,
1029 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1030 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1033 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1037 const UConverterSharedData _UTF8Data
={
1038 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1039 NULL
, NULL
, &_UTF8StaticData
, FALSE
, &_UTF8Impl
,
1043 /* CESU-8 converter data ---------------------------------------------------- */
1045 static const UConverterImpl _CESU8Impl
={
1055 ucnv_toUnicode_UTF8
,
1056 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1057 ucnv_fromUnicode_UTF8
,
1058 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1065 ucnv_getCompleteUnicodeSet
1068 static const UConverterStaticData _CESU8StaticData
={
1069 sizeof(UConverterStaticData
),
1071 9400, /* CCSID for CESU-8 */
1072 UCNV_UNKNOWN
, UCNV_CESU8
, 1, 3,
1073 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1076 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1080 const UConverterSharedData _CESU8Data
={
1081 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1082 NULL
, NULL
, &_CESU8StaticData
, FALSE
, &_CESU8Impl
,