2 **********************************************************************
3 * Copyright (C) 2002-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
16 * Also, CESU-8 implementation, see UTR 26.
17 * The CESU-8 converter uses all the same functions as the
18 * UTF-8 converter, with a branch for converting supplementary code points.
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_CONVERSION
25 #include "unicode/ucnv.h"
30 /* Prototypes --------------------------------------------------------------- */
32 /* Keep these here to make finicky compilers happy */
34 U_CFUNC
void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
*args
,
36 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
*args
,
40 /* UTF-8 -------------------------------------------------------------------- */
42 /* UTF-8 Conversion DATA
43 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
46 #define MAXIMUM_UCS2 0x0000FFFF
47 #define MAXIMUM_UTF 0x0010FFFF
48 #define MAXIMUM_UCS4 0x7FFFFFFF
50 #define HALF_BASE 0x0010000
51 #define HALF_MASK 0x3FF
52 #define SURROGATE_HIGH_START 0xD800
53 #define SURROGATE_HIGH_END 0xDBFF
54 #define SURROGATE_LOW_START 0xDC00
55 #define SURROGATE_LOW_END 0xDFFF
57 /* -SURROGATE_LOW_START + HALF_BASE */
58 #define SURROGATE_LOW_BASE 9216
60 static const uint32_t offsetsFromUTF8
[7] = {0,
61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
65 /* END OF UTF-8 Conversion DATA */
67 static const int8_t bytesFromUTF8
[256] = {
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79 * Starting with Unicode 3.0.1:
80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
81 * byte sequences with more than 4 bytes are illegal in UTF-8,
82 * which is tested with impossible values for them
85 utf8_minChar32
[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs
* args
,
90 UConverter
*cnv
= args
->converter
;
91 const unsigned char *mySource
= (unsigned char *) args
->source
;
92 UChar
*myTarget
= args
->target
;
93 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
94 const UChar
*targetLimit
= args
->targetLimit
;
95 unsigned char *toUBytes
= cnv
->toUBytes
;
96 UBool isCESU8
= (UBool
)(cnv
->sharedData
== &_CESU8Data
);
100 /* Restore size of current sequence */
101 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
103 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
104 i
= cnv
->toULength
; /* restore # of bytes consumed */
107 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
108 cnv
->toUnicodeStatus
= 0;
113 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
116 if (ch
< 0x80) /* Simple case */
118 *(myTarget
++) = (UChar
) ch
;
122 /* store the first char */
123 toUBytes
[0] = (char)ch
;
124 inBytes
= bytesFromUTF8
[ch
]; /* lookup current sequence length */
130 if (mySource
< sourceLimit
)
132 toUBytes
[i
] = (char) (ch2
= *mySource
);
133 if (!UTF8_IS_TRAIL(ch2
))
135 break; /* i < inBytes */
137 ch
= (ch
<< 6) + ch2
;
143 /* stores a partially calculated target*/
144 cnv
->toUnicodeStatus
= ch
;
146 cnv
->toULength
= (int8_t) i
;
151 /* Remove the accumulated high bits */
152 ch
-= offsetsFromUTF8
[inBytes
];
155 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
156 * - use only trail bytes after a lead byte (checked above)
157 * - use the right number of trail bytes for a given lead byte
158 * - encode a code point <= U+10ffff
159 * - use the fewest possible number of bytes for their code points
160 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
162 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
163 * There are no irregular sequences any more.
164 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
166 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
167 (isCESU8
? i
<= 3 : !UTF_IS_SURROGATE(ch
)))
169 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
170 if (ch
<= MAXIMUM_UCS2
)
172 /* fits in 16 bits */
173 *(myTarget
++) = (UChar
) ch
;
177 /* write out the surrogates */
179 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
180 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
181 if (myTarget
< targetLimit
)
183 *(myTarget
++) = (UChar
)ch
;
187 /* Put in overflow buffer (not handled here) */
188 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
189 cnv
->UCharErrorBufferLength
= 1;
190 *err
= U_BUFFER_OVERFLOW_ERROR
;
197 cnv
->toULength
= (int8_t)i
;
198 *err
= U_ILLEGAL_CHAR_FOUND
;
205 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
207 /* End of target buffer */
208 *err
= U_BUFFER_OVERFLOW_ERROR
;
211 args
->target
= myTarget
;
212 args
->source
= (const char *) mySource
;
215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
* args
,
218 UConverter
*cnv
= args
->converter
;
219 const unsigned char *mySource
= (unsigned char *) args
->source
;
220 UChar
*myTarget
= args
->target
;
221 int32_t *myOffsets
= args
->offsets
;
222 int32_t offsetNum
= 0;
223 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
224 const UChar
*targetLimit
= args
->targetLimit
;
225 unsigned char *toUBytes
= cnv
->toUBytes
;
226 UBool isCESU8
= (UBool
)(cnv
->sharedData
== &_CESU8Data
);
227 uint32_t ch
, ch2
= 0;
230 /* Restore size of current sequence */
231 if (cnv
->toUnicodeStatus
&& myTarget
< targetLimit
)
233 inBytes
= cnv
->mode
; /* restore # of bytes to consume */
234 i
= cnv
->toULength
; /* restore # of bytes consumed */
237 ch
= cnv
->toUnicodeStatus
;/*Stores the previously calculated ch from a previous call*/
238 cnv
->toUnicodeStatus
= 0;
242 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
245 if (ch
< 0x80) /* Simple case */
247 *(myTarget
++) = (UChar
) ch
;
248 *(myOffsets
++) = offsetNum
++;
252 toUBytes
[0] = (char)ch
;
253 inBytes
= bytesFromUTF8
[ch
];
259 if (mySource
< sourceLimit
)
261 toUBytes
[i
] = (char) (ch2
= *mySource
);
262 if (!UTF8_IS_TRAIL(ch2
))
264 break; /* i < inBytes */
266 ch
= (ch
<< 6) + ch2
;
272 cnv
->toUnicodeStatus
= ch
;
274 cnv
->toULength
= (int8_t)i
;
279 /* Remove the accumulated high bits */
280 ch
-= offsetsFromUTF8
[inBytes
];
283 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
284 * - use only trail bytes after a lead byte (checked above)
285 * - use the right number of trail bytes for a given lead byte
286 * - encode a code point <= U+10ffff
287 * - use the fewest possible number of bytes for their code points
288 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
290 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
291 * There are no irregular sequences any more.
292 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
294 if (i
== inBytes
&& ch
<= MAXIMUM_UTF
&& ch
>= utf8_minChar32
[i
] &&
295 (isCESU8
? i
<= 3 : !UTF_IS_SURROGATE(ch
)))
297 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
298 if (ch
<= MAXIMUM_UCS2
)
300 /* fits in 16 bits */
301 *(myTarget
++) = (UChar
) ch
;
302 *(myOffsets
++) = offsetNum
;
306 /* write out the surrogates */
308 *(myTarget
++) = (UChar
) ((ch
>> HALF_SHIFT
) + SURROGATE_HIGH_START
);
309 *(myOffsets
++) = offsetNum
;
310 ch
= (ch
& HALF_MASK
) + SURROGATE_LOW_START
;
311 if (myTarget
< targetLimit
)
313 *(myTarget
++) = (UChar
)ch
;
314 *(myOffsets
++) = offsetNum
;
318 cnv
->UCharErrorBuffer
[0] = (UChar
) ch
;
319 cnv
->UCharErrorBufferLength
= 1;
320 *err
= U_BUFFER_OVERFLOW_ERROR
;
327 cnv
->toULength
= (int8_t)i
;
328 *err
= U_ILLEGAL_CHAR_FOUND
;
335 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
336 { /* End of target buffer */
337 *err
= U_BUFFER_OVERFLOW_ERROR
;
340 args
->target
= myTarget
;
341 args
->source
= (const char *) mySource
;
342 args
->offsets
= myOffsets
;
345 U_CFUNC
void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs
* args
,
348 UConverter
*cnv
= args
->converter
;
349 const UChar
*mySource
= args
->source
;
350 const UChar
*sourceLimit
= args
->sourceLimit
;
351 uint8_t *myTarget
= (uint8_t *) args
->target
;
352 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
356 int32_t indexToWrite
;
357 UBool isNotCESU8
= (UBool
)(cnv
->sharedData
!= &_CESU8Data
);
359 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
361 ch
= cnv
->fromUChar32
;
362 cnv
->fromUChar32
= 0;
366 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
370 if (ch
< 0x80) /* Single byte */
372 *(myTarget
++) = (uint8_t) ch
;
374 else if (ch
< 0x800) /* Double byte */
376 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
377 if (myTarget
< targetLimit
)
379 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
383 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
384 cnv
->charErrorBufferLength
= 1;
385 *err
= U_BUFFER_OVERFLOW_ERROR
;
389 /* Check for surrogates */
390 if(UTF_IS_SURROGATE(ch
) && isNotCESU8
) {
392 if (mySource
< sourceLimit
) {
393 /* test both code units */
394 if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_SECOND_SURROGATE(*mySource
)) {
395 /* convert and consume this supplementary code point */
396 ch
=UTF16_GET_PAIR_VALUE(ch
, *mySource
);
398 /* exit this condition tree */
401 /* this is an unpaired trail or lead code unit */
402 /* callback(illegal) */
403 cnv
->fromUChar32
= ch
;
404 *err
= U_ILLEGAL_CHAR_FOUND
;
410 cnv
->fromUChar32
= ch
;
415 /* Do we write the buffer directly for speed,
416 or do we have to be careful about target buffer space? */
417 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
419 if (ch
<= MAXIMUM_UCS2
) {
421 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
425 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
426 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
428 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
429 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
431 if (tempPtr
== myTarget
) {
432 /* There was enough space to write the codepoint directly. */
433 myTarget
+= (indexToWrite
+ 1);
436 /* We might run out of room soon. Write it slowly. */
437 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
438 if (myTarget
< targetLimit
) {
439 *(myTarget
++) = *tempPtr
;
442 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
443 *err
= U_BUFFER_OVERFLOW_ERROR
;
450 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
452 *err
= U_BUFFER_OVERFLOW_ERROR
;
455 args
->target
= (char *) myTarget
;
456 args
->source
= mySource
;
459 U_CFUNC
void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
462 UConverter
*cnv
= args
->converter
;
463 const UChar
*mySource
= args
->source
;
464 int32_t *myOffsets
= args
->offsets
;
465 const UChar
*sourceLimit
= args
->sourceLimit
;
466 uint8_t *myTarget
= (uint8_t *) args
->target
;
467 const uint8_t *targetLimit
= (uint8_t *) args
->targetLimit
;
470 int32_t offsetNum
, nextSourceIndex
;
471 int32_t indexToWrite
;
473 UBool isNotCESU8
= (UBool
)(cnv
->sharedData
!= &_CESU8Data
);
475 if (cnv
->fromUChar32
&& myTarget
< targetLimit
)
477 ch
= cnv
->fromUChar32
;
478 cnv
->fromUChar32
= 0;
486 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
490 if (ch
< 0x80) /* Single byte */
492 *(myOffsets
++) = offsetNum
++;
493 *(myTarget
++) = (char) ch
;
495 else if (ch
< 0x800) /* Double byte */
497 *(myOffsets
++) = offsetNum
;
498 *(myTarget
++) = (uint8_t) ((ch
>> 6) | 0xc0);
499 if (myTarget
< targetLimit
)
501 *(myOffsets
++) = offsetNum
++;
502 *(myTarget
++) = (uint8_t) ((ch
& 0x3f) | 0x80);
506 cnv
->charErrorBuffer
[0] = (uint8_t) ((ch
& 0x3f) | 0x80);
507 cnv
->charErrorBufferLength
= 1;
508 *err
= U_BUFFER_OVERFLOW_ERROR
;
512 /* Check for surrogates */
514 nextSourceIndex
= offsetNum
+ 1;
516 if(UTF_IS_SURROGATE(ch
) && isNotCESU8
) {
518 if (mySource
< sourceLimit
) {
519 /* test both code units */
520 if(UTF_IS_SURROGATE_FIRST(ch
) && UTF_IS_SECOND_SURROGATE(*mySource
)) {
521 /* convert and consume this supplementary code point */
522 ch
=UTF16_GET_PAIR_VALUE(ch
, *mySource
);
525 /* exit this condition tree */
528 /* this is an unpaired trail or lead code unit */
529 /* callback(illegal) */
530 cnv
->fromUChar32
= ch
;
531 *err
= U_ILLEGAL_CHAR_FOUND
;
537 cnv
->fromUChar32
= ch
;
542 /* Do we write the buffer directly for speed,
543 or do we have to be careful about target buffer space? */
544 tempPtr
= (((targetLimit
- myTarget
) >= 4) ? myTarget
: tempBuf
);
546 if (ch
<= MAXIMUM_UCS2
) {
548 tempPtr
[0] = (uint8_t) ((ch
>> 12) | 0xe0);
552 tempPtr
[0] = (uint8_t) ((ch
>> 18) | 0xf0);
553 tempPtr
[1] = (uint8_t) (((ch
>> 12) & 0x3f) | 0x80);
555 tempPtr
[indexToWrite
-1] = (uint8_t) (((ch
>> 6) & 0x3f) | 0x80);
556 tempPtr
[indexToWrite
] = (uint8_t) ((ch
& 0x3f) | 0x80);
558 if (tempPtr
== myTarget
) {
559 /* There was enough space to write the codepoint directly. */
560 myTarget
+= (indexToWrite
+ 1);
561 myOffsets
[0] = offsetNum
;
562 myOffsets
[1] = offsetNum
;
563 myOffsets
[2] = offsetNum
;
564 if (indexToWrite
>= 3) {
565 myOffsets
[3] = offsetNum
;
567 myOffsets
+= (indexToWrite
+ 1);
570 /* We might run out of room soon. Write it slowly. */
571 for (; tempPtr
<= (tempBuf
+ indexToWrite
); tempPtr
++) {
572 if (myTarget
< targetLimit
)
574 *(myOffsets
++) = offsetNum
;
575 *(myTarget
++) = *tempPtr
;
579 cnv
->charErrorBuffer
[cnv
->charErrorBufferLength
++] = *tempPtr
;
580 *err
= U_BUFFER_OVERFLOW_ERROR
;
584 offsetNum
= nextSourceIndex
;
588 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
590 *err
= U_BUFFER_OVERFLOW_ERROR
;
593 args
->target
= (char *) myTarget
;
594 args
->source
= mySource
;
595 args
->offsets
= myOffsets
;
598 static UChar32
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs
*args
,
601 const uint8_t *sourceInitial
;
602 const uint8_t *source
;
603 uint16_t extraBytesToWrite
;
606 int8_t i
, isLegalSequence
;
608 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
610 cnv
= args
->converter
;
611 sourceInitial
= source
= (const uint8_t *)args
->source
;
612 if (source
>= (const uint8_t *)args
->sourceLimit
)
615 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
619 myByte
= (uint8_t)*(source
++);
622 args
->source
= (const char *)source
;
623 return (UChar32
)myByte
;
626 extraBytesToWrite
= (uint16_t)bytesFromUTF8
[myByte
];
627 if (extraBytesToWrite
== 0) {
628 cnv
->toUBytes
[0] = myByte
;
630 *err
= U_ILLEGAL_CHAR_FOUND
;
631 args
->source
= (const char *)source
;
635 /*The byte sequence is longer than the buffer area passed*/
636 if (((const char *)source
+ extraBytesToWrite
- 1) > args
->sourceLimit
)
638 /* check if all of the remaining bytes are trail bytes */
639 cnv
->toUBytes
[0] = myByte
;
641 *err
= U_TRUNCATED_CHAR_FOUND
;
642 while(source
< (const uint8_t *)args
->sourceLimit
) {
643 if(U8_IS_TRAIL(myByte
= *source
)) {
644 cnv
->toUBytes
[i
++] = myByte
;
647 /* error even before we run out of input */
648 *err
= U_ILLEGAL_CHAR_FOUND
;
653 args
->source
= (const char *)source
;
659 switch(extraBytesToWrite
)
661 /* note: code falls through cases! (sic)*/
663 ch
+= (myByte
= *source
);
665 if (!UTF8_IS_TRAIL(myByte
))
672 ch
+= (myByte
= *source
);
674 if (!UTF8_IS_TRAIL(myByte
))
681 ch
+= (myByte
= *source
);
683 if (!UTF8_IS_TRAIL(myByte
))
690 ch
+= (myByte
= *source
);
692 if (!UTF8_IS_TRAIL(myByte
))
699 ch
+= (myByte
= *source
);
700 if (!UTF8_IS_TRAIL(myByte
))
707 ch
-= offsetsFromUTF8
[extraBytesToWrite
];
708 args
->source
= (const char *)source
;
711 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
712 * - use only trail bytes after a lead byte (checked above)
713 * - use the right number of trail bytes for a given lead byte
714 * - encode a code point <= U+10ffff
715 * - use the fewest possible number of bytes for their code points
716 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
718 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
719 * There are no irregular sequences any more.
721 if (isLegalSequence
&&
722 (uint32_t)ch
<= MAXIMUM_UTF
&&
723 (uint32_t)ch
>= utf8_minChar32
[extraBytesToWrite
] &&
726 return ch
; /* return the code point */
729 for(i
= 0; sourceInitial
< source
; ++i
) {
730 cnv
->toUBytes
[i
] = *sourceInitial
++;
733 *err
= U_ILLEGAL_CHAR_FOUND
;
737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
741 utf8_minLegal
[5]={ 0, 0, 0x80, 0x800, 0x10000 };
743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
745 utf8_offsets
[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs
*pFromUArgs
,
750 UConverterToUnicodeArgs
*pToUArgs
,
751 UErrorCode
*pErrorCode
) {
752 UConverter
*utf8
, *cnv
;
753 const uint8_t *source
, *sourceLimit
;
755 int32_t targetCapacity
;
758 int8_t oldToULength
, toULength
, toULimit
;
763 /* set up the local pointers */
764 utf8
=pToUArgs
->converter
;
765 cnv
=pFromUArgs
->converter
;
766 source
=(uint8_t *)pToUArgs
->source
;
767 sourceLimit
=(uint8_t *)pToUArgs
->sourceLimit
;
768 target
=(uint8_t *)pFromUArgs
->target
;
769 targetCapacity
=(int32_t)(pFromUArgs
->targetLimit
-pFromUArgs
->target
);
771 /* get the converter state from the UTF-8 UConverter */
772 c
=(UChar32
)utf8
->toUnicodeStatus
;
774 toULength
=oldToULength
=utf8
->toULength
;
775 toULimit
=(int8_t)utf8
->mode
;
777 toULength
=oldToULength
=toULimit
=0;
780 count
=(int32_t)(sourceLimit
-source
)+oldToULength
;
783 * Not enough input to complete the partial character.
784 * Jump to moreBytes below - it will not output to target.
786 } else if(targetCapacity
<toULimit
) {
788 * Not enough target capacity to output the partial character.
789 * Let the standard converter handle this.
791 *pErrorCode
=U_USING_DEFAULT_WARNING
;
795 * Use a single counter for source and target, counting the minimum of
796 * the source length and the target capacity.
797 * As a result, the source length is checked only once per multi-byte
798 * character instead of twice.
800 * Make sure that the last byte sequence is complete, or else
801 * stop just before it.
802 * (The longest legal byte sequence has 3 trail bytes.)
803 * Count oldToULength (number of source bytes from a previous buffer)
804 * into the source length but reduce the source index by toULimit
805 * while going back over trail bytes in order to not go back into
806 * the bytes that will be read for finishing a partial
807 * sequence from the previous buffer.
808 * Let the standard converter handle edge cases.
812 if(count
>targetCapacity
) {
813 count
=targetCapacity
;
817 while(i
<3 && i
<(count
-toULimit
)) {
818 b
=source
[count
-oldToULength
-i
-1];
822 if(i
<utf8_countTrailBytes
[b
]) {
823 /* stop converting before the lead byte if there are not enough trail bytes for it */
832 utf8
->toUnicodeStatus
=0;
835 /* See note in ucnv_SBCSFromUTF8() about this goto. */
838 /* conversion loop */
848 if( /* handle U+1000..U+D7FF inline */
849 (t1
=source
[0]) >= 0x80 && ((b
<0xed && (t1
<= 0xbf)) ||
850 (b
==0xed && (t1
<= 0x9f))) &&
851 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
861 if( /* handle U+0080..U+07FF inline */
863 (t1
=*source
) >= 0x80 && t1
<= 0xbf
872 if( /* handle U+0800..U+0FFF inline */
873 (t1
=source
[0]) >= 0xa0 && t1
<= 0xbf &&
874 (t2
=source
[1]) >= 0x80 && t2
<= 0xbf
885 /* handle "complicated" and error cases, and continuing partial characters */
888 toULimit
=utf8_countTrailBytes
[b
]+1;
891 while(toULength
<toULimit
) {
892 if(source
<sourceLimit
) {
899 break; /* sequence too short, stop with toULength<toULimit */
902 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
903 source
-=(toULength
-oldToULength
);
904 while(oldToULength
<toULength
) {
905 utf8
->toUBytes
[oldToULength
++]=*source
++;
907 utf8
->toUnicodeStatus
=c
;
908 utf8
->toULength
=toULength
;
910 pToUArgs
->source
=(char *)source
;
911 pFromUArgs
->target
=(char *)target
;
916 if( toULength
==toULimit
&& /* consumed all trail bytes */
917 (toULength
==3 || toULength
==2) && /* BMP */
918 (c
-=utf8_offsets
[toULength
])>=utf8_minLegal
[toULength
] &&
919 (c
<=0xd7ff || 0xe000<=c
) /* not a surrogate */
921 /* legal byte sequence for BMP code point */
923 toULength
==toULimit
&& toULength
==4 &&
924 (0x10000<=(c
-=utf8_offsets
[4]) && c
<=0x10ffff)
926 /* legal byte sequence for supplementary code point */
928 /* error handling: illegal UTF-8 byte sequence */
929 source
-=(toULength
-oldToULength
);
930 while(oldToULength
<toULength
) {
931 utf8
->toUBytes
[oldToULength
++]=*source
++;
933 utf8
->toULength
=toULength
;
934 pToUArgs
->source
=(char *)source
;
935 pFromUArgs
->target
=(char *)target
;
936 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
940 /* copy the legal byte sequence to the target */
944 for(i
=0; i
<oldToULength
; ++i
) {
945 *target
++=utf8
->toUBytes
[i
];
947 source
-=(toULength
-oldToULength
);
948 for(; i
<toULength
; ++i
) {
956 if(U_SUCCESS(*pErrorCode
) && source
<sourceLimit
) {
957 if(target
==(const uint8_t *)pFromUArgs
->targetLimit
) {
958 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
961 toULimit
=utf8_countTrailBytes
[b
]+1;
962 if(toULimit
>(sourceLimit
-source
)) {
963 /* collect a truncated byte sequence */
967 utf8
->toUBytes
[toULength
++]=b
;
968 if(++source
==sourceLimit
) {
969 /* partial byte sequence at end of source */
970 utf8
->toUnicodeStatus
=c
;
971 utf8
->toULength
=toULength
;
974 } else if(!U8_IS_TRAIL(b
=*source
)) {
975 /* lead byte in trail byte position */
976 utf8
->toULength
=toULength
;
977 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
983 /* partial-sequence target overflow: fall back to the pivoting implementation */
984 *pErrorCode
=U_USING_DEFAULT_WARNING
;
989 /* write back the updated pointers */
990 pToUArgs
->source
=(char *)source
;
991 pFromUArgs
->target
=(char *)target
;
994 /* UTF-8 converter data ----------------------------------------------------- */
996 static const UConverterImpl _UTF8Impl
={
1006 ucnv_toUnicode_UTF8
,
1007 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1008 ucnv_fromUnicode_UTF8
,
1009 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1010 ucnv_getNextUChar_UTF8
,
1016 ucnv_getNonSurrogateUnicodeSet
,
1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1023 static const UConverterStaticData _UTF8StaticData
={
1024 sizeof(UConverterStaticData
),
1026 1208, UCNV_IBM
, UCNV_UTF8
,
1027 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1028 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1031 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1035 const UConverterSharedData _UTF8Data
={
1036 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1037 NULL
, NULL
, &_UTF8StaticData
, FALSE
, &_UTF8Impl
,
1041 /* CESU-8 converter data ---------------------------------------------------- */
1043 static const UConverterImpl _CESU8Impl
={
1053 ucnv_toUnicode_UTF8
,
1054 ucnv_toUnicode_UTF8_OFFSETS_LOGIC
,
1055 ucnv_fromUnicode_UTF8
,
1056 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
1063 ucnv_getCompleteUnicodeSet
1066 static const UConverterStaticData _CESU8StaticData
={
1067 sizeof(UConverterStaticData
),
1069 9400, /* CCSID for CESU-8 */
1070 UCNV_UNKNOWN
, UCNV_CESU8
, 1, 3,
1071 { 0xef, 0xbf, 0xbd, 0 },3,FALSE
,FALSE
,
1074 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1078 const UConverterSharedData _CESU8Data
={
1079 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1080 NULL
, NULL
, &_CESU8StaticData
, FALSE
, &_CESU8Impl
,