1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2002-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u32.c
10 * tab size: 8 (not used)
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
16 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
23 #include "unicode/ucnv.h"
24 #include "unicode/utf.h"
29 #define MAXIMUM_UCS2 0x0000FFFF
30 #define MAXIMUM_UTF 0x0010FFFF
32 #define HALF_BASE 0x0010000
33 #define HALF_MASK 0x3FF
34 #define SURROGATE_HIGH_START 0xD800
35 #define SURROGATE_LOW_START 0xDC00
37 /* -SURROGATE_LOW_START + HALF_BASE */
38 #define SURROGATE_LOW_BASE 9216
41 UCNV_NEED_TO_WRITE_BOM
=1
44 /* UTF-32BE ----------------------------------------------------------------- */
46 static void U_CALLCONV
47 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs
* args
,
50 const unsigned char *mySource
= (unsigned char *) args
->source
;
51 UChar
*myTarget
= args
->target
;
52 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
53 const UChar
*targetLimit
= args
->targetLimit
;
54 unsigned char *toUBytes
= args
->converter
->toUBytes
;
57 /* Restore state of current sequence */
58 if (args
->converter
->toULength
> 0 && myTarget
< targetLimit
) {
59 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
60 args
->converter
->toULength
= 0;
62 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
63 args
->converter
->toUnicodeStatus
= 0;
67 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
71 while (i
< sizeof(uint32_t)) {
72 if (mySource
< sourceLimit
) {
73 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
74 toUBytes
[i
++] = (char) *(mySource
++);
77 /* stores a partially calculated target*/
78 /* + 1 to make 0 a valid character */
79 args
->converter
->toUnicodeStatus
= ch
+ 1;
80 args
->converter
->toULength
= (int8_t) i
;
85 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
86 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
87 if (ch
<= MAXIMUM_UCS2
)
90 *(myTarget
++) = (UChar
) ch
;
93 /* write out the surrogates */
94 *(myTarget
++) = U16_LEAD(ch
);
96 if (myTarget
< targetLimit
) {
97 *(myTarget
++) = (UChar
)ch
;
100 /* Put in overflow buffer (not handled here) */
101 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
102 args
->converter
->UCharErrorBufferLength
= 1;
103 *err
= U_BUFFER_OVERFLOW_ERROR
;
109 args
->converter
->toULength
= (int8_t)i
;
110 *err
= U_ILLEGAL_CHAR_FOUND
;
116 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
117 /* End of target buffer */
118 *err
= U_BUFFER_OVERFLOW_ERROR
;
121 args
->target
= myTarget
;
122 args
->source
= (const char *) mySource
;
125 static void U_CALLCONV
126 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
129 const unsigned char *mySource
= (unsigned char *) args
->source
;
130 UChar
*myTarget
= args
->target
;
131 int32_t *myOffsets
= args
->offsets
;
132 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
133 const UChar
*targetLimit
= args
->targetLimit
;
134 unsigned char *toUBytes
= args
->converter
->toUBytes
;
136 int32_t offsetNum
= 0;
138 /* Restore state of current sequence */
139 if (args
->converter
->toULength
> 0 && myTarget
< targetLimit
) {
140 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
141 args
->converter
->toULength
= 0;
143 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
144 args
->converter
->toUnicodeStatus
= 0;
148 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
152 while (i
< sizeof(uint32_t)) {
153 if (mySource
< sourceLimit
) {
154 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
155 toUBytes
[i
++] = (char) *(mySource
++);
158 /* stores a partially calculated target*/
159 /* + 1 to make 0 a valid character */
160 args
->converter
->toUnicodeStatus
= ch
+ 1;
161 args
->converter
->toULength
= (int8_t) i
;
166 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
167 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
168 if (ch
<= MAXIMUM_UCS2
) {
169 /* fits in 16 bits */
170 *(myTarget
++) = (UChar
) ch
;
171 *(myOffsets
++) = offsetNum
;
174 /* write out the surrogates */
175 *(myTarget
++) = U16_LEAD(ch
);
176 *myOffsets
++ = offsetNum
;
178 if (myTarget
< targetLimit
)
180 *(myTarget
++) = (UChar
)ch
;
181 *(myOffsets
++) = offsetNum
;
184 /* Put in overflow buffer (not handled here) */
185 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
186 args
->converter
->UCharErrorBufferLength
= 1;
187 *err
= U_BUFFER_OVERFLOW_ERROR
;
193 args
->converter
->toULength
= (int8_t)i
;
194 *err
= U_ILLEGAL_CHAR_FOUND
;
201 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
203 /* End of target buffer */
204 *err
= U_BUFFER_OVERFLOW_ERROR
;
207 args
->target
= myTarget
;
208 args
->source
= (const char *) mySource
;
209 args
->offsets
= myOffsets
;
212 static void U_CALLCONV
213 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs
* args
,
216 const UChar
*mySource
= args
->source
;
217 unsigned char *myTarget
;
218 const UChar
*sourceLimit
= args
->sourceLimit
;
219 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
221 unsigned int indexToWrite
;
222 unsigned char temp
[sizeof(uint32_t)];
224 if(mySource
>= sourceLimit
) {
225 /* no input, nothing to do */
229 /* write the BOM if necessary */
230 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
231 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
232 ucnv_fromUWriteBytes(args
->converter
,
234 &args
->target
, args
->targetLimit
,
237 args
->converter
->fromUnicodeStatus
=0;
240 myTarget
= (unsigned char *) args
->target
;
243 if (args
->converter
->fromUChar32
) {
244 ch
= args
->converter
->fromUChar32
;
245 args
->converter
->fromUChar32
= 0;
249 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
252 if (U_IS_SURROGATE(ch
)) {
255 if (mySource
< sourceLimit
) {
257 if (U_IS_TRAIL(ch2
)) {
258 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
262 /* this is an unmatched trail code unit (2nd surrogate) */
263 /* callback(illegal) */
264 args
->converter
->fromUChar32
= ch
;
265 *err
= U_ILLEGAL_CHAR_FOUND
;
270 /* ran out of source */
271 args
->converter
->fromUChar32
= ch
;
273 /* this is an unmatched trail code unit (2nd surrogate) */
274 /* callback(illegal) */
275 *err
= U_ILLEGAL_CHAR_FOUND
;
281 /* this is an unmatched trail code unit (2nd surrogate) */
282 /* callback(illegal) */
283 args
->converter
->fromUChar32
= ch
;
284 *err
= U_ILLEGAL_CHAR_FOUND
;
289 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
290 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
291 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
292 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
294 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
295 if (myTarget
< targetLimit
) {
296 *(myTarget
++) = temp
[indexToWrite
];
299 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
300 *err
= U_BUFFER_OVERFLOW_ERROR
;
305 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
306 *err
= U_BUFFER_OVERFLOW_ERROR
;
309 args
->target
= (char *) myTarget
;
310 args
->source
= mySource
;
313 static void U_CALLCONV
314 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
317 const UChar
*mySource
= args
->source
;
318 unsigned char *myTarget
;
320 const UChar
*sourceLimit
= args
->sourceLimit
;
321 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
323 int32_t offsetNum
= 0;
324 unsigned int indexToWrite
;
325 unsigned char temp
[sizeof(uint32_t)];
327 if(mySource
>= sourceLimit
) {
328 /* no input, nothing to do */
332 /* write the BOM if necessary */
333 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
334 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
335 ucnv_fromUWriteBytes(args
->converter
,
337 &args
->target
, args
->targetLimit
,
340 args
->converter
->fromUnicodeStatus
=0;
343 myTarget
= (unsigned char *) args
->target
;
344 myOffsets
= args
->offsets
;
347 if (args
->converter
->fromUChar32
) {
348 ch
= args
->converter
->fromUChar32
;
349 args
->converter
->fromUChar32
= 0;
353 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
356 if (U_IS_SURROGATE(ch
)) {
359 if (mySource
< sourceLimit
) {
361 if (U_IS_TRAIL(ch2
)) {
362 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
366 /* this is an unmatched trail code unit (2nd surrogate) */
367 /* callback(illegal) */
368 args
->converter
->fromUChar32
= ch
;
369 *err
= U_ILLEGAL_CHAR_FOUND
;
374 /* ran out of source */
375 args
->converter
->fromUChar32
= ch
;
377 /* this is an unmatched trail code unit (2nd surrogate) */
378 /* callback(illegal) */
379 *err
= U_ILLEGAL_CHAR_FOUND
;
385 /* this is an unmatched trail code unit (2nd surrogate) */
386 /* callback(illegal) */
387 args
->converter
->fromUChar32
= ch
;
388 *err
= U_ILLEGAL_CHAR_FOUND
;
393 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
394 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
395 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
396 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
398 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
399 if (myTarget
< targetLimit
) {
400 *(myTarget
++) = temp
[indexToWrite
];
401 *(myOffsets
++) = offsetNum
;
404 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
405 *err
= U_BUFFER_OVERFLOW_ERROR
;
408 offsetNum
= offsetNum
+ 1 + (temp
[1] != 0);
411 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
412 *err
= U_BUFFER_OVERFLOW_ERROR
;
415 args
->target
= (char *) myTarget
;
416 args
->source
= mySource
;
417 args
->offsets
= myOffsets
;
420 static UChar32 U_CALLCONV
421 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs
* args
,
424 const uint8_t *mySource
;
428 mySource
= (const uint8_t *)args
->source
;
429 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
432 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
436 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
439 /* got a partial character */
440 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
441 args
->converter
->toULength
= (int8_t)length
;
442 args
->source
= (const char *)(mySource
+ length
);
443 *err
= U_TRUNCATED_CHAR_FOUND
;
447 /* Don't even try to do a direct cast because the value may be on an odd address. */
448 myUChar
= ((UChar32
)mySource
[0] << 24)
449 | ((UChar32
)mySource
[1] << 16)
450 | ((UChar32
)mySource
[2] << 8)
451 | ((UChar32
)mySource
[3]);
453 args
->source
= (const char *)(mySource
+ 4);
454 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
458 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
459 args
->converter
->toULength
= 4;
461 *err
= U_ILLEGAL_CHAR_FOUND
;
465 static const UConverterImpl _UTF32BEImpl
= {
466 UCNV_UTF32_BigEndian
,
475 T_UConverter_toUnicode_UTF32_BE
,
476 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC
,
477 T_UConverter_fromUnicode_UTF32_BE
,
478 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
479 T_UConverter_getNextUChar_UTF32_BE
,
485 ucnv_getNonSurrogateUnicodeSet
,
491 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
492 static const UConverterStaticData _UTF32BEStaticData
= {
493 sizeof(UConverterStaticData
),
496 UCNV_IBM
, UCNV_UTF32_BigEndian
, 4, 4,
497 { 0, 0, 0xff, 0xfd }, 4, FALSE
, FALSE
,
500 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
503 const UConverterSharedData _UTF32BEData
=
504 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData
, &_UTF32BEImpl
);
506 /* UTF-32LE ---------------------------------------------------------- */
508 static void U_CALLCONV
509 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs
* args
,
512 const unsigned char *mySource
= (unsigned char *) args
->source
;
513 UChar
*myTarget
= args
->target
;
514 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
515 const UChar
*targetLimit
= args
->targetLimit
;
516 unsigned char *toUBytes
= args
->converter
->toUBytes
;
519 /* Restore state of current sequence */
520 if (args
->converter
->toULength
> 0 && myTarget
< targetLimit
)
522 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
523 args
->converter
->toULength
= 0;
525 /* Stores the previously calculated ch from a previous call*/
526 ch
= args
->converter
->toUnicodeStatus
- 1;
527 args
->converter
->toUnicodeStatus
= 0;
531 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
536 while (i
< sizeof(uint32_t))
538 if (mySource
< sourceLimit
)
540 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
541 toUBytes
[i
++] = (char) *(mySource
++);
545 /* stores a partially calculated target*/
546 /* + 1 to make 0 a valid character */
547 args
->converter
->toUnicodeStatus
= ch
+ 1;
548 args
->converter
->toULength
= (int8_t) i
;
553 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
554 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
555 if (ch
<= MAXIMUM_UCS2
) {
556 /* fits in 16 bits */
557 *(myTarget
++) = (UChar
) ch
;
560 /* write out the surrogates */
561 *(myTarget
++) = U16_LEAD(ch
);
563 if (myTarget
< targetLimit
) {
564 *(myTarget
++) = (UChar
)ch
;
567 /* Put in overflow buffer (not handled here) */
568 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
569 args
->converter
->UCharErrorBufferLength
= 1;
570 *err
= U_BUFFER_OVERFLOW_ERROR
;
576 args
->converter
->toULength
= (int8_t)i
;
577 *err
= U_ILLEGAL_CHAR_FOUND
;
583 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
585 /* End of target buffer */
586 *err
= U_BUFFER_OVERFLOW_ERROR
;
589 args
->target
= myTarget
;
590 args
->source
= (const char *) mySource
;
593 static void U_CALLCONV
594 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
597 const unsigned char *mySource
= (unsigned char *) args
->source
;
598 UChar
*myTarget
= args
->target
;
599 int32_t *myOffsets
= args
->offsets
;
600 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
601 const UChar
*targetLimit
= args
->targetLimit
;
602 unsigned char *toUBytes
= args
->converter
->toUBytes
;
604 int32_t offsetNum
= 0;
606 /* Restore state of current sequence */
607 if (args
->converter
->toULength
> 0 && myTarget
< targetLimit
)
609 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
610 args
->converter
->toULength
= 0;
612 /* Stores the previously calculated ch from a previous call*/
613 ch
= args
->converter
->toUnicodeStatus
- 1;
614 args
->converter
->toUnicodeStatus
= 0;
618 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
623 while (i
< sizeof(uint32_t))
625 if (mySource
< sourceLimit
)
627 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
628 toUBytes
[i
++] = (char) *(mySource
++);
632 /* stores a partially calculated target*/
633 /* + 1 to make 0 a valid character */
634 args
->converter
->toUnicodeStatus
= ch
+ 1;
635 args
->converter
->toULength
= (int8_t) i
;
640 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
))
642 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
643 if (ch
<= MAXIMUM_UCS2
)
645 /* fits in 16 bits */
646 *(myTarget
++) = (UChar
) ch
;
647 *(myOffsets
++) = offsetNum
;
650 /* write out the surrogates */
651 *(myTarget
++) = U16_LEAD(ch
);
652 *(myOffsets
++) = offsetNum
;
654 if (myTarget
< targetLimit
)
656 *(myTarget
++) = (UChar
)ch
;
657 *(myOffsets
++) = offsetNum
;
661 /* Put in overflow buffer (not handled here) */
662 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
663 args
->converter
->UCharErrorBufferLength
= 1;
664 *err
= U_BUFFER_OVERFLOW_ERROR
;
671 args
->converter
->toULength
= (int8_t)i
;
672 *err
= U_ILLEGAL_CHAR_FOUND
;
679 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
681 /* End of target buffer */
682 *err
= U_BUFFER_OVERFLOW_ERROR
;
685 args
->target
= myTarget
;
686 args
->source
= (const char *) mySource
;
687 args
->offsets
= myOffsets
;
690 static void U_CALLCONV
691 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs
* args
,
694 const UChar
*mySource
= args
->source
;
695 unsigned char *myTarget
;
696 const UChar
*sourceLimit
= args
->sourceLimit
;
697 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
699 unsigned int indexToWrite
;
700 unsigned char temp
[sizeof(uint32_t)];
702 if(mySource
>= sourceLimit
) {
703 /* no input, nothing to do */
707 /* write the BOM if necessary */
708 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
709 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
710 ucnv_fromUWriteBytes(args
->converter
,
712 &args
->target
, args
->targetLimit
,
715 args
->converter
->fromUnicodeStatus
=0;
718 myTarget
= (unsigned char *) args
->target
;
721 if (args
->converter
->fromUChar32
)
723 ch
= args
->converter
->fromUChar32
;
724 args
->converter
->fromUChar32
= 0;
728 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
732 if (U16_IS_SURROGATE(ch
)) {
736 if (mySource
< sourceLimit
)
739 if (U16_IS_TRAIL(ch2
)) {
740 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
744 /* this is an unmatched trail code unit (2nd surrogate) */
745 /* callback(illegal) */
746 args
->converter
->fromUChar32
= ch
;
747 *err
= U_ILLEGAL_CHAR_FOUND
;
752 /* ran out of source */
753 args
->converter
->fromUChar32
= ch
;
755 /* this is an unmatched trail code unit (2nd surrogate) */
756 /* callback(illegal) */
757 *err
= U_ILLEGAL_CHAR_FOUND
;
763 /* this is an unmatched trail code unit (2nd surrogate) */
764 /* callback(illegal) */
765 args
->converter
->fromUChar32
= ch
;
766 *err
= U_ILLEGAL_CHAR_FOUND
;
771 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
772 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
773 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
774 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
776 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
778 if (myTarget
< targetLimit
)
780 *(myTarget
++) = temp
[indexToWrite
];
784 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
785 *err
= U_BUFFER_OVERFLOW_ERROR
;
790 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
792 *err
= U_BUFFER_OVERFLOW_ERROR
;
795 args
->target
= (char *) myTarget
;
796 args
->source
= mySource
;
799 static void U_CALLCONV
800 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
803 const UChar
*mySource
= args
->source
;
804 unsigned char *myTarget
;
806 const UChar
*sourceLimit
= args
->sourceLimit
;
807 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
809 unsigned int indexToWrite
;
810 unsigned char temp
[sizeof(uint32_t)];
811 int32_t offsetNum
= 0;
813 if(mySource
>= sourceLimit
) {
814 /* no input, nothing to do */
818 /* write the BOM if necessary */
819 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
820 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
821 ucnv_fromUWriteBytes(args
->converter
,
823 &args
->target
, args
->targetLimit
,
826 args
->converter
->fromUnicodeStatus
=0;
829 myTarget
= (unsigned char *) args
->target
;
830 myOffsets
= args
->offsets
;
833 if (args
->converter
->fromUChar32
)
835 ch
= args
->converter
->fromUChar32
;
836 args
->converter
->fromUChar32
= 0;
840 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
844 if (U16_IS_SURROGATE(ch
)) {
848 if (mySource
< sourceLimit
)
851 if (U16_IS_TRAIL(ch2
))
853 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
857 /* this is an unmatched trail code unit (2nd surrogate) */
858 /* callback(illegal) */
859 args
->converter
->fromUChar32
= ch
;
860 *err
= U_ILLEGAL_CHAR_FOUND
;
865 /* ran out of source */
866 args
->converter
->fromUChar32
= ch
;
868 /* this is an unmatched trail code unit (2nd surrogate) */
869 /* callback(illegal) */
870 *err
= U_ILLEGAL_CHAR_FOUND
;
876 /* this is an unmatched trail code unit (2nd surrogate) */
877 /* callback(illegal) */
878 args
->converter
->fromUChar32
= ch
;
879 *err
= U_ILLEGAL_CHAR_FOUND
;
884 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
885 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
886 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
887 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
889 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
891 if (myTarget
< targetLimit
)
893 *(myTarget
++) = temp
[indexToWrite
];
894 *(myOffsets
++) = offsetNum
;
898 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
899 *err
= U_BUFFER_OVERFLOW_ERROR
;
902 offsetNum
= offsetNum
+ 1 + (temp
[2] != 0);
905 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
907 *err
= U_BUFFER_OVERFLOW_ERROR
;
910 args
->target
= (char *) myTarget
;
911 args
->source
= mySource
;
912 args
->offsets
= myOffsets
;
915 static UChar32 U_CALLCONV
916 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs
* args
,
919 const uint8_t *mySource
;
923 mySource
= (const uint8_t *)args
->source
;
924 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
927 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
931 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
934 /* got a partial character */
935 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
936 args
->converter
->toULength
= (int8_t)length
;
937 args
->source
= (const char *)(mySource
+ length
);
938 *err
= U_TRUNCATED_CHAR_FOUND
;
942 /* Don't even try to do a direct cast because the value may be on an odd address. */
943 myUChar
= ((UChar32
)mySource
[3] << 24)
944 | ((UChar32
)mySource
[2] << 16)
945 | ((UChar32
)mySource
[1] << 8)
946 | ((UChar32
)mySource
[0]);
948 args
->source
= (const char *)(mySource
+ 4);
949 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
953 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
954 args
->converter
->toULength
= 4;
956 *err
= U_ILLEGAL_CHAR_FOUND
;
960 static const UConverterImpl _UTF32LEImpl
= {
961 UCNV_UTF32_LittleEndian
,
970 T_UConverter_toUnicode_UTF32_LE
,
971 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC
,
972 T_UConverter_fromUnicode_UTF32_LE
,
973 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
974 T_UConverter_getNextUChar_UTF32_LE
,
980 ucnv_getNonSurrogateUnicodeSet
,
986 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
987 static const UConverterStaticData _UTF32LEStaticData
= {
988 sizeof(UConverterStaticData
),
991 UCNV_IBM
, UCNV_UTF32_LittleEndian
, 4, 4,
992 { 0xfd, 0xff, 0, 0 }, 4, FALSE
, FALSE
,
995 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
999 const UConverterSharedData _UTF32LEData
=
1000 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData
, &_UTF32LEImpl
);
1002 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1005 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1020 * During detection: state&3==number of matching bytes so far.
1022 * On output, emit U+FEFF as the first code point.
1025 static void U_CALLCONV
1026 _UTF32Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1027 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1028 /* reset toUnicode: state=0 */
1031 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1032 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1033 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1037 static void U_CALLCONV
1038 _UTF32Open(UConverter
*cnv
,
1039 UConverterLoadArgs
*pArgs
,
1040 UErrorCode
*pErrorCode
) {
1043 _UTF32Reset(cnv
, UCNV_RESET_BOTH
);
1046 static const char utf32BOM
[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1048 static void U_CALLCONV
1049 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1050 UErrorCode
*pErrorCode
) {
1051 UConverter
*cnv
=pArgs
->converter
;
1052 const char *source
=pArgs
->source
;
1053 const char *sourceLimit
=pArgs
->sourceLimit
;
1054 int32_t *offsets
=pArgs
->offsets
;
1056 int32_t state
, offsetDelta
;
1062 * If we detect a BOM in this buffer, then we must add the BOM size to the
1063 * offsets because the actual converter function will not see and count the BOM.
1064 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1068 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1073 state
=1; /* could be 00 00 FE FF */
1074 } else if(b
==(char)0xff) {
1075 state
=5; /* could be FF FE 00 00 */
1077 state
=8; /* default to UTF-32BE */
1088 if(*source
==utf32BOM
[state
]) {
1092 state
=8; /* detect UTF-32BE */
1093 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1094 } else if(state
==8) {
1095 state
=9; /* detect UTF-32LE */
1096 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1099 /* switch to UTF-32BE and pass the previous bytes */
1100 int32_t count
=(int32_t)(source
-pArgs
->source
); /* number of bytes from this buffer */
1102 /* reset the source */
1103 source
=pArgs
->source
;
1105 if(count
==(state
&3)) {
1106 /* simple: all in the same buffer, just reset source */
1108 UBool oldFlush
=pArgs
->flush
;
1110 /* some of the bytes are from a previous buffer, replay those first */
1111 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1112 pArgs
->sourceLimit
=pArgs
->source
+((state
&3)-count
); /* replay previous bytes */
1113 pArgs
->flush
=FALSE
; /* this sourceLimit is not the real source stream limit */
1115 /* no offsets: bytes from previous buffer, and not enough for output */
1116 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1118 /* restore real pointers; pArgs->source will be set in case 8/9 */
1119 pArgs
->sourceLimit
=sourceLimit
;
1120 pArgs
->flush
=oldFlush
;
1128 pArgs
->source
=source
;
1130 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1132 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1134 source
=pArgs
->source
;
1138 pArgs
->source
=source
;
1140 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1142 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1144 source
=pArgs
->source
;
1147 break; /* does not occur */
1151 /* add BOM size to offsets - see comment at offsetDelta declaration */
1152 if(offsets
!=NULL
&& offsetDelta
!=0) {
1153 int32_t *offsetsLimit
=pArgs
->offsets
;
1154 while(offsets
<offsetsLimit
) {
1155 *offsets
++ += offsetDelta
;
1159 pArgs
->source
=source
;
1161 if(source
==sourceLimit
&& pArgs
->flush
) {
1162 /* handle truncated input */
1165 break; /* no input at all, nothing to do */
1167 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1170 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1173 /* handle 0<state<8: call UTF-32BE with too-short input */
1174 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1175 pArgs
->sourceLimit
=pArgs
->source
+(state
&3); /* replay bytes */
1177 /* no offsets: not enough for output */
1178 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1179 pArgs
->source
=source
;
1180 pArgs
->sourceLimit
=sourceLimit
;
1189 static UChar32 U_CALLCONV
1190 _UTF32GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1191 UErrorCode
*pErrorCode
) {
1192 switch(pArgs
->converter
->mode
) {
1194 return T_UConverter_getNextUChar_UTF32_BE(pArgs
, pErrorCode
);
1196 return T_UConverter_getNextUChar_UTF32_LE(pArgs
, pErrorCode
);
1198 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1202 static const UConverterImpl _UTF32Impl
= {
1212 _UTF32ToUnicodeWithOffsets
,
1213 _UTF32ToUnicodeWithOffsets
,
1215 T_UConverter_fromUnicode_UTF32_BE
,
1216 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
1218 T_UConverter_fromUnicode_UTF32_LE
,
1219 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
1223 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1227 ucnv_getNonSurrogateUnicodeSet
,
1233 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1234 static const UConverterStaticData _UTF32StaticData
= {
1235 sizeof(UConverterStaticData
),
1238 UCNV_IBM
, UCNV_UTF32
, 4, 4,
1240 { 0, 0, 0xff, 0xfd }, 4,
1242 { 0xfd, 0xff, 0, 0 }, 4,
1247 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1250 const UConverterSharedData _UTF32Data
=
1251 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData
, &_UTF32Impl
);