2 **********************************************************************
3 * Copyright (C) 2002-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
26 #define MAXIMUM_UCS2 0x0000FFFF
27 #define MAXIMUM_UTF 0x0010FFFF
29 #define HALF_BASE 0x0010000
30 #define HALF_MASK 0x3FF
31 #define SURROGATE_HIGH_START 0xD800
32 #define SURROGATE_LOW_START 0xDC00
34 /* -SURROGATE_LOW_START + HALF_BASE */
35 #define SURROGATE_LOW_BASE 9216
37 /* UTF-32BE ----------------------------------------------------------------- */
40 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs
* args
,
43 const unsigned char *mySource
= (unsigned char *) args
->source
;
44 UChar
*myTarget
= args
->target
;
45 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
46 const UChar
*targetLimit
= args
->targetLimit
;
47 unsigned char *toUBytes
= args
->converter
->toUBytes
;
50 /* UTF-8 returns here for only non-offset, this needs to change.*/
51 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
52 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
54 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
55 args
->converter
->toUnicodeStatus
= 0;
59 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
63 while (i
< sizeof(uint32_t)) {
64 if (mySource
< sourceLimit
) {
65 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
66 toUBytes
[i
++] = (char) *(mySource
++);
69 /* stores a partially calculated target*/
70 /* + 1 to make 0 a valid character */
71 args
->converter
->toUnicodeStatus
= ch
+ 1;
72 args
->converter
->toULength
= (int8_t) i
;
77 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
78 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
79 if (ch
<= MAXIMUM_UCS2
)
82 *(myTarget
++) = (UChar
) ch
;
85 /* write out the surrogates */
86 *(myTarget
++) = U16_LEAD(ch
);
88 if (myTarget
< targetLimit
) {
89 *(myTarget
++) = (UChar
)ch
;
92 /* Put in overflow buffer (not handled here) */
93 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
94 args
->converter
->UCharErrorBufferLength
= 1;
95 *err
= U_BUFFER_OVERFLOW_ERROR
;
101 args
->converter
->toULength
= (int8_t)i
;
102 *err
= U_ILLEGAL_CHAR_FOUND
;
108 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
109 /* End of target buffer */
110 *err
= U_BUFFER_OVERFLOW_ERROR
;
113 args
->target
= myTarget
;
114 args
->source
= (const char *) mySource
;
118 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
121 const unsigned char *mySource
= (unsigned char *) args
->source
;
122 UChar
*myTarget
= args
->target
;
123 int32_t *myOffsets
= args
->offsets
;
124 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
125 const UChar
*targetLimit
= args
->targetLimit
;
126 unsigned char *toUBytes
= args
->converter
->toUBytes
;
128 int32_t offsetNum
= 0;
130 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
131 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
133 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
134 args
->converter
->toUnicodeStatus
= 0;
138 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
142 while (i
< sizeof(uint32_t)) {
143 if (mySource
< sourceLimit
) {
144 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
145 toUBytes
[i
++] = (char) *(mySource
++);
148 /* stores a partially calculated target*/
149 /* + 1 to make 0 a valid character */
150 args
->converter
->toUnicodeStatus
= ch
+ 1;
151 args
->converter
->toULength
= (int8_t) i
;
156 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
157 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
158 if (ch
<= MAXIMUM_UCS2
) {
159 /* fits in 16 bits */
160 *(myTarget
++) = (UChar
) ch
;
161 *(myOffsets
++) = offsetNum
;
164 /* write out the surrogates */
165 *(myTarget
++) = U16_LEAD(ch
);
166 *myOffsets
++ = offsetNum
;
168 if (myTarget
< targetLimit
)
170 *(myTarget
++) = (UChar
)ch
;
171 *(myOffsets
++) = offsetNum
;
174 /* Put in overflow buffer (not handled here) */
175 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
176 args
->converter
->UCharErrorBufferLength
= 1;
177 *err
= U_BUFFER_OVERFLOW_ERROR
;
183 args
->converter
->toULength
= (int8_t)i
;
184 *err
= U_ILLEGAL_CHAR_FOUND
;
191 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
193 /* End of target buffer */
194 *err
= U_BUFFER_OVERFLOW_ERROR
;
197 args
->target
= myTarget
;
198 args
->source
= (const char *) mySource
;
199 args
->offsets
= myOffsets
;
203 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs
* args
,
206 const UChar
*mySource
= args
->source
;
207 unsigned char *myTarget
= (unsigned char *) args
->target
;
208 const UChar
*sourceLimit
= args
->sourceLimit
;
209 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
211 unsigned int indexToWrite
;
212 unsigned char temp
[sizeof(uint32_t)];
216 if (args
->converter
->fromUChar32
) {
217 ch
= args
->converter
->fromUChar32
;
218 args
->converter
->fromUChar32
= 0;
222 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
225 if (UTF_IS_SURROGATE(ch
)) {
228 if (mySource
< sourceLimit
) {
230 if (U_IS_TRAIL(ch2
)) {
231 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
235 /* this is an unmatched trail code unit (2nd surrogate) */
236 /* callback(illegal) */
237 args
->converter
->fromUChar32
= ch
;
238 *err
= U_ILLEGAL_CHAR_FOUND
;
243 /* ran out of source */
244 args
->converter
->fromUChar32
= ch
;
246 /* this is an unmatched trail code unit (2nd surrogate) */
247 /* callback(illegal) */
248 *err
= U_ILLEGAL_CHAR_FOUND
;
254 /* this is an unmatched trail code unit (2nd surrogate) */
255 /* callback(illegal) */
256 args
->converter
->fromUChar32
= ch
;
257 *err
= U_ILLEGAL_CHAR_FOUND
;
262 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
263 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
264 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
265 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
267 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
268 if (myTarget
< targetLimit
) {
269 *(myTarget
++) = temp
[indexToWrite
];
272 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
273 *err
= U_BUFFER_OVERFLOW_ERROR
;
278 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
279 *err
= U_BUFFER_OVERFLOW_ERROR
;
282 args
->target
= (char *) myTarget
;
283 args
->source
= mySource
;
287 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
290 const UChar
*mySource
= args
->source
;
291 unsigned char *myTarget
= (unsigned char *) args
->target
;
292 int32_t *myOffsets
= args
->offsets
;
293 const UChar
*sourceLimit
= args
->sourceLimit
;
294 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
296 int32_t offsetNum
= 0;
297 unsigned int indexToWrite
;
298 unsigned char temp
[sizeof(uint32_t)];
302 if (args
->converter
->fromUChar32
) {
303 ch
= args
->converter
->fromUChar32
;
304 args
->converter
->fromUChar32
= 0;
308 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
311 if (UTF_IS_SURROGATE(ch
)) {
314 if (mySource
< sourceLimit
) {
316 if (U_IS_TRAIL(ch2
)) {
317 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
321 /* this is an unmatched trail code unit (2nd surrogate) */
322 /* callback(illegal) */
323 args
->converter
->fromUChar32
= ch
;
324 *err
= U_ILLEGAL_CHAR_FOUND
;
329 /* ran out of source */
330 args
->converter
->fromUChar32
= ch
;
332 /* this is an unmatched trail code unit (2nd surrogate) */
333 /* callback(illegal) */
334 *err
= U_ILLEGAL_CHAR_FOUND
;
340 /* this is an unmatched trail code unit (2nd surrogate) */
341 /* callback(illegal) */
342 args
->converter
->fromUChar32
= ch
;
343 *err
= U_ILLEGAL_CHAR_FOUND
;
348 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
349 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
350 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
351 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
353 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
354 if (myTarget
< targetLimit
) {
355 *(myTarget
++) = temp
[indexToWrite
];
356 *(myOffsets
++) = offsetNum
;
359 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
360 *err
= U_BUFFER_OVERFLOW_ERROR
;
366 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
367 *err
= U_BUFFER_OVERFLOW_ERROR
;
370 args
->target
= (char *) myTarget
;
371 args
->source
= mySource
;
372 args
->offsets
= myOffsets
;
376 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs
* args
,
379 const uint8_t *mySource
;
383 mySource
= (const uint8_t *)args
->source
;
384 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
387 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
391 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
394 /* got a partial character */
395 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
396 args
->converter
->toULength
= (int8_t)length
;
397 args
->source
= (const char *)(mySource
+ length
);
398 *err
= U_TRUNCATED_CHAR_FOUND
;
402 /* Don't even try to do a direct cast because the value may be on an odd address. */
403 myUChar
= ((UChar32
)mySource
[0] << 24)
404 | ((UChar32
)mySource
[1] << 16)
405 | ((UChar32
)mySource
[2] << 8)
406 | ((UChar32
)mySource
[3]);
408 args
->source
= (const char *)(mySource
+ 4);
409 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
413 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
414 args
->converter
->toULength
= 4;
416 *err
= U_ILLEGAL_CHAR_FOUND
;
420 static const UConverterImpl _UTF32BEImpl
= {
421 UCNV_UTF32_BigEndian
,
430 T_UConverter_toUnicode_UTF32_BE
,
431 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC
,
432 T_UConverter_fromUnicode_UTF32_BE
,
433 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
434 T_UConverter_getNextUChar_UTF32_BE
,
440 ucnv_getCompleteUnicodeSet
443 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
444 static const UConverterStaticData _UTF32BEStaticData
= {
445 sizeof(UConverterStaticData
),
448 UCNV_IBM
, UCNV_UTF32_BigEndian
, 4, 4,
449 { 0, 0, 0xff, 0xfd }, 4, FALSE
, FALSE
,
452 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
455 const UConverterSharedData _UTF32BEData
= {
456 sizeof(UConverterSharedData
), ~((uint32_t) 0),
457 NULL
, NULL
, &_UTF32BEStaticData
, FALSE
, &_UTF32BEImpl
,
461 /* UTF-32LE ---------------------------------------------------------- */
464 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs
* args
,
467 const unsigned char *mySource
= (unsigned char *) args
->source
;
468 UChar
*myTarget
= args
->target
;
469 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
470 const UChar
*targetLimit
= args
->targetLimit
;
471 unsigned char *toUBytes
= args
->converter
->toUBytes
;
474 /* UTF-8 returns here for only non-offset, this needs to change.*/
475 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
477 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
479 /* Stores the previously calculated ch from a previous call*/
480 ch
= args
->converter
->toUnicodeStatus
- 1;
481 args
->converter
->toUnicodeStatus
= 0;
485 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
490 while (i
< sizeof(uint32_t))
492 if (mySource
< sourceLimit
)
494 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
495 toUBytes
[i
++] = (char) *(mySource
++);
499 /* stores a partially calculated target*/
500 /* + 1 to make 0 a valid character */
501 args
->converter
->toUnicodeStatus
= ch
+ 1;
502 args
->converter
->toULength
= (int8_t) i
;
507 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
508 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
509 if (ch
<= MAXIMUM_UCS2
) {
510 /* fits in 16 bits */
511 *(myTarget
++) = (UChar
) ch
;
514 /* write out the surrogates */
515 *(myTarget
++) = U16_LEAD(ch
);
517 if (myTarget
< targetLimit
) {
518 *(myTarget
++) = (UChar
)ch
;
521 /* Put in overflow buffer (not handled here) */
522 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
523 args
->converter
->UCharErrorBufferLength
= 1;
524 *err
= U_BUFFER_OVERFLOW_ERROR
;
530 args
->converter
->toULength
= (int8_t)i
;
531 *err
= U_ILLEGAL_CHAR_FOUND
;
537 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
539 /* End of target buffer */
540 *err
= U_BUFFER_OVERFLOW_ERROR
;
543 args
->target
= myTarget
;
544 args
->source
= (const char *) mySource
;
548 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
551 const unsigned char *mySource
= (unsigned char *) args
->source
;
552 UChar
*myTarget
= args
->target
;
553 int32_t *myOffsets
= args
->offsets
;
554 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
555 const UChar
*targetLimit
= args
->targetLimit
;
556 unsigned char *toUBytes
= args
->converter
->toUBytes
;
558 int32_t offsetNum
= 0;
560 /* UTF-8 returns here for only non-offset, this needs to change.*/
561 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
563 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
565 /* Stores the previously calculated ch from a previous call*/
566 ch
= args
->converter
->toUnicodeStatus
- 1;
567 args
->converter
->toUnicodeStatus
= 0;
571 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
576 while (i
< sizeof(uint32_t))
578 if (mySource
< sourceLimit
)
580 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
581 toUBytes
[i
++] = (char) *(mySource
++);
585 /* stores a partially calculated target*/
586 /* + 1 to make 0 a valid character */
587 args
->converter
->toUnicodeStatus
= ch
+ 1;
588 args
->converter
->toULength
= (int8_t) i
;
593 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
))
595 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
596 if (ch
<= MAXIMUM_UCS2
)
598 /* fits in 16 bits */
599 *(myTarget
++) = (UChar
) ch
;
600 *(myOffsets
++) = offsetNum
;
603 /* write out the surrogates */
604 *(myTarget
++) = U16_LEAD(ch
);
605 *(myOffsets
++) = offsetNum
;
607 if (myTarget
< targetLimit
)
609 *(myTarget
++) = (UChar
)ch
;
610 *(myOffsets
++) = offsetNum
;
614 /* Put in overflow buffer (not handled here) */
615 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
616 args
->converter
->UCharErrorBufferLength
= 1;
617 *err
= U_BUFFER_OVERFLOW_ERROR
;
624 args
->converter
->toULength
= (int8_t)i
;
625 *err
= U_ILLEGAL_CHAR_FOUND
;
632 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
634 /* End of target buffer */
635 *err
= U_BUFFER_OVERFLOW_ERROR
;
638 args
->target
= myTarget
;
639 args
->source
= (const char *) mySource
;
640 args
->offsets
= myOffsets
;
644 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs
* args
,
647 const UChar
*mySource
= args
->source
;
648 unsigned char *myTarget
= (unsigned char *) args
->target
;
649 const UChar
*sourceLimit
= args
->sourceLimit
;
650 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
652 unsigned int indexToWrite
;
653 unsigned char temp
[sizeof(uint32_t)];
657 if (args
->converter
->fromUChar32
)
659 ch
= args
->converter
->fromUChar32
;
660 args
->converter
->fromUChar32
= 0;
664 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
668 if (UTF_IS_SURROGATE(ch
)) {
672 if (mySource
< sourceLimit
)
675 if (U_IS_TRAIL(ch2
)) {
676 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
680 /* this is an unmatched trail code unit (2nd surrogate) */
681 /* callback(illegal) */
682 args
->converter
->fromUChar32
= ch
;
683 *err
= U_ILLEGAL_CHAR_FOUND
;
688 /* ran out of source */
689 args
->converter
->fromUChar32
= ch
;
691 /* this is an unmatched trail code unit (2nd surrogate) */
692 /* callback(illegal) */
693 *err
= U_ILLEGAL_CHAR_FOUND
;
699 /* this is an unmatched trail code unit (2nd surrogate) */
700 /* callback(illegal) */
701 args
->converter
->fromUChar32
= ch
;
702 *err
= U_ILLEGAL_CHAR_FOUND
;
707 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
708 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
709 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
710 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
712 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
714 if (myTarget
< targetLimit
)
716 *(myTarget
++) = temp
[indexToWrite
];
720 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
721 *err
= U_BUFFER_OVERFLOW_ERROR
;
726 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
728 *err
= U_BUFFER_OVERFLOW_ERROR
;
731 args
->target
= (char *) myTarget
;
732 args
->source
= mySource
;
736 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
739 const UChar
*mySource
= args
->source
;
740 unsigned char *myTarget
= (unsigned char *) args
->target
;
741 int32_t *myOffsets
= args
->offsets
;
742 const UChar
*sourceLimit
= args
->sourceLimit
;
743 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
745 unsigned int indexToWrite
;
746 unsigned char temp
[sizeof(uint32_t)];
747 int32_t offsetNum
= 0;
751 if (args
->converter
->fromUChar32
)
753 ch
= args
->converter
->fromUChar32
;
754 args
->converter
->fromUChar32
= 0;
758 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
762 if (UTF_IS_SURROGATE(ch
)) {
766 if (mySource
< sourceLimit
)
771 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
775 /* this is an unmatched trail code unit (2nd surrogate) */
776 /* callback(illegal) */
777 args
->converter
->fromUChar32
= ch
;
778 *err
= U_ILLEGAL_CHAR_FOUND
;
783 /* ran out of source */
784 args
->converter
->fromUChar32
= ch
;
786 /* this is an unmatched trail code unit (2nd surrogate) */
787 /* callback(illegal) */
788 *err
= U_ILLEGAL_CHAR_FOUND
;
794 /* this is an unmatched trail code unit (2nd surrogate) */
795 /* callback(illegal) */
796 args
->converter
->fromUChar32
= ch
;
797 *err
= U_ILLEGAL_CHAR_FOUND
;
802 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
803 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
804 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
805 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
807 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
809 if (myTarget
< targetLimit
)
811 *(myTarget
++) = temp
[indexToWrite
];
812 *(myOffsets
++) = offsetNum
;
816 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
817 *err
= U_BUFFER_OVERFLOW_ERROR
;
823 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
825 *err
= U_BUFFER_OVERFLOW_ERROR
;
828 args
->target
= (char *) myTarget
;
829 args
->source
= mySource
;
830 args
->offsets
= myOffsets
;
834 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs
* args
,
837 const uint8_t *mySource
;
841 mySource
= (const uint8_t *)args
->source
;
842 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
845 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
849 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
852 /* got a partial character */
853 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
854 args
->converter
->toULength
= (int8_t)length
;
855 args
->source
= (const char *)(mySource
+ length
);
856 *err
= U_TRUNCATED_CHAR_FOUND
;
860 /* Don't even try to do a direct cast because the value may be on an odd address. */
861 myUChar
= ((UChar32
)mySource
[3] << 24)
862 | ((UChar32
)mySource
[2] << 16)
863 | ((UChar32
)mySource
[1] << 8)
864 | ((UChar32
)mySource
[0]);
866 args
->source
= (const char *)(mySource
+ 4);
867 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
871 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
872 args
->converter
->toULength
= 4;
874 *err
= U_ILLEGAL_CHAR_FOUND
;
878 static const UConverterImpl _UTF32LEImpl
= {
879 UCNV_UTF32_LittleEndian
,
888 T_UConverter_toUnicode_UTF32_LE
,
889 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC
,
890 T_UConverter_fromUnicode_UTF32_LE
,
891 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
892 T_UConverter_getNextUChar_UTF32_LE
,
898 ucnv_getCompleteUnicodeSet
901 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
902 static const UConverterStaticData _UTF32LEStaticData
= {
903 sizeof(UConverterStaticData
),
906 UCNV_IBM
, UCNV_UTF32_LittleEndian
, 4, 4,
907 { 0xfd, 0xff, 0, 0 }, 4, FALSE
, FALSE
,
910 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
914 const UConverterSharedData _UTF32LEData
= {
915 sizeof(UConverterSharedData
), ~((uint32_t) 0),
916 NULL
, NULL
, &_UTF32LEStaticData
, FALSE
, &_UTF32LEImpl
,
920 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
923 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
938 * During detection: state&3==number of matching bytes so far.
940 * On output, emit U+FEFF as the first code point.
944 _UTF32Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
945 if(choice
<=UCNV_RESET_TO_UNICODE
) {
946 /* reset toUnicode: state=0 */
949 if(choice
!=UCNV_RESET_TO_UNICODE
) {
950 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
951 cnv
->charErrorBufferLength
=4;
953 cnv
->charErrorBuffer
[0]=0;
954 cnv
->charErrorBuffer
[1]=0;
955 cnv
->charErrorBuffer
[2]=0xfe;
956 cnv
->charErrorBuffer
[3]=0xff;
958 cnv
->charErrorBuffer
[0]=0xff;
959 cnv
->charErrorBuffer
[1]=0xfe;
960 cnv
->charErrorBuffer
[2]=0;
961 cnv
->charErrorBuffer
[3]=0;
967 _UTF32Open(UConverter
*cnv
,
971 UErrorCode
*pErrorCode
) {
972 _UTF32Reset(cnv
, UCNV_RESET_BOTH
);
975 static const char utf32BOM
[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
978 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
979 UErrorCode
*pErrorCode
) {
980 UConverter
*cnv
=pArgs
->converter
;
981 const char *source
=pArgs
->source
;
982 const char *sourceLimit
=pArgs
->sourceLimit
;
983 int32_t *offsets
=pArgs
->offsets
;
985 int32_t state
, offsetDelta
;
991 * If we detect a BOM in this buffer, then we must add the BOM size to the
992 * offsets because the actual converter function will not see and count the BOM.
993 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
997 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1002 state
=1; /* could be 00 00 FE FF */
1003 } else if(b
==(char)0xff) {
1004 state
=5; /* could be FF FE 00 00 */
1006 state
=8; /* default to UTF-32BE */
1017 if(*source
==utf32BOM
[state
]) {
1021 state
=8; /* detect UTF-32BE */
1022 offsetDelta
=source
-pArgs
->source
;
1023 } else if(state
==8) {
1024 state
=9; /* detect UTF-32LE */
1025 offsetDelta
=source
-pArgs
->source
;
1028 /* switch to UTF-32BE and pass the previous bytes */
1029 int32_t count
=source
-pArgs
->source
; /* number of bytes from this buffer */
1031 /* reset the source */
1032 source
=pArgs
->source
;
1034 if(count
==(state
&3)) {
1035 /* simple: all in the same buffer, just reset source */
1037 UBool oldFlush
=pArgs
->flush
;
1039 /* some of the bytes are from a previous buffer, replay those first */
1040 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1041 pArgs
->sourceLimit
=pArgs
->source
+((state
&3)-count
); /* replay previous bytes */
1042 pArgs
->flush
=FALSE
; /* this sourceLimit is not the real source stream limit */
1044 /* no offsets: bytes from previous buffer, and not enough for output */
1045 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1047 /* restore real pointers; pArgs->source will be set in case 8/9 */
1048 pArgs
->sourceLimit
=sourceLimit
;
1049 pArgs
->flush
=oldFlush
;
1057 pArgs
->source
=source
;
1059 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1061 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1063 source
=pArgs
->source
;
1067 pArgs
->source
=source
;
1069 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1071 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1073 source
=pArgs
->source
;
1076 break; /* does not occur */
1080 /* add BOM size to offsets - see comment at offsetDelta declaration */
1081 if(offsets
!=NULL
&& offsetDelta
!=0) {
1082 int32_t *offsetsLimit
=pArgs
->offsets
;
1083 while(offsets
<offsetsLimit
) {
1084 *offsets
++ += offsetDelta
;
1088 pArgs
->source
=source
;
1090 if(source
==sourceLimit
&& pArgs
->flush
) {
1091 /* handle truncated input */
1094 break; /* no input at all, nothing to do */
1096 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1099 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1102 /* handle 0<state<8: call UTF-32BE with too-short input */
1103 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1104 pArgs
->sourceLimit
=pArgs
->source
+(state
&3); /* replay bytes */
1106 /* no offsets: not enough for output */
1107 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1108 pArgs
->source
=source
;
1109 pArgs
->sourceLimit
=sourceLimit
;
1119 _UTF32GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1120 UErrorCode
*pErrorCode
) {
1121 switch(pArgs
->converter
->mode
) {
1123 return T_UConverter_getNextUChar_UTF32_BE(pArgs
, pErrorCode
);
1125 return T_UConverter_getNextUChar_UTF32_LE(pArgs
, pErrorCode
);
1127 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1131 static const UConverterImpl _UTF32Impl
= {
1141 _UTF32ToUnicodeWithOffsets
,
1142 _UTF32ToUnicodeWithOffsets
,
1144 T_UConverter_fromUnicode_UTF32_BE
,
1145 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
1147 T_UConverter_fromUnicode_UTF32_LE
,
1148 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
1152 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1156 ucnv_getCompleteUnicodeSet
1159 static const UConverterStaticData _UTF32StaticData
= {
1160 sizeof(UConverterStaticData
),
1162 0, /* ### TODO review correctness of all Unicode CCSIDs */
1163 UCNV_IBM
, UCNV_UTF32
, 4, 4,
1165 { 0, 0, 0xff, 0xfd }, 4,
1167 { 0xfd, 0xff, 0, 0 }, 4,
1172 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1175 const UConverterSharedData _UTF32Data
= {
1176 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1177 NULL
, NULL
, &_UTF32StaticData
, FALSE
, &_UTF32Impl
,