2 **********************************************************************
3 * Copyright (C) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
26 #define MAXIMUM_UCS2 0x0000FFFF
27 #define MAXIMUM_UTF 0x0010FFFF
29 #define HALF_BASE 0x0010000
30 #define HALF_MASK 0x3FF
31 #define SURROGATE_HIGH_START 0xD800
32 #define SURROGATE_LOW_START 0xDC00
34 /* -SURROGATE_LOW_START + HALF_BASE */
35 #define SURROGATE_LOW_BASE 9216
38 UCNV_NEED_TO_WRITE_BOM
=1
41 /* UTF-32BE ----------------------------------------------------------------- */
44 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs
* args
,
47 const unsigned char *mySource
= (unsigned char *) args
->source
;
48 UChar
*myTarget
= args
->target
;
49 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
50 const UChar
*targetLimit
= args
->targetLimit
;
51 unsigned char *toUBytes
= args
->converter
->toUBytes
;
54 /* Restore state of current sequence */
55 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
56 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
57 args
->converter
->toULength
= 0;
59 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
60 args
->converter
->toUnicodeStatus
= 0;
64 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
68 while (i
< sizeof(uint32_t)) {
69 if (mySource
< sourceLimit
) {
70 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
71 toUBytes
[i
++] = (char) *(mySource
++);
74 /* stores a partially calculated target*/
75 /* + 1 to make 0 a valid character */
76 args
->converter
->toUnicodeStatus
= ch
+ 1;
77 args
->converter
->toULength
= (int8_t) i
;
82 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
83 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
84 if (ch
<= MAXIMUM_UCS2
)
87 *(myTarget
++) = (UChar
) ch
;
90 /* write out the surrogates */
91 *(myTarget
++) = U16_LEAD(ch
);
93 if (myTarget
< targetLimit
) {
94 *(myTarget
++) = (UChar
)ch
;
97 /* Put in overflow buffer (not handled here) */
98 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
99 args
->converter
->UCharErrorBufferLength
= 1;
100 *err
= U_BUFFER_OVERFLOW_ERROR
;
106 args
->converter
->toULength
= (int8_t)i
;
107 *err
= U_ILLEGAL_CHAR_FOUND
;
113 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
114 /* End of target buffer */
115 *err
= U_BUFFER_OVERFLOW_ERROR
;
118 args
->target
= myTarget
;
119 args
->source
= (const char *) mySource
;
123 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
126 const unsigned char *mySource
= (unsigned char *) args
->source
;
127 UChar
*myTarget
= args
->target
;
128 int32_t *myOffsets
= args
->offsets
;
129 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
130 const UChar
*targetLimit
= args
->targetLimit
;
131 unsigned char *toUBytes
= args
->converter
->toUBytes
;
133 int32_t offsetNum
= 0;
135 /* Restore state of current sequence */
136 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
137 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
138 args
->converter
->toULength
= 0;
140 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
141 args
->converter
->toUnicodeStatus
= 0;
145 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
149 while (i
< sizeof(uint32_t)) {
150 if (mySource
< sourceLimit
) {
151 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
152 toUBytes
[i
++] = (char) *(mySource
++);
155 /* stores a partially calculated target*/
156 /* + 1 to make 0 a valid character */
157 args
->converter
->toUnicodeStatus
= ch
+ 1;
158 args
->converter
->toULength
= (int8_t) i
;
163 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
164 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
165 if (ch
<= MAXIMUM_UCS2
) {
166 /* fits in 16 bits */
167 *(myTarget
++) = (UChar
) ch
;
168 *(myOffsets
++) = offsetNum
;
171 /* write out the surrogates */
172 *(myTarget
++) = U16_LEAD(ch
);
173 *myOffsets
++ = offsetNum
;
175 if (myTarget
< targetLimit
)
177 *(myTarget
++) = (UChar
)ch
;
178 *(myOffsets
++) = offsetNum
;
181 /* Put in overflow buffer (not handled here) */
182 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
183 args
->converter
->UCharErrorBufferLength
= 1;
184 *err
= U_BUFFER_OVERFLOW_ERROR
;
190 args
->converter
->toULength
= (int8_t)i
;
191 *err
= U_ILLEGAL_CHAR_FOUND
;
198 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
200 /* End of target buffer */
201 *err
= U_BUFFER_OVERFLOW_ERROR
;
204 args
->target
= myTarget
;
205 args
->source
= (const char *) mySource
;
206 args
->offsets
= myOffsets
;
210 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs
* args
,
213 const UChar
*mySource
= args
->source
;
214 unsigned char *myTarget
;
215 const UChar
*sourceLimit
= args
->sourceLimit
;
216 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
218 unsigned int indexToWrite
;
219 unsigned char temp
[sizeof(uint32_t)];
221 if(mySource
>= sourceLimit
) {
222 /* no input, nothing to do */
226 /* write the BOM if necessary */
227 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
228 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
229 ucnv_fromUWriteBytes(args
->converter
,
231 &args
->target
, args
->targetLimit
,
234 args
->converter
->fromUnicodeStatus
=0;
237 myTarget
= (unsigned char *) args
->target
;
240 if (args
->converter
->fromUChar32
) {
241 ch
= args
->converter
->fromUChar32
;
242 args
->converter
->fromUChar32
= 0;
246 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
249 if (UTF_IS_SURROGATE(ch
)) {
252 if (mySource
< sourceLimit
) {
254 if (U_IS_TRAIL(ch2
)) {
255 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
259 /* this is an unmatched trail code unit (2nd surrogate) */
260 /* callback(illegal) */
261 args
->converter
->fromUChar32
= ch
;
262 *err
= U_ILLEGAL_CHAR_FOUND
;
267 /* ran out of source */
268 args
->converter
->fromUChar32
= ch
;
270 /* this is an unmatched trail code unit (2nd surrogate) */
271 /* callback(illegal) */
272 *err
= U_ILLEGAL_CHAR_FOUND
;
278 /* this is an unmatched trail code unit (2nd surrogate) */
279 /* callback(illegal) */
280 args
->converter
->fromUChar32
= ch
;
281 *err
= U_ILLEGAL_CHAR_FOUND
;
286 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
287 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
288 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
289 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
291 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
292 if (myTarget
< targetLimit
) {
293 *(myTarget
++) = temp
[indexToWrite
];
296 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
297 *err
= U_BUFFER_OVERFLOW_ERROR
;
302 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
303 *err
= U_BUFFER_OVERFLOW_ERROR
;
306 args
->target
= (char *) myTarget
;
307 args
->source
= mySource
;
311 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
314 const UChar
*mySource
= args
->source
;
315 unsigned char *myTarget
;
317 const UChar
*sourceLimit
= args
->sourceLimit
;
318 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
320 int32_t offsetNum
= 0;
321 unsigned int indexToWrite
;
322 unsigned char temp
[sizeof(uint32_t)];
324 if(mySource
>= sourceLimit
) {
325 /* no input, nothing to do */
329 /* write the BOM if necessary */
330 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
331 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
332 ucnv_fromUWriteBytes(args
->converter
,
334 &args
->target
, args
->targetLimit
,
337 args
->converter
->fromUnicodeStatus
=0;
340 myTarget
= (unsigned char *) args
->target
;
341 myOffsets
= args
->offsets
;
344 if (args
->converter
->fromUChar32
) {
345 ch
= args
->converter
->fromUChar32
;
346 args
->converter
->fromUChar32
= 0;
350 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
353 if (UTF_IS_SURROGATE(ch
)) {
356 if (mySource
< sourceLimit
) {
358 if (U_IS_TRAIL(ch2
)) {
359 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
363 /* this is an unmatched trail code unit (2nd surrogate) */
364 /* callback(illegal) */
365 args
->converter
->fromUChar32
= ch
;
366 *err
= U_ILLEGAL_CHAR_FOUND
;
371 /* ran out of source */
372 args
->converter
->fromUChar32
= ch
;
374 /* this is an unmatched trail code unit (2nd surrogate) */
375 /* callback(illegal) */
376 *err
= U_ILLEGAL_CHAR_FOUND
;
382 /* this is an unmatched trail code unit (2nd surrogate) */
383 /* callback(illegal) */
384 args
->converter
->fromUChar32
= ch
;
385 *err
= U_ILLEGAL_CHAR_FOUND
;
390 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
391 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
392 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
393 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
395 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
396 if (myTarget
< targetLimit
) {
397 *(myTarget
++) = temp
[indexToWrite
];
398 *(myOffsets
++) = offsetNum
;
401 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
402 *err
= U_BUFFER_OVERFLOW_ERROR
;
405 offsetNum
= offsetNum
+ 1 + (temp
[1] != 0);
408 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
409 *err
= U_BUFFER_OVERFLOW_ERROR
;
412 args
->target
= (char *) myTarget
;
413 args
->source
= mySource
;
414 args
->offsets
= myOffsets
;
418 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs
* args
,
421 const uint8_t *mySource
;
425 mySource
= (const uint8_t *)args
->source
;
426 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
429 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
433 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
436 /* got a partial character */
437 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
438 args
->converter
->toULength
= (int8_t)length
;
439 args
->source
= (const char *)(mySource
+ length
);
440 *err
= U_TRUNCATED_CHAR_FOUND
;
444 /* Don't even try to do a direct cast because the value may be on an odd address. */
445 myUChar
= ((UChar32
)mySource
[0] << 24)
446 | ((UChar32
)mySource
[1] << 16)
447 | ((UChar32
)mySource
[2] << 8)
448 | ((UChar32
)mySource
[3]);
450 args
->source
= (const char *)(mySource
+ 4);
451 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
455 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
456 args
->converter
->toULength
= 4;
458 *err
= U_ILLEGAL_CHAR_FOUND
;
462 static const UConverterImpl _UTF32BEImpl
= {
463 UCNV_UTF32_BigEndian
,
472 T_UConverter_toUnicode_UTF32_BE
,
473 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC
,
474 T_UConverter_fromUnicode_UTF32_BE
,
475 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
476 T_UConverter_getNextUChar_UTF32_BE
,
482 ucnv_getNonSurrogateUnicodeSet
485 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
486 static const UConverterStaticData _UTF32BEStaticData
= {
487 sizeof(UConverterStaticData
),
490 UCNV_IBM
, UCNV_UTF32_BigEndian
, 4, 4,
491 { 0, 0, 0xff, 0xfd }, 4, FALSE
, FALSE
,
494 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
497 const UConverterSharedData _UTF32BEData
= {
498 sizeof(UConverterSharedData
), ~((uint32_t) 0),
499 NULL
, NULL
, &_UTF32BEStaticData
, FALSE
, &_UTF32BEImpl
,
503 /* UTF-32LE ---------------------------------------------------------- */
506 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs
* args
,
509 const unsigned char *mySource
= (unsigned char *) args
->source
;
510 UChar
*myTarget
= args
->target
;
511 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
512 const UChar
*targetLimit
= args
->targetLimit
;
513 unsigned char *toUBytes
= args
->converter
->toUBytes
;
516 /* Restore state of current sequence */
517 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
519 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
520 args
->converter
->toULength
= 0;
522 /* Stores the previously calculated ch from a previous call*/
523 ch
= args
->converter
->toUnicodeStatus
- 1;
524 args
->converter
->toUnicodeStatus
= 0;
528 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
533 while (i
< sizeof(uint32_t))
535 if (mySource
< sourceLimit
)
537 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
538 toUBytes
[i
++] = (char) *(mySource
++);
542 /* stores a partially calculated target*/
543 /* + 1 to make 0 a valid character */
544 args
->converter
->toUnicodeStatus
= ch
+ 1;
545 args
->converter
->toULength
= (int8_t) i
;
550 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
551 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
552 if (ch
<= MAXIMUM_UCS2
) {
553 /* fits in 16 bits */
554 *(myTarget
++) = (UChar
) ch
;
557 /* write out the surrogates */
558 *(myTarget
++) = U16_LEAD(ch
);
560 if (myTarget
< targetLimit
) {
561 *(myTarget
++) = (UChar
)ch
;
564 /* Put in overflow buffer (not handled here) */
565 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
566 args
->converter
->UCharErrorBufferLength
= 1;
567 *err
= U_BUFFER_OVERFLOW_ERROR
;
573 args
->converter
->toULength
= (int8_t)i
;
574 *err
= U_ILLEGAL_CHAR_FOUND
;
580 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
582 /* End of target buffer */
583 *err
= U_BUFFER_OVERFLOW_ERROR
;
586 args
->target
= myTarget
;
587 args
->source
= (const char *) mySource
;
591 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
594 const unsigned char *mySource
= (unsigned char *) args
->source
;
595 UChar
*myTarget
= args
->target
;
596 int32_t *myOffsets
= args
->offsets
;
597 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
598 const UChar
*targetLimit
= args
->targetLimit
;
599 unsigned char *toUBytes
= args
->converter
->toUBytes
;
601 int32_t offsetNum
= 0;
603 /* Restore state of current sequence */
604 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
606 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
607 args
->converter
->toULength
= 0;
609 /* Stores the previously calculated ch from a previous call*/
610 ch
= args
->converter
->toUnicodeStatus
- 1;
611 args
->converter
->toUnicodeStatus
= 0;
615 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
620 while (i
< sizeof(uint32_t))
622 if (mySource
< sourceLimit
)
624 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
625 toUBytes
[i
++] = (char) *(mySource
++);
629 /* stores a partially calculated target*/
630 /* + 1 to make 0 a valid character */
631 args
->converter
->toUnicodeStatus
= ch
+ 1;
632 args
->converter
->toULength
= (int8_t) i
;
637 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
))
639 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
640 if (ch
<= MAXIMUM_UCS2
)
642 /* fits in 16 bits */
643 *(myTarget
++) = (UChar
) ch
;
644 *(myOffsets
++) = offsetNum
;
647 /* write out the surrogates */
648 *(myTarget
++) = U16_LEAD(ch
);
649 *(myOffsets
++) = offsetNum
;
651 if (myTarget
< targetLimit
)
653 *(myTarget
++) = (UChar
)ch
;
654 *(myOffsets
++) = offsetNum
;
658 /* Put in overflow buffer (not handled here) */
659 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
660 args
->converter
->UCharErrorBufferLength
= 1;
661 *err
= U_BUFFER_OVERFLOW_ERROR
;
668 args
->converter
->toULength
= (int8_t)i
;
669 *err
= U_ILLEGAL_CHAR_FOUND
;
676 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
678 /* End of target buffer */
679 *err
= U_BUFFER_OVERFLOW_ERROR
;
682 args
->target
= myTarget
;
683 args
->source
= (const char *) mySource
;
684 args
->offsets
= myOffsets
;
688 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs
* args
,
691 const UChar
*mySource
= args
->source
;
692 unsigned char *myTarget
;
693 const UChar
*sourceLimit
= args
->sourceLimit
;
694 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
696 unsigned int indexToWrite
;
697 unsigned char temp
[sizeof(uint32_t)];
699 if(mySource
>= sourceLimit
) {
700 /* no input, nothing to do */
704 /* write the BOM if necessary */
705 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
706 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
707 ucnv_fromUWriteBytes(args
->converter
,
709 &args
->target
, args
->targetLimit
,
712 args
->converter
->fromUnicodeStatus
=0;
715 myTarget
= (unsigned char *) args
->target
;
718 if (args
->converter
->fromUChar32
)
720 ch
= args
->converter
->fromUChar32
;
721 args
->converter
->fromUChar32
= 0;
725 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
729 if (UTF_IS_SURROGATE(ch
)) {
733 if (mySource
< sourceLimit
)
736 if (U_IS_TRAIL(ch2
)) {
737 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
741 /* this is an unmatched trail code unit (2nd surrogate) */
742 /* callback(illegal) */
743 args
->converter
->fromUChar32
= ch
;
744 *err
= U_ILLEGAL_CHAR_FOUND
;
749 /* ran out of source */
750 args
->converter
->fromUChar32
= ch
;
752 /* this is an unmatched trail code unit (2nd surrogate) */
753 /* callback(illegal) */
754 *err
= U_ILLEGAL_CHAR_FOUND
;
760 /* this is an unmatched trail code unit (2nd surrogate) */
761 /* callback(illegal) */
762 args
->converter
->fromUChar32
= ch
;
763 *err
= U_ILLEGAL_CHAR_FOUND
;
768 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
769 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
770 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
771 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
773 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
775 if (myTarget
< targetLimit
)
777 *(myTarget
++) = temp
[indexToWrite
];
781 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
782 *err
= U_BUFFER_OVERFLOW_ERROR
;
787 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
789 *err
= U_BUFFER_OVERFLOW_ERROR
;
792 args
->target
= (char *) myTarget
;
793 args
->source
= mySource
;
797 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
800 const UChar
*mySource
= args
->source
;
801 unsigned char *myTarget
;
803 const UChar
*sourceLimit
= args
->sourceLimit
;
804 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
806 unsigned int indexToWrite
;
807 unsigned char temp
[sizeof(uint32_t)];
808 int32_t offsetNum
= 0;
810 if(mySource
>= sourceLimit
) {
811 /* no input, nothing to do */
815 /* write the BOM if necessary */
816 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
817 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
818 ucnv_fromUWriteBytes(args
->converter
,
820 &args
->target
, args
->targetLimit
,
823 args
->converter
->fromUnicodeStatus
=0;
826 myTarget
= (unsigned char *) args
->target
;
827 myOffsets
= args
->offsets
;
830 if (args
->converter
->fromUChar32
)
832 ch
= args
->converter
->fromUChar32
;
833 args
->converter
->fromUChar32
= 0;
837 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
841 if (UTF_IS_SURROGATE(ch
)) {
845 if (mySource
< sourceLimit
)
850 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
854 /* this is an unmatched trail code unit (2nd surrogate) */
855 /* callback(illegal) */
856 args
->converter
->fromUChar32
= ch
;
857 *err
= U_ILLEGAL_CHAR_FOUND
;
862 /* ran out of source */
863 args
->converter
->fromUChar32
= ch
;
865 /* this is an unmatched trail code unit (2nd surrogate) */
866 /* callback(illegal) */
867 *err
= U_ILLEGAL_CHAR_FOUND
;
873 /* this is an unmatched trail code unit (2nd surrogate) */
874 /* callback(illegal) */
875 args
->converter
->fromUChar32
= ch
;
876 *err
= U_ILLEGAL_CHAR_FOUND
;
881 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
882 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
883 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
884 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
886 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
888 if (myTarget
< targetLimit
)
890 *(myTarget
++) = temp
[indexToWrite
];
891 *(myOffsets
++) = offsetNum
;
895 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
896 *err
= U_BUFFER_OVERFLOW_ERROR
;
899 offsetNum
= offsetNum
+ 1 + (temp
[2] != 0);
902 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
904 *err
= U_BUFFER_OVERFLOW_ERROR
;
907 args
->target
= (char *) myTarget
;
908 args
->source
= mySource
;
909 args
->offsets
= myOffsets
;
913 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs
* args
,
916 const uint8_t *mySource
;
920 mySource
= (const uint8_t *)args
->source
;
921 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
924 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
928 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
931 /* got a partial character */
932 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
933 args
->converter
->toULength
= (int8_t)length
;
934 args
->source
= (const char *)(mySource
+ length
);
935 *err
= U_TRUNCATED_CHAR_FOUND
;
939 /* Don't even try to do a direct cast because the value may be on an odd address. */
940 myUChar
= ((UChar32
)mySource
[3] << 24)
941 | ((UChar32
)mySource
[2] << 16)
942 | ((UChar32
)mySource
[1] << 8)
943 | ((UChar32
)mySource
[0]);
945 args
->source
= (const char *)(mySource
+ 4);
946 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
950 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
951 args
->converter
->toULength
= 4;
953 *err
= U_ILLEGAL_CHAR_FOUND
;
957 static const UConverterImpl _UTF32LEImpl
= {
958 UCNV_UTF32_LittleEndian
,
967 T_UConverter_toUnicode_UTF32_LE
,
968 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC
,
969 T_UConverter_fromUnicode_UTF32_LE
,
970 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
971 T_UConverter_getNextUChar_UTF32_LE
,
977 ucnv_getNonSurrogateUnicodeSet
980 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
981 static const UConverterStaticData _UTF32LEStaticData
= {
982 sizeof(UConverterStaticData
),
985 UCNV_IBM
, UCNV_UTF32_LittleEndian
, 4, 4,
986 { 0xfd, 0xff, 0, 0 }, 4, FALSE
, FALSE
,
989 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
993 const UConverterSharedData _UTF32LEData
= {
994 sizeof(UConverterSharedData
), ~((uint32_t) 0),
995 NULL
, NULL
, &_UTF32LEStaticData
, FALSE
, &_UTF32LEImpl
,
999 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1002 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1017 * During detection: state&3==number of matching bytes so far.
1019 * On output, emit U+FEFF as the first code point.
1023 _UTF32Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1024 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1025 /* reset toUnicode: state=0 */
1028 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1029 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1030 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1035 _UTF32Open(UConverter
*cnv
,
1039 UErrorCode
*pErrorCode
) {
1040 _UTF32Reset(cnv
, UCNV_RESET_BOTH
);
1043 static const char utf32BOM
[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1046 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1047 UErrorCode
*pErrorCode
) {
1048 UConverter
*cnv
=pArgs
->converter
;
1049 const char *source
=pArgs
->source
;
1050 const char *sourceLimit
=pArgs
->sourceLimit
;
1051 int32_t *offsets
=pArgs
->offsets
;
1053 int32_t state
, offsetDelta
;
1059 * If we detect a BOM in this buffer, then we must add the BOM size to the
1060 * offsets because the actual converter function will not see and count the BOM.
1061 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1065 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1070 state
=1; /* could be 00 00 FE FF */
1071 } else if(b
==(char)0xff) {
1072 state
=5; /* could be FF FE 00 00 */
1074 state
=8; /* default to UTF-32BE */
1085 if(*source
==utf32BOM
[state
]) {
1089 state
=8; /* detect UTF-32BE */
1090 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1091 } else if(state
==8) {
1092 state
=9; /* detect UTF-32LE */
1093 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1096 /* switch to UTF-32BE and pass the previous bytes */
1097 int32_t count
=(int32_t)(source
-pArgs
->source
); /* number of bytes from this buffer */
1099 /* reset the source */
1100 source
=pArgs
->source
;
1102 if(count
==(state
&3)) {
1103 /* simple: all in the same buffer, just reset source */
1105 UBool oldFlush
=pArgs
->flush
;
1107 /* some of the bytes are from a previous buffer, replay those first */
1108 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1109 pArgs
->sourceLimit
=pArgs
->source
+((state
&3)-count
); /* replay previous bytes */
1110 pArgs
->flush
=FALSE
; /* this sourceLimit is not the real source stream limit */
1112 /* no offsets: bytes from previous buffer, and not enough for output */
1113 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1115 /* restore real pointers; pArgs->source will be set in case 8/9 */
1116 pArgs
->sourceLimit
=sourceLimit
;
1117 pArgs
->flush
=oldFlush
;
1125 pArgs
->source
=source
;
1127 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1129 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1131 source
=pArgs
->source
;
1135 pArgs
->source
=source
;
1137 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1139 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1141 source
=pArgs
->source
;
1144 break; /* does not occur */
1148 /* add BOM size to offsets - see comment at offsetDelta declaration */
1149 if(offsets
!=NULL
&& offsetDelta
!=0) {
1150 int32_t *offsetsLimit
=pArgs
->offsets
;
1151 while(offsets
<offsetsLimit
) {
1152 *offsets
++ += offsetDelta
;
1156 pArgs
->source
=source
;
1158 if(source
==sourceLimit
&& pArgs
->flush
) {
1159 /* handle truncated input */
1162 break; /* no input at all, nothing to do */
1164 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1167 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1170 /* handle 0<state<8: call UTF-32BE with too-short input */
1171 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1172 pArgs
->sourceLimit
=pArgs
->source
+(state
&3); /* replay bytes */
1174 /* no offsets: not enough for output */
1175 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1176 pArgs
->source
=source
;
1177 pArgs
->sourceLimit
=sourceLimit
;
1187 _UTF32GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1188 UErrorCode
*pErrorCode
) {
1189 switch(pArgs
->converter
->mode
) {
1191 return T_UConverter_getNextUChar_UTF32_BE(pArgs
, pErrorCode
);
1193 return T_UConverter_getNextUChar_UTF32_LE(pArgs
, pErrorCode
);
1195 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1199 static const UConverterImpl _UTF32Impl
= {
1209 _UTF32ToUnicodeWithOffsets
,
1210 _UTF32ToUnicodeWithOffsets
,
1212 T_UConverter_fromUnicode_UTF32_BE
,
1213 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
1215 T_UConverter_fromUnicode_UTF32_LE
,
1216 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
1220 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1224 ucnv_getNonSurrogateUnicodeSet
1227 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1228 static const UConverterStaticData _UTF32StaticData
= {
1229 sizeof(UConverterStaticData
),
1232 UCNV_IBM
, UCNV_UTF32
, 4, 4,
1234 { 0, 0, 0xff, 0xfd }, 4,
1236 { 0xfd, 0xff, 0, 0 }, 4,
1241 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1244 const UConverterSharedData _UTF32Data
= {
1245 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1246 NULL
, NULL
, &_UTF32StaticData
, FALSE
, &_UTF32Impl
,