2 **********************************************************************
3 * Copyright (C) 2002-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
21 #include "unicode/ucnv.h"
22 #include "unicode/utf.h"
27 #define MAXIMUM_UCS2 0x0000FFFF
28 #define MAXIMUM_UTF 0x0010FFFF
30 #define HALF_BASE 0x0010000
31 #define HALF_MASK 0x3FF
32 #define SURROGATE_HIGH_START 0xD800
33 #define SURROGATE_LOW_START 0xDC00
35 /* -SURROGATE_LOW_START + HALF_BASE */
36 #define SURROGATE_LOW_BASE 9216
39 UCNV_NEED_TO_WRITE_BOM
=1
42 /* UTF-32BE ----------------------------------------------------------------- */
45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs
* args
,
48 const unsigned char *mySource
= (unsigned char *) args
->source
;
49 UChar
*myTarget
= args
->target
;
50 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
51 const UChar
*targetLimit
= args
->targetLimit
;
52 unsigned char *toUBytes
= args
->converter
->toUBytes
;
55 /* Restore state of current sequence */
56 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
57 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
58 args
->converter
->toULength
= 0;
60 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
61 args
->converter
->toUnicodeStatus
= 0;
65 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
69 while (i
< sizeof(uint32_t)) {
70 if (mySource
< sourceLimit
) {
71 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
72 toUBytes
[i
++] = (char) *(mySource
++);
75 /* stores a partially calculated target*/
76 /* + 1 to make 0 a valid character */
77 args
->converter
->toUnicodeStatus
= ch
+ 1;
78 args
->converter
->toULength
= (int8_t) i
;
83 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
85 if (ch
<= MAXIMUM_UCS2
)
88 *(myTarget
++) = (UChar
) ch
;
91 /* write out the surrogates */
92 *(myTarget
++) = U16_LEAD(ch
);
94 if (myTarget
< targetLimit
) {
95 *(myTarget
++) = (UChar
)ch
;
98 /* Put in overflow buffer (not handled here) */
99 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
100 args
->converter
->UCharErrorBufferLength
= 1;
101 *err
= U_BUFFER_OVERFLOW_ERROR
;
107 args
->converter
->toULength
= (int8_t)i
;
108 *err
= U_ILLEGAL_CHAR_FOUND
;
114 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
115 /* End of target buffer */
116 *err
= U_BUFFER_OVERFLOW_ERROR
;
119 args
->target
= myTarget
;
120 args
->source
= (const char *) mySource
;
124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
127 const unsigned char *mySource
= (unsigned char *) args
->source
;
128 UChar
*myTarget
= args
->target
;
129 int32_t *myOffsets
= args
->offsets
;
130 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
131 const UChar
*targetLimit
= args
->targetLimit
;
132 unsigned char *toUBytes
= args
->converter
->toUBytes
;
134 int32_t offsetNum
= 0;
136 /* Restore state of current sequence */
137 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
138 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
139 args
->converter
->toULength
= 0;
141 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
142 args
->converter
->toUnicodeStatus
= 0;
146 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
150 while (i
< sizeof(uint32_t)) {
151 if (mySource
< sourceLimit
) {
152 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
153 toUBytes
[i
++] = (char) *(mySource
++);
156 /* stores a partially calculated target*/
157 /* + 1 to make 0 a valid character */
158 args
->converter
->toUnicodeStatus
= ch
+ 1;
159 args
->converter
->toULength
= (int8_t) i
;
164 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
166 if (ch
<= MAXIMUM_UCS2
) {
167 /* fits in 16 bits */
168 *(myTarget
++) = (UChar
) ch
;
169 *(myOffsets
++) = offsetNum
;
172 /* write out the surrogates */
173 *(myTarget
++) = U16_LEAD(ch
);
174 *myOffsets
++ = offsetNum
;
176 if (myTarget
< targetLimit
)
178 *(myTarget
++) = (UChar
)ch
;
179 *(myOffsets
++) = offsetNum
;
182 /* Put in overflow buffer (not handled here) */
183 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
184 args
->converter
->UCharErrorBufferLength
= 1;
185 *err
= U_BUFFER_OVERFLOW_ERROR
;
191 args
->converter
->toULength
= (int8_t)i
;
192 *err
= U_ILLEGAL_CHAR_FOUND
;
199 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
201 /* End of target buffer */
202 *err
= U_BUFFER_OVERFLOW_ERROR
;
205 args
->target
= myTarget
;
206 args
->source
= (const char *) mySource
;
207 args
->offsets
= myOffsets
;
211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs
* args
,
214 const UChar
*mySource
= args
->source
;
215 unsigned char *myTarget
;
216 const UChar
*sourceLimit
= args
->sourceLimit
;
217 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
219 unsigned int indexToWrite
;
220 unsigned char temp
[sizeof(uint32_t)];
222 if(mySource
>= sourceLimit
) {
223 /* no input, nothing to do */
227 /* write the BOM if necessary */
228 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
229 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
230 ucnv_fromUWriteBytes(args
->converter
,
232 &args
->target
, args
->targetLimit
,
235 args
->converter
->fromUnicodeStatus
=0;
238 myTarget
= (unsigned char *) args
->target
;
241 if (args
->converter
->fromUChar32
) {
242 ch
= args
->converter
->fromUChar32
;
243 args
->converter
->fromUChar32
= 0;
247 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
250 if (U_IS_SURROGATE(ch
)) {
253 if (mySource
< sourceLimit
) {
255 if (U_IS_TRAIL(ch2
)) {
256 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
260 /* this is an unmatched trail code unit (2nd surrogate) */
261 /* callback(illegal) */
262 args
->converter
->fromUChar32
= ch
;
263 *err
= U_ILLEGAL_CHAR_FOUND
;
268 /* ran out of source */
269 args
->converter
->fromUChar32
= ch
;
271 /* this is an unmatched trail code unit (2nd surrogate) */
272 /* callback(illegal) */
273 *err
= U_ILLEGAL_CHAR_FOUND
;
279 /* this is an unmatched trail code unit (2nd surrogate) */
280 /* callback(illegal) */
281 args
->converter
->fromUChar32
= ch
;
282 *err
= U_ILLEGAL_CHAR_FOUND
;
287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
288 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
289 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
290 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
292 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
293 if (myTarget
< targetLimit
) {
294 *(myTarget
++) = temp
[indexToWrite
];
297 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
298 *err
= U_BUFFER_OVERFLOW_ERROR
;
303 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
304 *err
= U_BUFFER_OVERFLOW_ERROR
;
307 args
->target
= (char *) myTarget
;
308 args
->source
= mySource
;
312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
315 const UChar
*mySource
= args
->source
;
316 unsigned char *myTarget
;
318 const UChar
*sourceLimit
= args
->sourceLimit
;
319 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
321 int32_t offsetNum
= 0;
322 unsigned int indexToWrite
;
323 unsigned char temp
[sizeof(uint32_t)];
325 if(mySource
>= sourceLimit
) {
326 /* no input, nothing to do */
330 /* write the BOM if necessary */
331 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
332 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
333 ucnv_fromUWriteBytes(args
->converter
,
335 &args
->target
, args
->targetLimit
,
338 args
->converter
->fromUnicodeStatus
=0;
341 myTarget
= (unsigned char *) args
->target
;
342 myOffsets
= args
->offsets
;
345 if (args
->converter
->fromUChar32
) {
346 ch
= args
->converter
->fromUChar32
;
347 args
->converter
->fromUChar32
= 0;
351 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
354 if (U_IS_SURROGATE(ch
)) {
357 if (mySource
< sourceLimit
) {
359 if (U_IS_TRAIL(ch2
)) {
360 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
364 /* this is an unmatched trail code unit (2nd surrogate) */
365 /* callback(illegal) */
366 args
->converter
->fromUChar32
= ch
;
367 *err
= U_ILLEGAL_CHAR_FOUND
;
372 /* ran out of source */
373 args
->converter
->fromUChar32
= ch
;
375 /* this is an unmatched trail code unit (2nd surrogate) */
376 /* callback(illegal) */
377 *err
= U_ILLEGAL_CHAR_FOUND
;
383 /* this is an unmatched trail code unit (2nd surrogate) */
384 /* callback(illegal) */
385 args
->converter
->fromUChar32
= ch
;
386 *err
= U_ILLEGAL_CHAR_FOUND
;
391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
392 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
393 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
394 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
396 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
397 if (myTarget
< targetLimit
) {
398 *(myTarget
++) = temp
[indexToWrite
];
399 *(myOffsets
++) = offsetNum
;
402 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
403 *err
= U_BUFFER_OVERFLOW_ERROR
;
406 offsetNum
= offsetNum
+ 1 + (temp
[1] != 0);
409 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
410 *err
= U_BUFFER_OVERFLOW_ERROR
;
413 args
->target
= (char *) myTarget
;
414 args
->source
= mySource
;
415 args
->offsets
= myOffsets
;
419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs
* args
,
422 const uint8_t *mySource
;
426 mySource
= (const uint8_t *)args
->source
;
427 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
430 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
434 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
437 /* got a partial character */
438 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
439 args
->converter
->toULength
= (int8_t)length
;
440 args
->source
= (const char *)(mySource
+ length
);
441 *err
= U_TRUNCATED_CHAR_FOUND
;
445 /* Don't even try to do a direct cast because the value may be on an odd address. */
446 myUChar
= ((UChar32
)mySource
[0] << 24)
447 | ((UChar32
)mySource
[1] << 16)
448 | ((UChar32
)mySource
[2] << 8)
449 | ((UChar32
)mySource
[3]);
451 args
->source
= (const char *)(mySource
+ 4);
452 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
456 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
457 args
->converter
->toULength
= 4;
459 *err
= U_ILLEGAL_CHAR_FOUND
;
463 static const UConverterImpl _UTF32BEImpl
= {
464 UCNV_UTF32_BigEndian
,
473 T_UConverter_toUnicode_UTF32_BE
,
474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC
,
475 T_UConverter_fromUnicode_UTF32_BE
,
476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
477 T_UConverter_getNextUChar_UTF32_BE
,
483 ucnv_getNonSurrogateUnicodeSet
486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
487 static const UConverterStaticData _UTF32BEStaticData
= {
488 sizeof(UConverterStaticData
),
491 UCNV_IBM
, UCNV_UTF32_BigEndian
, 4, 4,
492 { 0, 0, 0xff, 0xfd }, 4, FALSE
, FALSE
,
495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
498 const UConverterSharedData _UTF32BEData
=
499 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData
, &_UTF32BEImpl
);
501 /* UTF-32LE ---------------------------------------------------------- */
504 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs
* args
,
507 const unsigned char *mySource
= (unsigned char *) args
->source
;
508 UChar
*myTarget
= args
->target
;
509 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
510 const UChar
*targetLimit
= args
->targetLimit
;
511 unsigned char *toUBytes
= args
->converter
->toUBytes
;
514 /* Restore state of current sequence */
515 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
517 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
518 args
->converter
->toULength
= 0;
520 /* Stores the previously calculated ch from a previous call*/
521 ch
= args
->converter
->toUnicodeStatus
- 1;
522 args
->converter
->toUnicodeStatus
= 0;
526 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
531 while (i
< sizeof(uint32_t))
533 if (mySource
< sourceLimit
)
535 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
536 toUBytes
[i
++] = (char) *(mySource
++);
540 /* stores a partially calculated target*/
541 /* + 1 to make 0 a valid character */
542 args
->converter
->toUnicodeStatus
= ch
+ 1;
543 args
->converter
->toULength
= (int8_t) i
;
548 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
549 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
550 if (ch
<= MAXIMUM_UCS2
) {
551 /* fits in 16 bits */
552 *(myTarget
++) = (UChar
) ch
;
555 /* write out the surrogates */
556 *(myTarget
++) = U16_LEAD(ch
);
558 if (myTarget
< targetLimit
) {
559 *(myTarget
++) = (UChar
)ch
;
562 /* Put in overflow buffer (not handled here) */
563 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
564 args
->converter
->UCharErrorBufferLength
= 1;
565 *err
= U_BUFFER_OVERFLOW_ERROR
;
571 args
->converter
->toULength
= (int8_t)i
;
572 *err
= U_ILLEGAL_CHAR_FOUND
;
578 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
580 /* End of target buffer */
581 *err
= U_BUFFER_OVERFLOW_ERROR
;
584 args
->target
= myTarget
;
585 args
->source
= (const char *) mySource
;
589 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
592 const unsigned char *mySource
= (unsigned char *) args
->source
;
593 UChar
*myTarget
= args
->target
;
594 int32_t *myOffsets
= args
->offsets
;
595 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
596 const UChar
*targetLimit
= args
->targetLimit
;
597 unsigned char *toUBytes
= args
->converter
->toUBytes
;
599 int32_t offsetNum
= 0;
601 /* Restore state of current sequence */
602 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
604 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
605 args
->converter
->toULength
= 0;
607 /* Stores the previously calculated ch from a previous call*/
608 ch
= args
->converter
->toUnicodeStatus
- 1;
609 args
->converter
->toUnicodeStatus
= 0;
613 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
618 while (i
< sizeof(uint32_t))
620 if (mySource
< sourceLimit
)
622 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
623 toUBytes
[i
++] = (char) *(mySource
++);
627 /* stores a partially calculated target*/
628 /* + 1 to make 0 a valid character */
629 args
->converter
->toUnicodeStatus
= ch
+ 1;
630 args
->converter
->toULength
= (int8_t) i
;
635 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
))
637 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
638 if (ch
<= MAXIMUM_UCS2
)
640 /* fits in 16 bits */
641 *(myTarget
++) = (UChar
) ch
;
642 *(myOffsets
++) = offsetNum
;
645 /* write out the surrogates */
646 *(myTarget
++) = U16_LEAD(ch
);
647 *(myOffsets
++) = offsetNum
;
649 if (myTarget
< targetLimit
)
651 *(myTarget
++) = (UChar
)ch
;
652 *(myOffsets
++) = offsetNum
;
656 /* Put in overflow buffer (not handled here) */
657 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
658 args
->converter
->UCharErrorBufferLength
= 1;
659 *err
= U_BUFFER_OVERFLOW_ERROR
;
666 args
->converter
->toULength
= (int8_t)i
;
667 *err
= U_ILLEGAL_CHAR_FOUND
;
674 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
676 /* End of target buffer */
677 *err
= U_BUFFER_OVERFLOW_ERROR
;
680 args
->target
= myTarget
;
681 args
->source
= (const char *) mySource
;
682 args
->offsets
= myOffsets
;
686 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs
* args
,
689 const UChar
*mySource
= args
->source
;
690 unsigned char *myTarget
;
691 const UChar
*sourceLimit
= args
->sourceLimit
;
692 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
694 unsigned int indexToWrite
;
695 unsigned char temp
[sizeof(uint32_t)];
697 if(mySource
>= sourceLimit
) {
698 /* no input, nothing to do */
702 /* write the BOM if necessary */
703 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
704 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
705 ucnv_fromUWriteBytes(args
->converter
,
707 &args
->target
, args
->targetLimit
,
710 args
->converter
->fromUnicodeStatus
=0;
713 myTarget
= (unsigned char *) args
->target
;
716 if (args
->converter
->fromUChar32
)
718 ch
= args
->converter
->fromUChar32
;
719 args
->converter
->fromUChar32
= 0;
723 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
727 if (U16_IS_SURROGATE(ch
)) {
731 if (mySource
< sourceLimit
)
734 if (U16_IS_TRAIL(ch2
)) {
735 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
739 /* this is an unmatched trail code unit (2nd surrogate) */
740 /* callback(illegal) */
741 args
->converter
->fromUChar32
= ch
;
742 *err
= U_ILLEGAL_CHAR_FOUND
;
747 /* ran out of source */
748 args
->converter
->fromUChar32
= ch
;
750 /* this is an unmatched trail code unit (2nd surrogate) */
751 /* callback(illegal) */
752 *err
= U_ILLEGAL_CHAR_FOUND
;
758 /* this is an unmatched trail code unit (2nd surrogate) */
759 /* callback(illegal) */
760 args
->converter
->fromUChar32
= ch
;
761 *err
= U_ILLEGAL_CHAR_FOUND
;
766 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
767 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
768 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
769 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
771 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
773 if (myTarget
< targetLimit
)
775 *(myTarget
++) = temp
[indexToWrite
];
779 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
780 *err
= U_BUFFER_OVERFLOW_ERROR
;
785 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
787 *err
= U_BUFFER_OVERFLOW_ERROR
;
790 args
->target
= (char *) myTarget
;
791 args
->source
= mySource
;
795 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
798 const UChar
*mySource
= args
->source
;
799 unsigned char *myTarget
;
801 const UChar
*sourceLimit
= args
->sourceLimit
;
802 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
804 unsigned int indexToWrite
;
805 unsigned char temp
[sizeof(uint32_t)];
806 int32_t offsetNum
= 0;
808 if(mySource
>= sourceLimit
) {
809 /* no input, nothing to do */
813 /* write the BOM if necessary */
814 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
815 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
816 ucnv_fromUWriteBytes(args
->converter
,
818 &args
->target
, args
->targetLimit
,
821 args
->converter
->fromUnicodeStatus
=0;
824 myTarget
= (unsigned char *) args
->target
;
825 myOffsets
= args
->offsets
;
828 if (args
->converter
->fromUChar32
)
830 ch
= args
->converter
->fromUChar32
;
831 args
->converter
->fromUChar32
= 0;
835 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
839 if (U16_IS_SURROGATE(ch
)) {
843 if (mySource
< sourceLimit
)
846 if (U16_IS_TRAIL(ch2
))
848 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
852 /* this is an unmatched trail code unit (2nd surrogate) */
853 /* callback(illegal) */
854 args
->converter
->fromUChar32
= ch
;
855 *err
= U_ILLEGAL_CHAR_FOUND
;
860 /* ran out of source */
861 args
->converter
->fromUChar32
= ch
;
863 /* this is an unmatched trail code unit (2nd surrogate) */
864 /* callback(illegal) */
865 *err
= U_ILLEGAL_CHAR_FOUND
;
871 /* this is an unmatched trail code unit (2nd surrogate) */
872 /* callback(illegal) */
873 args
->converter
->fromUChar32
= ch
;
874 *err
= U_ILLEGAL_CHAR_FOUND
;
879 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
880 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
881 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
882 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
884 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
886 if (myTarget
< targetLimit
)
888 *(myTarget
++) = temp
[indexToWrite
];
889 *(myOffsets
++) = offsetNum
;
893 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
894 *err
= U_BUFFER_OVERFLOW_ERROR
;
897 offsetNum
= offsetNum
+ 1 + (temp
[2] != 0);
900 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
902 *err
= U_BUFFER_OVERFLOW_ERROR
;
905 args
->target
= (char *) myTarget
;
906 args
->source
= mySource
;
907 args
->offsets
= myOffsets
;
911 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs
* args
,
914 const uint8_t *mySource
;
918 mySource
= (const uint8_t *)args
->source
;
919 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
922 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
926 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
929 /* got a partial character */
930 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
931 args
->converter
->toULength
= (int8_t)length
;
932 args
->source
= (const char *)(mySource
+ length
);
933 *err
= U_TRUNCATED_CHAR_FOUND
;
937 /* Don't even try to do a direct cast because the value may be on an odd address. */
938 myUChar
= ((UChar32
)mySource
[3] << 24)
939 | ((UChar32
)mySource
[2] << 16)
940 | ((UChar32
)mySource
[1] << 8)
941 | ((UChar32
)mySource
[0]);
943 args
->source
= (const char *)(mySource
+ 4);
944 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
948 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
949 args
->converter
->toULength
= 4;
951 *err
= U_ILLEGAL_CHAR_FOUND
;
955 static const UConverterImpl _UTF32LEImpl
= {
956 UCNV_UTF32_LittleEndian
,
965 T_UConverter_toUnicode_UTF32_LE
,
966 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC
,
967 T_UConverter_fromUnicode_UTF32_LE
,
968 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
969 T_UConverter_getNextUChar_UTF32_LE
,
975 ucnv_getNonSurrogateUnicodeSet
978 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
979 static const UConverterStaticData _UTF32LEStaticData
= {
980 sizeof(UConverterStaticData
),
983 UCNV_IBM
, UCNV_UTF32_LittleEndian
, 4, 4,
984 { 0xfd, 0xff, 0, 0 }, 4, FALSE
, FALSE
,
987 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
991 const UConverterSharedData _UTF32LEData
=
992 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData
, &_UTF32LEImpl
);
994 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
997 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1012 * During detection: state&3==number of matching bytes so far.
1014 * On output, emit U+FEFF as the first code point.
1018 _UTF32Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1019 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1020 /* reset toUnicode: state=0 */
1023 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1024 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1025 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1030 _UTF32Open(UConverter
*cnv
,
1031 UConverterLoadArgs
*pArgs
,
1032 UErrorCode
*pErrorCode
) {
1033 _UTF32Reset(cnv
, UCNV_RESET_BOTH
);
1036 static const char utf32BOM
[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1039 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1040 UErrorCode
*pErrorCode
) {
1041 UConverter
*cnv
=pArgs
->converter
;
1042 const char *source
=pArgs
->source
;
1043 const char *sourceLimit
=pArgs
->sourceLimit
;
1044 int32_t *offsets
=pArgs
->offsets
;
1046 int32_t state
, offsetDelta
;
1052 * If we detect a BOM in this buffer, then we must add the BOM size to the
1053 * offsets because the actual converter function will not see and count the BOM.
1054 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1058 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1063 state
=1; /* could be 00 00 FE FF */
1064 } else if(b
==(char)0xff) {
1065 state
=5; /* could be FF FE 00 00 */
1067 state
=8; /* default to UTF-32BE */
1078 if(*source
==utf32BOM
[state
]) {
1082 state
=8; /* detect UTF-32BE */
1083 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1084 } else if(state
==8) {
1085 state
=9; /* detect UTF-32LE */
1086 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1089 /* switch to UTF-32BE and pass the previous bytes */
1090 int32_t count
=(int32_t)(source
-pArgs
->source
); /* number of bytes from this buffer */
1092 /* reset the source */
1093 source
=pArgs
->source
;
1095 if(count
==(state
&3)) {
1096 /* simple: all in the same buffer, just reset source */
1098 UBool oldFlush
=pArgs
->flush
;
1100 /* some of the bytes are from a previous buffer, replay those first */
1101 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1102 pArgs
->sourceLimit
=pArgs
->source
+((state
&3)-count
); /* replay previous bytes */
1103 pArgs
->flush
=FALSE
; /* this sourceLimit is not the real source stream limit */
1105 /* no offsets: bytes from previous buffer, and not enough for output */
1106 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1108 /* restore real pointers; pArgs->source will be set in case 8/9 */
1109 pArgs
->sourceLimit
=sourceLimit
;
1110 pArgs
->flush
=oldFlush
;
1118 pArgs
->source
=source
;
1120 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1122 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1124 source
=pArgs
->source
;
1128 pArgs
->source
=source
;
1130 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1132 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1134 source
=pArgs
->source
;
1137 break; /* does not occur */
1141 /* add BOM size to offsets - see comment at offsetDelta declaration */
1142 if(offsets
!=NULL
&& offsetDelta
!=0) {
1143 int32_t *offsetsLimit
=pArgs
->offsets
;
1144 while(offsets
<offsetsLimit
) {
1145 *offsets
++ += offsetDelta
;
1149 pArgs
->source
=source
;
1151 if(source
==sourceLimit
&& pArgs
->flush
) {
1152 /* handle truncated input */
1155 break; /* no input at all, nothing to do */
1157 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1160 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1163 /* handle 0<state<8: call UTF-32BE with too-short input */
1164 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1165 pArgs
->sourceLimit
=pArgs
->source
+(state
&3); /* replay bytes */
1167 /* no offsets: not enough for output */
1168 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1169 pArgs
->source
=source
;
1170 pArgs
->sourceLimit
=sourceLimit
;
1180 _UTF32GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1181 UErrorCode
*pErrorCode
) {
1182 switch(pArgs
->converter
->mode
) {
1184 return T_UConverter_getNextUChar_UTF32_BE(pArgs
, pErrorCode
);
1186 return T_UConverter_getNextUChar_UTF32_LE(pArgs
, pErrorCode
);
1188 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1192 static const UConverterImpl _UTF32Impl
= {
1202 _UTF32ToUnicodeWithOffsets
,
1203 _UTF32ToUnicodeWithOffsets
,
1205 T_UConverter_fromUnicode_UTF32_BE
,
1206 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
1208 T_UConverter_fromUnicode_UTF32_LE
,
1209 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
1213 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1217 ucnv_getNonSurrogateUnicodeSet
1220 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1221 static const UConverterStaticData _UTF32StaticData
= {
1222 sizeof(UConverterStaticData
),
1225 UCNV_IBM
, UCNV_UTF32
, 4, 4,
1227 { 0, 0, 0xff, 0xfd }, 4,
1229 { 0xfd, 0xff, 0, 0 }, 4,
1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1237 const UConverterSharedData _UTF32Data
=
1238 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData
, &_UTF32Impl
);