2 **********************************************************************
3 * Copyright (C) 2002-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
22 #include "unicode/utf.h"
27 #define MAXIMUM_UCS2 0x0000FFFF
28 #define MAXIMUM_UTF 0x0010FFFF
30 #define HALF_BASE 0x0010000
31 #define HALF_MASK 0x3FF
32 #define SURROGATE_HIGH_START 0xD800
33 #define SURROGATE_LOW_START 0xDC00
35 /* -SURROGATE_LOW_START + HALF_BASE */
36 #define SURROGATE_LOW_BASE 9216
39 UCNV_NEED_TO_WRITE_BOM
=1
42 /* UTF-32BE ----------------------------------------------------------------- */
45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs
* args
,
48 const unsigned char *mySource
= (unsigned char *) args
->source
;
49 UChar
*myTarget
= args
->target
;
50 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
51 const UChar
*targetLimit
= args
->targetLimit
;
52 unsigned char *toUBytes
= args
->converter
->toUBytes
;
55 /* Restore state of current sequence */
56 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
57 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
58 args
->converter
->toULength
= 0;
60 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
61 args
->converter
->toUnicodeStatus
= 0;
65 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
69 while (i
< sizeof(uint32_t)) {
70 if (mySource
< sourceLimit
) {
71 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
72 toUBytes
[i
++] = (char) *(mySource
++);
75 /* stores a partially calculated target*/
76 /* + 1 to make 0 a valid character */
77 args
->converter
->toUnicodeStatus
= ch
+ 1;
78 args
->converter
->toULength
= (int8_t) i
;
83 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
85 if (ch
<= MAXIMUM_UCS2
)
88 *(myTarget
++) = (UChar
) ch
;
91 /* write out the surrogates */
92 *(myTarget
++) = U16_LEAD(ch
);
94 if (myTarget
< targetLimit
) {
95 *(myTarget
++) = (UChar
)ch
;
98 /* Put in overflow buffer (not handled here) */
99 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
100 args
->converter
->UCharErrorBufferLength
= 1;
101 *err
= U_BUFFER_OVERFLOW_ERROR
;
107 args
->converter
->toULength
= (int8_t)i
;
108 *err
= U_ILLEGAL_CHAR_FOUND
;
114 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
115 /* End of target buffer */
116 *err
= U_BUFFER_OVERFLOW_ERROR
;
119 args
->target
= myTarget
;
120 args
->source
= (const char *) mySource
;
124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
127 const unsigned char *mySource
= (unsigned char *) args
->source
;
128 UChar
*myTarget
= args
->target
;
129 int32_t *myOffsets
= args
->offsets
;
130 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
131 const UChar
*targetLimit
= args
->targetLimit
;
132 unsigned char *toUBytes
= args
->converter
->toUBytes
;
134 int32_t offsetNum
= 0;
136 /* Restore state of current sequence */
137 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
) {
138 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
139 args
->converter
->toULength
= 0;
141 ch
= args
->converter
->toUnicodeStatus
- 1;/*Stores the previously calculated ch from a previous call*/
142 args
->converter
->toUnicodeStatus
= 0;
146 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
150 while (i
< sizeof(uint32_t)) {
151 if (mySource
< sourceLimit
) {
152 ch
= (ch
<< 8) | (uint8_t)(*mySource
);
153 toUBytes
[i
++] = (char) *(mySource
++);
156 /* stores a partially calculated target*/
157 /* + 1 to make 0 a valid character */
158 args
->converter
->toUnicodeStatus
= ch
+ 1;
159 args
->converter
->toULength
= (int8_t) i
;
164 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
166 if (ch
<= MAXIMUM_UCS2
) {
167 /* fits in 16 bits */
168 *(myTarget
++) = (UChar
) ch
;
169 *(myOffsets
++) = offsetNum
;
172 /* write out the surrogates */
173 *(myTarget
++) = U16_LEAD(ch
);
174 *myOffsets
++ = offsetNum
;
176 if (myTarget
< targetLimit
)
178 *(myTarget
++) = (UChar
)ch
;
179 *(myOffsets
++) = offsetNum
;
182 /* Put in overflow buffer (not handled here) */
183 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
184 args
->converter
->UCharErrorBufferLength
= 1;
185 *err
= U_BUFFER_OVERFLOW_ERROR
;
191 args
->converter
->toULength
= (int8_t)i
;
192 *err
= U_ILLEGAL_CHAR_FOUND
;
199 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
201 /* End of target buffer */
202 *err
= U_BUFFER_OVERFLOW_ERROR
;
205 args
->target
= myTarget
;
206 args
->source
= (const char *) mySource
;
207 args
->offsets
= myOffsets
;
211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs
* args
,
214 const UChar
*mySource
= args
->source
;
215 unsigned char *myTarget
;
216 const UChar
*sourceLimit
= args
->sourceLimit
;
217 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
219 unsigned int indexToWrite
;
220 unsigned char temp
[sizeof(uint32_t)];
222 if(mySource
>= sourceLimit
) {
223 /* no input, nothing to do */
227 /* write the BOM if necessary */
228 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
229 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
230 ucnv_fromUWriteBytes(args
->converter
,
232 &args
->target
, args
->targetLimit
,
235 args
->converter
->fromUnicodeStatus
=0;
238 myTarget
= (unsigned char *) args
->target
;
241 if (args
->converter
->fromUChar32
) {
242 ch
= args
->converter
->fromUChar32
;
243 args
->converter
->fromUChar32
= 0;
247 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
250 if (U_IS_SURROGATE(ch
)) {
253 if (mySource
< sourceLimit
) {
255 if (U_IS_TRAIL(ch2
)) {
256 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
260 /* this is an unmatched trail code unit (2nd surrogate) */
261 /* callback(illegal) */
262 args
->converter
->fromUChar32
= ch
;
263 *err
= U_ILLEGAL_CHAR_FOUND
;
268 /* ran out of source */
269 args
->converter
->fromUChar32
= ch
;
271 /* this is an unmatched trail code unit (2nd surrogate) */
272 /* callback(illegal) */
273 *err
= U_ILLEGAL_CHAR_FOUND
;
279 /* this is an unmatched trail code unit (2nd surrogate) */
280 /* callback(illegal) */
281 args
->converter
->fromUChar32
= ch
;
282 *err
= U_ILLEGAL_CHAR_FOUND
;
287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
288 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
289 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
290 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
292 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
293 if (myTarget
< targetLimit
) {
294 *(myTarget
++) = temp
[indexToWrite
];
297 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
298 *err
= U_BUFFER_OVERFLOW_ERROR
;
303 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
304 *err
= U_BUFFER_OVERFLOW_ERROR
;
307 args
->target
= (char *) myTarget
;
308 args
->source
= mySource
;
312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
315 const UChar
*mySource
= args
->source
;
316 unsigned char *myTarget
;
318 const UChar
*sourceLimit
= args
->sourceLimit
;
319 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
321 int32_t offsetNum
= 0;
322 unsigned int indexToWrite
;
323 unsigned char temp
[sizeof(uint32_t)];
325 if(mySource
>= sourceLimit
) {
326 /* no input, nothing to do */
330 /* write the BOM if necessary */
331 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
332 static const char bom
[]={ 0, 0, (char)0xfe, (char)0xff };
333 ucnv_fromUWriteBytes(args
->converter
,
335 &args
->target
, args
->targetLimit
,
338 args
->converter
->fromUnicodeStatus
=0;
341 myTarget
= (unsigned char *) args
->target
;
342 myOffsets
= args
->offsets
;
345 if (args
->converter
->fromUChar32
) {
346 ch
= args
->converter
->fromUChar32
;
347 args
->converter
->fromUChar32
= 0;
351 while (mySource
< sourceLimit
&& myTarget
< targetLimit
) {
354 if (U_IS_SURROGATE(ch
)) {
357 if (mySource
< sourceLimit
) {
359 if (U_IS_TRAIL(ch2
)) {
360 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
364 /* this is an unmatched trail code unit (2nd surrogate) */
365 /* callback(illegal) */
366 args
->converter
->fromUChar32
= ch
;
367 *err
= U_ILLEGAL_CHAR_FOUND
;
372 /* ran out of source */
373 args
->converter
->fromUChar32
= ch
;
375 /* this is an unmatched trail code unit (2nd surrogate) */
376 /* callback(illegal) */
377 *err
= U_ILLEGAL_CHAR_FOUND
;
383 /* this is an unmatched trail code unit (2nd surrogate) */
384 /* callback(illegal) */
385 args
->converter
->fromUChar32
= ch
;
386 *err
= U_ILLEGAL_CHAR_FOUND
;
391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
392 temp
[1] = (uint8_t) (ch
>> 16 & 0x1F);
393 temp
[2] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
394 temp
[3] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
396 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++) {
397 if (myTarget
< targetLimit
) {
398 *(myTarget
++) = temp
[indexToWrite
];
399 *(myOffsets
++) = offsetNum
;
402 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
403 *err
= U_BUFFER_OVERFLOW_ERROR
;
406 offsetNum
= offsetNum
+ 1 + (temp
[1] != 0);
409 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
)) {
410 *err
= U_BUFFER_OVERFLOW_ERROR
;
413 args
->target
= (char *) myTarget
;
414 args
->source
= mySource
;
415 args
->offsets
= myOffsets
;
419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs
* args
,
422 const uint8_t *mySource
;
426 mySource
= (const uint8_t *)args
->source
;
427 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
430 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
434 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
437 /* got a partial character */
438 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
439 args
->converter
->toULength
= (int8_t)length
;
440 args
->source
= (const char *)(mySource
+ length
);
441 *err
= U_TRUNCATED_CHAR_FOUND
;
445 /* Don't even try to do a direct cast because the value may be on an odd address. */
446 myUChar
= ((UChar32
)mySource
[0] << 24)
447 | ((UChar32
)mySource
[1] << 16)
448 | ((UChar32
)mySource
[2] << 8)
449 | ((UChar32
)mySource
[3]);
451 args
->source
= (const char *)(mySource
+ 4);
452 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
456 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
457 args
->converter
->toULength
= 4;
459 *err
= U_ILLEGAL_CHAR_FOUND
;
463 static const UConverterImpl _UTF32BEImpl
= {
464 UCNV_UTF32_BigEndian
,
473 T_UConverter_toUnicode_UTF32_BE
,
474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC
,
475 T_UConverter_fromUnicode_UTF32_BE
,
476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
477 T_UConverter_getNextUChar_UTF32_BE
,
483 ucnv_getNonSurrogateUnicodeSet
486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
487 static const UConverterStaticData _UTF32BEStaticData
= {
488 sizeof(UConverterStaticData
),
491 UCNV_IBM
, UCNV_UTF32_BigEndian
, 4, 4,
492 { 0, 0, 0xff, 0xfd }, 4, FALSE
, FALSE
,
495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
498 const UConverterSharedData _UTF32BEData
= {
499 sizeof(UConverterSharedData
), ~((uint32_t) 0),
500 NULL
, NULL
, &_UTF32BEStaticData
, FALSE
, &_UTF32BEImpl
,
504 /* UTF-32LE ---------------------------------------------------------- */
507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs
* args
,
510 const unsigned char *mySource
= (unsigned char *) args
->source
;
511 UChar
*myTarget
= args
->target
;
512 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
513 const UChar
*targetLimit
= args
->targetLimit
;
514 unsigned char *toUBytes
= args
->converter
->toUBytes
;
517 /* Restore state of current sequence */
518 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
520 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
521 args
->converter
->toULength
= 0;
523 /* Stores the previously calculated ch from a previous call*/
524 ch
= args
->converter
->toUnicodeStatus
- 1;
525 args
->converter
->toUnicodeStatus
= 0;
529 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
534 while (i
< sizeof(uint32_t))
536 if (mySource
< sourceLimit
)
538 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
539 toUBytes
[i
++] = (char) *(mySource
++);
543 /* stores a partially calculated target*/
544 /* + 1 to make 0 a valid character */
545 args
->converter
->toUnicodeStatus
= ch
+ 1;
546 args
->converter
->toULength
= (int8_t) i
;
551 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
)) {
552 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
553 if (ch
<= MAXIMUM_UCS2
) {
554 /* fits in 16 bits */
555 *(myTarget
++) = (UChar
) ch
;
558 /* write out the surrogates */
559 *(myTarget
++) = U16_LEAD(ch
);
561 if (myTarget
< targetLimit
) {
562 *(myTarget
++) = (UChar
)ch
;
565 /* Put in overflow buffer (not handled here) */
566 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
567 args
->converter
->UCharErrorBufferLength
= 1;
568 *err
= U_BUFFER_OVERFLOW_ERROR
;
574 args
->converter
->toULength
= (int8_t)i
;
575 *err
= U_ILLEGAL_CHAR_FOUND
;
581 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
583 /* End of target buffer */
584 *err
= U_BUFFER_OVERFLOW_ERROR
;
587 args
->target
= myTarget
;
588 args
->source
= (const char *) mySource
;
592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs
* args
,
595 const unsigned char *mySource
= (unsigned char *) args
->source
;
596 UChar
*myTarget
= args
->target
;
597 int32_t *myOffsets
= args
->offsets
;
598 const unsigned char *sourceLimit
= (unsigned char *) args
->sourceLimit
;
599 const UChar
*targetLimit
= args
->targetLimit
;
600 unsigned char *toUBytes
= args
->converter
->toUBytes
;
602 int32_t offsetNum
= 0;
604 /* Restore state of current sequence */
605 if (args
->converter
->toUnicodeStatus
&& myTarget
< targetLimit
)
607 i
= args
->converter
->toULength
; /* restore # of bytes consumed */
608 args
->converter
->toULength
= 0;
610 /* Stores the previously calculated ch from a previous call*/
611 ch
= args
->converter
->toUnicodeStatus
- 1;
612 args
->converter
->toUnicodeStatus
= 0;
616 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
621 while (i
< sizeof(uint32_t))
623 if (mySource
< sourceLimit
)
625 ch
|= ((uint8_t)(*mySource
)) << (i
* 8);
626 toUBytes
[i
++] = (char) *(mySource
++);
630 /* stores a partially calculated target*/
631 /* + 1 to make 0 a valid character */
632 args
->converter
->toUnicodeStatus
= ch
+ 1;
633 args
->converter
->toULength
= (int8_t) i
;
638 if (ch
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(ch
))
640 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
641 if (ch
<= MAXIMUM_UCS2
)
643 /* fits in 16 bits */
644 *(myTarget
++) = (UChar
) ch
;
645 *(myOffsets
++) = offsetNum
;
648 /* write out the surrogates */
649 *(myTarget
++) = U16_LEAD(ch
);
650 *(myOffsets
++) = offsetNum
;
652 if (myTarget
< targetLimit
)
654 *(myTarget
++) = (UChar
)ch
;
655 *(myOffsets
++) = offsetNum
;
659 /* Put in overflow buffer (not handled here) */
660 args
->converter
->UCharErrorBuffer
[0] = (UChar
) ch
;
661 args
->converter
->UCharErrorBufferLength
= 1;
662 *err
= U_BUFFER_OVERFLOW_ERROR
;
669 args
->converter
->toULength
= (int8_t)i
;
670 *err
= U_ILLEGAL_CHAR_FOUND
;
677 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
679 /* End of target buffer */
680 *err
= U_BUFFER_OVERFLOW_ERROR
;
683 args
->target
= myTarget
;
684 args
->source
= (const char *) mySource
;
685 args
->offsets
= myOffsets
;
689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs
* args
,
692 const UChar
*mySource
= args
->source
;
693 unsigned char *myTarget
;
694 const UChar
*sourceLimit
= args
->sourceLimit
;
695 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
697 unsigned int indexToWrite
;
698 unsigned char temp
[sizeof(uint32_t)];
700 if(mySource
>= sourceLimit
) {
701 /* no input, nothing to do */
705 /* write the BOM if necessary */
706 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
707 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
708 ucnv_fromUWriteBytes(args
->converter
,
710 &args
->target
, args
->targetLimit
,
713 args
->converter
->fromUnicodeStatus
=0;
716 myTarget
= (unsigned char *) args
->target
;
719 if (args
->converter
->fromUChar32
)
721 ch
= args
->converter
->fromUChar32
;
722 args
->converter
->fromUChar32
= 0;
726 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
730 if (U16_IS_SURROGATE(ch
)) {
734 if (mySource
< sourceLimit
)
737 if (U16_IS_TRAIL(ch2
)) {
738 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
742 /* this is an unmatched trail code unit (2nd surrogate) */
743 /* callback(illegal) */
744 args
->converter
->fromUChar32
= ch
;
745 *err
= U_ILLEGAL_CHAR_FOUND
;
750 /* ran out of source */
751 args
->converter
->fromUChar32
= ch
;
753 /* this is an unmatched trail code unit (2nd surrogate) */
754 /* callback(illegal) */
755 *err
= U_ILLEGAL_CHAR_FOUND
;
761 /* this is an unmatched trail code unit (2nd surrogate) */
762 /* callback(illegal) */
763 args
->converter
->fromUChar32
= ch
;
764 *err
= U_ILLEGAL_CHAR_FOUND
;
769 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
770 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
771 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
772 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
774 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
776 if (myTarget
< targetLimit
)
778 *(myTarget
++) = temp
[indexToWrite
];
782 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
783 *err
= U_BUFFER_OVERFLOW_ERROR
;
788 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
790 *err
= U_BUFFER_OVERFLOW_ERROR
;
793 args
->target
= (char *) myTarget
;
794 args
->source
= mySource
;
798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs
* args
,
801 const UChar
*mySource
= args
->source
;
802 unsigned char *myTarget
;
804 const UChar
*sourceLimit
= args
->sourceLimit
;
805 const unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
807 unsigned int indexToWrite
;
808 unsigned char temp
[sizeof(uint32_t)];
809 int32_t offsetNum
= 0;
811 if(mySource
>= sourceLimit
) {
812 /* no input, nothing to do */
816 /* write the BOM if necessary */
817 if(args
->converter
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
818 static const char bom
[]={ (char)0xff, (char)0xfe, 0, 0 };
819 ucnv_fromUWriteBytes(args
->converter
,
821 &args
->target
, args
->targetLimit
,
824 args
->converter
->fromUnicodeStatus
=0;
827 myTarget
= (unsigned char *) args
->target
;
828 myOffsets
= args
->offsets
;
831 if (args
->converter
->fromUChar32
)
833 ch
= args
->converter
->fromUChar32
;
834 args
->converter
->fromUChar32
= 0;
838 while (mySource
< sourceLimit
&& myTarget
< targetLimit
)
842 if (U16_IS_SURROGATE(ch
)) {
846 if (mySource
< sourceLimit
)
849 if (U16_IS_TRAIL(ch2
))
851 ch
= ((ch
- SURROGATE_HIGH_START
) << HALF_SHIFT
) + ch2
+ SURROGATE_LOW_BASE
;
855 /* this is an unmatched trail code unit (2nd surrogate) */
856 /* callback(illegal) */
857 args
->converter
->fromUChar32
= ch
;
858 *err
= U_ILLEGAL_CHAR_FOUND
;
863 /* ran out of source */
864 args
->converter
->fromUChar32
= ch
;
866 /* this is an unmatched trail code unit (2nd surrogate) */
867 /* callback(illegal) */
868 *err
= U_ILLEGAL_CHAR_FOUND
;
874 /* this is an unmatched trail code unit (2nd surrogate) */
875 /* callback(illegal) */
876 args
->converter
->fromUChar32
= ch
;
877 *err
= U_ILLEGAL_CHAR_FOUND
;
882 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
883 temp
[2] = (uint8_t) (ch
>> 16 & 0x1F);
884 temp
[1] = (uint8_t) (ch
>> 8); /* unsigned cast implicitly does (ch & FF) */
885 temp
[0] = (uint8_t) (ch
); /* unsigned cast implicitly does (ch & FF) */
887 for (indexToWrite
= 0; indexToWrite
<= sizeof(uint32_t) - 1; indexToWrite
++)
889 if (myTarget
< targetLimit
)
891 *(myTarget
++) = temp
[indexToWrite
];
892 *(myOffsets
++) = offsetNum
;
896 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = temp
[indexToWrite
];
897 *err
= U_BUFFER_OVERFLOW_ERROR
;
900 offsetNum
= offsetNum
+ 1 + (temp
[2] != 0);
903 if (mySource
< sourceLimit
&& myTarget
>= targetLimit
&& U_SUCCESS(*err
))
905 *err
= U_BUFFER_OVERFLOW_ERROR
;
908 args
->target
= (char *) myTarget
;
909 args
->source
= mySource
;
910 args
->offsets
= myOffsets
;
914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs
* args
,
917 const uint8_t *mySource
;
921 mySource
= (const uint8_t *)args
->source
;
922 if (mySource
>= (const uint8_t *)args
->sourceLimit
)
925 *err
= U_INDEX_OUTOFBOUNDS_ERROR
;
929 length
= (int32_t)((const uint8_t *)args
->sourceLimit
- mySource
);
932 /* got a partial character */
933 uprv_memcpy(args
->converter
->toUBytes
, mySource
, length
);
934 args
->converter
->toULength
= (int8_t)length
;
935 args
->source
= (const char *)(mySource
+ length
);
936 *err
= U_TRUNCATED_CHAR_FOUND
;
940 /* Don't even try to do a direct cast because the value may be on an odd address. */
941 myUChar
= ((UChar32
)mySource
[3] << 24)
942 | ((UChar32
)mySource
[2] << 16)
943 | ((UChar32
)mySource
[1] << 8)
944 | ((UChar32
)mySource
[0]);
946 args
->source
= (const char *)(mySource
+ 4);
947 if ((uint32_t)myUChar
<= MAXIMUM_UTF
&& !U_IS_SURROGATE(myUChar
)) {
951 uprv_memcpy(args
->converter
->toUBytes
, mySource
, 4);
952 args
->converter
->toULength
= 4;
954 *err
= U_ILLEGAL_CHAR_FOUND
;
958 static const UConverterImpl _UTF32LEImpl
= {
959 UCNV_UTF32_LittleEndian
,
968 T_UConverter_toUnicode_UTF32_LE
,
969 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC
,
970 T_UConverter_fromUnicode_UTF32_LE
,
971 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
972 T_UConverter_getNextUChar_UTF32_LE
,
978 ucnv_getNonSurrogateUnicodeSet
981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
982 static const UConverterStaticData _UTF32LEStaticData
= {
983 sizeof(UConverterStaticData
),
986 UCNV_IBM
, UCNV_UTF32_LittleEndian
, 4, 4,
987 { 0xfd, 0xff, 0, 0 }, 4, FALSE
, FALSE
,
990 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
994 const UConverterSharedData _UTF32LEData
= {
995 sizeof(UConverterSharedData
), ~((uint32_t) 0),
996 NULL
, NULL
, &_UTF32LEStaticData
, FALSE
, &_UTF32LEImpl
,
1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1003 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1018 * During detection: state&3==number of matching bytes so far.
1020 * On output, emit U+FEFF as the first code point.
1024 _UTF32Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1025 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1026 /* reset toUnicode: state=0 */
1029 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1030 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1031 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1036 _UTF32Open(UConverter
*cnv
,
1037 UConverterLoadArgs
*pArgs
,
1038 UErrorCode
*pErrorCode
) {
1039 _UTF32Reset(cnv
, UCNV_RESET_BOTH
);
1042 static const char utf32BOM
[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1046 UErrorCode
*pErrorCode
) {
1047 UConverter
*cnv
=pArgs
->converter
;
1048 const char *source
=pArgs
->source
;
1049 const char *sourceLimit
=pArgs
->sourceLimit
;
1050 int32_t *offsets
=pArgs
->offsets
;
1052 int32_t state
, offsetDelta
;
1058 * If we detect a BOM in this buffer, then we must add the BOM size to the
1059 * offsets because the actual converter function will not see and count the BOM.
1060 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1064 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1069 state
=1; /* could be 00 00 FE FF */
1070 } else if(b
==(char)0xff) {
1071 state
=5; /* could be FF FE 00 00 */
1073 state
=8; /* default to UTF-32BE */
1084 if(*source
==utf32BOM
[state
]) {
1088 state
=8; /* detect UTF-32BE */
1089 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1090 } else if(state
==8) {
1091 state
=9; /* detect UTF-32LE */
1092 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1095 /* switch to UTF-32BE and pass the previous bytes */
1096 int32_t count
=(int32_t)(source
-pArgs
->source
); /* number of bytes from this buffer */
1098 /* reset the source */
1099 source
=pArgs
->source
;
1101 if(count
==(state
&3)) {
1102 /* simple: all in the same buffer, just reset source */
1104 UBool oldFlush
=pArgs
->flush
;
1106 /* some of the bytes are from a previous buffer, replay those first */
1107 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1108 pArgs
->sourceLimit
=pArgs
->source
+((state
&3)-count
); /* replay previous bytes */
1109 pArgs
->flush
=FALSE
; /* this sourceLimit is not the real source stream limit */
1111 /* no offsets: bytes from previous buffer, and not enough for output */
1112 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1114 /* restore real pointers; pArgs->source will be set in case 8/9 */
1115 pArgs
->sourceLimit
=sourceLimit
;
1116 pArgs
->flush
=oldFlush
;
1124 pArgs
->source
=source
;
1126 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1128 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1130 source
=pArgs
->source
;
1134 pArgs
->source
=source
;
1136 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1138 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs
, pErrorCode
);
1140 source
=pArgs
->source
;
1143 break; /* does not occur */
1147 /* add BOM size to offsets - see comment at offsetDelta declaration */
1148 if(offsets
!=NULL
&& offsetDelta
!=0) {
1149 int32_t *offsetsLimit
=pArgs
->offsets
;
1150 while(offsets
<offsetsLimit
) {
1151 *offsets
++ += offsetDelta
;
1155 pArgs
->source
=source
;
1157 if(source
==sourceLimit
&& pArgs
->flush
) {
1158 /* handle truncated input */
1161 break; /* no input at all, nothing to do */
1163 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1166 T_UConverter_toUnicode_UTF32_LE(pArgs
, pErrorCode
);
1169 /* handle 0<state<8: call UTF-32BE with too-short input */
1170 pArgs
->source
=utf32BOM
+(state
&4); /* select the correct BOM */
1171 pArgs
->sourceLimit
=pArgs
->source
+(state
&3); /* replay bytes */
1173 /* no offsets: not enough for output */
1174 T_UConverter_toUnicode_UTF32_BE(pArgs
, pErrorCode
);
1175 pArgs
->source
=source
;
1176 pArgs
->sourceLimit
=sourceLimit
;
1186 _UTF32GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1187 UErrorCode
*pErrorCode
) {
1188 switch(pArgs
->converter
->mode
) {
1190 return T_UConverter_getNextUChar_UTF32_BE(pArgs
, pErrorCode
);
1192 return T_UConverter_getNextUChar_UTF32_LE(pArgs
, pErrorCode
);
1194 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1198 static const UConverterImpl _UTF32Impl
= {
1208 _UTF32ToUnicodeWithOffsets
,
1209 _UTF32ToUnicodeWithOffsets
,
1211 T_UConverter_fromUnicode_UTF32_BE
,
1212 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC
,
1214 T_UConverter_fromUnicode_UTF32_LE
,
1215 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC
,
1219 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1223 ucnv_getNonSurrogateUnicodeSet
1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1227 static const UConverterStaticData _UTF32StaticData
= {
1228 sizeof(UConverterStaticData
),
1231 UCNV_IBM
, UCNV_UTF32
, 4, 4,
1233 { 0, 0, 0xff, 0xfd }, 4,
1235 { 0xfd, 0xff, 0, 0 }, 4,
1240 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1243 const UConverterSharedData _UTF32Data
= {
1244 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1245 NULL
, NULL
, &_UTF32StaticData
, FALSE
, &_UTF32Impl
,