2 **********************************************************************
3 * Copyright (C) 2002-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u16.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
26 /* UTF-16BE ----------------------------------------------------------------- */
29 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
31 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
35 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
36 UErrorCode
*pErrorCode
) {
42 int32_t targetCapacity
, length
, count
, sourceIndex
;
47 length
=pArgs
->sourceLimit
-source
;
49 /* no input, nothing to do */
53 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
54 if(targetCapacity
<=0) {
55 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
60 target
=(uint8_t *)pArgs
->target
;
61 offsets
=pArgs
->offsets
;
64 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
66 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
67 /* the last buffer ended with a lead surrogate, output the surrogate pair */
70 target
[0]=(uint8_t)(c
>>8);
72 target
[2]=(uint8_t)(trail
>>8);
73 target
[3]=(uint8_t)trail
;
86 /* copy an even number of bytes for complete UChars */
88 if(count
>targetCapacity
) {
89 count
=targetCapacity
&~1;
93 targetCapacity
-=count
;
100 if(U16_IS_SINGLE(c
)) {
101 target
[0]=(uint8_t)(c
>>8);
102 target
[1]=(uint8_t)c
;
104 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
107 target
[0]=(uint8_t)(c
>>8);
108 target
[1]=(uint8_t)c
;
109 target
[2]=(uint8_t)(trail
>>8);
110 target
[3]=(uint8_t)trail
;
120 if(U16_IS_SINGLE(c
)) {
121 target
[0]=(uint8_t)(c
>>8);
122 target
[1]=(uint8_t)c
;
124 *offsets
++=sourceIndex
;
125 *offsets
++=sourceIndex
++;
126 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
129 target
[0]=(uint8_t)(c
>>8);
130 target
[1]=(uint8_t)c
;
131 target
[2]=(uint8_t)(trail
>>8);
132 target
[3]=(uint8_t)trail
;
134 *offsets
++=sourceIndex
;
135 *offsets
++=sourceIndex
;
136 *offsets
++=sourceIndex
;
137 *offsets
++=sourceIndex
;
147 /* done with the loop for complete UChars */
148 if(length
>0 && targetCapacity
>0) {
150 * there is more input and some target capacity -
151 * it must be targetCapacity==1 because otherwise
152 * the above would have copied more;
153 * prepare for overflow output
155 if(U16_IS_SINGLE(c
=*source
++)) {
156 overflow
[0]=(char)(c
>>8);
158 length
=2; /* 2 bytes to output */
160 /* } else { keep c for surrogate handling, length will be set there */
167 /* keep c for surrogate handling, length will be set there */
168 targetCapacity
+=2*count
;
171 length
=0; /* from here on, length counts the bytes in overflow[] */
176 * c is a surrogate, and
177 * - source or target too short
178 * - or the surrogate is unmatched
181 if(U16_IS_SURROGATE_LEAD(c
)) {
182 if(source
<pArgs
->sourceLimit
) {
183 if(U16_IS_TRAIL(trail
=*source
)) {
184 /* output the surrogate pair, will overflow (see conditions comment above) */
186 overflow
[0]=(char)(c
>>8);
188 overflow
[2]=(char)(trail
>>8);
189 overflow
[3]=(char)trail
;
190 length
=4; /* 4 bytes to output */
193 /* unmatched lead surrogate */
194 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
197 /* see if the trail surrogate is in the next buffer */
200 /* unmatched trail surrogate */
201 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
207 /* output length bytes with overflow (length>targetCapacity>0) */
208 ucnv_fromUWriteBytes(cnv
,
210 (char **)&target
, pArgs
->targetLimit
,
211 &offsets
, sourceIndex
,
213 targetCapacity
=pArgs
->targetLimit
-(char *)target
;
216 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
217 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
220 /* write back the updated pointers */
221 pArgs
->source
=source
;
222 pArgs
->target
=(char *)target
;
223 pArgs
->offsets
=offsets
;
227 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
228 UErrorCode
*pErrorCode
) {
230 const uint8_t *source
;
234 int32_t targetCapacity
, length
, count
, sourceIndex
;
237 cnv
=pArgs
->converter
;
238 source
=(const uint8_t *)pArgs
->source
;
239 length
=(const uint8_t *)pArgs
->sourceLimit
-source
;
240 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
241 /* no input, nothing to do */
245 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
246 if(targetCapacity
<=0) {
247 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
251 target
=pArgs
->target
;
252 offsets
=pArgs
->offsets
;
256 /* complete a partial UChar or pair from the last call */
257 if(cnv
->toUnicodeStatus
!=0) {
259 * special case: single byte from a previous buffer,
260 * where the byte turned out not to belong to a trail surrogate
261 * and the preceding, unmatched lead surrogate was put into toUBytes[]
264 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
266 cnv
->toUnicodeStatus
=0;
268 if((count
=cnv
->toULength
)!=0) {
269 uint8_t *p
=cnv
->toUBytes
;
271 p
[count
++]=*source
++;
275 c
=((UChar
)p
[0]<<8)|p
[1];
276 if(U16_IS_SINGLE(c
)) {
277 /* output the BMP code point */
286 } else if(U16_IS_SURROGATE_LEAD(c
)) {
287 /* continue collecting bytes for the trail surrogate */
288 c
=0; /* avoid unnecessary surrogate handling below */
290 /* fall through to error handling for an unmatched trail surrogate */
293 } else if(count
==4) {
294 c
=((UChar
)p
[0]<<8)|p
[1];
295 trail
=((UChar
)p
[2]<<8)|p
[3];
296 if(U16_IS_TRAIL(trail
)) {
297 /* output the surrogate pair */
299 if(targetCapacity
>=2) {
306 } else /* targetCapacity==1 */ {
308 cnv
->UCharErrorBuffer
[0]=trail
;
309 cnv
->UCharErrorBufferLength
=1;
310 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
316 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
317 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
319 /* back out reading the code unit after it */
320 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
324 * if the trail unit's first byte was in a previous buffer, then
325 * we need to put it into a special place because toUBytes[] will be
326 * used for the lead unit's bytes
328 cnv
->toUnicodeStatus
=0x100|p
[2];
333 /* write back the updated pointers */
334 pArgs
->source
=(const char *)source
;
335 pArgs
->target
=target
;
336 pArgs
->offsets
=offsets
;
341 cnv
->toULength
=(int8_t)count
;
344 /* copy an even number of bytes for complete UChars */
345 count
=2*targetCapacity
;
349 if(c
==0 && count
>0) {
352 targetCapacity
-=count
;
355 c
=((UChar
)source
[0]<<8)|source
[1];
357 if(U16_IS_SINGLE(c
)) {
359 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
360 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
372 c
=((UChar
)source
[0]<<8)|source
[1];
374 if(U16_IS_SINGLE(c
)) {
376 *offsets
++=sourceIndex
;
378 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
379 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
385 *offsets
++=sourceIndex
;
386 *offsets
++=sourceIndex
;
395 /* done with the loop for complete UChars */
398 /* keep c for surrogate handling, trail will be set there */
399 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
400 targetCapacity
+=count
;
406 * c is a surrogate, and
407 * - source or target too short
408 * - or the surrogate is unmatched
410 cnv
->toUBytes
[0]=(uint8_t)(c
>>8);
411 cnv
->toUBytes
[1]=(uint8_t)c
;
414 if(U16_IS_SURROGATE_LEAD(c
)) {
416 if(U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])) {
417 /* output the surrogate pair, will overflow (see conditions comment above) */
422 *offsets
++=sourceIndex
;
424 cnv
->UCharErrorBuffer
[0]=trail
;
425 cnv
->UCharErrorBufferLength
=1;
427 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
429 /* unmatched lead surrogate */
430 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
433 /* see if the trail surrogate is in the next buffer */
436 /* unmatched trail surrogate */
437 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
441 if(U_SUCCESS(*pErrorCode
)) {
442 /* check for a remaining source byte */
444 if(targetCapacity
==0) {
445 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
447 /* it must be length==1 because otherwise the above would have copied more */
448 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
453 /* write back the updated pointers */
454 pArgs
->source
=(const char *)source
;
455 pArgs
->target
=target
;
456 pArgs
->offsets
=offsets
;
460 _UTF16BEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
461 const uint8_t *s
, *sourceLimit
;
464 s
=(const uint8_t *)pArgs
->source
;
465 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
469 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
473 if(s
+2>sourceLimit
) {
474 /* only one byte: truncated UChar */
475 pArgs
->converter
->toUBytes
[0]=*s
++;
476 pArgs
->converter
->toULength
=1;
477 pArgs
->source
=(const char *)s
;
478 *err
= U_TRUNCATED_CHAR_FOUND
;
483 c
=((UChar32
)*s
<<8)|s
[1];
486 /* check for a surrogate pair */
487 if(U_IS_SURROGATE(c
)) {
488 if(U16_IS_SURROGATE_LEAD(c
)) {
489 if(s
+2<=sourceLimit
) {
492 /* get a second UChar and see if it is a trail surrogate */
493 trail
=((UChar
)*s
<<8)|s
[1];
494 if(U16_IS_TRAIL(trail
)) {
495 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
498 /* unmatched lead surrogate */
502 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
503 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
505 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
508 } while(s
<sourceLimit
);
511 *err
=U_TRUNCATED_CHAR_FOUND
;
514 /* unmatched trail surrogate */
519 /* write the unmatched surrogate */
520 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
521 pArgs
->converter
->toULength
=2;
526 *err
=U_ILLEGAL_CHAR_FOUND
;
530 pArgs
->source
=(const char *)s
;
534 static const UConverterImpl _UTF16BEImpl
={
535 UCNV_UTF16_BigEndian
,
544 _UTF16BEToUnicodeWithOffsets
,
545 _UTF16BEToUnicodeWithOffsets
,
546 _UTF16BEFromUnicodeWithOffsets
,
547 _UTF16BEFromUnicodeWithOffsets
,
548 _UTF16BEGetNextUChar
,
554 ucnv_getCompleteUnicodeSet
557 static const UConverterStaticData _UTF16BEStaticData
={
558 sizeof(UConverterStaticData
),
560 1200, UCNV_IBM
, UCNV_UTF16_BigEndian
, 2, 2,
561 { 0xff, 0xfd, 0, 0 },2,FALSE
,FALSE
,
564 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
568 const UConverterSharedData _UTF16BEData
={
569 sizeof(UConverterSharedData
), ~((uint32_t) 0),
570 NULL
, NULL
, &_UTF16BEStaticData
, FALSE
, &_UTF16BEImpl
,
574 /* UTF-16LE ----------------------------------------------------------------- */
577 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
578 UErrorCode
*pErrorCode
) {
584 int32_t targetCapacity
, length
, count
, sourceIndex
;
588 source
=pArgs
->source
;
589 length
=pArgs
->sourceLimit
-source
;
591 /* no input, nothing to do */
595 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
596 if(targetCapacity
<=0) {
597 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
601 cnv
=pArgs
->converter
;
602 target
=(uint8_t *)pArgs
->target
;
603 offsets
=pArgs
->offsets
;
606 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
608 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
609 /* the last buffer ended with a lead surrogate, output the surrogate pair */
612 target
[0]=(uint8_t)c
;
613 target
[1]=(uint8_t)(c
>>8);
614 target
[2]=(uint8_t)trail
;
615 target
[3]=(uint8_t)(trail
>>8);
625 cnv
->fromUChar32
=c
=0;
628 /* copy an even number of bytes for complete UChars */
630 if(count
>targetCapacity
) {
631 count
=targetCapacity
&~1;
635 targetCapacity
-=count
;
642 if(U16_IS_SINGLE(c
)) {
643 target
[0]=(uint8_t)c
;
644 target
[1]=(uint8_t)(c
>>8);
646 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
649 target
[0]=(uint8_t)c
;
650 target
[1]=(uint8_t)(c
>>8);
651 target
[2]=(uint8_t)trail
;
652 target
[3]=(uint8_t)(trail
>>8);
662 if(U16_IS_SINGLE(c
)) {
663 target
[0]=(uint8_t)c
;
664 target
[1]=(uint8_t)(c
>>8);
666 *offsets
++=sourceIndex
;
667 *offsets
++=sourceIndex
++;
668 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
671 target
[0]=(uint8_t)c
;
672 target
[1]=(uint8_t)(c
>>8);
673 target
[2]=(uint8_t)trail
;
674 target
[3]=(uint8_t)(trail
>>8);
676 *offsets
++=sourceIndex
;
677 *offsets
++=sourceIndex
;
678 *offsets
++=sourceIndex
;
679 *offsets
++=sourceIndex
;
689 /* done with the loop for complete UChars */
690 if(length
>0 && targetCapacity
>0) {
692 * there is more input and some target capacity -
693 * it must be targetCapacity==1 because otherwise
694 * the above would have copied more;
695 * prepare for overflow output
697 if(U16_IS_SINGLE(c
=*source
++)) {
699 overflow
[1]=(char)(c
>>8);
700 length
=2; /* 2 bytes to output */
702 /* } else { keep c for surrogate handling, length will be set there */
709 /* keep c for surrogate handling, length will be set there */
710 targetCapacity
+=2*count
;
713 length
=0; /* from here on, length counts the bytes in overflow[] */
718 * c is a surrogate, and
719 * - source or target too short
720 * - or the surrogate is unmatched
723 if(U16_IS_SURROGATE_LEAD(c
)) {
724 if(source
<pArgs
->sourceLimit
) {
725 if(U16_IS_TRAIL(trail
=*source
)) {
726 /* output the surrogate pair, will overflow (see conditions comment above) */
729 overflow
[1]=(char)(c
>>8);
730 overflow
[2]=(char)trail
;
731 overflow
[3]=(char)(trail
>>8);
732 length
=4; /* 4 bytes to output */
735 /* unmatched lead surrogate */
736 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
739 /* see if the trail surrogate is in the next buffer */
742 /* unmatched trail surrogate */
743 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
749 /* output length bytes with overflow (length>targetCapacity>0) */
750 ucnv_fromUWriteBytes(cnv
,
752 (char **)&target
, pArgs
->targetLimit
,
753 &offsets
, sourceIndex
,
755 targetCapacity
=pArgs
->targetLimit
-(char *)target
;
758 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
759 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
762 /* write back the updated pointers */
763 pArgs
->source
=source
;
764 pArgs
->target
=(char *)target
;
765 pArgs
->offsets
=offsets
;
769 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
770 UErrorCode
*pErrorCode
) {
772 const uint8_t *source
;
776 int32_t targetCapacity
, length
, count
, sourceIndex
;
779 cnv
=pArgs
->converter
;
780 source
=(const uint8_t *)pArgs
->source
;
781 length
=(const uint8_t *)pArgs
->sourceLimit
-source
;
782 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
783 /* no input, nothing to do */
787 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
788 if(targetCapacity
<=0) {
789 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
793 target
=pArgs
->target
;
794 offsets
=pArgs
->offsets
;
798 /* complete a partial UChar or pair from the last call */
799 if(cnv
->toUnicodeStatus
!=0) {
801 * special case: single byte from a previous buffer,
802 * where the byte turned out not to belong to a trail surrogate
803 * and the preceding, unmatched lead surrogate was put into toUBytes[]
806 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
808 cnv
->toUnicodeStatus
=0;
810 if((count
=cnv
->toULength
)!=0) {
811 uint8_t *p
=cnv
->toUBytes
;
813 p
[count
++]=*source
++;
817 c
=((UChar
)p
[1]<<8)|p
[0];
818 if(U16_IS_SINGLE(c
)) {
819 /* output the BMP code point */
828 } else if(U16_IS_SURROGATE_LEAD(c
)) {
829 /* continue collecting bytes for the trail surrogate */
830 c
=0; /* avoid unnecessary surrogate handling below */
832 /* fall through to error handling for an unmatched trail surrogate */
835 } else if(count
==4) {
836 c
=((UChar
)p
[1]<<8)|p
[0];
837 trail
=((UChar
)p
[3]<<8)|p
[2];
838 if(U16_IS_TRAIL(trail
)) {
839 /* output the surrogate pair */
841 if(targetCapacity
>=2) {
848 } else /* targetCapacity==1 */ {
850 cnv
->UCharErrorBuffer
[0]=trail
;
851 cnv
->UCharErrorBufferLength
=1;
852 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
858 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
859 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
861 /* back out reading the code unit after it */
862 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
866 * if the trail unit's first byte was in a previous buffer, then
867 * we need to put it into a special place because toUBytes[] will be
868 * used for the lead unit's bytes
870 cnv
->toUnicodeStatus
=0x100|p
[2];
875 /* write back the updated pointers */
876 pArgs
->source
=(const char *)source
;
877 pArgs
->target
=target
;
878 pArgs
->offsets
=offsets
;
883 cnv
->toULength
=(int8_t)count
;
886 /* copy an even number of bytes for complete UChars */
887 count
=2*targetCapacity
;
891 if(c
==0 && count
>0) {
894 targetCapacity
-=count
;
897 c
=((UChar
)source
[1]<<8)|source
[0];
899 if(U16_IS_SINGLE(c
)) {
901 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
902 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
914 c
=((UChar
)source
[1]<<8)|source
[0];
916 if(U16_IS_SINGLE(c
)) {
918 *offsets
++=sourceIndex
;
920 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
921 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
927 *offsets
++=sourceIndex
;
928 *offsets
++=sourceIndex
;
937 /* done with the loop for complete UChars */
940 /* keep c for surrogate handling, trail will be set there */
941 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
942 targetCapacity
+=count
;
948 * c is a surrogate, and
949 * - source or target too short
950 * - or the surrogate is unmatched
952 cnv
->toUBytes
[0]=(uint8_t)c
;
953 cnv
->toUBytes
[1]=(uint8_t)(c
>>8);
956 if(U16_IS_SURROGATE_LEAD(c
)) {
958 if(U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])) {
959 /* output the surrogate pair, will overflow (see conditions comment above) */
964 *offsets
++=sourceIndex
;
966 cnv
->UCharErrorBuffer
[0]=trail
;
967 cnv
->UCharErrorBufferLength
=1;
969 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
971 /* unmatched lead surrogate */
972 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
975 /* see if the trail surrogate is in the next buffer */
978 /* unmatched trail surrogate */
979 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
983 if(U_SUCCESS(*pErrorCode
)) {
984 /* check for a remaining source byte */
986 if(targetCapacity
==0) {
987 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
989 /* it must be length==1 because otherwise the above would have copied more */
990 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
995 /* write back the updated pointers */
996 pArgs
->source
=(const char *)source
;
997 pArgs
->target
=target
;
998 pArgs
->offsets
=offsets
;
1002 _UTF16LEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
1003 const uint8_t *s
, *sourceLimit
;
1006 s
=(const uint8_t *)pArgs
->source
;
1007 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1009 if(s
>=sourceLimit
) {
1011 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
1015 if(s
+2>sourceLimit
) {
1016 /* only one byte: truncated UChar */
1017 pArgs
->converter
->toUBytes
[0]=*s
++;
1018 pArgs
->converter
->toULength
=1;
1019 pArgs
->source
=(const char *)s
;
1020 *err
= U_TRUNCATED_CHAR_FOUND
;
1025 c
=((UChar32
)s
[1]<<8)|*s
;
1028 /* check for a surrogate pair */
1029 if(U_IS_SURROGATE(c
)) {
1030 if(U16_IS_SURROGATE_LEAD(c
)) {
1031 if(s
+2<=sourceLimit
) {
1034 /* get a second UChar and see if it is a trail surrogate */
1035 trail
=((UChar
)s
[1]<<8)|*s
;
1036 if(U16_IS_TRAIL(trail
)) {
1037 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1040 /* unmatched lead surrogate */
1044 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1045 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1047 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
1050 } while(s
<sourceLimit
);
1053 *err
=U_TRUNCATED_CHAR_FOUND
;
1056 /* unmatched trail surrogate */
1061 /* write the unmatched surrogate */
1062 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1063 pArgs
->converter
->toULength
=2;
1068 *err
=U_ILLEGAL_CHAR_FOUND
;
1072 pArgs
->source
=(const char *)s
;
1076 static const UConverterImpl _UTF16LEImpl
={
1077 UCNV_UTF16_LittleEndian
,
1086 _UTF16LEToUnicodeWithOffsets
,
1087 _UTF16LEToUnicodeWithOffsets
,
1088 _UTF16LEFromUnicodeWithOffsets
,
1089 _UTF16LEFromUnicodeWithOffsets
,
1090 _UTF16LEGetNextUChar
,
1096 ucnv_getCompleteUnicodeSet
1100 static const UConverterStaticData _UTF16LEStaticData
={
1101 sizeof(UConverterStaticData
),
1103 1202, UCNV_IBM
, UCNV_UTF16_LittleEndian
, 2, 2,
1104 { 0xfd, 0xff, 0, 0 },2,FALSE
,FALSE
,
1107 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1111 const UConverterSharedData _UTF16LEData
={
1112 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1113 NULL
, NULL
, &_UTF16LEStaticData
, FALSE
, &_UTF16LEImpl
,
1117 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1120 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1122 * This is a simpler version of the UTF-32 converter below, with
1123 * fewer states for shorter BOMs.
1134 * During detection: state&3==number of matching bytes so far.
1136 * On output, emit U+FEFF as the first code point.
1140 _UTF16Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1141 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1142 /* reset toUnicode: state=0 */
1145 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1146 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1147 cnv
->charErrorBufferLength
=2;
1149 cnv
->charErrorBuffer
[0]=0xfe;
1150 cnv
->charErrorBuffer
[1]=0xff;
1152 cnv
->charErrorBuffer
[0]=0xff;
1153 cnv
->charErrorBuffer
[1]=0xfe;
1159 _UTF16Open(UConverter
*cnv
,
1163 UErrorCode
*pErrorCode
) {
1164 _UTF16Reset(cnv
, UCNV_RESET_BOTH
);
1167 static const char utf16BOM
[8]={ (char)0xfe, (char)0xff, 0, 0, (char)0xff, (char)0xfe, 0, 0 };
1170 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1171 UErrorCode
*pErrorCode
) {
1172 UConverter
*cnv
=pArgs
->converter
;
1173 const char *source
=pArgs
->source
;
1174 const char *sourceLimit
=pArgs
->sourceLimit
;
1175 int32_t *offsets
=pArgs
->offsets
;
1177 int32_t state
, offsetDelta
;
1183 * If we detect a BOM in this buffer, then we must add the BOM size to the
1184 * offsets because the actual converter function will not see and count the BOM.
1185 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1189 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1194 state
=1; /* could be FE FF */
1195 } else if(b
==(char)0xff) {
1196 state
=5; /* could be FF FE */
1198 state
=8; /* default to UTF-16BE */
1205 if(*source
==utf16BOM
[state
]) {
1208 state
=8; /* detect UTF-16BE */
1209 offsetDelta
=source
-pArgs
->source
;
1210 } else if(state
==5) {
1211 state
=9; /* detect UTF-16LE */
1212 offsetDelta
=source
-pArgs
->source
;
1215 /* switch to UTF-16BE and pass the previous bytes */
1216 if(source
!=pArgs
->source
) {
1217 /* just reset the source */
1218 source
=pArgs
->source
;
1220 UBool oldFlush
=pArgs
->flush
;
1222 /* the first byte is from a previous buffer, replay it first */
1223 pArgs
->source
=utf16BOM
+(state
&4); /* select the correct BOM */
1224 pArgs
->sourceLimit
=pArgs
->source
+1; /* replay previous byte */
1225 pArgs
->flush
=FALSE
; /* this sourceLimit is not the real source stream limit */
1227 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1229 /* restore real pointers; pArgs->source will be set in case 8/9 */
1230 pArgs
->sourceLimit
=sourceLimit
;
1231 pArgs
->flush
=oldFlush
;
1239 pArgs
->source
=source
;
1240 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1241 source
=pArgs
->source
;
1245 pArgs
->source
=source
;
1246 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1247 source
=pArgs
->source
;
1250 break; /* does not occur */
1254 /* add BOM size to offsets - see comment at offsetDelta declaration */
1255 if(offsets
!=NULL
&& offsetDelta
!=0) {
1256 int32_t *offsetsLimit
=pArgs
->offsets
;
1257 while(offsets
<offsetsLimit
) {
1258 *offsets
++ += offsetDelta
;
1262 pArgs
->source
=source
;
1264 if(source
==sourceLimit
&& pArgs
->flush
) {
1265 /* handle truncated input */
1268 break; /* no input at all, nothing to do */
1270 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1273 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1276 /* handle 0<state<8: call UTF-16BE with too-short input */
1277 pArgs
->source
=utf16BOM
+(state
&4); /* select the correct BOM */
1278 pArgs
->sourceLimit
=pArgs
->source
+(state
&3); /* replay bytes */
1280 /* no offsets: not enough for output */
1281 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1282 pArgs
->source
=source
;
1283 pArgs
->sourceLimit
=sourceLimit
;
1293 _UTF16GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1294 UErrorCode
*pErrorCode
) {
1295 switch(pArgs
->converter
->mode
) {
1297 return _UTF16BEGetNextUChar(pArgs
, pErrorCode
);
1299 return _UTF16LEGetNextUChar(pArgs
, pErrorCode
);
1301 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1305 static const UConverterImpl _UTF16Impl
= {
1315 _UTF16ToUnicodeWithOffsets
,
1316 _UTF16ToUnicodeWithOffsets
,
1317 _UTF16PEFromUnicodeWithOffsets
,
1318 _UTF16PEFromUnicodeWithOffsets
,
1321 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1325 ucnv_getCompleteUnicodeSet
1328 static const UConverterStaticData _UTF16StaticData
= {
1329 sizeof(UConverterStaticData
),
1331 0, /* ### TODO review correctness of all Unicode CCSIDs */
1332 UCNV_IBM
, UCNV_UTF16
, 2, 2,
1334 { 0xff, 0xfd, 0, 0 }, 2,
1336 { 0xfd, 0xff, 0, 0 }, 2,
1341 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1344 const UConverterSharedData _UTF16Data
= {
1345 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1346 NULL
, NULL
, &_UTF16StaticData
, FALSE
, &_UTF16Impl
,