1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2002-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u16.c
10 * tab size: 8 (not used)
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
16 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_CONVERSION
23 #include "unicode/ucnv.h"
24 #include "unicode/uversion.h"
30 UCNV_NEED_TO_WRITE_BOM
=1
35 * The UTF-16 toUnicode implementation is also used for the Java-specific
36 * "with BOM" variants of UTF-16BE and UTF-16LE.
38 static void U_CALLCONV
39 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
40 UErrorCode
*pErrorCode
);
42 /* UTF-16BE ----------------------------------------------------------------- */
45 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
47 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
51 static void U_CALLCONV
52 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
53 UErrorCode
*pErrorCode
) {
59 uint32_t targetCapacity
, length
, sourceIndex
;
64 length
=(int32_t)(pArgs
->sourceLimit
-source
);
66 /* no input, nothing to do */
72 /* write the BOM if necessary */
73 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
74 static const char bom
[]={ (char)0xfe, (char)0xff };
75 ucnv_fromUWriteBytes(cnv
,
77 &pArgs
->target
, pArgs
->targetLimit
,
80 cnv
->fromUnicodeStatus
=0;
84 if(target
>= pArgs
->targetLimit
) {
85 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
89 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
90 offsets
=pArgs
->offsets
;
93 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
95 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
96 /* the last buffer ended with a lead surrogate, output the surrogate pair */
99 target
[0]=(uint8_t)(c
>>8);
100 target
[1]=(uint8_t)c
;
101 target
[2]=(uint8_t)(trail
>>8);
102 target
[3]=(uint8_t)trail
;
112 cnv
->fromUChar32
=c
=0;
116 /* copy an even number of bytes for complete UChars */
117 uint32_t count
=2*length
;
118 if(count
>targetCapacity
) {
119 count
=targetCapacity
&~1;
122 targetCapacity
-=count
;
129 if(U16_IS_SINGLE(c
)) {
130 target
[0]=(uint8_t)(c
>>8);
131 target
[1]=(uint8_t)c
;
133 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
136 target
[0]=(uint8_t)(c
>>8);
137 target
[1]=(uint8_t)c
;
138 target
[2]=(uint8_t)(trail
>>8);
139 target
[3]=(uint8_t)trail
;
149 if(U16_IS_SINGLE(c
)) {
150 target
[0]=(uint8_t)(c
>>8);
151 target
[1]=(uint8_t)c
;
153 *offsets
++=sourceIndex
;
154 *offsets
++=sourceIndex
++;
155 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
158 target
[0]=(uint8_t)(c
>>8);
159 target
[1]=(uint8_t)c
;
160 target
[2]=(uint8_t)(trail
>>8);
161 target
[3]=(uint8_t)trail
;
163 *offsets
++=sourceIndex
;
164 *offsets
++=sourceIndex
;
165 *offsets
++=sourceIndex
;
166 *offsets
++=sourceIndex
;
176 /* done with the loop for complete UChars */
177 if(length
>0 && targetCapacity
>0) {
179 * there is more input and some target capacity -
180 * it must be targetCapacity==1 because otherwise
181 * the above would have copied more;
182 * prepare for overflow output
184 if(U16_IS_SINGLE(c
=*source
++)) {
185 overflow
[0]=(char)(c
>>8);
187 length
=2; /* 2 bytes to output */
189 /* } else { keep c for surrogate handling, length will be set there */
196 /* keep c for surrogate handling, length will be set there */
197 targetCapacity
+=2*count
;
200 length
=0; /* from here on, length counts the bytes in overflow[] */
205 * c is a surrogate, and
206 * - source or target too short
207 * - or the surrogate is unmatched
210 if(U16_IS_SURROGATE_LEAD(c
)) {
211 if(source
<pArgs
->sourceLimit
) {
212 if(U16_IS_TRAIL(trail
=*source
)) {
213 /* output the surrogate pair, will overflow (see conditions comment above) */
215 overflow
[0]=(char)(c
>>8);
217 overflow
[2]=(char)(trail
>>8);
218 overflow
[3]=(char)trail
;
219 length
=4; /* 4 bytes to output */
222 /* unmatched lead surrogate */
223 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
226 /* see if the trail surrogate is in the next buffer */
229 /* unmatched trail surrogate */
230 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
236 /* output length bytes with overflow (length>targetCapacity>0) */
237 ucnv_fromUWriteBytes(cnv
,
239 (char **)&target
, pArgs
->targetLimit
,
240 &offsets
, sourceIndex
,
242 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
245 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
246 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
249 /* write back the updated pointers */
250 pArgs
->source
=source
;
251 pArgs
->target
=(char *)target
;
252 pArgs
->offsets
=offsets
;
255 static void U_CALLCONV
256 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
257 UErrorCode
*pErrorCode
) {
259 const uint8_t *source
;
263 uint32_t targetCapacity
, length
, count
, sourceIndex
;
266 if(pArgs
->converter
->mode
<8) {
267 _UTF16ToUnicodeWithOffsets(pArgs
, pErrorCode
);
271 cnv
=pArgs
->converter
;
272 source
=(const uint8_t *)pArgs
->source
;
273 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
274 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
275 /* no input, nothing to do */
279 target
=pArgs
->target
;
280 if(target
>= pArgs
->targetLimit
) {
281 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
285 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
286 offsets
=pArgs
->offsets
;
290 /* complete a partial UChar or pair from the last call */
291 if(cnv
->toUnicodeStatus
!=0) {
293 * special case: single byte from a previous buffer,
294 * where the byte turned out not to belong to a trail surrogate
295 * and the preceding, unmatched lead surrogate was put into toUBytes[]
298 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
300 cnv
->toUnicodeStatus
=0;
302 if((count
=cnv
->toULength
)!=0) {
303 uint8_t *p
=cnv
->toUBytes
;
305 p
[count
++]=*source
++;
309 c
=((UChar
)p
[0]<<8)|p
[1];
310 if(U16_IS_SINGLE(c
)) {
311 /* output the BMP code point */
320 } else if(U16_IS_SURROGATE_LEAD(c
)) {
321 /* continue collecting bytes for the trail surrogate */
322 c
=0; /* avoid unnecessary surrogate handling below */
324 /* fall through to error handling for an unmatched trail surrogate */
327 } else if(count
==4) {
328 c
=((UChar
)p
[0]<<8)|p
[1];
329 trail
=((UChar
)p
[2]<<8)|p
[3];
330 if(U16_IS_TRAIL(trail
)) {
331 /* output the surrogate pair */
333 if(targetCapacity
>=2) {
340 } else /* targetCapacity==1 */ {
342 cnv
->UCharErrorBuffer
[0]=trail
;
343 cnv
->UCharErrorBufferLength
=1;
344 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
350 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
351 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
353 /* back out reading the code unit after it */
354 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
358 * if the trail unit's first byte was in a previous buffer, then
359 * we need to put it into a special place because toUBytes[] will be
360 * used for the lead unit's bytes
362 cnv
->toUnicodeStatus
=0x100|p
[2];
367 /* write back the updated pointers */
368 pArgs
->source
=(const char *)source
;
369 pArgs
->target
=target
;
370 pArgs
->offsets
=offsets
;
375 cnv
->toULength
=(int8_t)count
;
378 /* copy an even number of bytes for complete UChars */
379 count
=2*targetCapacity
;
383 if(c
==0 && count
>0) {
386 targetCapacity
-=count
;
389 c
=((UChar
)source
[0]<<8)|source
[1];
391 if(U16_IS_SINGLE(c
)) {
393 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
394 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
406 c
=((UChar
)source
[0]<<8)|source
[1];
408 if(U16_IS_SINGLE(c
)) {
410 *offsets
++=sourceIndex
;
412 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
413 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
419 *offsets
++=sourceIndex
;
420 *offsets
++=sourceIndex
;
429 /* done with the loop for complete UChars */
432 /* keep c for surrogate handling, trail will be set there */
433 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
434 targetCapacity
+=count
;
440 * c is a surrogate, and
441 * - source or target too short
442 * - or the surrogate is unmatched
444 cnv
->toUBytes
[0]=(uint8_t)(c
>>8);
445 cnv
->toUBytes
[1]=(uint8_t)c
;
448 if(U16_IS_SURROGATE_LEAD(c
)) {
450 if(U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])) {
451 /* output the surrogate pair, will overflow (see conditions comment above) */
456 *offsets
++=sourceIndex
;
458 cnv
->UCharErrorBuffer
[0]=trail
;
459 cnv
->UCharErrorBufferLength
=1;
461 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
463 /* unmatched lead surrogate */
464 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
467 /* see if the trail surrogate is in the next buffer */
470 /* unmatched trail surrogate */
471 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
475 if(U_SUCCESS(*pErrorCode
)) {
476 /* check for a remaining source byte */
478 if(targetCapacity
==0) {
479 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
481 /* it must be length==1 because otherwise the above would have copied more */
482 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
487 /* write back the updated pointers */
488 pArgs
->source
=(const char *)source
;
489 pArgs
->target
=target
;
490 pArgs
->offsets
=offsets
;
493 static UChar32 U_CALLCONV
494 _UTF16BEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
495 const uint8_t *s
, *sourceLimit
;
498 if(pArgs
->converter
->mode
<8) {
499 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
502 s
=(const uint8_t *)pArgs
->source
;
503 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
507 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
511 if(s
+2>sourceLimit
) {
512 /* only one byte: truncated UChar */
513 pArgs
->converter
->toUBytes
[0]=*s
++;
514 pArgs
->converter
->toULength
=1;
515 pArgs
->source
=(const char *)s
;
516 *err
= U_TRUNCATED_CHAR_FOUND
;
521 c
=((UChar32
)*s
<<8)|s
[1];
524 /* check for a surrogate pair */
525 if(U_IS_SURROGATE(c
)) {
526 if(U16_IS_SURROGATE_LEAD(c
)) {
527 if(s
+2<=sourceLimit
) {
530 /* get a second UChar and see if it is a trail surrogate */
531 trail
=((UChar
)*s
<<8)|s
[1];
532 if(U16_IS_TRAIL(trail
)) {
533 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
536 /* unmatched lead surrogate */
540 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
541 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
543 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
546 } while(s
<sourceLimit
);
549 *err
=U_TRUNCATED_CHAR_FOUND
;
552 /* unmatched trail surrogate */
557 /* write the unmatched surrogate */
558 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
559 pArgs
->converter
->toULength
=2;
564 *err
=U_ILLEGAL_CHAR_FOUND
;
568 pArgs
->source
=(const char *)s
;
572 static void U_CALLCONV
573 _UTF16BEReset(UConverter
*cnv
, UConverterResetChoice choice
) {
574 if(choice
<=UCNV_RESET_TO_UNICODE
) {
575 /* reset toUnicode state */
576 if(UCNV_GET_VERSION(cnv
)==0) {
577 cnv
->mode
=8; /* no BOM handling */
579 cnv
->mode
=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
582 if(choice
!=UCNV_RESET_TO_UNICODE
&& UCNV_GET_VERSION(cnv
)==1) {
583 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
584 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
588 static void U_CALLCONV
589 _UTF16BEOpen(UConverter
*cnv
,
590 UConverterLoadArgs
*pArgs
,
591 UErrorCode
*pErrorCode
) {
593 if(UCNV_GET_VERSION(cnv
)<=1) {
594 _UTF16BEReset(cnv
, UCNV_RESET_BOTH
);
596 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
600 static const char * U_CALLCONV
601 _UTF16BEGetName(const UConverter
*cnv
) {
602 if(UCNV_GET_VERSION(cnv
)==0) {
605 return "UTF-16BE,version=1";
610 static const UConverterImpl _UTF16BEImpl
={
611 UCNV_UTF16_BigEndian
,
620 _UTF16BEToUnicodeWithOffsets
,
621 _UTF16BEToUnicodeWithOffsets
,
622 _UTF16BEFromUnicodeWithOffsets
,
623 _UTF16BEFromUnicodeWithOffsets
,
624 _UTF16BEGetNextUChar
,
630 ucnv_getNonSurrogateUnicodeSet
,
636 static const UConverterStaticData _UTF16BEStaticData
={
637 sizeof(UConverterStaticData
),
639 1200, UCNV_IBM
, UCNV_UTF16_BigEndian
, 2, 2,
640 { 0xff, 0xfd, 0, 0 },2,FALSE
,FALSE
,
643 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
647 const UConverterSharedData _UTF16BEData
=
648 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData
, &_UTF16BEImpl
);
650 /* UTF-16LE ----------------------------------------------------------------- */
652 static void U_CALLCONV
653 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
654 UErrorCode
*pErrorCode
) {
660 uint32_t targetCapacity
, length
, sourceIndex
;
664 source
=pArgs
->source
;
665 length
=(int32_t)(pArgs
->sourceLimit
-source
);
667 /* no input, nothing to do */
671 cnv
=pArgs
->converter
;
673 /* write the BOM if necessary */
674 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
675 static const char bom
[]={ (char)0xff, (char)0xfe };
676 ucnv_fromUWriteBytes(cnv
,
678 &pArgs
->target
, pArgs
->targetLimit
,
681 cnv
->fromUnicodeStatus
=0;
684 target
=pArgs
->target
;
685 if(target
>= pArgs
->targetLimit
) {
686 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
690 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
691 offsets
=pArgs
->offsets
;
694 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
696 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
697 /* the last buffer ended with a lead surrogate, output the surrogate pair */
700 target
[0]=(uint8_t)c
;
701 target
[1]=(uint8_t)(c
>>8);
702 target
[2]=(uint8_t)trail
;
703 target
[3]=(uint8_t)(trail
>>8);
713 cnv
->fromUChar32
=c
=0;
717 /* copy an even number of bytes for complete UChars */
718 uint32_t count
=2*length
;
719 if(count
>targetCapacity
) {
720 count
=targetCapacity
&~1;
723 targetCapacity
-=count
;
730 if(U16_IS_SINGLE(c
)) {
731 target
[0]=(uint8_t)c
;
732 target
[1]=(uint8_t)(c
>>8);
734 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
737 target
[0]=(uint8_t)c
;
738 target
[1]=(uint8_t)(c
>>8);
739 target
[2]=(uint8_t)trail
;
740 target
[3]=(uint8_t)(trail
>>8);
750 if(U16_IS_SINGLE(c
)) {
751 target
[0]=(uint8_t)c
;
752 target
[1]=(uint8_t)(c
>>8);
754 *offsets
++=sourceIndex
;
755 *offsets
++=sourceIndex
++;
756 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
759 target
[0]=(uint8_t)c
;
760 target
[1]=(uint8_t)(c
>>8);
761 target
[2]=(uint8_t)trail
;
762 target
[3]=(uint8_t)(trail
>>8);
764 *offsets
++=sourceIndex
;
765 *offsets
++=sourceIndex
;
766 *offsets
++=sourceIndex
;
767 *offsets
++=sourceIndex
;
777 /* done with the loop for complete UChars */
778 if(length
>0 && targetCapacity
>0) {
780 * there is more input and some target capacity -
781 * it must be targetCapacity==1 because otherwise
782 * the above would have copied more;
783 * prepare for overflow output
785 if(U16_IS_SINGLE(c
=*source
++)) {
787 overflow
[1]=(char)(c
>>8);
788 length
=2; /* 2 bytes to output */
790 /* } else { keep c for surrogate handling, length will be set there */
797 /* keep c for surrogate handling, length will be set there */
798 targetCapacity
+=2*count
;
801 length
=0; /* from here on, length counts the bytes in overflow[] */
806 * c is a surrogate, and
807 * - source or target too short
808 * - or the surrogate is unmatched
811 if(U16_IS_SURROGATE_LEAD(c
)) {
812 if(source
<pArgs
->sourceLimit
) {
813 if(U16_IS_TRAIL(trail
=*source
)) {
814 /* output the surrogate pair, will overflow (see conditions comment above) */
817 overflow
[1]=(char)(c
>>8);
818 overflow
[2]=(char)trail
;
819 overflow
[3]=(char)(trail
>>8);
820 length
=4; /* 4 bytes to output */
823 /* unmatched lead surrogate */
824 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
827 /* see if the trail surrogate is in the next buffer */
830 /* unmatched trail surrogate */
831 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
837 /* output length bytes with overflow (length>targetCapacity>0) */
838 ucnv_fromUWriteBytes(cnv
,
840 &target
, pArgs
->targetLimit
,
841 &offsets
, sourceIndex
,
843 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
846 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
847 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
850 /* write back the updated pointers */
851 pArgs
->source
=source
;
852 pArgs
->target
=target
;
853 pArgs
->offsets
=offsets
;
856 static void U_CALLCONV
857 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
858 UErrorCode
*pErrorCode
) {
860 const uint8_t *source
;
864 uint32_t targetCapacity
, length
, count
, sourceIndex
;
867 if(pArgs
->converter
->mode
<8) {
868 _UTF16ToUnicodeWithOffsets(pArgs
, pErrorCode
);
872 cnv
=pArgs
->converter
;
873 source
=(const uint8_t *)pArgs
->source
;
874 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
875 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
876 /* no input, nothing to do */
880 target
=pArgs
->target
;
881 if(target
>= pArgs
->targetLimit
) {
882 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
886 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
887 offsets
=pArgs
->offsets
;
891 /* complete a partial UChar or pair from the last call */
892 if(cnv
->toUnicodeStatus
!=0) {
894 * special case: single byte from a previous buffer,
895 * where the byte turned out not to belong to a trail surrogate
896 * and the preceding, unmatched lead surrogate was put into toUBytes[]
899 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
901 cnv
->toUnicodeStatus
=0;
903 if((count
=cnv
->toULength
)!=0) {
904 uint8_t *p
=cnv
->toUBytes
;
906 p
[count
++]=*source
++;
910 c
=((UChar
)p
[1]<<8)|p
[0];
911 if(U16_IS_SINGLE(c
)) {
912 /* output the BMP code point */
921 } else if(U16_IS_SURROGATE_LEAD(c
)) {
922 /* continue collecting bytes for the trail surrogate */
923 c
=0; /* avoid unnecessary surrogate handling below */
925 /* fall through to error handling for an unmatched trail surrogate */
928 } else if(count
==4) {
929 c
=((UChar
)p
[1]<<8)|p
[0];
930 trail
=((UChar
)p
[3]<<8)|p
[2];
931 if(U16_IS_TRAIL(trail
)) {
932 /* output the surrogate pair */
934 if(targetCapacity
>=2) {
941 } else /* targetCapacity==1 */ {
943 cnv
->UCharErrorBuffer
[0]=trail
;
944 cnv
->UCharErrorBufferLength
=1;
945 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
951 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
952 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
954 /* back out reading the code unit after it */
955 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
959 * if the trail unit's first byte was in a previous buffer, then
960 * we need to put it into a special place because toUBytes[] will be
961 * used for the lead unit's bytes
963 cnv
->toUnicodeStatus
=0x100|p
[2];
968 /* write back the updated pointers */
969 pArgs
->source
=(const char *)source
;
970 pArgs
->target
=target
;
971 pArgs
->offsets
=offsets
;
976 cnv
->toULength
=(int8_t)count
;
979 /* copy an even number of bytes for complete UChars */
980 count
=2*targetCapacity
;
984 if(c
==0 && count
>0) {
987 targetCapacity
-=count
;
990 c
=((UChar
)source
[1]<<8)|source
[0];
992 if(U16_IS_SINGLE(c
)) {
994 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
995 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
1007 c
=((UChar
)source
[1]<<8)|source
[0];
1009 if(U16_IS_SINGLE(c
)) {
1011 *offsets
++=sourceIndex
;
1013 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
1014 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
1020 *offsets
++=sourceIndex
;
1021 *offsets
++=sourceIndex
;
1030 /* done with the loop for complete UChars */
1033 /* keep c for surrogate handling, trail will be set there */
1034 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
1035 targetCapacity
+=count
;
1041 * c is a surrogate, and
1042 * - source or target too short
1043 * - or the surrogate is unmatched
1045 cnv
->toUBytes
[0]=(uint8_t)c
;
1046 cnv
->toUBytes
[1]=(uint8_t)(c
>>8);
1049 if(U16_IS_SURROGATE_LEAD(c
)) {
1051 if(U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])) {
1052 /* output the surrogate pair, will overflow (see conditions comment above) */
1057 *offsets
++=sourceIndex
;
1059 cnv
->UCharErrorBuffer
[0]=trail
;
1060 cnv
->UCharErrorBufferLength
=1;
1062 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1064 /* unmatched lead surrogate */
1065 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1068 /* see if the trail surrogate is in the next buffer */
1071 /* unmatched trail surrogate */
1072 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1076 if(U_SUCCESS(*pErrorCode
)) {
1077 /* check for a remaining source byte */
1079 if(targetCapacity
==0) {
1080 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1082 /* it must be length==1 because otherwise the above would have copied more */
1083 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
1088 /* write back the updated pointers */
1089 pArgs
->source
=(const char *)source
;
1090 pArgs
->target
=target
;
1091 pArgs
->offsets
=offsets
;
1094 static UChar32 U_CALLCONV
1095 _UTF16LEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
1096 const uint8_t *s
, *sourceLimit
;
1099 if(pArgs
->converter
->mode
<8) {
1100 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1103 s
=(const uint8_t *)pArgs
->source
;
1104 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1106 if(s
>=sourceLimit
) {
1108 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
1112 if(s
+2>sourceLimit
) {
1113 /* only one byte: truncated UChar */
1114 pArgs
->converter
->toUBytes
[0]=*s
++;
1115 pArgs
->converter
->toULength
=1;
1116 pArgs
->source
=(const char *)s
;
1117 *err
= U_TRUNCATED_CHAR_FOUND
;
1122 c
=((UChar32
)s
[1]<<8)|*s
;
1125 /* check for a surrogate pair */
1126 if(U_IS_SURROGATE(c
)) {
1127 if(U16_IS_SURROGATE_LEAD(c
)) {
1128 if(s
+2<=sourceLimit
) {
1131 /* get a second UChar and see if it is a trail surrogate */
1132 trail
=((UChar
)s
[1]<<8)|*s
;
1133 if(U16_IS_TRAIL(trail
)) {
1134 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1137 /* unmatched lead surrogate */
1141 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1142 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1144 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
1147 } while(s
<sourceLimit
);
1150 *err
=U_TRUNCATED_CHAR_FOUND
;
1153 /* unmatched trail surrogate */
1158 /* write the unmatched surrogate */
1159 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1160 pArgs
->converter
->toULength
=2;
1165 *err
=U_ILLEGAL_CHAR_FOUND
;
1169 pArgs
->source
=(const char *)s
;
1173 static void U_CALLCONV
1174 _UTF16LEReset(UConverter
*cnv
, UConverterResetChoice choice
) {
1175 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1176 /* reset toUnicode state */
1177 if(UCNV_GET_VERSION(cnv
)==0) {
1178 cnv
->mode
=8; /* no BOM handling */
1180 cnv
->mode
=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1183 if(choice
!=UCNV_RESET_TO_UNICODE
&& UCNV_GET_VERSION(cnv
)==1) {
1184 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1185 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1189 static void U_CALLCONV
1190 _UTF16LEOpen(UConverter
*cnv
,
1191 UConverterLoadArgs
*pArgs
,
1192 UErrorCode
*pErrorCode
) {
1194 if(UCNV_GET_VERSION(cnv
)<=1) {
1195 _UTF16LEReset(cnv
, UCNV_RESET_BOTH
);
1197 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1201 static const char * U_CALLCONV
1202 _UTF16LEGetName(const UConverter
*cnv
) {
1203 if(UCNV_GET_VERSION(cnv
)==0) {
1206 return "UTF-16LE,version=1";
1211 static const UConverterImpl _UTF16LEImpl
={
1212 UCNV_UTF16_LittleEndian
,
1221 _UTF16LEToUnicodeWithOffsets
,
1222 _UTF16LEToUnicodeWithOffsets
,
1223 _UTF16LEFromUnicodeWithOffsets
,
1224 _UTF16LEFromUnicodeWithOffsets
,
1225 _UTF16LEGetNextUChar
,
1231 ucnv_getNonSurrogateUnicodeSet
,
1238 static const UConverterStaticData _UTF16LEStaticData
={
1239 sizeof(UConverterStaticData
),
1241 1202, UCNV_IBM
, UCNV_UTF16_LittleEndian
, 2, 2,
1242 { 0xfd, 0xff, 0, 0 },2,FALSE
,FALSE
,
1245 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1249 const UConverterSharedData _UTF16LEData
=
1250 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData
, &_UTF16LEImpl
);
1252 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1255 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1257 * This is a simpler version of the UTF-32 converter, with
1258 * fewer states for shorter BOMs.
1264 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1268 * During detection: state==number of initial bytes seen so far.
1270 * On output, emit U+FEFF as the first code point.
1273 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1274 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1275 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1278 static void U_CALLCONV
1279 _UTF16Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1280 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1281 /* reset toUnicode: state=0 */
1284 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1285 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1286 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1290 extern const UConverterSharedData _UTF16v2Data
;
1292 static void U_CALLCONV
1293 _UTF16Open(UConverter
*cnv
,
1294 UConverterLoadArgs
*pArgs
,
1295 UErrorCode
*pErrorCode
) {
1296 if(UCNV_GET_VERSION(cnv
)<=2) {
1297 if(UCNV_GET_VERSION(cnv
)==2 && !pArgs
->onlyTestIsLoadable
) {
1299 * Switch implementation, and switch the staticData that's different
1300 * and was copied into the UConverter.
1301 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1302 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1304 cnv
->sharedData
=(UConverterSharedData
*)&_UTF16v2Data
;
1305 uprv_memcpy(cnv
->subChars
, _UTF16v2Data
.staticData
->subChar
, UCNV_MAX_SUBCHAR_LEN
);
1307 _UTF16Reset(cnv
, UCNV_RESET_BOTH
);
1309 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1313 static const char * U_CALLCONV
1314 _UTF16GetName(const UConverter
*cnv
) {
1315 if(UCNV_GET_VERSION(cnv
)==0) {
1317 } else if(UCNV_GET_VERSION(cnv
)==1) {
1318 return "UTF-16,version=1";
1320 return "UTF-16,version=2";
1324 extern const UConverterSharedData _UTF16Data
;
1326 static inline bool IS_UTF16BE(const UConverter
*cnv
) {
1327 return ((cnv
)->sharedData
== &_UTF16BEData
);
1330 static inline bool IS_UTF16LE(const UConverter
*cnv
) {
1331 return ((cnv
)->sharedData
== &_UTF16LEData
);
1334 static inline bool IS_UTF16(const UConverter
*cnv
) {
1335 return ((cnv
)->sharedData
==&_UTF16Data
) || ((cnv
)->sharedData
== &_UTF16v2Data
);
1339 static void U_CALLCONV
1340 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1341 UErrorCode
*pErrorCode
) {
1342 UConverter
*cnv
=pArgs
->converter
;
1343 const char *source
=pArgs
->source
;
1344 const char *sourceLimit
=pArgs
->sourceLimit
;
1345 int32_t *offsets
=pArgs
->offsets
;
1347 int32_t state
, offsetDelta
;
1353 * If we detect a BOM in this buffer, then we must add the BOM size to the
1354 * offsets because the actual converter function will not see and count the BOM.
1355 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1359 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1362 cnv
->toUBytes
[0]=(uint8_t)*source
++;
1368 * Only inside this switch case can the state variable
1369 * temporarily take two additional values:
1370 * 6: BOM error, continue with BE
1371 * 7: BOM error, continue with LE
1374 if(cnv
->toUBytes
[0]==0xfe && b
==0xff) {
1375 if(IS_UTF16LE(cnv
)) {
1376 state
=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1378 state
=8; /* detect UTF-16BE */
1380 } else if(cnv
->toUBytes
[0]==0xff && b
==0xfe) {
1381 if(IS_UTF16BE(cnv
)) {
1382 state
=6; /* illegal reverse BOM for Java "UnicodeBig" */
1384 state
=9; /* detect UTF-16LE */
1386 } else if((IS_UTF16(cnv
) && UCNV_GET_VERSION(cnv
)==1)) {
1387 state
=6; /* illegal missing BOM for Java "Unicode" */
1390 /* BOM detected, consume it */
1393 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1394 } else if(state
<6) {
1395 /* ok: no BOM, and not a reverse BOM */
1396 if(source
!=pArgs
->source
) {
1397 /* reset the source for a correct first offset */
1398 source
=pArgs
->source
;
1401 if(IS_UTF16LE(cnv
)) {
1402 /* Make Java "UnicodeLittle" default to LE. */
1405 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1410 * error: missing BOM, or reverse BOM
1411 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1412 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1413 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1415 /* report the non-BOM or reverse BOM as an illegal sequence */
1418 pArgs
->source
=source
+1;
1419 /* continue with conversion if the callback resets the error */
1421 * Make Java "Unicode" default to BE like standard UTF-16.
1422 * Make Java "UnicodeBig" and "UnicodeLittle" default
1423 * to their normal endiannesses.
1426 *pErrorCode
=U_ILLEGAL_ESCAPE_SEQUENCE
;
1429 /* convert the rest of the stream */
1434 pArgs
->source
=source
;
1435 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1436 source
=pArgs
->source
;
1440 pArgs
->source
=source
;
1441 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1442 source
=pArgs
->source
;
1445 break; /* does not occur */
1449 /* add BOM size to offsets - see comment at offsetDelta declaration */
1450 if(offsets
!=NULL
&& offsetDelta
!=0) {
1451 int32_t *offsetsLimit
=pArgs
->offsets
;
1452 while(offsets
<offsetsLimit
) {
1453 *offsets
++ += offsetDelta
;
1457 pArgs
->source
=source
;
1459 if(source
==sourceLimit
&& pArgs
->flush
) {
1460 /* handle truncated input */
1463 break; /* no input at all, nothing to do */
1465 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1468 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1471 /* 0<state<8: framework will report truncation, nothing to do here */
1479 static UChar32 U_CALLCONV
1480 _UTF16GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1481 UErrorCode
*pErrorCode
) {
1482 switch(pArgs
->converter
->mode
) {
1484 return _UTF16BEGetNextUChar(pArgs
, pErrorCode
);
1486 return _UTF16LEGetNextUChar(pArgs
, pErrorCode
);
1488 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1493 static const UConverterImpl _UTF16Impl
= {
1503 _UTF16ToUnicodeWithOffsets
,
1504 _UTF16ToUnicodeWithOffsets
,
1505 _UTF16PEFromUnicodeWithOffsets
,
1506 _UTF16PEFromUnicodeWithOffsets
,
1509 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1513 ucnv_getNonSurrogateUnicodeSet
,
1519 static const UConverterStaticData _UTF16StaticData
= {
1520 sizeof(UConverterStaticData
),
1522 1204, /* CCSID for BOM sensitive UTF-16 */
1523 UCNV_IBM
, UCNV_UTF16
, 2, 2,
1525 { 0xff, 0xfd, 0, 0 }, 2,
1527 { 0xfd, 0xff, 0, 0 }, 2,
1532 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1535 const UConverterSharedData _UTF16Data
=
1536 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData
, &_UTF16Impl
);
1538 static const UConverterImpl _UTF16v2Impl
= {
1548 _UTF16ToUnicodeWithOffsets
,
1549 _UTF16ToUnicodeWithOffsets
,
1550 _UTF16BEFromUnicodeWithOffsets
,
1551 _UTF16BEFromUnicodeWithOffsets
,
1554 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1558 ucnv_getNonSurrogateUnicodeSet
,
1564 static const UConverterStaticData _UTF16v2StaticData
= {
1565 sizeof(UConverterStaticData
),
1567 1204, /* CCSID for BOM sensitive UTF-16 */
1568 UCNV_IBM
, UCNV_UTF16
, 2, 2,
1569 { 0xff, 0xfd, 0, 0 }, 2,
1573 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1576 const UConverterSharedData _UTF16v2Data
=
1577 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData
, &_UTF16v2Impl
);