2 **********************************************************************
3 * Copyright (C) 2002-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u16.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
27 UCNV_NEED_TO_WRITE_BOM
=1
31 * The UTF-16 toUnicode implementation is also used for the Java-specific
32 * "with BOM" variants of UTF-16BE and UTF-16LE.
35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
36 UErrorCode
*pErrorCode
);
38 /* UTF-16BE ----------------------------------------------------------------- */
41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
49 UErrorCode
*pErrorCode
) {
55 uint32_t targetCapacity
, length
, sourceIndex
;
60 length
=(int32_t)(pArgs
->sourceLimit
-source
);
62 /* no input, nothing to do */
68 /* write the BOM if necessary */
69 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
70 static const char bom
[]={ (char)0xfe, (char)0xff };
71 ucnv_fromUWriteBytes(cnv
,
73 &pArgs
->target
, pArgs
->targetLimit
,
76 cnv
->fromUnicodeStatus
=0;
80 if(target
>= pArgs
->targetLimit
) {
81 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
85 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
86 offsets
=pArgs
->offsets
;
89 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
91 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
92 /* the last buffer ended with a lead surrogate, output the surrogate pair */
95 target
[0]=(uint8_t)(c
>>8);
97 target
[2]=(uint8_t)(trail
>>8);
98 target
[3]=(uint8_t)trail
;
108 cnv
->fromUChar32
=c
=0;
112 /* copy an even number of bytes for complete UChars */
113 uint32_t count
=2*length
;
114 if(count
>targetCapacity
) {
115 count
=targetCapacity
&~1;
118 targetCapacity
-=count
;
125 if(U16_IS_SINGLE(c
)) {
126 target
[0]=(uint8_t)(c
>>8);
127 target
[1]=(uint8_t)c
;
129 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
132 target
[0]=(uint8_t)(c
>>8);
133 target
[1]=(uint8_t)c
;
134 target
[2]=(uint8_t)(trail
>>8);
135 target
[3]=(uint8_t)trail
;
145 if(U16_IS_SINGLE(c
)) {
146 target
[0]=(uint8_t)(c
>>8);
147 target
[1]=(uint8_t)c
;
149 *offsets
++=sourceIndex
;
150 *offsets
++=sourceIndex
++;
151 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
154 target
[0]=(uint8_t)(c
>>8);
155 target
[1]=(uint8_t)c
;
156 target
[2]=(uint8_t)(trail
>>8);
157 target
[3]=(uint8_t)trail
;
159 *offsets
++=sourceIndex
;
160 *offsets
++=sourceIndex
;
161 *offsets
++=sourceIndex
;
162 *offsets
++=sourceIndex
;
172 /* done with the loop for complete UChars */
173 if(length
>0 && targetCapacity
>0) {
175 * there is more input and some target capacity -
176 * it must be targetCapacity==1 because otherwise
177 * the above would have copied more;
178 * prepare for overflow output
180 if(U16_IS_SINGLE(c
=*source
++)) {
181 overflow
[0]=(char)(c
>>8);
183 length
=2; /* 2 bytes to output */
185 /* } else { keep c for surrogate handling, length will be set there */
192 /* keep c for surrogate handling, length will be set there */
193 targetCapacity
+=2*count
;
196 length
=0; /* from here on, length counts the bytes in overflow[] */
201 * c is a surrogate, and
202 * - source or target too short
203 * - or the surrogate is unmatched
206 if(U16_IS_SURROGATE_LEAD(c
)) {
207 if(source
<pArgs
->sourceLimit
) {
208 if(U16_IS_TRAIL(trail
=*source
)) {
209 /* output the surrogate pair, will overflow (see conditions comment above) */
211 overflow
[0]=(char)(c
>>8);
213 overflow
[2]=(char)(trail
>>8);
214 overflow
[3]=(char)trail
;
215 length
=4; /* 4 bytes to output */
218 /* unmatched lead surrogate */
219 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
222 /* see if the trail surrogate is in the next buffer */
225 /* unmatched trail surrogate */
226 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
232 /* output length bytes with overflow (length>targetCapacity>0) */
233 ucnv_fromUWriteBytes(cnv
,
235 (char **)&target
, pArgs
->targetLimit
,
236 &offsets
, sourceIndex
,
238 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
241 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
242 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
245 /* write back the updated pointers */
246 pArgs
->source
=source
;
247 pArgs
->target
=(char *)target
;
248 pArgs
->offsets
=offsets
;
252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
253 UErrorCode
*pErrorCode
) {
255 const uint8_t *source
;
259 uint32_t targetCapacity
, length
, count
, sourceIndex
;
262 if(pArgs
->converter
->mode
<8) {
263 _UTF16ToUnicodeWithOffsets(pArgs
, pErrorCode
);
267 cnv
=pArgs
->converter
;
268 source
=(const uint8_t *)pArgs
->source
;
269 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
270 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
271 /* no input, nothing to do */
275 target
=pArgs
->target
;
276 if(target
>= pArgs
->targetLimit
) {
277 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
281 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
282 offsets
=pArgs
->offsets
;
286 /* complete a partial UChar or pair from the last call */
287 if(cnv
->toUnicodeStatus
!=0) {
289 * special case: single byte from a previous buffer,
290 * where the byte turned out not to belong to a trail surrogate
291 * and the preceding, unmatched lead surrogate was put into toUBytes[]
294 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
296 cnv
->toUnicodeStatus
=0;
298 if((count
=cnv
->toULength
)!=0) {
299 uint8_t *p
=cnv
->toUBytes
;
301 p
[count
++]=*source
++;
305 c
=((UChar
)p
[0]<<8)|p
[1];
306 if(U16_IS_SINGLE(c
)) {
307 /* output the BMP code point */
316 } else if(U16_IS_SURROGATE_LEAD(c
)) {
317 /* continue collecting bytes for the trail surrogate */
318 c
=0; /* avoid unnecessary surrogate handling below */
320 /* fall through to error handling for an unmatched trail surrogate */
323 } else if(count
==4) {
324 c
=((UChar
)p
[0]<<8)|p
[1];
325 trail
=((UChar
)p
[2]<<8)|p
[3];
326 if(U16_IS_TRAIL(trail
)) {
327 /* output the surrogate pair */
329 if(targetCapacity
>=2) {
336 } else /* targetCapacity==1 */ {
338 cnv
->UCharErrorBuffer
[0]=trail
;
339 cnv
->UCharErrorBufferLength
=1;
340 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
346 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
347 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
349 /* back out reading the code unit after it */
350 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
354 * if the trail unit's first byte was in a previous buffer, then
355 * we need to put it into a special place because toUBytes[] will be
356 * used for the lead unit's bytes
358 cnv
->toUnicodeStatus
=0x100|p
[2];
363 /* write back the updated pointers */
364 pArgs
->source
=(const char *)source
;
365 pArgs
->target
=target
;
366 pArgs
->offsets
=offsets
;
371 cnv
->toULength
=(int8_t)count
;
374 /* copy an even number of bytes for complete UChars */
375 count
=2*targetCapacity
;
379 if(c
==0 && count
>0) {
382 targetCapacity
-=count
;
385 c
=((UChar
)source
[0]<<8)|source
[1];
387 if(U16_IS_SINGLE(c
)) {
389 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
390 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
402 c
=((UChar
)source
[0]<<8)|source
[1];
404 if(U16_IS_SINGLE(c
)) {
406 *offsets
++=sourceIndex
;
408 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
409 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
415 *offsets
++=sourceIndex
;
416 *offsets
++=sourceIndex
;
425 /* done with the loop for complete UChars */
428 /* keep c for surrogate handling, trail will be set there */
429 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
430 targetCapacity
+=count
;
436 * c is a surrogate, and
437 * - source or target too short
438 * - or the surrogate is unmatched
440 cnv
->toUBytes
[0]=(uint8_t)(c
>>8);
441 cnv
->toUBytes
[1]=(uint8_t)c
;
444 if(U16_IS_SURROGATE_LEAD(c
)) {
446 if(U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])) {
447 /* output the surrogate pair, will overflow (see conditions comment above) */
452 *offsets
++=sourceIndex
;
454 cnv
->UCharErrorBuffer
[0]=trail
;
455 cnv
->UCharErrorBufferLength
=1;
457 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
459 /* unmatched lead surrogate */
460 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
463 /* see if the trail surrogate is in the next buffer */
466 /* unmatched trail surrogate */
467 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
471 if(U_SUCCESS(*pErrorCode
)) {
472 /* check for a remaining source byte */
474 if(targetCapacity
==0) {
475 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
477 /* it must be length==1 because otherwise the above would have copied more */
478 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
483 /* write back the updated pointers */
484 pArgs
->source
=(const char *)source
;
485 pArgs
->target
=target
;
486 pArgs
->offsets
=offsets
;
490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
491 const uint8_t *s
, *sourceLimit
;
494 if(pArgs
->converter
->mode
<8) {
495 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
498 s
=(const uint8_t *)pArgs
->source
;
499 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
503 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
507 if(s
+2>sourceLimit
) {
508 /* only one byte: truncated UChar */
509 pArgs
->converter
->toUBytes
[0]=*s
++;
510 pArgs
->converter
->toULength
=1;
511 pArgs
->source
=(const char *)s
;
512 *err
= U_TRUNCATED_CHAR_FOUND
;
517 c
=((UChar32
)*s
<<8)|s
[1];
520 /* check for a surrogate pair */
521 if(U_IS_SURROGATE(c
)) {
522 if(U16_IS_SURROGATE_LEAD(c
)) {
523 if(s
+2<=sourceLimit
) {
526 /* get a second UChar and see if it is a trail surrogate */
527 trail
=((UChar
)*s
<<8)|s
[1];
528 if(U16_IS_TRAIL(trail
)) {
529 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
532 /* unmatched lead surrogate */
536 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
537 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
539 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
542 } while(s
<sourceLimit
);
545 *err
=U_TRUNCATED_CHAR_FOUND
;
548 /* unmatched trail surrogate */
553 /* write the unmatched surrogate */
554 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
555 pArgs
->converter
->toULength
=2;
560 *err
=U_ILLEGAL_CHAR_FOUND
;
564 pArgs
->source
=(const char *)s
;
569 _UTF16BEReset(UConverter
*cnv
, UConverterResetChoice choice
) {
570 if(choice
<=UCNV_RESET_TO_UNICODE
) {
571 /* reset toUnicode state */
572 if(UCNV_GET_VERSION(cnv
)==0) {
573 cnv
->mode
=8; /* no BOM handling */
575 cnv
->mode
=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
578 if(choice
!=UCNV_RESET_TO_UNICODE
&& UCNV_GET_VERSION(cnv
)==1) {
579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
580 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
585 _UTF16BEOpen(UConverter
*cnv
,
586 UConverterLoadArgs
*pArgs
,
587 UErrorCode
*pErrorCode
) {
588 if(UCNV_GET_VERSION(cnv
)<=1) {
589 _UTF16BEReset(cnv
, UCNV_RESET_BOTH
);
591 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
596 _UTF16BEGetName(const UConverter
*cnv
) {
597 if(UCNV_GET_VERSION(cnv
)==0) {
600 return "UTF-16BE,version=1";
604 static const UConverterImpl _UTF16BEImpl
={
605 UCNV_UTF16_BigEndian
,
614 _UTF16BEToUnicodeWithOffsets
,
615 _UTF16BEToUnicodeWithOffsets
,
616 _UTF16BEFromUnicodeWithOffsets
,
617 _UTF16BEFromUnicodeWithOffsets
,
618 _UTF16BEGetNextUChar
,
624 ucnv_getNonSurrogateUnicodeSet
627 static const UConverterStaticData _UTF16BEStaticData
={
628 sizeof(UConverterStaticData
),
630 1200, UCNV_IBM
, UCNV_UTF16_BigEndian
, 2, 2,
631 { 0xff, 0xfd, 0, 0 },2,FALSE
,FALSE
,
634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
638 const UConverterSharedData _UTF16BEData
=
639 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData
, &_UTF16BEImpl
);
641 /* UTF-16LE ----------------------------------------------------------------- */
644 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
645 UErrorCode
*pErrorCode
) {
651 uint32_t targetCapacity
, length
, sourceIndex
;
655 source
=pArgs
->source
;
656 length
=(int32_t)(pArgs
->sourceLimit
-source
);
658 /* no input, nothing to do */
662 cnv
=pArgs
->converter
;
664 /* write the BOM if necessary */
665 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
666 static const char bom
[]={ (char)0xff, (char)0xfe };
667 ucnv_fromUWriteBytes(cnv
,
669 &pArgs
->target
, pArgs
->targetLimit
,
672 cnv
->fromUnicodeStatus
=0;
675 target
=pArgs
->target
;
676 if(target
>= pArgs
->targetLimit
) {
677 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
681 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
682 offsets
=pArgs
->offsets
;
685 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
687 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
688 /* the last buffer ended with a lead surrogate, output the surrogate pair */
691 target
[0]=(uint8_t)c
;
692 target
[1]=(uint8_t)(c
>>8);
693 target
[2]=(uint8_t)trail
;
694 target
[3]=(uint8_t)(trail
>>8);
704 cnv
->fromUChar32
=c
=0;
708 /* copy an even number of bytes for complete UChars */
709 uint32_t count
=2*length
;
710 if(count
>targetCapacity
) {
711 count
=targetCapacity
&~1;
714 targetCapacity
-=count
;
721 if(U16_IS_SINGLE(c
)) {
722 target
[0]=(uint8_t)c
;
723 target
[1]=(uint8_t)(c
>>8);
725 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
728 target
[0]=(uint8_t)c
;
729 target
[1]=(uint8_t)(c
>>8);
730 target
[2]=(uint8_t)trail
;
731 target
[3]=(uint8_t)(trail
>>8);
741 if(U16_IS_SINGLE(c
)) {
742 target
[0]=(uint8_t)c
;
743 target
[1]=(uint8_t)(c
>>8);
745 *offsets
++=sourceIndex
;
746 *offsets
++=sourceIndex
++;
747 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
750 target
[0]=(uint8_t)c
;
751 target
[1]=(uint8_t)(c
>>8);
752 target
[2]=(uint8_t)trail
;
753 target
[3]=(uint8_t)(trail
>>8);
755 *offsets
++=sourceIndex
;
756 *offsets
++=sourceIndex
;
757 *offsets
++=sourceIndex
;
758 *offsets
++=sourceIndex
;
768 /* done with the loop for complete UChars */
769 if(length
>0 && targetCapacity
>0) {
771 * there is more input and some target capacity -
772 * it must be targetCapacity==1 because otherwise
773 * the above would have copied more;
774 * prepare for overflow output
776 if(U16_IS_SINGLE(c
=*source
++)) {
778 overflow
[1]=(char)(c
>>8);
779 length
=2; /* 2 bytes to output */
781 /* } else { keep c for surrogate handling, length will be set there */
788 /* keep c for surrogate handling, length will be set there */
789 targetCapacity
+=2*count
;
792 length
=0; /* from here on, length counts the bytes in overflow[] */
797 * c is a surrogate, and
798 * - source or target too short
799 * - or the surrogate is unmatched
802 if(U16_IS_SURROGATE_LEAD(c
)) {
803 if(source
<pArgs
->sourceLimit
) {
804 if(U16_IS_TRAIL(trail
=*source
)) {
805 /* output the surrogate pair, will overflow (see conditions comment above) */
808 overflow
[1]=(char)(c
>>8);
809 overflow
[2]=(char)trail
;
810 overflow
[3]=(char)(trail
>>8);
811 length
=4; /* 4 bytes to output */
814 /* unmatched lead surrogate */
815 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
818 /* see if the trail surrogate is in the next buffer */
821 /* unmatched trail surrogate */
822 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
828 /* output length bytes with overflow (length>targetCapacity>0) */
829 ucnv_fromUWriteBytes(cnv
,
831 &target
, pArgs
->targetLimit
,
832 &offsets
, sourceIndex
,
834 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
837 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
838 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
841 /* write back the updated pointers */
842 pArgs
->source
=source
;
843 pArgs
->target
=target
;
844 pArgs
->offsets
=offsets
;
848 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
849 UErrorCode
*pErrorCode
) {
851 const uint8_t *source
;
855 uint32_t targetCapacity
, length
, count
, sourceIndex
;
858 if(pArgs
->converter
->mode
<8) {
859 _UTF16ToUnicodeWithOffsets(pArgs
, pErrorCode
);
863 cnv
=pArgs
->converter
;
864 source
=(const uint8_t *)pArgs
->source
;
865 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
866 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
867 /* no input, nothing to do */
871 target
=pArgs
->target
;
872 if(target
>= pArgs
->targetLimit
) {
873 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
877 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
878 offsets
=pArgs
->offsets
;
882 /* complete a partial UChar or pair from the last call */
883 if(cnv
->toUnicodeStatus
!=0) {
885 * special case: single byte from a previous buffer,
886 * where the byte turned out not to belong to a trail surrogate
887 * and the preceding, unmatched lead surrogate was put into toUBytes[]
890 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
892 cnv
->toUnicodeStatus
=0;
894 if((count
=cnv
->toULength
)!=0) {
895 uint8_t *p
=cnv
->toUBytes
;
897 p
[count
++]=*source
++;
901 c
=((UChar
)p
[1]<<8)|p
[0];
902 if(U16_IS_SINGLE(c
)) {
903 /* output the BMP code point */
912 } else if(U16_IS_SURROGATE_LEAD(c
)) {
913 /* continue collecting bytes for the trail surrogate */
914 c
=0; /* avoid unnecessary surrogate handling below */
916 /* fall through to error handling for an unmatched trail surrogate */
919 } else if(count
==4) {
920 c
=((UChar
)p
[1]<<8)|p
[0];
921 trail
=((UChar
)p
[3]<<8)|p
[2];
922 if(U16_IS_TRAIL(trail
)) {
923 /* output the surrogate pair */
925 if(targetCapacity
>=2) {
932 } else /* targetCapacity==1 */ {
934 cnv
->UCharErrorBuffer
[0]=trail
;
935 cnv
->UCharErrorBufferLength
=1;
936 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
942 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
943 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
945 /* back out reading the code unit after it */
946 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
950 * if the trail unit's first byte was in a previous buffer, then
951 * we need to put it into a special place because toUBytes[] will be
952 * used for the lead unit's bytes
954 cnv
->toUnicodeStatus
=0x100|p
[2];
959 /* write back the updated pointers */
960 pArgs
->source
=(const char *)source
;
961 pArgs
->target
=target
;
962 pArgs
->offsets
=offsets
;
967 cnv
->toULength
=(int8_t)count
;
970 /* copy an even number of bytes for complete UChars */
971 count
=2*targetCapacity
;
975 if(c
==0 && count
>0) {
978 targetCapacity
-=count
;
981 c
=((UChar
)source
[1]<<8)|source
[0];
983 if(U16_IS_SINGLE(c
)) {
985 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
986 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
998 c
=((UChar
)source
[1]<<8)|source
[0];
1000 if(U16_IS_SINGLE(c
)) {
1002 *offsets
++=sourceIndex
;
1004 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
1005 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
1011 *offsets
++=sourceIndex
;
1012 *offsets
++=sourceIndex
;
1021 /* done with the loop for complete UChars */
1024 /* keep c for surrogate handling, trail will be set there */
1025 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
1026 targetCapacity
+=count
;
1032 * c is a surrogate, and
1033 * - source or target too short
1034 * - or the surrogate is unmatched
1036 cnv
->toUBytes
[0]=(uint8_t)c
;
1037 cnv
->toUBytes
[1]=(uint8_t)(c
>>8);
1040 if(U16_IS_SURROGATE_LEAD(c
)) {
1042 if(U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])) {
1043 /* output the surrogate pair, will overflow (see conditions comment above) */
1048 *offsets
++=sourceIndex
;
1050 cnv
->UCharErrorBuffer
[0]=trail
;
1051 cnv
->UCharErrorBufferLength
=1;
1053 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1055 /* unmatched lead surrogate */
1056 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1059 /* see if the trail surrogate is in the next buffer */
1062 /* unmatched trail surrogate */
1063 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1067 if(U_SUCCESS(*pErrorCode
)) {
1068 /* check for a remaining source byte */
1070 if(targetCapacity
==0) {
1071 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1073 /* it must be length==1 because otherwise the above would have copied more */
1074 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
1079 /* write back the updated pointers */
1080 pArgs
->source
=(const char *)source
;
1081 pArgs
->target
=target
;
1082 pArgs
->offsets
=offsets
;
1086 _UTF16LEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
1087 const uint8_t *s
, *sourceLimit
;
1090 if(pArgs
->converter
->mode
<8) {
1091 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1094 s
=(const uint8_t *)pArgs
->source
;
1095 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1097 if(s
>=sourceLimit
) {
1099 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
1103 if(s
+2>sourceLimit
) {
1104 /* only one byte: truncated UChar */
1105 pArgs
->converter
->toUBytes
[0]=*s
++;
1106 pArgs
->converter
->toULength
=1;
1107 pArgs
->source
=(const char *)s
;
1108 *err
= U_TRUNCATED_CHAR_FOUND
;
1113 c
=((UChar32
)s
[1]<<8)|*s
;
1116 /* check for a surrogate pair */
1117 if(U_IS_SURROGATE(c
)) {
1118 if(U16_IS_SURROGATE_LEAD(c
)) {
1119 if(s
+2<=sourceLimit
) {
1122 /* get a second UChar and see if it is a trail surrogate */
1123 trail
=((UChar
)s
[1]<<8)|*s
;
1124 if(U16_IS_TRAIL(trail
)) {
1125 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1128 /* unmatched lead surrogate */
1132 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1133 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1135 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
1138 } while(s
<sourceLimit
);
1141 *err
=U_TRUNCATED_CHAR_FOUND
;
1144 /* unmatched trail surrogate */
1149 /* write the unmatched surrogate */
1150 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1151 pArgs
->converter
->toULength
=2;
1156 *err
=U_ILLEGAL_CHAR_FOUND
;
1160 pArgs
->source
=(const char *)s
;
1165 _UTF16LEReset(UConverter
*cnv
, UConverterResetChoice choice
) {
1166 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1167 /* reset toUnicode state */
1168 if(UCNV_GET_VERSION(cnv
)==0) {
1169 cnv
->mode
=8; /* no BOM handling */
1171 cnv
->mode
=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1174 if(choice
!=UCNV_RESET_TO_UNICODE
&& UCNV_GET_VERSION(cnv
)==1) {
1175 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1176 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1181 _UTF16LEOpen(UConverter
*cnv
,
1182 UConverterLoadArgs
*pArgs
,
1183 UErrorCode
*pErrorCode
) {
1184 if(UCNV_GET_VERSION(cnv
)<=1) {
1185 _UTF16LEReset(cnv
, UCNV_RESET_BOTH
);
1187 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1192 _UTF16LEGetName(const UConverter
*cnv
) {
1193 if(UCNV_GET_VERSION(cnv
)==0) {
1196 return "UTF-16LE,version=1";
1200 static const UConverterImpl _UTF16LEImpl
={
1201 UCNV_UTF16_LittleEndian
,
1210 _UTF16LEToUnicodeWithOffsets
,
1211 _UTF16LEToUnicodeWithOffsets
,
1212 _UTF16LEFromUnicodeWithOffsets
,
1213 _UTF16LEFromUnicodeWithOffsets
,
1214 _UTF16LEGetNextUChar
,
1220 ucnv_getNonSurrogateUnicodeSet
1224 static const UConverterStaticData _UTF16LEStaticData
={
1225 sizeof(UConverterStaticData
),
1227 1202, UCNV_IBM
, UCNV_UTF16_LittleEndian
, 2, 2,
1228 { 0xfd, 0xff, 0, 0 },2,FALSE
,FALSE
,
1231 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1235 const UConverterSharedData _UTF16LEData
=
1236 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData
, &_UTF16LEImpl
);
1238 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1241 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1243 * This is a simpler version of the UTF-32 converter, with
1244 * fewer states for shorter BOMs.
1250 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1254 * During detection: state==number of initial bytes seen so far.
1256 * On output, emit U+FEFF as the first code point.
1259 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1260 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1261 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1265 _UTF16Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1266 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1267 /* reset toUnicode: state=0 */
1270 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1271 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1272 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1276 static const UConverterSharedData _UTF16v2Data
;
1279 _UTF16Open(UConverter
*cnv
,
1280 UConverterLoadArgs
*pArgs
,
1281 UErrorCode
*pErrorCode
) {
1282 if(UCNV_GET_VERSION(cnv
)<=2) {
1283 if(UCNV_GET_VERSION(cnv
)==2 && !pArgs
->onlyTestIsLoadable
) {
1285 * Switch implementation, and switch the staticData that's different
1286 * and was copied into the UConverter.
1287 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1288 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1290 cnv
->sharedData
=(UConverterSharedData
*)&_UTF16v2Data
;
1291 uprv_memcpy(cnv
->subChars
, _UTF16v2Data
.staticData
->subChar
, UCNV_MAX_SUBCHAR_LEN
);
1293 _UTF16Reset(cnv
, UCNV_RESET_BOTH
);
1295 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1300 _UTF16GetName(const UConverter
*cnv
) {
1301 if(UCNV_GET_VERSION(cnv
)==0) {
1303 } else if(UCNV_GET_VERSION(cnv
)==1) {
1304 return "UTF-16,version=1";
1306 return "UTF-16,version=2";
1310 const UConverterSharedData _UTF16Data
;
1312 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
1313 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
1314 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
1317 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1318 UErrorCode
*pErrorCode
) {
1319 UConverter
*cnv
=pArgs
->converter
;
1320 const char *source
=pArgs
->source
;
1321 const char *sourceLimit
=pArgs
->sourceLimit
;
1322 int32_t *offsets
=pArgs
->offsets
;
1324 int32_t state
, offsetDelta
;
1330 * If we detect a BOM in this buffer, then we must add the BOM size to the
1331 * offsets because the actual converter function will not see and count the BOM.
1332 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1336 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1339 cnv
->toUBytes
[0]=(uint8_t)*source
++;
1345 * Only inside this switch case can the state variable
1346 * temporarily take two additional values:
1347 * 6: BOM error, continue with BE
1348 * 7: BOM error, continue with LE
1351 if(cnv
->toUBytes
[0]==0xfe && b
==0xff) {
1352 if(IS_UTF16LE(cnv
)) {
1353 state
=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1355 state
=8; /* detect UTF-16BE */
1357 } else if(cnv
->toUBytes
[0]==0xff && b
==0xfe) {
1358 if(IS_UTF16BE(cnv
)) {
1359 state
=6; /* illegal reverse BOM for Java "UnicodeBig" */
1361 state
=9; /* detect UTF-16LE */
1363 } else if((IS_UTF16(cnv
) && UCNV_GET_VERSION(cnv
)==1)) {
1364 state
=6; /* illegal missing BOM for Java "Unicode" */
1367 /* BOM detected, consume it */
1370 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1371 } else if(state
<6) {
1372 /* ok: no BOM, and not a reverse BOM */
1373 if(source
!=pArgs
->source
) {
1374 /* reset the source for a correct first offset */
1375 source
=pArgs
->source
;
1378 if(IS_UTF16LE(cnv
)) {
1379 /* Make Java "UnicodeLittle" default to LE. */
1382 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1387 * error: missing BOM, or reverse BOM
1388 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1389 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1390 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1392 /* report the non-BOM or reverse BOM as an illegal sequence */
1395 pArgs
->source
=source
+1;
1396 /* continue with conversion if the callback resets the error */
1398 * Make Java "Unicode" default to BE like standard UTF-16.
1399 * Make Java "UnicodeBig" and "UnicodeLittle" default
1400 * to their normal endiannesses.
1403 *pErrorCode
=U_ILLEGAL_ESCAPE_SEQUENCE
;
1406 /* convert the rest of the stream */
1411 pArgs
->source
=source
;
1412 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1413 source
=pArgs
->source
;
1417 pArgs
->source
=source
;
1418 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1419 source
=pArgs
->source
;
1422 break; /* does not occur */
1426 /* add BOM size to offsets - see comment at offsetDelta declaration */
1427 if(offsets
!=NULL
&& offsetDelta
!=0) {
1428 int32_t *offsetsLimit
=pArgs
->offsets
;
1429 while(offsets
<offsetsLimit
) {
1430 *offsets
++ += offsetDelta
;
1434 pArgs
->source
=source
;
1436 if(source
==sourceLimit
&& pArgs
->flush
) {
1437 /* handle truncated input */
1440 break; /* no input at all, nothing to do */
1442 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1445 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1448 /* 0<state<8: framework will report truncation, nothing to do here */
1457 _UTF16GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1458 UErrorCode
*pErrorCode
) {
1459 switch(pArgs
->converter
->mode
) {
1461 return _UTF16BEGetNextUChar(pArgs
, pErrorCode
);
1463 return _UTF16LEGetNextUChar(pArgs
, pErrorCode
);
1465 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1469 static const UConverterImpl _UTF16Impl
= {
1479 _UTF16ToUnicodeWithOffsets
,
1480 _UTF16ToUnicodeWithOffsets
,
1481 _UTF16PEFromUnicodeWithOffsets
,
1482 _UTF16PEFromUnicodeWithOffsets
,
1485 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1489 ucnv_getNonSurrogateUnicodeSet
1492 static const UConverterStaticData _UTF16StaticData
= {
1493 sizeof(UConverterStaticData
),
1495 1204, /* CCSID for BOM sensitive UTF-16 */
1496 UCNV_IBM
, UCNV_UTF16
, 2, 2,
1498 { 0xff, 0xfd, 0, 0 }, 2,
1500 { 0xfd, 0xff, 0, 0 }, 2,
1505 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1508 const UConverterSharedData _UTF16Data
=
1509 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData
, &_UTF16Impl
);
1511 static const UConverterImpl _UTF16v2Impl
= {
1521 _UTF16ToUnicodeWithOffsets
,
1522 _UTF16ToUnicodeWithOffsets
,
1523 _UTF16BEFromUnicodeWithOffsets
,
1524 _UTF16BEFromUnicodeWithOffsets
,
1527 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1531 ucnv_getNonSurrogateUnicodeSet
1534 static const UConverterStaticData _UTF16v2StaticData
= {
1535 sizeof(UConverterStaticData
),
1537 1204, /* CCSID for BOM sensitive UTF-16 */
1538 UCNV_IBM
, UCNV_UTF16
, 2, 2,
1539 { 0xff, 0xfd, 0, 0 }, 2,
1543 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1546 static const UConverterSharedData _UTF16v2Data
=
1547 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData
, &_UTF16v2Impl
);