2 **********************************************************************
3 * Copyright (C) 2002-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u16.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
27 UCNV_NEED_TO_WRITE_BOM
=1
31 * The UTF-16 toUnicode implementation is also used for the Java-specific
32 * "with BOM" variants of UTF-16BE and UTF-16LE.
35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
36 UErrorCode
*pErrorCode
);
38 /* UTF-16BE ----------------------------------------------------------------- */
41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
49 UErrorCode
*pErrorCode
) {
55 uint32_t targetCapacity
, length
, sourceIndex
;
60 length
=(int32_t)(pArgs
->sourceLimit
-source
);
62 /* no input, nothing to do */
68 /* write the BOM if necessary */
69 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
70 static const char bom
[]={ (char)0xfe, (char)0xff };
71 ucnv_fromUWriteBytes(cnv
,
73 &pArgs
->target
, pArgs
->targetLimit
,
76 cnv
->fromUnicodeStatus
=0;
80 if(target
>= pArgs
->targetLimit
) {
81 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
85 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
86 offsets
=pArgs
->offsets
;
89 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
91 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
92 /* the last buffer ended with a lead surrogate, output the surrogate pair */
95 target
[0]=(uint8_t)(c
>>8);
97 target
[2]=(uint8_t)(trail
>>8);
98 target
[3]=(uint8_t)trail
;
108 cnv
->fromUChar32
=c
=0;
112 /* copy an even number of bytes for complete UChars */
113 uint32_t count
=2*length
;
114 if(count
>targetCapacity
) {
115 count
=targetCapacity
&~1;
118 targetCapacity
-=count
;
125 if(U16_IS_SINGLE(c
)) {
126 target
[0]=(uint8_t)(c
>>8);
127 target
[1]=(uint8_t)c
;
129 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
132 target
[0]=(uint8_t)(c
>>8);
133 target
[1]=(uint8_t)c
;
134 target
[2]=(uint8_t)(trail
>>8);
135 target
[3]=(uint8_t)trail
;
145 if(U16_IS_SINGLE(c
)) {
146 target
[0]=(uint8_t)(c
>>8);
147 target
[1]=(uint8_t)c
;
149 *offsets
++=sourceIndex
;
150 *offsets
++=sourceIndex
++;
151 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
154 target
[0]=(uint8_t)(c
>>8);
155 target
[1]=(uint8_t)c
;
156 target
[2]=(uint8_t)(trail
>>8);
157 target
[3]=(uint8_t)trail
;
159 *offsets
++=sourceIndex
;
160 *offsets
++=sourceIndex
;
161 *offsets
++=sourceIndex
;
162 *offsets
++=sourceIndex
;
172 /* done with the loop for complete UChars */
173 if(length
>0 && targetCapacity
>0) {
175 * there is more input and some target capacity -
176 * it must be targetCapacity==1 because otherwise
177 * the above would have copied more;
178 * prepare for overflow output
180 if(U16_IS_SINGLE(c
=*source
++)) {
181 overflow
[0]=(char)(c
>>8);
183 length
=2; /* 2 bytes to output */
185 /* } else { keep c for surrogate handling, length will be set there */
192 /* keep c for surrogate handling, length will be set there */
193 targetCapacity
+=2*count
;
196 length
=0; /* from here on, length counts the bytes in overflow[] */
201 * c is a surrogate, and
202 * - source or target too short
203 * - or the surrogate is unmatched
206 if(U16_IS_SURROGATE_LEAD(c
)) {
207 if(source
<pArgs
->sourceLimit
) {
208 if(U16_IS_TRAIL(trail
=*source
)) {
209 /* output the surrogate pair, will overflow (see conditions comment above) */
211 overflow
[0]=(char)(c
>>8);
213 overflow
[2]=(char)(trail
>>8);
214 overflow
[3]=(char)trail
;
215 length
=4; /* 4 bytes to output */
218 /* unmatched lead surrogate */
219 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
222 /* see if the trail surrogate is in the next buffer */
225 /* unmatched trail surrogate */
226 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
232 /* output length bytes with overflow (length>targetCapacity>0) */
233 ucnv_fromUWriteBytes(cnv
,
235 (char **)&target
, pArgs
->targetLimit
,
236 &offsets
, sourceIndex
,
238 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
241 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
242 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
245 /* write back the updated pointers */
246 pArgs
->source
=source
;
247 pArgs
->target
=(char *)target
;
248 pArgs
->offsets
=offsets
;
252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
253 UErrorCode
*pErrorCode
) {
255 const uint8_t *source
;
259 uint32_t targetCapacity
, length
, count
, sourceIndex
;
262 if(pArgs
->converter
->mode
<8) {
263 _UTF16ToUnicodeWithOffsets(pArgs
, pErrorCode
);
267 cnv
=pArgs
->converter
;
268 source
=(const uint8_t *)pArgs
->source
;
269 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
270 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
271 /* no input, nothing to do */
275 target
=pArgs
->target
;
276 if(target
>= pArgs
->targetLimit
) {
277 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
281 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
282 offsets
=pArgs
->offsets
;
286 /* complete a partial UChar or pair from the last call */
287 if(cnv
->toUnicodeStatus
!=0) {
289 * special case: single byte from a previous buffer,
290 * where the byte turned out not to belong to a trail surrogate
291 * and the preceding, unmatched lead surrogate was put into toUBytes[]
294 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
296 cnv
->toUnicodeStatus
=0;
298 if((count
=cnv
->toULength
)!=0) {
299 uint8_t *p
=cnv
->toUBytes
;
301 p
[count
++]=*source
++;
305 c
=((UChar
)p
[0]<<8)|p
[1];
306 if(U16_IS_SINGLE(c
)) {
307 /* output the BMP code point */
316 } else if(U16_IS_SURROGATE_LEAD(c
)) {
317 /* continue collecting bytes for the trail surrogate */
318 c
=0; /* avoid unnecessary surrogate handling below */
320 /* fall through to error handling for an unmatched trail surrogate */
323 } else if(count
==4) {
324 c
=((UChar
)p
[0]<<8)|p
[1];
325 trail
=((UChar
)p
[2]<<8)|p
[3];
326 if(U16_IS_TRAIL(trail
)) {
327 /* output the surrogate pair */
329 if(targetCapacity
>=2) {
336 } else /* targetCapacity==1 */ {
338 cnv
->UCharErrorBuffer
[0]=trail
;
339 cnv
->UCharErrorBufferLength
=1;
340 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
346 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
347 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
349 /* back out reading the code unit after it */
350 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
354 * if the trail unit's first byte was in a previous buffer, then
355 * we need to put it into a special place because toUBytes[] will be
356 * used for the lead unit's bytes
358 cnv
->toUnicodeStatus
=0x100|p
[2];
363 /* write back the updated pointers */
364 pArgs
->source
=(const char *)source
;
365 pArgs
->target
=target
;
366 pArgs
->offsets
=offsets
;
371 cnv
->toULength
=(int8_t)count
;
374 /* copy an even number of bytes for complete UChars */
375 count
=2*targetCapacity
;
379 if(c
==0 && count
>0) {
382 targetCapacity
-=count
;
385 c
=((UChar
)source
[0]<<8)|source
[1];
387 if(U16_IS_SINGLE(c
)) {
389 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
390 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
402 c
=((UChar
)source
[0]<<8)|source
[1];
404 if(U16_IS_SINGLE(c
)) {
406 *offsets
++=sourceIndex
;
408 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
409 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
415 *offsets
++=sourceIndex
;
416 *offsets
++=sourceIndex
;
425 /* done with the loop for complete UChars */
428 /* keep c for surrogate handling, trail will be set there */
429 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
430 targetCapacity
+=count
;
436 * c is a surrogate, and
437 * - source or target too short
438 * - or the surrogate is unmatched
440 cnv
->toUBytes
[0]=(uint8_t)(c
>>8);
441 cnv
->toUBytes
[1]=(uint8_t)c
;
444 if(U16_IS_SURROGATE_LEAD(c
)) {
446 if(U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])) {
447 /* output the surrogate pair, will overflow (see conditions comment above) */
452 *offsets
++=sourceIndex
;
454 cnv
->UCharErrorBuffer
[0]=trail
;
455 cnv
->UCharErrorBufferLength
=1;
457 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
459 /* unmatched lead surrogate */
460 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
463 /* see if the trail surrogate is in the next buffer */
466 /* unmatched trail surrogate */
467 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
471 if(U_SUCCESS(*pErrorCode
)) {
472 /* check for a remaining source byte */
474 if(targetCapacity
==0) {
475 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
477 /* it must be length==1 because otherwise the above would have copied more */
478 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
483 /* write back the updated pointers */
484 pArgs
->source
=(const char *)source
;
485 pArgs
->target
=target
;
486 pArgs
->offsets
=offsets
;
490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
491 const uint8_t *s
, *sourceLimit
;
494 if(pArgs
->converter
->mode
<8) {
495 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
498 s
=(const uint8_t *)pArgs
->source
;
499 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
503 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
507 if(s
+2>sourceLimit
) {
508 /* only one byte: truncated UChar */
509 pArgs
->converter
->toUBytes
[0]=*s
++;
510 pArgs
->converter
->toULength
=1;
511 pArgs
->source
=(const char *)s
;
512 *err
= U_TRUNCATED_CHAR_FOUND
;
517 c
=((UChar32
)*s
<<8)|s
[1];
520 /* check for a surrogate pair */
521 if(U_IS_SURROGATE(c
)) {
522 if(U16_IS_SURROGATE_LEAD(c
)) {
523 if(s
+2<=sourceLimit
) {
526 /* get a second UChar and see if it is a trail surrogate */
527 trail
=((UChar
)*s
<<8)|s
[1];
528 if(U16_IS_TRAIL(trail
)) {
529 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
532 /* unmatched lead surrogate */
536 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
537 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
539 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
542 } while(s
<sourceLimit
);
545 *err
=U_TRUNCATED_CHAR_FOUND
;
548 /* unmatched trail surrogate */
553 /* write the unmatched surrogate */
554 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
555 pArgs
->converter
->toULength
=2;
560 *err
=U_ILLEGAL_CHAR_FOUND
;
564 pArgs
->source
=(const char *)s
;
569 _UTF16BEReset(UConverter
*cnv
, UConverterResetChoice choice
) {
570 if(choice
<=UCNV_RESET_TO_UNICODE
) {
571 /* reset toUnicode state */
572 if(UCNV_GET_VERSION(cnv
)==0) {
573 cnv
->mode
=8; /* no BOM handling */
575 cnv
->mode
=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
578 if(choice
!=UCNV_RESET_TO_UNICODE
&& UCNV_GET_VERSION(cnv
)==1) {
579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
580 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
585 _UTF16BEOpen(UConverter
*cnv
,
586 UConverterLoadArgs
*pArgs
,
587 UErrorCode
*pErrorCode
) {
588 if(UCNV_GET_VERSION(cnv
)<=1) {
589 _UTF16BEReset(cnv
, UCNV_RESET_BOTH
);
591 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
596 _UTF16BEGetName(const UConverter
*cnv
) {
597 if(UCNV_GET_VERSION(cnv
)==0) {
600 return "UTF-16BE,version=1";
604 static const UConverterImpl _UTF16BEImpl
={
605 UCNV_UTF16_BigEndian
,
614 _UTF16BEToUnicodeWithOffsets
,
615 _UTF16BEToUnicodeWithOffsets
,
616 _UTF16BEFromUnicodeWithOffsets
,
617 _UTF16BEFromUnicodeWithOffsets
,
618 _UTF16BEGetNextUChar
,
624 ucnv_getNonSurrogateUnicodeSet
627 static const UConverterStaticData _UTF16BEStaticData
={
628 sizeof(UConverterStaticData
),
630 1200, UCNV_IBM
, UCNV_UTF16_BigEndian
, 2, 2,
631 { 0xff, 0xfd, 0, 0 },2,FALSE
,FALSE
,
634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
638 const UConverterSharedData _UTF16BEData
={
639 sizeof(UConverterSharedData
), ~((uint32_t) 0),
640 NULL
, NULL
, &_UTF16BEStaticData
, FALSE
, &_UTF16BEImpl
,
644 /* UTF-16LE ----------------------------------------------------------------- */
647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
648 UErrorCode
*pErrorCode
) {
654 uint32_t targetCapacity
, length
, sourceIndex
;
658 source
=pArgs
->source
;
659 length
=(int32_t)(pArgs
->sourceLimit
-source
);
661 /* no input, nothing to do */
665 cnv
=pArgs
->converter
;
667 /* write the BOM if necessary */
668 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
669 static const char bom
[]={ (char)0xff, (char)0xfe };
670 ucnv_fromUWriteBytes(cnv
,
672 &pArgs
->target
, pArgs
->targetLimit
,
675 cnv
->fromUnicodeStatus
=0;
678 target
=pArgs
->target
;
679 if(target
>= pArgs
->targetLimit
) {
680 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
684 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
685 offsets
=pArgs
->offsets
;
688 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
690 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
691 /* the last buffer ended with a lead surrogate, output the surrogate pair */
694 target
[0]=(uint8_t)c
;
695 target
[1]=(uint8_t)(c
>>8);
696 target
[2]=(uint8_t)trail
;
697 target
[3]=(uint8_t)(trail
>>8);
707 cnv
->fromUChar32
=c
=0;
711 /* copy an even number of bytes for complete UChars */
712 uint32_t count
=2*length
;
713 if(count
>targetCapacity
) {
714 count
=targetCapacity
&~1;
717 targetCapacity
-=count
;
724 if(U16_IS_SINGLE(c
)) {
725 target
[0]=(uint8_t)c
;
726 target
[1]=(uint8_t)(c
>>8);
728 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
731 target
[0]=(uint8_t)c
;
732 target
[1]=(uint8_t)(c
>>8);
733 target
[2]=(uint8_t)trail
;
734 target
[3]=(uint8_t)(trail
>>8);
744 if(U16_IS_SINGLE(c
)) {
745 target
[0]=(uint8_t)c
;
746 target
[1]=(uint8_t)(c
>>8);
748 *offsets
++=sourceIndex
;
749 *offsets
++=sourceIndex
++;
750 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
753 target
[0]=(uint8_t)c
;
754 target
[1]=(uint8_t)(c
>>8);
755 target
[2]=(uint8_t)trail
;
756 target
[3]=(uint8_t)(trail
>>8);
758 *offsets
++=sourceIndex
;
759 *offsets
++=sourceIndex
;
760 *offsets
++=sourceIndex
;
761 *offsets
++=sourceIndex
;
771 /* done with the loop for complete UChars */
772 if(length
>0 && targetCapacity
>0) {
774 * there is more input and some target capacity -
775 * it must be targetCapacity==1 because otherwise
776 * the above would have copied more;
777 * prepare for overflow output
779 if(U16_IS_SINGLE(c
=*source
++)) {
781 overflow
[1]=(char)(c
>>8);
782 length
=2; /* 2 bytes to output */
784 /* } else { keep c for surrogate handling, length will be set there */
791 /* keep c for surrogate handling, length will be set there */
792 targetCapacity
+=2*count
;
795 length
=0; /* from here on, length counts the bytes in overflow[] */
800 * c is a surrogate, and
801 * - source or target too short
802 * - or the surrogate is unmatched
805 if(U16_IS_SURROGATE_LEAD(c
)) {
806 if(source
<pArgs
->sourceLimit
) {
807 if(U16_IS_TRAIL(trail
=*source
)) {
808 /* output the surrogate pair, will overflow (see conditions comment above) */
811 overflow
[1]=(char)(c
>>8);
812 overflow
[2]=(char)trail
;
813 overflow
[3]=(char)(trail
>>8);
814 length
=4; /* 4 bytes to output */
817 /* unmatched lead surrogate */
818 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
821 /* see if the trail surrogate is in the next buffer */
824 /* unmatched trail surrogate */
825 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
831 /* output length bytes with overflow (length>targetCapacity>0) */
832 ucnv_fromUWriteBytes(cnv
,
834 &target
, pArgs
->targetLimit
,
835 &offsets
, sourceIndex
,
837 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
840 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
841 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
844 /* write back the updated pointers */
845 pArgs
->source
=source
;
846 pArgs
->target
=target
;
847 pArgs
->offsets
=offsets
;
851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
852 UErrorCode
*pErrorCode
) {
854 const uint8_t *source
;
858 uint32_t targetCapacity
, length
, count
, sourceIndex
;
861 if(pArgs
->converter
->mode
<8) {
862 _UTF16ToUnicodeWithOffsets(pArgs
, pErrorCode
);
866 cnv
=pArgs
->converter
;
867 source
=(const uint8_t *)pArgs
->source
;
868 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
869 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
870 /* no input, nothing to do */
874 target
=pArgs
->target
;
875 if(target
>= pArgs
->targetLimit
) {
876 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
880 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
881 offsets
=pArgs
->offsets
;
885 /* complete a partial UChar or pair from the last call */
886 if(cnv
->toUnicodeStatus
!=0) {
888 * special case: single byte from a previous buffer,
889 * where the byte turned out not to belong to a trail surrogate
890 * and the preceding, unmatched lead surrogate was put into toUBytes[]
893 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
895 cnv
->toUnicodeStatus
=0;
897 if((count
=cnv
->toULength
)!=0) {
898 uint8_t *p
=cnv
->toUBytes
;
900 p
[count
++]=*source
++;
904 c
=((UChar
)p
[1]<<8)|p
[0];
905 if(U16_IS_SINGLE(c
)) {
906 /* output the BMP code point */
915 } else if(U16_IS_SURROGATE_LEAD(c
)) {
916 /* continue collecting bytes for the trail surrogate */
917 c
=0; /* avoid unnecessary surrogate handling below */
919 /* fall through to error handling for an unmatched trail surrogate */
922 } else if(count
==4) {
923 c
=((UChar
)p
[1]<<8)|p
[0];
924 trail
=((UChar
)p
[3]<<8)|p
[2];
925 if(U16_IS_TRAIL(trail
)) {
926 /* output the surrogate pair */
928 if(targetCapacity
>=2) {
935 } else /* targetCapacity==1 */ {
937 cnv
->UCharErrorBuffer
[0]=trail
;
938 cnv
->UCharErrorBufferLength
=1;
939 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
945 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
946 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
948 /* back out reading the code unit after it */
949 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
953 * if the trail unit's first byte was in a previous buffer, then
954 * we need to put it into a special place because toUBytes[] will be
955 * used for the lead unit's bytes
957 cnv
->toUnicodeStatus
=0x100|p
[2];
962 /* write back the updated pointers */
963 pArgs
->source
=(const char *)source
;
964 pArgs
->target
=target
;
965 pArgs
->offsets
=offsets
;
970 cnv
->toULength
=(int8_t)count
;
973 /* copy an even number of bytes for complete UChars */
974 count
=2*targetCapacity
;
978 if(c
==0 && count
>0) {
981 targetCapacity
-=count
;
984 c
=((UChar
)source
[1]<<8)|source
[0];
986 if(U16_IS_SINGLE(c
)) {
988 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
989 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
1001 c
=((UChar
)source
[1]<<8)|source
[0];
1003 if(U16_IS_SINGLE(c
)) {
1005 *offsets
++=sourceIndex
;
1007 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
1008 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
1014 *offsets
++=sourceIndex
;
1015 *offsets
++=sourceIndex
;
1024 /* done with the loop for complete UChars */
1027 /* keep c for surrogate handling, trail will be set there */
1028 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
1029 targetCapacity
+=count
;
1035 * c is a surrogate, and
1036 * - source or target too short
1037 * - or the surrogate is unmatched
1039 cnv
->toUBytes
[0]=(uint8_t)c
;
1040 cnv
->toUBytes
[1]=(uint8_t)(c
>>8);
1043 if(U16_IS_SURROGATE_LEAD(c
)) {
1045 if(U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])) {
1046 /* output the surrogate pair, will overflow (see conditions comment above) */
1051 *offsets
++=sourceIndex
;
1053 cnv
->UCharErrorBuffer
[0]=trail
;
1054 cnv
->UCharErrorBufferLength
=1;
1056 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1058 /* unmatched lead surrogate */
1059 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1062 /* see if the trail surrogate is in the next buffer */
1065 /* unmatched trail surrogate */
1066 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1070 if(U_SUCCESS(*pErrorCode
)) {
1071 /* check for a remaining source byte */
1073 if(targetCapacity
==0) {
1074 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1076 /* it must be length==1 because otherwise the above would have copied more */
1077 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
1082 /* write back the updated pointers */
1083 pArgs
->source
=(const char *)source
;
1084 pArgs
->target
=target
;
1085 pArgs
->offsets
=offsets
;
1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
1090 const uint8_t *s
, *sourceLimit
;
1093 if(pArgs
->converter
->mode
<8) {
1094 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1097 s
=(const uint8_t *)pArgs
->source
;
1098 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1100 if(s
>=sourceLimit
) {
1102 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
1106 if(s
+2>sourceLimit
) {
1107 /* only one byte: truncated UChar */
1108 pArgs
->converter
->toUBytes
[0]=*s
++;
1109 pArgs
->converter
->toULength
=1;
1110 pArgs
->source
=(const char *)s
;
1111 *err
= U_TRUNCATED_CHAR_FOUND
;
1116 c
=((UChar32
)s
[1]<<8)|*s
;
1119 /* check for a surrogate pair */
1120 if(U_IS_SURROGATE(c
)) {
1121 if(U16_IS_SURROGATE_LEAD(c
)) {
1122 if(s
+2<=sourceLimit
) {
1125 /* get a second UChar and see if it is a trail surrogate */
1126 trail
=((UChar
)s
[1]<<8)|*s
;
1127 if(U16_IS_TRAIL(trail
)) {
1128 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1131 /* unmatched lead surrogate */
1135 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1136 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1138 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
1141 } while(s
<sourceLimit
);
1144 *err
=U_TRUNCATED_CHAR_FOUND
;
1147 /* unmatched trail surrogate */
1152 /* write the unmatched surrogate */
1153 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1154 pArgs
->converter
->toULength
=2;
1159 *err
=U_ILLEGAL_CHAR_FOUND
;
1163 pArgs
->source
=(const char *)s
;
1168 _UTF16LEReset(UConverter
*cnv
, UConverterResetChoice choice
) {
1169 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1170 /* reset toUnicode state */
1171 if(UCNV_GET_VERSION(cnv
)==0) {
1172 cnv
->mode
=8; /* no BOM handling */
1174 cnv
->mode
=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1177 if(choice
!=UCNV_RESET_TO_UNICODE
&& UCNV_GET_VERSION(cnv
)==1) {
1178 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1179 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1184 _UTF16LEOpen(UConverter
*cnv
,
1185 UConverterLoadArgs
*pArgs
,
1186 UErrorCode
*pErrorCode
) {
1187 if(UCNV_GET_VERSION(cnv
)<=1) {
1188 _UTF16LEReset(cnv
, UCNV_RESET_BOTH
);
1190 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1195 _UTF16LEGetName(const UConverter
*cnv
) {
1196 if(UCNV_GET_VERSION(cnv
)==0) {
1199 return "UTF-16LE,version=1";
1203 static const UConverterImpl _UTF16LEImpl
={
1204 UCNV_UTF16_LittleEndian
,
1213 _UTF16LEToUnicodeWithOffsets
,
1214 _UTF16LEToUnicodeWithOffsets
,
1215 _UTF16LEFromUnicodeWithOffsets
,
1216 _UTF16LEFromUnicodeWithOffsets
,
1217 _UTF16LEGetNextUChar
,
1223 ucnv_getNonSurrogateUnicodeSet
1227 static const UConverterStaticData _UTF16LEStaticData
={
1228 sizeof(UConverterStaticData
),
1230 1202, UCNV_IBM
, UCNV_UTF16_LittleEndian
, 2, 2,
1231 { 0xfd, 0xff, 0, 0 },2,FALSE
,FALSE
,
1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1238 const UConverterSharedData _UTF16LEData
={
1239 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1240 NULL
, NULL
, &_UTF16LEStaticData
, FALSE
, &_UTF16LEImpl
,
1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1247 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1249 * This is a simpler version of the UTF-32 converter, with
1250 * fewer states for shorter BOMs.
1256 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1260 * During detection: state==number of initial bytes seen so far.
1262 * On output, emit U+FEFF as the first code point.
1265 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1266 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1267 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1271 _UTF16Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1272 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1273 /* reset toUnicode: state=0 */
1276 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1277 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1278 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1282 static const UConverterSharedData _UTF16v2Data
;
1285 _UTF16Open(UConverter
*cnv
,
1286 UConverterLoadArgs
*pArgs
,
1287 UErrorCode
*pErrorCode
) {
1288 if(UCNV_GET_VERSION(cnv
)<=2) {
1289 if(UCNV_GET_VERSION(cnv
)==2 && !pArgs
->onlyTestIsLoadable
) {
1291 * Switch implementation, and switch the staticData that's different
1292 * and was copied into the UConverter.
1293 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1294 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1296 cnv
->sharedData
=(UConverterSharedData
*)&_UTF16v2Data
;
1297 uprv_memcpy(cnv
->subChars
, _UTF16v2Data
.staticData
->subChar
, UCNV_MAX_SUBCHAR_LEN
);
1299 _UTF16Reset(cnv
, UCNV_RESET_BOTH
);
1301 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1306 _UTF16GetName(const UConverter
*cnv
) {
1307 if(UCNV_GET_VERSION(cnv
)==0) {
1309 } else if(UCNV_GET_VERSION(cnv
)==1) {
1310 return "UTF-16,version=1";
1312 return "UTF-16,version=2";
1316 const UConverterSharedData _UTF16Data
;
1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1324 UErrorCode
*pErrorCode
) {
1325 UConverter
*cnv
=pArgs
->converter
;
1326 const char *source
=pArgs
->source
;
1327 const char *sourceLimit
=pArgs
->sourceLimit
;
1328 int32_t *offsets
=pArgs
->offsets
;
1330 int32_t state
, offsetDelta
;
1336 * If we detect a BOM in this buffer, then we must add the BOM size to the
1337 * offsets because the actual converter function will not see and count the BOM.
1338 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1342 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1345 cnv
->toUBytes
[0]=(uint8_t)*source
++;
1351 * Only inside this switch case can the state variable
1352 * temporarily take two additional values:
1353 * 6: BOM error, continue with BE
1354 * 7: BOM error, continue with LE
1357 if(cnv
->toUBytes
[0]==0xfe && b
==0xff) {
1358 if(IS_UTF16LE(cnv
)) {
1359 state
=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1361 state
=8; /* detect UTF-16BE */
1363 } else if(cnv
->toUBytes
[0]==0xff && b
==0xfe) {
1364 if(IS_UTF16BE(cnv
)) {
1365 state
=6; /* illegal reverse BOM for Java "UnicodeBig" */
1367 state
=9; /* detect UTF-16LE */
1369 } else if((IS_UTF16(cnv
) && UCNV_GET_VERSION(cnv
)==1)) {
1370 state
=6; /* illegal missing BOM for Java "Unicode" */
1373 /* BOM detected, consume it */
1376 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1377 } else if(state
<6) {
1378 /* ok: no BOM, and not a reverse BOM */
1379 if(source
!=pArgs
->source
) {
1380 /* reset the source for a correct first offset */
1381 source
=pArgs
->source
;
1384 if(IS_UTF16LE(cnv
)) {
1385 /* Make Java "UnicodeLittle" default to LE. */
1388 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1393 * error: missing BOM, or reverse BOM
1394 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1395 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1396 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1398 /* report the non-BOM or reverse BOM as an illegal sequence */
1401 pArgs
->source
=source
+1;
1402 /* continue with conversion if the callback resets the error */
1404 * Make Java "Unicode" default to BE like standard UTF-16.
1405 * Make Java "UnicodeBig" and "UnicodeLittle" default
1406 * to their normal endiannesses.
1409 *pErrorCode
=U_ILLEGAL_ESCAPE_SEQUENCE
;
1412 /* convert the rest of the stream */
1417 pArgs
->source
=source
;
1418 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1419 source
=pArgs
->source
;
1423 pArgs
->source
=source
;
1424 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1425 source
=pArgs
->source
;
1428 break; /* does not occur */
1432 /* add BOM size to offsets - see comment at offsetDelta declaration */
1433 if(offsets
!=NULL
&& offsetDelta
!=0) {
1434 int32_t *offsetsLimit
=pArgs
->offsets
;
1435 while(offsets
<offsetsLimit
) {
1436 *offsets
++ += offsetDelta
;
1440 pArgs
->source
=source
;
1442 if(source
==sourceLimit
&& pArgs
->flush
) {
1443 /* handle truncated input */
1446 break; /* no input at all, nothing to do */
1448 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1451 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1454 /* 0<state<8: framework will report truncation, nothing to do here */
1463 _UTF16GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1464 UErrorCode
*pErrorCode
) {
1465 switch(pArgs
->converter
->mode
) {
1467 return _UTF16BEGetNextUChar(pArgs
, pErrorCode
);
1469 return _UTF16LEGetNextUChar(pArgs
, pErrorCode
);
1471 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1475 static const UConverterImpl _UTF16Impl
= {
1485 _UTF16ToUnicodeWithOffsets
,
1486 _UTF16ToUnicodeWithOffsets
,
1487 _UTF16PEFromUnicodeWithOffsets
,
1488 _UTF16PEFromUnicodeWithOffsets
,
1491 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1495 ucnv_getNonSurrogateUnicodeSet
1498 static const UConverterStaticData _UTF16StaticData
= {
1499 sizeof(UConverterStaticData
),
1501 1204, /* CCSID for BOM sensitive UTF-16 */
1502 UCNV_IBM
, UCNV_UTF16
, 2, 2,
1504 { 0xff, 0xfd, 0, 0 }, 2,
1506 { 0xfd, 0xff, 0, 0 }, 2,
1511 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1514 const UConverterSharedData _UTF16Data
= {
1515 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1516 NULL
, NULL
, &_UTF16StaticData
, FALSE
, &_UTF16Impl
,
1520 static const UConverterImpl _UTF16v2Impl
= {
1530 _UTF16ToUnicodeWithOffsets
,
1531 _UTF16ToUnicodeWithOffsets
,
1532 _UTF16BEFromUnicodeWithOffsets
,
1533 _UTF16BEFromUnicodeWithOffsets
,
1536 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1540 ucnv_getNonSurrogateUnicodeSet
1543 static const UConverterStaticData _UTF16v2StaticData
= {
1544 sizeof(UConverterStaticData
),
1546 1204, /* CCSID for BOM sensitive UTF-16 */
1547 UCNV_IBM
, UCNV_UTF16
, 2, 2,
1548 { 0xff, 0xfd, 0, 0 }, 2,
1552 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1555 static const UConverterSharedData _UTF16v2Data
= {
1556 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1557 NULL
, NULL
, &_UTF16v2StaticData
, FALSE
, &_UTF16v2Impl
,