2 **********************************************************************
3 * Copyright (C) 2002-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u16.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
27 UCNV_NEED_TO_WRITE_BOM
=1
30 /* UTF-16BE ----------------------------------------------------------------- */
33 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
35 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
40 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
41 UErrorCode
*pErrorCode
) {
47 uint32_t targetCapacity
, length
, sourceIndex
;
52 length
=(int32_t)(pArgs
->sourceLimit
-source
);
54 /* no input, nothing to do */
60 /* write the BOM if necessary */
61 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
62 static const char bom
[]={ (char)0xfe, (char)0xff };
63 ucnv_fromUWriteBytes(cnv
,
65 &pArgs
->target
, pArgs
->targetLimit
,
68 cnv
->fromUnicodeStatus
=0;
72 if(target
>= pArgs
->targetLimit
) {
73 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
77 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
78 offsets
=pArgs
->offsets
;
81 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
83 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
84 /* the last buffer ended with a lead surrogate, output the surrogate pair */
87 target
[0]=(uint8_t)(c
>>8);
89 target
[2]=(uint8_t)(trail
>>8);
90 target
[3]=(uint8_t)trail
;
100 cnv
->fromUChar32
=c
=0;
104 /* copy an even number of bytes for complete UChars */
105 uint32_t count
=2*length
;
106 if(count
>targetCapacity
) {
107 count
=targetCapacity
&~1;
110 targetCapacity
-=count
;
117 if(U16_IS_SINGLE(c
)) {
118 target
[0]=(uint8_t)(c
>>8);
119 target
[1]=(uint8_t)c
;
121 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
124 target
[0]=(uint8_t)(c
>>8);
125 target
[1]=(uint8_t)c
;
126 target
[2]=(uint8_t)(trail
>>8);
127 target
[3]=(uint8_t)trail
;
137 if(U16_IS_SINGLE(c
)) {
138 target
[0]=(uint8_t)(c
>>8);
139 target
[1]=(uint8_t)c
;
141 *offsets
++=sourceIndex
;
142 *offsets
++=sourceIndex
++;
143 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
146 target
[0]=(uint8_t)(c
>>8);
147 target
[1]=(uint8_t)c
;
148 target
[2]=(uint8_t)(trail
>>8);
149 target
[3]=(uint8_t)trail
;
151 *offsets
++=sourceIndex
;
152 *offsets
++=sourceIndex
;
153 *offsets
++=sourceIndex
;
154 *offsets
++=sourceIndex
;
164 /* done with the loop for complete UChars */
165 if(length
>0 && targetCapacity
>0) {
167 * there is more input and some target capacity -
168 * it must be targetCapacity==1 because otherwise
169 * the above would have copied more;
170 * prepare for overflow output
172 if(U16_IS_SINGLE(c
=*source
++)) {
173 overflow
[0]=(char)(c
>>8);
175 length
=2; /* 2 bytes to output */
177 /* } else { keep c for surrogate handling, length will be set there */
184 /* keep c for surrogate handling, length will be set there */
185 targetCapacity
+=2*count
;
188 length
=0; /* from here on, length counts the bytes in overflow[] */
193 * c is a surrogate, and
194 * - source or target too short
195 * - or the surrogate is unmatched
198 if(U16_IS_SURROGATE_LEAD(c
)) {
199 if(source
<pArgs
->sourceLimit
) {
200 if(U16_IS_TRAIL(trail
=*source
)) {
201 /* output the surrogate pair, will overflow (see conditions comment above) */
203 overflow
[0]=(char)(c
>>8);
205 overflow
[2]=(char)(trail
>>8);
206 overflow
[3]=(char)trail
;
207 length
=4; /* 4 bytes to output */
210 /* unmatched lead surrogate */
211 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
214 /* see if the trail surrogate is in the next buffer */
217 /* unmatched trail surrogate */
218 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
224 /* output length bytes with overflow (length>targetCapacity>0) */
225 ucnv_fromUWriteBytes(cnv
,
227 (char **)&target
, pArgs
->targetLimit
,
228 &offsets
, sourceIndex
,
230 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
233 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
234 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
237 /* write back the updated pointers */
238 pArgs
->source
=source
;
239 pArgs
->target
=(char *)target
;
240 pArgs
->offsets
=offsets
;
244 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
245 UErrorCode
*pErrorCode
) {
247 const uint8_t *source
;
251 uint32_t targetCapacity
, length
, count
, sourceIndex
;
254 cnv
=pArgs
->converter
;
255 source
=(const uint8_t *)pArgs
->source
;
256 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
257 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
258 /* no input, nothing to do */
262 target
=pArgs
->target
;
263 if(target
>= pArgs
->targetLimit
) {
264 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
268 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-target
);
269 offsets
=pArgs
->offsets
;
273 /* complete a partial UChar or pair from the last call */
274 if(cnv
->toUnicodeStatus
!=0) {
276 * special case: single byte from a previous buffer,
277 * where the byte turned out not to belong to a trail surrogate
278 * and the preceding, unmatched lead surrogate was put into toUBytes[]
281 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
283 cnv
->toUnicodeStatus
=0;
285 if((count
=cnv
->toULength
)!=0) {
286 uint8_t *p
=cnv
->toUBytes
;
288 p
[count
++]=*source
++;
292 c
=((UChar
)p
[0]<<8)|p
[1];
293 if(U16_IS_SINGLE(c
)) {
294 /* output the BMP code point */
303 } else if(U16_IS_SURROGATE_LEAD(c
)) {
304 /* continue collecting bytes for the trail surrogate */
305 c
=0; /* avoid unnecessary surrogate handling below */
307 /* fall through to error handling for an unmatched trail surrogate */
310 } else if(count
==4) {
311 c
=((UChar
)p
[0]<<8)|p
[1];
312 trail
=((UChar
)p
[2]<<8)|p
[3];
313 if(U16_IS_TRAIL(trail
)) {
314 /* output the surrogate pair */
316 if(targetCapacity
>=2) {
323 } else /* targetCapacity==1 */ {
325 cnv
->UCharErrorBuffer
[0]=trail
;
326 cnv
->UCharErrorBufferLength
=1;
327 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
333 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
334 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
336 /* back out reading the code unit after it */
337 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
341 * if the trail unit's first byte was in a previous buffer, then
342 * we need to put it into a special place because toUBytes[] will be
343 * used for the lead unit's bytes
345 cnv
->toUnicodeStatus
=0x100|p
[2];
350 /* write back the updated pointers */
351 pArgs
->source
=(const char *)source
;
352 pArgs
->target
=target
;
353 pArgs
->offsets
=offsets
;
358 cnv
->toULength
=(int8_t)count
;
361 /* copy an even number of bytes for complete UChars */
362 count
=2*targetCapacity
;
366 if(c
==0 && count
>0) {
369 targetCapacity
-=count
;
372 c
=((UChar
)source
[0]<<8)|source
[1];
374 if(U16_IS_SINGLE(c
)) {
376 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
377 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
389 c
=((UChar
)source
[0]<<8)|source
[1];
391 if(U16_IS_SINGLE(c
)) {
393 *offsets
++=sourceIndex
;
395 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
396 U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])
402 *offsets
++=sourceIndex
;
403 *offsets
++=sourceIndex
;
412 /* done with the loop for complete UChars */
415 /* keep c for surrogate handling, trail will be set there */
416 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
417 targetCapacity
+=count
;
423 * c is a surrogate, and
424 * - source or target too short
425 * - or the surrogate is unmatched
427 cnv
->toUBytes
[0]=(uint8_t)(c
>>8);
428 cnv
->toUBytes
[1]=(uint8_t)c
;
431 if(U16_IS_SURROGATE_LEAD(c
)) {
433 if(U16_IS_TRAIL(trail
=((UChar
)source
[0]<<8)|source
[1])) {
434 /* output the surrogate pair, will overflow (see conditions comment above) */
439 *offsets
++=sourceIndex
;
441 cnv
->UCharErrorBuffer
[0]=trail
;
442 cnv
->UCharErrorBufferLength
=1;
444 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
446 /* unmatched lead surrogate */
447 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
450 /* see if the trail surrogate is in the next buffer */
453 /* unmatched trail surrogate */
454 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
458 if(U_SUCCESS(*pErrorCode
)) {
459 /* check for a remaining source byte */
461 if(targetCapacity
==0) {
462 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
464 /* it must be length==1 because otherwise the above would have copied more */
465 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
470 /* write back the updated pointers */
471 pArgs
->source
=(const char *)source
;
472 pArgs
->target
=target
;
473 pArgs
->offsets
=offsets
;
477 _UTF16BEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
478 const uint8_t *s
, *sourceLimit
;
481 s
=(const uint8_t *)pArgs
->source
;
482 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
486 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
490 if(s
+2>sourceLimit
) {
491 /* only one byte: truncated UChar */
492 pArgs
->converter
->toUBytes
[0]=*s
++;
493 pArgs
->converter
->toULength
=1;
494 pArgs
->source
=(const char *)s
;
495 *err
= U_TRUNCATED_CHAR_FOUND
;
500 c
=((UChar32
)*s
<<8)|s
[1];
503 /* check for a surrogate pair */
504 if(U_IS_SURROGATE(c
)) {
505 if(U16_IS_SURROGATE_LEAD(c
)) {
506 if(s
+2<=sourceLimit
) {
509 /* get a second UChar and see if it is a trail surrogate */
510 trail
=((UChar
)*s
<<8)|s
[1];
511 if(U16_IS_TRAIL(trail
)) {
512 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
515 /* unmatched lead surrogate */
519 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
520 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
522 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
525 } while(s
<sourceLimit
);
528 *err
=U_TRUNCATED_CHAR_FOUND
;
531 /* unmatched trail surrogate */
536 /* write the unmatched surrogate */
537 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
538 pArgs
->converter
->toULength
=2;
543 *err
=U_ILLEGAL_CHAR_FOUND
;
547 pArgs
->source
=(const char *)s
;
551 static const UConverterImpl _UTF16BEImpl
={
552 UCNV_UTF16_BigEndian
,
561 _UTF16BEToUnicodeWithOffsets
,
562 _UTF16BEToUnicodeWithOffsets
,
563 _UTF16BEFromUnicodeWithOffsets
,
564 _UTF16BEFromUnicodeWithOffsets
,
565 _UTF16BEGetNextUChar
,
571 ucnv_getNonSurrogateUnicodeSet
574 static const UConverterStaticData _UTF16BEStaticData
={
575 sizeof(UConverterStaticData
),
577 1200, UCNV_IBM
, UCNV_UTF16_BigEndian
, 2, 4,
578 { 0xff, 0xfd, 0, 0 },2,FALSE
,FALSE
,
581 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
585 const UConverterSharedData _UTF16BEData
={
586 sizeof(UConverterSharedData
), ~((uint32_t) 0),
587 NULL
, NULL
, &_UTF16BEStaticData
, FALSE
, &_UTF16BEImpl
,
591 /* UTF-16LE ----------------------------------------------------------------- */
594 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
595 UErrorCode
*pErrorCode
) {
601 uint32_t targetCapacity
, length
, sourceIndex
;
605 source
=pArgs
->source
;
606 length
=(int32_t)(pArgs
->sourceLimit
-source
);
608 /* no input, nothing to do */
612 cnv
=pArgs
->converter
;
614 /* write the BOM if necessary */
615 if(cnv
->fromUnicodeStatus
==UCNV_NEED_TO_WRITE_BOM
) {
616 static const char bom
[]={ (char)0xff, (char)0xfe };
617 ucnv_fromUWriteBytes(cnv
,
619 &pArgs
->target
, pArgs
->targetLimit
,
622 cnv
->fromUnicodeStatus
=0;
625 target
=pArgs
->target
;
626 if(target
>= pArgs
->targetLimit
) {
627 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
631 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
632 offsets
=pArgs
->offsets
;
635 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
637 if((c
=(UChar
)cnv
->fromUChar32
)!=0 && U16_IS_TRAIL(trail
=*source
) && targetCapacity
>=4) {
638 /* the last buffer ended with a lead surrogate, output the surrogate pair */
641 target
[0]=(uint8_t)c
;
642 target
[1]=(uint8_t)(c
>>8);
643 target
[2]=(uint8_t)trail
;
644 target
[3]=(uint8_t)(trail
>>8);
654 cnv
->fromUChar32
=c
=0;
658 /* copy an even number of bytes for complete UChars */
659 uint32_t count
=2*length
;
660 if(count
>targetCapacity
) {
661 count
=targetCapacity
&~1;
664 targetCapacity
-=count
;
671 if(U16_IS_SINGLE(c
)) {
672 target
[0]=(uint8_t)c
;
673 target
[1]=(uint8_t)(c
>>8);
675 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
678 target
[0]=(uint8_t)c
;
679 target
[1]=(uint8_t)(c
>>8);
680 target
[2]=(uint8_t)trail
;
681 target
[3]=(uint8_t)(trail
>>8);
691 if(U16_IS_SINGLE(c
)) {
692 target
[0]=(uint8_t)c
;
693 target
[1]=(uint8_t)(c
>>8);
695 *offsets
++=sourceIndex
;
696 *offsets
++=sourceIndex
++;
697 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 && U16_IS_TRAIL(trail
=*source
)) {
700 target
[0]=(uint8_t)c
;
701 target
[1]=(uint8_t)(c
>>8);
702 target
[2]=(uint8_t)trail
;
703 target
[3]=(uint8_t)(trail
>>8);
705 *offsets
++=sourceIndex
;
706 *offsets
++=sourceIndex
;
707 *offsets
++=sourceIndex
;
708 *offsets
++=sourceIndex
;
718 /* done with the loop for complete UChars */
719 if(length
>0 && targetCapacity
>0) {
721 * there is more input and some target capacity -
722 * it must be targetCapacity==1 because otherwise
723 * the above would have copied more;
724 * prepare for overflow output
726 if(U16_IS_SINGLE(c
=*source
++)) {
728 overflow
[1]=(char)(c
>>8);
729 length
=2; /* 2 bytes to output */
731 /* } else { keep c for surrogate handling, length will be set there */
738 /* keep c for surrogate handling, length will be set there */
739 targetCapacity
+=2*count
;
742 length
=0; /* from here on, length counts the bytes in overflow[] */
747 * c is a surrogate, and
748 * - source or target too short
749 * - or the surrogate is unmatched
752 if(U16_IS_SURROGATE_LEAD(c
)) {
753 if(source
<pArgs
->sourceLimit
) {
754 if(U16_IS_TRAIL(trail
=*source
)) {
755 /* output the surrogate pair, will overflow (see conditions comment above) */
758 overflow
[1]=(char)(c
>>8);
759 overflow
[2]=(char)trail
;
760 overflow
[3]=(char)(trail
>>8);
761 length
=4; /* 4 bytes to output */
764 /* unmatched lead surrogate */
765 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
768 /* see if the trail surrogate is in the next buffer */
771 /* unmatched trail surrogate */
772 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
778 /* output length bytes with overflow (length>targetCapacity>0) */
779 ucnv_fromUWriteBytes(cnv
,
781 &target
, pArgs
->targetLimit
,
782 &offsets
, sourceIndex
,
784 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-(char *)target
);
787 if(U_SUCCESS(*pErrorCode
) && source
<pArgs
->sourceLimit
&& targetCapacity
==0) {
788 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
791 /* write back the updated pointers */
792 pArgs
->source
=source
;
793 pArgs
->target
=target
;
794 pArgs
->offsets
=offsets
;
798 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
799 UErrorCode
*pErrorCode
) {
801 const uint8_t *source
;
805 uint32_t targetCapacity
, length
, count
, sourceIndex
;
808 cnv
=pArgs
->converter
;
809 source
=(const uint8_t *)pArgs
->source
;
810 length
=(int32_t)((const uint8_t *)pArgs
->sourceLimit
-source
);
811 if(length
<=0 && cnv
->toUnicodeStatus
==0) {
812 /* no input, nothing to do */
816 target
=pArgs
->target
;
817 if(target
>= pArgs
->targetLimit
) {
818 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
822 targetCapacity
=(uint32_t)(pArgs
->targetLimit
-pArgs
->target
);
823 offsets
=pArgs
->offsets
;
827 /* complete a partial UChar or pair from the last call */
828 if(cnv
->toUnicodeStatus
!=0) {
830 * special case: single byte from a previous buffer,
831 * where the byte turned out not to belong to a trail surrogate
832 * and the preceding, unmatched lead surrogate was put into toUBytes[]
835 cnv
->toUBytes
[0]=(uint8_t)cnv
->toUnicodeStatus
;
837 cnv
->toUnicodeStatus
=0;
839 if((count
=cnv
->toULength
)!=0) {
840 uint8_t *p
=cnv
->toUBytes
;
842 p
[count
++]=*source
++;
846 c
=((UChar
)p
[1]<<8)|p
[0];
847 if(U16_IS_SINGLE(c
)) {
848 /* output the BMP code point */
857 } else if(U16_IS_SURROGATE_LEAD(c
)) {
858 /* continue collecting bytes for the trail surrogate */
859 c
=0; /* avoid unnecessary surrogate handling below */
861 /* fall through to error handling for an unmatched trail surrogate */
864 } else if(count
==4) {
865 c
=((UChar
)p
[1]<<8)|p
[0];
866 trail
=((UChar
)p
[3]<<8)|p
[2];
867 if(U16_IS_TRAIL(trail
)) {
868 /* output the surrogate pair */
870 if(targetCapacity
>=2) {
877 } else /* targetCapacity==1 */ {
879 cnv
->UCharErrorBuffer
[0]=trail
;
880 cnv
->UCharErrorBufferLength
=1;
881 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
887 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
888 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
890 /* back out reading the code unit after it */
891 if(((const uint8_t *)pArgs
->source
-source
)>=2) {
895 * if the trail unit's first byte was in a previous buffer, then
896 * we need to put it into a special place because toUBytes[] will be
897 * used for the lead unit's bytes
899 cnv
->toUnicodeStatus
=0x100|p
[2];
904 /* write back the updated pointers */
905 pArgs
->source
=(const char *)source
;
906 pArgs
->target
=target
;
907 pArgs
->offsets
=offsets
;
912 cnv
->toULength
=(int8_t)count
;
915 /* copy an even number of bytes for complete UChars */
916 count
=2*targetCapacity
;
920 if(c
==0 && count
>0) {
923 targetCapacity
-=count
;
926 c
=((UChar
)source
[1]<<8)|source
[0];
928 if(U16_IS_SINGLE(c
)) {
930 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
931 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
943 c
=((UChar
)source
[1]<<8)|source
[0];
945 if(U16_IS_SINGLE(c
)) {
947 *offsets
++=sourceIndex
;
949 } else if(U16_IS_SURROGATE_LEAD(c
) && count
>=2 &&
950 U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])
956 *offsets
++=sourceIndex
;
957 *offsets
++=sourceIndex
;
966 /* done with the loop for complete UChars */
969 /* keep c for surrogate handling, trail will be set there */
970 length
+=2*(count
-1); /* one more byte pair was consumed than count decremented */
971 targetCapacity
+=count
;
977 * c is a surrogate, and
978 * - source or target too short
979 * - or the surrogate is unmatched
981 cnv
->toUBytes
[0]=(uint8_t)c
;
982 cnv
->toUBytes
[1]=(uint8_t)(c
>>8);
985 if(U16_IS_SURROGATE_LEAD(c
)) {
987 if(U16_IS_TRAIL(trail
=((UChar
)source
[1]<<8)|source
[0])) {
988 /* output the surrogate pair, will overflow (see conditions comment above) */
993 *offsets
++=sourceIndex
;
995 cnv
->UCharErrorBuffer
[0]=trail
;
996 cnv
->UCharErrorBufferLength
=1;
998 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1000 /* unmatched lead surrogate */
1001 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1004 /* see if the trail surrogate is in the next buffer */
1007 /* unmatched trail surrogate */
1008 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1012 if(U_SUCCESS(*pErrorCode
)) {
1013 /* check for a remaining source byte */
1015 if(targetCapacity
==0) {
1016 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1018 /* it must be length==1 because otherwise the above would have copied more */
1019 cnv
->toUBytes
[cnv
->toULength
++]=*source
++;
1024 /* write back the updated pointers */
1025 pArgs
->source
=(const char *)source
;
1026 pArgs
->target
=target
;
1027 pArgs
->offsets
=offsets
;
1031 _UTF16LEGetNextUChar(UConverterToUnicodeArgs
*pArgs
, UErrorCode
*err
) {
1032 const uint8_t *s
, *sourceLimit
;
1035 s
=(const uint8_t *)pArgs
->source
;
1036 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1038 if(s
>=sourceLimit
) {
1040 *err
=U_INDEX_OUTOFBOUNDS_ERROR
;
1044 if(s
+2>sourceLimit
) {
1045 /* only one byte: truncated UChar */
1046 pArgs
->converter
->toUBytes
[0]=*s
++;
1047 pArgs
->converter
->toULength
=1;
1048 pArgs
->source
=(const char *)s
;
1049 *err
= U_TRUNCATED_CHAR_FOUND
;
1054 c
=((UChar32
)s
[1]<<8)|*s
;
1057 /* check for a surrogate pair */
1058 if(U_IS_SURROGATE(c
)) {
1059 if(U16_IS_SURROGATE_LEAD(c
)) {
1060 if(s
+2<=sourceLimit
) {
1063 /* get a second UChar and see if it is a trail surrogate */
1064 trail
=((UChar
)s
[1]<<8)|*s
;
1065 if(U16_IS_TRAIL(trail
)) {
1066 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1069 /* unmatched lead surrogate */
1073 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1074 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1076 pArgs
->converter
->toULength
=(int8_t)(sourceLimit
-s
);
1079 } while(s
<sourceLimit
);
1082 *err
=U_TRUNCATED_CHAR_FOUND
;
1085 /* unmatched trail surrogate */
1090 /* write the unmatched surrogate */
1091 uint8_t *bytes
=pArgs
->converter
->toUBytes
;
1092 pArgs
->converter
->toULength
=2;
1097 *err
=U_ILLEGAL_CHAR_FOUND
;
1101 pArgs
->source
=(const char *)s
;
1105 static const UConverterImpl _UTF16LEImpl
={
1106 UCNV_UTF16_LittleEndian
,
1115 _UTF16LEToUnicodeWithOffsets
,
1116 _UTF16LEToUnicodeWithOffsets
,
1117 _UTF16LEFromUnicodeWithOffsets
,
1118 _UTF16LEFromUnicodeWithOffsets
,
1119 _UTF16LEGetNextUChar
,
1125 ucnv_getNonSurrogateUnicodeSet
1129 static const UConverterStaticData _UTF16LEStaticData
={
1130 sizeof(UConverterStaticData
),
1132 1202, UCNV_IBM
, UCNV_UTF16_LittleEndian
, 2, 4,
1133 { 0xfd, 0xff, 0, 0 },2,FALSE
,FALSE
,
1136 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1140 const UConverterSharedData _UTF16LEData
={
1141 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1142 NULL
, NULL
, &_UTF16LEStaticData
, FALSE
, &_UTF16LEImpl
,
1146 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1149 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1151 * This is a simpler version of the UTF-32 converter below, with
1152 * fewer states for shorter BOMs.
1163 * During detection: state&3==number of matching bytes so far.
1165 * On output, emit U+FEFF as the first code point.
1169 _UTF16Reset(UConverter
*cnv
, UConverterResetChoice choice
) {
1170 if(choice
<=UCNV_RESET_TO_UNICODE
) {
1171 /* reset toUnicode: state=0 */
1174 if(choice
!=UCNV_RESET_TO_UNICODE
) {
1175 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1176 cnv
->fromUnicodeStatus
=UCNV_NEED_TO_WRITE_BOM
;
1181 _UTF16Open(UConverter
*cnv
,
1185 UErrorCode
*pErrorCode
) {
1186 _UTF16Reset(cnv
, UCNV_RESET_BOTH
);
1189 static const char utf16BOM
[8]={ (char)0xfe, (char)0xff, 0, 0, (char)0xff, (char)0xfe, 0, 0 };
1192 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
1193 UErrorCode
*pErrorCode
) {
1194 UConverter
*cnv
=pArgs
->converter
;
1195 const char *source
=pArgs
->source
;
1196 const char *sourceLimit
=pArgs
->sourceLimit
;
1197 int32_t *offsets
=pArgs
->offsets
;
1199 int32_t state
, offsetDelta
;
1205 * If we detect a BOM in this buffer, then we must add the BOM size to the
1206 * offsets because the actual converter function will not see and count the BOM.
1207 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1211 while(source
<sourceLimit
&& U_SUCCESS(*pErrorCode
)) {
1216 state
=1; /* could be FE FF */
1217 } else if(b
==(char)0xff) {
1218 state
=5; /* could be FF FE */
1220 state
=8; /* default to UTF-16BE */
1227 if(*source
==utf16BOM
[state
]) {
1230 state
=8; /* detect UTF-16BE */
1231 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1232 } else if(state
==5) {
1233 state
=9; /* detect UTF-16LE */
1234 offsetDelta
=(int32_t)(source
-pArgs
->source
);
1237 /* switch to UTF-16BE and pass the previous bytes */
1238 if(source
!=pArgs
->source
) {
1239 /* just reset the source */
1240 source
=pArgs
->source
;
1242 UBool oldFlush
=pArgs
->flush
;
1244 /* the first byte is from a previous buffer, replay it first */
1245 pArgs
->source
=utf16BOM
+(state
&4); /* select the correct BOM */
1246 pArgs
->sourceLimit
=pArgs
->source
+1; /* replay previous byte */
1247 pArgs
->flush
=FALSE
; /* this sourceLimit is not the real source stream limit */
1249 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1251 /* restore real pointers; pArgs->source will be set in case 8/9 */
1252 pArgs
->sourceLimit
=sourceLimit
;
1253 pArgs
->flush
=oldFlush
;
1261 pArgs
->source
=source
;
1262 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1263 source
=pArgs
->source
;
1267 pArgs
->source
=source
;
1268 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1269 source
=pArgs
->source
;
1272 break; /* does not occur */
1276 /* add BOM size to offsets - see comment at offsetDelta declaration */
1277 if(offsets
!=NULL
&& offsetDelta
!=0) {
1278 int32_t *offsetsLimit
=pArgs
->offsets
;
1279 while(offsets
<offsetsLimit
) {
1280 *offsets
++ += offsetDelta
;
1284 pArgs
->source
=source
;
1286 if(source
==sourceLimit
&& pArgs
->flush
) {
1287 /* handle truncated input */
1290 break; /* no input at all, nothing to do */
1292 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1295 _UTF16LEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1298 /* handle 0<state<8: call UTF-16BE with too-short input */
1299 pArgs
->source
=utf16BOM
+(state
&4); /* select the correct BOM */
1300 pArgs
->sourceLimit
=pArgs
->source
+(state
&3); /* replay bytes */
1302 /* no offsets: not enough for output */
1303 _UTF16BEToUnicodeWithOffsets(pArgs
, pErrorCode
);
1304 pArgs
->source
=source
;
1305 pArgs
->sourceLimit
=sourceLimit
;
1315 _UTF16GetNextUChar(UConverterToUnicodeArgs
*pArgs
,
1316 UErrorCode
*pErrorCode
) {
1317 switch(pArgs
->converter
->mode
) {
1319 return _UTF16BEGetNextUChar(pArgs
, pErrorCode
);
1321 return _UTF16LEGetNextUChar(pArgs
, pErrorCode
);
1323 return UCNV_GET_NEXT_UCHAR_USE_TO_U
;
1327 static const UConverterImpl _UTF16Impl
= {
1337 _UTF16ToUnicodeWithOffsets
,
1338 _UTF16ToUnicodeWithOffsets
,
1339 _UTF16PEFromUnicodeWithOffsets
,
1340 _UTF16PEFromUnicodeWithOffsets
,
1343 NULL
, /* ### TODO implement getStarters for all Unicode encodings?! */
1347 ucnv_getNonSurrogateUnicodeSet
1350 static const UConverterStaticData _UTF16StaticData
= {
1351 sizeof(UConverterStaticData
),
1353 1204, /* CCSID for BOM sensitive UTF-16 */
1354 UCNV_IBM
, UCNV_UTF16
, 2, 4,
1356 { 0xff, 0xfd, 0, 0 }, 2,
1358 { 0xfd, 0xff, 0, 0 }, 2,
1363 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1366 const UConverterSharedData _UTF16Data
= {
1367 sizeof(UConverterSharedData
), ~((uint32_t) 0),
1368 NULL
, NULL
, &_UTF16StaticData
, FALSE
, &_UTF16Impl
,