]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u16.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u16.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2002-2015, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* file name: ucnv_u16.c
f3c0d7a5 9* encoding: UTF-8
b75a7d8f
A
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002jul01
14* created by: Markus W. Scherer
15*
16* UTF-16 converter implementation. Used to be in ucnv_utf.c.
17*/
18
19#include "unicode/utypes.h"
374ca955
A
20
21#if !UCONFIG_NO_CONVERSION
22
b75a7d8f 23#include "unicode/ucnv.h"
f3c0d7a5 24#include "unicode/uversion.h"
b75a7d8f
A
25#include "ucnv_bld.h"
26#include "ucnv_cnv.h"
27#include "cmemory.h"
28
73c04bcf
A
29enum {
30 UCNV_NEED_TO_WRITE_BOM=1
31};
32
f3c0d7a5 33U_CDECL_BEGIN
729e4ab9
A
34/*
35 * The UTF-16 toUnicode implementation is also used for the Java-specific
36 * "with BOM" variants of UTF-16BE and UTF-16LE.
37 */
f3c0d7a5 38static void U_CALLCONV
729e4ab9
A
39_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
40 UErrorCode *pErrorCode);
41
374ca955
A
42/* UTF-16BE ----------------------------------------------------------------- */
43
44#if U_IS_BIG_ENDIAN
45# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
46#else
47# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
48#endif
b75a7d8f 49
73c04bcf 50
f3c0d7a5 51static void U_CALLCONV
374ca955
A
52_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
53 UErrorCode *pErrorCode) {
54 UConverter *cnv;
55 const UChar *source;
73c04bcf 56 char *target;
374ca955
A
57 int32_t *offsets;
58
73c04bcf 59 uint32_t targetCapacity, length, sourceIndex;
374ca955
A
60 UChar c, trail;
61 char overflow[4];
62
63 source=pArgs->source;
73c04bcf 64 length=(int32_t)(pArgs->sourceLimit-source);
374ca955 65 if(length<=0) {
b75a7d8f
A
66 /* no input, nothing to do */
67 return;
68 }
69
73c04bcf
A
70 cnv=pArgs->converter;
71
72 /* write the BOM if necessary */
73 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
3d1f044b 74 static const char bom[]={ (char)0xfeu, (char)0xffu };
73c04bcf
A
75 ucnv_fromUWriteBytes(cnv,
76 bom, 2,
77 &pArgs->target, pArgs->targetLimit,
78 &pArgs->offsets, -1,
79 pErrorCode);
80 cnv->fromUnicodeStatus=0;
81 }
82
83 target=pArgs->target;
84 if(target >= pArgs->targetLimit) {
b75a7d8f
A
85 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86 return;
87 }
88
73c04bcf 89 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
374ca955
A
90 offsets=pArgs->offsets;
91 sourceIndex=0;
92
93 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
94
95 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
96 /* the last buffer ended with a lead surrogate, output the surrogate pair */
97 ++source;
b75a7d8f 98 --length;
374ca955
A
99 target[0]=(uint8_t)(c>>8);
100 target[1]=(uint8_t)c;
101 target[2]=(uint8_t)(trail>>8);
102 target[3]=(uint8_t)trail;
103 target+=4;
104 targetCapacity-=4;
105 if(offsets!=NULL) {
106 *offsets++=-1;
107 *offsets++=-1;
108 *offsets++=-1;
109 *offsets++=-1;
b75a7d8f 110 }
374ca955
A
111 sourceIndex=1;
112 cnv->fromUChar32=c=0;
b75a7d8f
A
113 }
114
374ca955 115 if(c==0) {
73c04bcf
A
116 /* copy an even number of bytes for complete UChars */
117 uint32_t count=2*length;
118 if(count>targetCapacity) {
119 count=targetCapacity&~1;
120 }
121 /* count is even */
374ca955
A
122 targetCapacity-=count;
123 count>>=1;
124 length-=count;
125
126 if(offsets==NULL) {
127 while(count>0) {
128 c=*source++;
129 if(U16_IS_SINGLE(c)) {
130 target[0]=(uint8_t)(c>>8);
131 target[1]=(uint8_t)c;
132 target+=2;
133 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
134 ++source;
135 --count;
136 target[0]=(uint8_t)(c>>8);
137 target[1]=(uint8_t)c;
138 target[2]=(uint8_t)(trail>>8);
139 target[3]=(uint8_t)trail;
140 target+=4;
141 } else {
142 break;
143 }
144 --count;
145 }
146 } else {
147 while(count>0) {
148 c=*source++;
149 if(U16_IS_SINGLE(c)) {
150 target[0]=(uint8_t)(c>>8);
151 target[1]=(uint8_t)c;
152 target+=2;
153 *offsets++=sourceIndex;
154 *offsets++=sourceIndex++;
155 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
156 ++source;
157 --count;
158 target[0]=(uint8_t)(c>>8);
159 target[1]=(uint8_t)c;
160 target[2]=(uint8_t)(trail>>8);
161 target[3]=(uint8_t)trail;
162 target+=4;
163 *offsets++=sourceIndex;
164 *offsets++=sourceIndex;
165 *offsets++=sourceIndex;
166 *offsets++=sourceIndex;
167 sourceIndex+=2;
168 } else {
169 break;
170 }
b75a7d8f
A
171 --count;
172 }
173 }
b75a7d8f 174
374ca955
A
175 if(count==0) {
176 /* done with the loop for complete UChars */
177 if(length>0 && targetCapacity>0) {
178 /*
179 * there is more input and some target capacity -
180 * it must be targetCapacity==1 because otherwise
181 * the above would have copied more;
182 * prepare for overflow output
183 */
184 if(U16_IS_SINGLE(c=*source++)) {
185 overflow[0]=(char)(c>>8);
186 overflow[1]=(char)c;
187 length=2; /* 2 bytes to output */
188 c=0;
189 /* } else { keep c for surrogate handling, length will be set there */
190 }
191 } else {
192 length=0;
193 c=0;
194 }
b75a7d8f 195 } else {
374ca955
A
196 /* keep c for surrogate handling, length will be set there */
197 targetCapacity+=2*count;
b75a7d8f 198 }
374ca955
A
199 } else {
200 length=0; /* from here on, length counts the bytes in overflow[] */
b75a7d8f 201 }
374ca955
A
202
203 if(c!=0) {
204 /*
205 * c is a surrogate, and
206 * - source or target too short
207 * - or the surrogate is unmatched
208 */
209 length=0;
210 if(U16_IS_SURROGATE_LEAD(c)) {
211 if(source<pArgs->sourceLimit) {
212 if(U16_IS_TRAIL(trail=*source)) {
213 /* output the surrogate pair, will overflow (see conditions comment above) */
214 ++source;
215 overflow[0]=(char)(c>>8);
216 overflow[1]=(char)c;
217 overflow[2]=(char)(trail>>8);
218 overflow[3]=(char)trail;
219 length=4; /* 4 bytes to output */
220 c=0;
221 } else {
222 /* unmatched lead surrogate */
223 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
224 }
225 } else {
226 /* see if the trail surrogate is in the next buffer */
227 }
228 } else {
229 /* unmatched trail surrogate */
230 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 231 }
374ca955 232 cnv->fromUChar32=c;
b75a7d8f
A
233 }
234
374ca955
A
235 if(length>0) {
236 /* output length bytes with overflow (length>targetCapacity>0) */
237 ucnv_fromUWriteBytes(cnv,
238 overflow, length,
239 (char **)&target, pArgs->targetLimit,
240 &offsets, sourceIndex,
241 pErrorCode);
73c04bcf 242 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
b75a7d8f
A
243 }
244
374ca955
A
245 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
246 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f
A
247 }
248
249 /* write back the updated pointers */
374ca955
A
250 pArgs->source=source;
251 pArgs->target=(char *)target;
252 pArgs->offsets=offsets;
b75a7d8f
A
253}
254
f3c0d7a5 255static void U_CALLCONV
374ca955 256_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
b75a7d8f 257 UErrorCode *pErrorCode) {
374ca955
A
258 UConverter *cnv;
259 const uint8_t *source;
260 UChar *target;
261 int32_t *offsets;
262
73c04bcf 263 uint32_t targetCapacity, length, count, sourceIndex;
374ca955
A
264 UChar c, trail;
265
729e4ab9
A
266 if(pArgs->converter->mode<8) {
267 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
268 return;
269 }
270
374ca955
A
271 cnv=pArgs->converter;
272 source=(const uint8_t *)pArgs->source;
73c04bcf 273 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
374ca955 274 if(length<=0 && cnv->toUnicodeStatus==0) {
b75a7d8f
A
275 /* no input, nothing to do */
276 return;
277 }
278
73c04bcf
A
279 target=pArgs->target;
280 if(target >= pArgs->targetLimit) {
374ca955 281 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f
A
282 return;
283 }
284
73c04bcf 285 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
374ca955
A
286 offsets=pArgs->offsets;
287 sourceIndex=0;
288 c=0;
289
290 /* complete a partial UChar or pair from the last call */
291 if(cnv->toUnicodeStatus!=0) {
b75a7d8f 292 /*
374ca955
A
293 * special case: single byte from a previous buffer,
294 * where the byte turned out not to belong to a trail surrogate
295 * and the preceding, unmatched lead surrogate was put into toUBytes[]
296 * for error handling
b75a7d8f 297 */
374ca955
A
298 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
299 cnv->toULength=1;
300 cnv->toUnicodeStatus=0;
b75a7d8f 301 }
374ca955
A
302 if((count=cnv->toULength)!=0) {
303 uint8_t *p=cnv->toUBytes;
304 do {
305 p[count++]=*source++;
306 ++sourceIndex;
307 --length;
308 if(count==2) {
309 c=((UChar)p[0]<<8)|p[1];
310 if(U16_IS_SINGLE(c)) {
311 /* output the BMP code point */
312 *target++=c;
313 if(offsets!=NULL) {
314 *offsets++=-1;
315 }
316 --targetCapacity;
317 count=0;
318 c=0;
319 break;
320 } else if(U16_IS_SURROGATE_LEAD(c)) {
321 /* continue collecting bytes for the trail surrogate */
322 c=0; /* avoid unnecessary surrogate handling below */
323 } else {
324 /* fall through to error handling for an unmatched trail surrogate */
325 break;
326 }
327 } else if(count==4) {
328 c=((UChar)p[0]<<8)|p[1];
329 trail=((UChar)p[2]<<8)|p[3];
330 if(U16_IS_TRAIL(trail)) {
331 /* output the surrogate pair */
332 *target++=c;
333 if(targetCapacity>=2) {
334 *target++=trail;
335 if(offsets!=NULL) {
336 *offsets++=-1;
337 *offsets++=-1;
338 }
339 targetCapacity-=2;
340 } else /* targetCapacity==1 */ {
341 targetCapacity=0;
342 cnv->UCharErrorBuffer[0]=trail;
343 cnv->UCharErrorBufferLength=1;
344 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
345 }
346 count=0;
347 c=0;
348 break;
349 } else {
350 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
351 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
352
353 /* back out reading the code unit after it */
354 if(((const uint8_t *)pArgs->source-source)>=2) {
355 source-=2;
356 } else {
357 /*
358 * if the trail unit's first byte was in a previous buffer, then
359 * we need to put it into a special place because toUBytes[] will be
360 * used for the lead unit's bytes
361 */
362 cnv->toUnicodeStatus=0x100|p[2];
363 --source;
364 }
365 cnv->toULength=2;
366
367 /* write back the updated pointers */
368 pArgs->source=(const char *)source;
369 pArgs->target=target;
370 pArgs->offsets=offsets;
371 return;
372 }
b75a7d8f 373 }
374ca955
A
374 } while(length>0);
375 cnv->toULength=(int8_t)count;
b75a7d8f
A
376 }
377
374ca955
A
378 /* copy an even number of bytes for complete UChars */
379 count=2*targetCapacity;
380 if(count>length) {
381 count=length&~1;
382 }
383 if(c==0 && count>0) {
384 length-=count;
385 count>>=1;
386 targetCapacity-=count;
387 if(offsets==NULL) {
388 do {
389 c=((UChar)source[0]<<8)|source[1];
390 source+=2;
391 if(U16_IS_SINGLE(c)) {
392 *target++=c;
393 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
394 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
395 ) {
396 source+=2;
397 --count;
398 *target++=c;
399 *target++=trail;
400 } else {
401 break;
402 }
403 } while(--count>0);
b75a7d8f 404 } else {
374ca955
A
405 do {
406 c=((UChar)source[0]<<8)|source[1];
407 source+=2;
408 if(U16_IS_SINGLE(c)) {
409 *target++=c;
410 *offsets++=sourceIndex;
411 sourceIndex+=2;
412 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
413 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
414 ) {
415 source+=2;
416 --count;
417 *target++=c;
418 *target++=trail;
419 *offsets++=sourceIndex;
420 *offsets++=sourceIndex;
421 sourceIndex+=4;
422 } else {
423 break;
424 }
425 } while(--count>0);
b75a7d8f 426 }
b75a7d8f 427
374ca955
A
428 if(count==0) {
429 /* done with the loop for complete UChars */
430 c=0;
431 } else {
432 /* keep c for surrogate handling, trail will be set there */
433 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
434 targetCapacity+=count;
b75a7d8f
A
435 }
436 }
437
374ca955
A
438 if(c!=0) {
439 /*
440 * c is a surrogate, and
441 * - source or target too short
442 * - or the surrogate is unmatched
443 */
444 cnv->toUBytes[0]=(uint8_t)(c>>8);
445 cnv->toUBytes[1]=(uint8_t)c;
446 cnv->toULength=2;
447
448 if(U16_IS_SURROGATE_LEAD(c)) {
449 if(length>=2) {
450 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
451 /* output the surrogate pair, will overflow (see conditions comment above) */
452 source+=2;
453 length-=2;
454 *target++=c;
455 if(offsets!=NULL) {
456 *offsets++=sourceIndex;
457 }
458 cnv->UCharErrorBuffer[0]=trail;
459 cnv->UCharErrorBufferLength=1;
460 cnv->toULength=0;
461 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
462 } else {
463 /* unmatched lead surrogate */
464 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
465 }
466 } else {
467 /* see if the trail surrogate is in the next buffer */
b75a7d8f
A
468 }
469 } else {
374ca955
A
470 /* unmatched trail surrogate */
471 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 472 }
b75a7d8f
A
473 }
474
374ca955
A
475 if(U_SUCCESS(*pErrorCode)) {
476 /* check for a remaining source byte */
477 if(length>0) {
478 if(targetCapacity==0) {
479 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
480 } else {
481 /* it must be length==1 because otherwise the above would have copied more */
482 cnv->toUBytes[cnv->toULength++]=*source++;
b75a7d8f
A
483 }
484 }
485 }
486
487 /* write back the updated pointers */
374ca955
A
488 pArgs->source=(const char *)source;
489 pArgs->target=target;
490 pArgs->offsets=offsets;
b75a7d8f
A
491}
492
f3c0d7a5 493static UChar32 U_CALLCONV
374ca955
A
494_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
495 const uint8_t *s, *sourceLimit;
496 UChar32 c;
b75a7d8f 497
729e4ab9
A
498 if(pArgs->converter->mode<8) {
499 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
500 }
501
374ca955
A
502 s=(const uint8_t *)pArgs->source;
503 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
b75a7d8f 504
374ca955
A
505 if(s>=sourceLimit) {
506 /* no input */
507 *err=U_INDEX_OUTOFBOUNDS_ERROR;
b75a7d8f
A
508 return 0xffff;
509 }
510
374ca955
A
511 if(s+2>sourceLimit) {
512 /* only one byte: truncated UChar */
513 pArgs->converter->toUBytes[0]=*s++;
514 pArgs->converter->toULength=1;
515 pArgs->source=(const char *)s;
516 *err = U_TRUNCATED_CHAR_FOUND;
517 return 0xffff;
518 }
b75a7d8f 519
374ca955
A
520 /* get one UChar */
521 c=((UChar32)*s<<8)|s[1];
522 s+=2;
523
524 /* check for a surrogate pair */
525 if(U_IS_SURROGATE(c)) {
526 if(U16_IS_SURROGATE_LEAD(c)) {
527 if(s+2<=sourceLimit) {
528 UChar trail;
529
530 /* get a second UChar and see if it is a trail surrogate */
531 trail=((UChar)*s<<8)|s[1];
532 if(U16_IS_TRAIL(trail)) {
533 c=U16_GET_SUPPLEMENTARY(c, trail);
534 s+=2;
535 } else {
536 /* unmatched lead surrogate */
537 c=-2;
538 }
539 } else {
540 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
541 uint8_t *bytes=pArgs->converter->toUBytes;
542 s-=2;
543 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
544 do {
545 *bytes++=*s++;
546 } while(s<sourceLimit);
547
548 c=0xffff;
549 *err=U_TRUNCATED_CHAR_FOUND;
550 }
551 } else {
552 /* unmatched trail surrogate */
553 c=-2;
b75a7d8f
A
554 }
555
374ca955
A
556 if(c<0) {
557 /* write the unmatched surrogate */
558 uint8_t *bytes=pArgs->converter->toUBytes;
559 pArgs->converter->toULength=2;
560 *bytes=*(s-2);
561 bytes[1]=*(s-1);
b75a7d8f 562
374ca955
A
563 c=0xffff;
564 *err=U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
565 }
566 }
567
374ca955
A
568 pArgs->source=(const char *)s;
569 return c;
b75a7d8f
A
570}
571
f3c0d7a5 572static void U_CALLCONV
729e4ab9
A
573_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
574 if(choice<=UCNV_RESET_TO_UNICODE) {
575 /* reset toUnicode state */
576 if(UCNV_GET_VERSION(cnv)==0) {
577 cnv->mode=8; /* no BOM handling */
578 } else {
579 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
580 }
581 }
582 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
583 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
584 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
585 }
586}
587
f3c0d7a5 588static void U_CALLCONV
729e4ab9
A
589_UTF16BEOpen(UConverter *cnv,
590 UConverterLoadArgs *pArgs,
591 UErrorCode *pErrorCode) {
f3c0d7a5 592 (void)pArgs;
729e4ab9
A
593 if(UCNV_GET_VERSION(cnv)<=1) {
594 _UTF16BEReset(cnv, UCNV_RESET_BOTH);
595 } else {
596 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
597 }
598}
599
f3c0d7a5 600static const char * U_CALLCONV
729e4ab9
A
601_UTF16BEGetName(const UConverter *cnv) {
602 if(UCNV_GET_VERSION(cnv)==0) {
603 return "UTF-16BE";
604 } else {
605 return "UTF-16BE,version=1";
606 }
607}
f3c0d7a5 608U_CDECL_END
729e4ab9 609
b75a7d8f
A
610static const UConverterImpl _UTF16BEImpl={
611 UCNV_UTF16_BigEndian,
612
613 NULL,
614 NULL,
615
729e4ab9 616 _UTF16BEOpen,
b75a7d8f 617 NULL,
729e4ab9 618 _UTF16BEReset,
b75a7d8f
A
619
620 _UTF16BEToUnicodeWithOffsets,
621 _UTF16BEToUnicodeWithOffsets,
622 _UTF16BEFromUnicodeWithOffsets,
623 _UTF16BEFromUnicodeWithOffsets,
374ca955 624 _UTF16BEGetNextUChar,
b75a7d8f
A
625
626 NULL,
729e4ab9 627 _UTF16BEGetName,
b75a7d8f
A
628 NULL,
629 NULL,
f3c0d7a5
A
630 ucnv_getNonSurrogateUnicodeSet,
631
632 NULL,
633 NULL
b75a7d8f
A
634};
635
b75a7d8f
A
636static const UConverterStaticData _UTF16BEStaticData={
637 sizeof(UConverterStaticData),
638 "UTF-16BE",
729e4ab9 639 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
b75a7d8f
A
640 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
641 0,
642 0,
643 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
644};
645
646
2ca993e8
A
647const UConverterSharedData _UTF16BEData=
648 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
b75a7d8f
A
649
650/* UTF-16LE ----------------------------------------------------------------- */
f3c0d7a5
A
651U_CDECL_BEGIN
652static void U_CALLCONV
374ca955
A
653_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
654 UErrorCode *pErrorCode) {
655 UConverter *cnv;
656 const UChar *source;
73c04bcf 657 char *target;
374ca955
A
658 int32_t *offsets;
659
73c04bcf 660 uint32_t targetCapacity, length, sourceIndex;
374ca955
A
661 UChar c, trail;
662 char overflow[4];
663
664 source=pArgs->source;
73c04bcf 665 length=(int32_t)(pArgs->sourceLimit-source);
374ca955
A
666 if(length<=0) {
667 /* no input, nothing to do */
668 return;
669 }
670
73c04bcf
A
671 cnv=pArgs->converter;
672
673 /* write the BOM if necessary */
674 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
3d1f044b 675 static const char bom[]={ (char)0xffu, (char)0xfeu };
73c04bcf
A
676 ucnv_fromUWriteBytes(cnv,
677 bom, 2,
678 &pArgs->target, pArgs->targetLimit,
679 &pArgs->offsets, -1,
680 pErrorCode);
681 cnv->fromUnicodeStatus=0;
682 }
683
684 target=pArgs->target;
685 if(target >= pArgs->targetLimit) {
374ca955
A
686 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
687 return;
688 }
689
73c04bcf 690 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
374ca955
A
691 offsets=pArgs->offsets;
692 sourceIndex=0;
693
694 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
695
696 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
697 /* the last buffer ended with a lead surrogate, output the surrogate pair */
698 ++source;
699 --length;
700 target[0]=(uint8_t)c;
701 target[1]=(uint8_t)(c>>8);
702 target[2]=(uint8_t)trail;
703 target[3]=(uint8_t)(trail>>8);
704 target+=4;
705 targetCapacity-=4;
706 if(offsets!=NULL) {
707 *offsets++=-1;
708 *offsets++=-1;
709 *offsets++=-1;
710 *offsets++=-1;
b75a7d8f 711 }
374ca955
A
712 sourceIndex=1;
713 cnv->fromUChar32=c=0;
714 }
715
374ca955 716 if(c==0) {
73c04bcf
A
717 /* copy an even number of bytes for complete UChars */
718 uint32_t count=2*length;
719 if(count>targetCapacity) {
720 count=targetCapacity&~1;
721 }
722 /* count is even */
374ca955
A
723 targetCapacity-=count;
724 count>>=1;
725 length-=count;
726
727 if(offsets==NULL) {
728 while(count>0) {
729 c=*source++;
730 if(U16_IS_SINGLE(c)) {
731 target[0]=(uint8_t)c;
732 target[1]=(uint8_t)(c>>8);
733 target+=2;
734 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
735 ++source;
736 --count;
737 target[0]=(uint8_t)c;
738 target[1]=(uint8_t)(c>>8);
739 target[2]=(uint8_t)trail;
740 target[3]=(uint8_t)(trail>>8);
741 target+=4;
742 } else {
743 break;
744 }
745 --count;
746 }
747 } else {
748 while(count>0) {
749 c=*source++;
750 if(U16_IS_SINGLE(c)) {
751 target[0]=(uint8_t)c;
752 target[1]=(uint8_t)(c>>8);
753 target+=2;
754 *offsets++=sourceIndex;
755 *offsets++=sourceIndex++;
756 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
757 ++source;
758 --count;
759 target[0]=(uint8_t)c;
760 target[1]=(uint8_t)(c>>8);
761 target[2]=(uint8_t)trail;
762 target[3]=(uint8_t)(trail>>8);
763 target+=4;
764 *offsets++=sourceIndex;
765 *offsets++=sourceIndex;
766 *offsets++=sourceIndex;
767 *offsets++=sourceIndex;
768 sourceIndex+=2;
769 } else {
770 break;
771 }
772 --count;
773 }
b75a7d8f
A
774 }
775
374ca955
A
776 if(count==0) {
777 /* done with the loop for complete UChars */
778 if(length>0 && targetCapacity>0) {
779 /*
780 * there is more input and some target capacity -
781 * it must be targetCapacity==1 because otherwise
782 * the above would have copied more;
783 * prepare for overflow output
784 */
785 if(U16_IS_SINGLE(c=*source++)) {
786 overflow[0]=(char)c;
787 overflow[1]=(char)(c>>8);
788 length=2; /* 2 bytes to output */
789 c=0;
790 /* } else { keep c for surrogate handling, length will be set there */
791 }
792 } else {
793 length=0;
794 c=0;
795 }
796 } else {
797 /* keep c for surrogate handling, length will be set there */
798 targetCapacity+=2*count;
799 }
800 } else {
801 length=0; /* from here on, length counts the bytes in overflow[] */
802 }
803
804 if(c!=0) {
805 /*
806 * c is a surrogate, and
807 * - source or target too short
808 * - or the surrogate is unmatched
809 */
810 length=0;
811 if(U16_IS_SURROGATE_LEAD(c)) {
812 if(source<pArgs->sourceLimit) {
813 if(U16_IS_TRAIL(trail=*source)) {
814 /* output the surrogate pair, will overflow (see conditions comment above) */
815 ++source;
816 overflow[0]=(char)c;
817 overflow[1]=(char)(c>>8);
818 overflow[2]=(char)trail;
819 overflow[3]=(char)(trail>>8);
820 length=4; /* 4 bytes to output */
821 c=0;
822 } else {
823 /* unmatched lead surrogate */
824 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
825 }
826 } else {
827 /* see if the trail surrogate is in the next buffer */
828 }
829 } else {
830 /* unmatched trail surrogate */
831 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
832 }
833 cnv->fromUChar32=c;
b75a7d8f
A
834 }
835
374ca955
A
836 if(length>0) {
837 /* output length bytes with overflow (length>targetCapacity>0) */
838 ucnv_fromUWriteBytes(cnv,
839 overflow, length,
73c04bcf 840 &target, pArgs->targetLimit,
374ca955
A
841 &offsets, sourceIndex,
842 pErrorCode);
73c04bcf 843 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
374ca955 844 }
b75a7d8f 845
374ca955
A
846 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
847 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
848 }
849
850 /* write back the updated pointers */
851 pArgs->source=source;
73c04bcf 852 pArgs->target=target;
374ca955
A
853 pArgs->offsets=offsets;
854}
b75a7d8f 855
f3c0d7a5 856static void U_CALLCONV
374ca955
A
857_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
858 UErrorCode *pErrorCode) {
859 UConverter *cnv;
860 const uint8_t *source;
861 UChar *target;
862 int32_t *offsets;
863
73c04bcf 864 uint32_t targetCapacity, length, count, sourceIndex;
374ca955
A
865 UChar c, trail;
866
729e4ab9
A
867 if(pArgs->converter->mode<8) {
868 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
869 return;
870 }
871
374ca955
A
872 cnv=pArgs->converter;
873 source=(const uint8_t *)pArgs->source;
73c04bcf 874 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
374ca955
A
875 if(length<=0 && cnv->toUnicodeStatus==0) {
876 /* no input, nothing to do */
877 return;
878 }
879
73c04bcf
A
880 target=pArgs->target;
881 if(target >= pArgs->targetLimit) {
374ca955
A
882 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
883 return;
884 }
885
73c04bcf 886 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
374ca955
A
887 offsets=pArgs->offsets;
888 sourceIndex=0;
889 c=0;
890
891 /* complete a partial UChar or pair from the last call */
892 if(cnv->toUnicodeStatus!=0) {
893 /*
894 * special case: single byte from a previous buffer,
895 * where the byte turned out not to belong to a trail surrogate
896 * and the preceding, unmatched lead surrogate was put into toUBytes[]
897 * for error handling
898 */
899 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
900 cnv->toULength=1;
901 cnv->toUnicodeStatus=0;
902 }
903 if((count=cnv->toULength)!=0) {
904 uint8_t *p=cnv->toUBytes;
905 do {
906 p[count++]=*source++;
907 ++sourceIndex;
908 --length;
909 if(count==2) {
910 c=((UChar)p[1]<<8)|p[0];
911 if(U16_IS_SINGLE(c)) {
912 /* output the BMP code point */
913 *target++=c;
914 if(offsets!=NULL) {
915 *offsets++=-1;
916 }
917 --targetCapacity;
918 count=0;
919 c=0;
920 break;
921 } else if(U16_IS_SURROGATE_LEAD(c)) {
922 /* continue collecting bytes for the trail surrogate */
923 c=0; /* avoid unnecessary surrogate handling below */
924 } else {
925 /* fall through to error handling for an unmatched trail surrogate */
926 break;
927 }
928 } else if(count==4) {
929 c=((UChar)p[1]<<8)|p[0];
930 trail=((UChar)p[3]<<8)|p[2];
931 if(U16_IS_TRAIL(trail)) {
932 /* output the surrogate pair */
933 *target++=c;
934 if(targetCapacity>=2) {
935 *target++=trail;
936 if(offsets!=NULL) {
937 *offsets++=-1;
938 *offsets++=-1;
939 }
940 targetCapacity-=2;
941 } else /* targetCapacity==1 */ {
942 targetCapacity=0;
943 cnv->UCharErrorBuffer[0]=trail;
944 cnv->UCharErrorBufferLength=1;
945 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
946 }
947 count=0;
948 c=0;
949 break;
950 } else {
951 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
952 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
953
954 /* back out reading the code unit after it */
955 if(((const uint8_t *)pArgs->source-source)>=2) {
956 source-=2;
957 } else {
958 /*
959 * if the trail unit's first byte was in a previous buffer, then
960 * we need to put it into a special place because toUBytes[] will be
961 * used for the lead unit's bytes
962 */
963 cnv->toUnicodeStatus=0x100|p[2];
964 --source;
965 }
966 cnv->toULength=2;
967
968 /* write back the updated pointers */
969 pArgs->source=(const char *)source;
970 pArgs->target=target;
971 pArgs->offsets=offsets;
972 return;
973 }
974 }
975 } while(length>0);
976 cnv->toULength=(int8_t)count;
977 }
978
979 /* copy an even number of bytes for complete UChars */
980 count=2*targetCapacity;
981 if(count>length) {
982 count=length&~1;
983 }
984 if(c==0 && count>0) {
985 length-=count;
986 count>>=1;
987 targetCapacity-=count;
988 if(offsets==NULL) {
989 do {
990 c=((UChar)source[1]<<8)|source[0];
991 source+=2;
992 if(U16_IS_SINGLE(c)) {
993 *target++=c;
994 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
995 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
996 ) {
997 source+=2;
998 --count;
999 *target++=c;
1000 *target++=trail;
1001 } else {
1002 break;
1003 }
1004 } while(--count>0);
1005 } else {
1006 do {
1007 c=((UChar)source[1]<<8)|source[0];
1008 source+=2;
1009 if(U16_IS_SINGLE(c)) {
1010 *target++=c;
1011 *offsets++=sourceIndex;
1012 sourceIndex+=2;
1013 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1014 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1015 ) {
1016 source+=2;
1017 --count;
1018 *target++=c;
1019 *target++=trail;
1020 *offsets++=sourceIndex;
1021 *offsets++=sourceIndex;
1022 sourceIndex+=4;
1023 } else {
1024 break;
1025 }
1026 } while(--count>0);
b75a7d8f
A
1027 }
1028
374ca955
A
1029 if(count==0) {
1030 /* done with the loop for complete UChars */
1031 c=0;
1032 } else {
1033 /* keep c for surrogate handling, trail will be set there */
1034 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1035 targetCapacity+=count;
1036 }
1037 }
b75a7d8f 1038
374ca955
A
1039 if(c!=0) {
1040 /*
1041 * c is a surrogate, and
1042 * - source or target too short
1043 * - or the surrogate is unmatched
1044 */
1045 cnv->toUBytes[0]=(uint8_t)c;
1046 cnv->toUBytes[1]=(uint8_t)(c>>8);
1047 cnv->toULength=2;
1048
1049 if(U16_IS_SURROGATE_LEAD(c)) {
1050 if(length>=2) {
1051 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1052 /* output the surrogate pair, will overflow (see conditions comment above) */
1053 source+=2;
1054 length-=2;
1055 *target++=c;
1056 if(offsets!=NULL) {
1057 *offsets++=sourceIndex;
1058 }
1059 cnv->UCharErrorBuffer[0]=trail;
1060 cnv->UCharErrorBufferLength=1;
1061 cnv->toULength=0;
1062 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1063 } else {
1064 /* unmatched lead surrogate */
1065 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1066 }
1067 } else {
1068 /* see if the trail surrogate is in the next buffer */
1069 }
1070 } else {
1071 /* unmatched trail surrogate */
1072 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
1073 }
1074 }
1075
374ca955
A
1076 if(U_SUCCESS(*pErrorCode)) {
1077 /* check for a remaining source byte */
1078 if(length>0) {
1079 if(targetCapacity==0) {
1080 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1081 } else {
1082 /* it must be length==1 because otherwise the above would have copied more */
1083 cnv->toUBytes[cnv->toULength++]=*source++;
1084 }
1085 }
1086 }
1087
1088 /* write back the updated pointers */
1089 pArgs->source=(const char *)source;
1090 pArgs->target=target;
1091 pArgs->offsets=offsets;
1092}
1093
f3c0d7a5 1094static UChar32 U_CALLCONV
374ca955
A
1095_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1096 const uint8_t *s, *sourceLimit;
1097 UChar32 c;
1098
729e4ab9
A
1099 if(pArgs->converter->mode<8) {
1100 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1101 }
1102
374ca955
A
1103 s=(const uint8_t *)pArgs->source;
1104 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1105
1106 if(s>=sourceLimit) {
1107 /* no input */
1108 *err=U_INDEX_OUTOFBOUNDS_ERROR;
1109 return 0xffff;
1110 }
1111
1112 if(s+2>sourceLimit) {
1113 /* only one byte: truncated UChar */
1114 pArgs->converter->toUBytes[0]=*s++;
1115 pArgs->converter->toULength=1;
1116 pArgs->source=(const char *)s;
1117 *err = U_TRUNCATED_CHAR_FOUND;
1118 return 0xffff;
1119 }
1120
1121 /* get one UChar */
1122 c=((UChar32)s[1]<<8)|*s;
1123 s+=2;
1124
1125 /* check for a surrogate pair */
1126 if(U_IS_SURROGATE(c)) {
1127 if(U16_IS_SURROGATE_LEAD(c)) {
1128 if(s+2<=sourceLimit) {
1129 UChar trail;
1130
1131 /* get a second UChar and see if it is a trail surrogate */
1132 trail=((UChar)s[1]<<8)|*s;
1133 if(U16_IS_TRAIL(trail)) {
1134 c=U16_GET_SUPPLEMENTARY(c, trail);
1135 s+=2;
1136 } else {
1137 /* unmatched lead surrogate */
1138 c=-2;
1139 }
1140 } else {
1141 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1142 uint8_t *bytes=pArgs->converter->toUBytes;
1143 s-=2;
1144 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1145 do {
1146 *bytes++=*s++;
1147 } while(s<sourceLimit);
1148
1149 c=0xffff;
1150 *err=U_TRUNCATED_CHAR_FOUND;
1151 }
1152 } else {
1153 /* unmatched trail surrogate */
1154 c=-2;
1155 }
1156
1157 if(c<0) {
1158 /* write the unmatched surrogate */
1159 uint8_t *bytes=pArgs->converter->toUBytes;
1160 pArgs->converter->toULength=2;
1161 *bytes=*(s-2);
1162 bytes[1]=*(s-1);
1163
1164 c=0xffff;
1165 *err=U_ILLEGAL_CHAR_FOUND;
1166 }
1167 }
1168
1169 pArgs->source=(const char *)s;
1170 return c;
b75a7d8f
A
1171}
1172
f3c0d7a5 1173static void U_CALLCONV
729e4ab9
A
1174_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1175 if(choice<=UCNV_RESET_TO_UNICODE) {
1176 /* reset toUnicode state */
1177 if(UCNV_GET_VERSION(cnv)==0) {
1178 cnv->mode=8; /* no BOM handling */
1179 } else {
1180 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1181 }
1182 }
1183 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1184 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1185 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1186 }
1187}
1188
f3c0d7a5 1189static void U_CALLCONV
729e4ab9
A
1190_UTF16LEOpen(UConverter *cnv,
1191 UConverterLoadArgs *pArgs,
1192 UErrorCode *pErrorCode) {
f3c0d7a5 1193 (void)pArgs;
729e4ab9
A
1194 if(UCNV_GET_VERSION(cnv)<=1) {
1195 _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1196 } else {
1197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1198 }
1199}
1200
f3c0d7a5 1201static const char * U_CALLCONV
729e4ab9
A
1202_UTF16LEGetName(const UConverter *cnv) {
1203 if(UCNV_GET_VERSION(cnv)==0) {
1204 return "UTF-16LE";
1205 } else {
1206 return "UTF-16LE,version=1";
1207 }
1208}
f3c0d7a5 1209U_CDECL_END
729e4ab9 1210
b75a7d8f
A
1211static const UConverterImpl _UTF16LEImpl={
1212 UCNV_UTF16_LittleEndian,
1213
1214 NULL,
1215 NULL,
1216
729e4ab9 1217 _UTF16LEOpen,
b75a7d8f 1218 NULL,
729e4ab9 1219 _UTF16LEReset,
b75a7d8f
A
1220
1221 _UTF16LEToUnicodeWithOffsets,
1222 _UTF16LEToUnicodeWithOffsets,
1223 _UTF16LEFromUnicodeWithOffsets,
1224 _UTF16LEFromUnicodeWithOffsets,
374ca955 1225 _UTF16LEGetNextUChar,
b75a7d8f
A
1226
1227 NULL,
729e4ab9 1228 _UTF16LEGetName,
b75a7d8f
A
1229 NULL,
1230 NULL,
f3c0d7a5
A
1231 ucnv_getNonSurrogateUnicodeSet,
1232
1233 NULL,
1234 NULL
b75a7d8f
A
1235};
1236
1237
b75a7d8f
A
1238static const UConverterStaticData _UTF16LEStaticData={
1239 sizeof(UConverterStaticData),
1240 "UTF-16LE",
729e4ab9 1241 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
b75a7d8f
A
1242 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1243 0,
1244 0,
1245 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1246};
1247
1248
2ca993e8
A
1249const UConverterSharedData _UTF16LEData=
1250 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
b75a7d8f
A
1251
1252/* UTF-16 (Detect BOM) ------------------------------------------------------ */
1253
1254/*
1255 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1256 * accordingly.
729e4ab9 1257 * This is a simpler version of the UTF-32 converter, with
b75a7d8f
A
1258 * fewer states for shorter BOMs.
1259 *
1260 * State values:
1261 * 0 initial state
729e4ab9
A
1262 * 1 saw first byte
1263 * 2..5 -
1264 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
b75a7d8f
A
1265 * 8 UTF-16BE mode
1266 * 9 UTF-16LE mode
1267 *
729e4ab9 1268 * During detection: state==number of initial bytes seen so far.
b75a7d8f
A
1269 *
1270 * On output, emit U+FEFF as the first code point.
729e4ab9
A
1271 *
1272 * Variants:
1273 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1274 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1275 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
b75a7d8f 1276 */
f3c0d7a5
A
1277U_CDECL_BEGIN
1278static void U_CALLCONV
b75a7d8f
A
1279_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1280 if(choice<=UCNV_RESET_TO_UNICODE) {
1281 /* reset toUnicode: state=0 */
1282 cnv->mode=0;
1283 }
1284 if(choice!=UCNV_RESET_TO_UNICODE) {
1285 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
73c04bcf 1286 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
b75a7d8f
A
1287 }
1288}
f3c0d7a5
A
1289U_CDECL_END
1290extern const UConverterSharedData _UTF16v2Data;
1291U_CDECL_BEGIN
1292static void U_CALLCONV
b75a7d8f 1293_UTF16Open(UConverter *cnv,
729e4ab9 1294 UConverterLoadArgs *pArgs,
b75a7d8f 1295 UErrorCode *pErrorCode) {
729e4ab9
A
1296 if(UCNV_GET_VERSION(cnv)<=2) {
1297 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1298 /*
1299 * Switch implementation, and switch the staticData that's different
1300 * and was copied into the UConverter.
1301 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1302 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1303 */
1304 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1305 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1306 }
1307 _UTF16Reset(cnv, UCNV_RESET_BOTH);
1308 } else {
1309 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1310 }
b75a7d8f
A
1311}
1312
f3c0d7a5 1313static const char * U_CALLCONV
729e4ab9
A
1314_UTF16GetName(const UConverter *cnv) {
1315 if(UCNV_GET_VERSION(cnv)==0) {
1316 return "UTF-16";
1317 } else if(UCNV_GET_VERSION(cnv)==1) {
1318 return "UTF-16,version=1";
1319 } else {
1320 return "UTF-16,version=2";
1321 }
1322}
f3c0d7a5
A
1323U_CDECL_END
1324extern const UConverterSharedData _UTF16Data;
729e4ab9 1325
0f5d89e8
A
1326static inline bool IS_UTF16BE(const UConverter *cnv) {
1327 return ((cnv)->sharedData == &_UTF16BEData);
1328}
1329
1330static inline bool IS_UTF16LE(const UConverter *cnv) {
1331 return ((cnv)->sharedData == &_UTF16LEData);
1332}
1333
1334static inline bool IS_UTF16(const UConverter *cnv) {
1335 return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
1336}
b75a7d8f 1337
f3c0d7a5
A
1338U_CDECL_BEGIN
1339static void U_CALLCONV
b75a7d8f
A
1340_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1341 UErrorCode *pErrorCode) {
1342 UConverter *cnv=pArgs->converter;
1343 const char *source=pArgs->source;
1344 const char *sourceLimit=pArgs->sourceLimit;
1345 int32_t *offsets=pArgs->offsets;
1346
1347 int32_t state, offsetDelta;
729e4ab9 1348 uint8_t b;
b75a7d8f
A
1349
1350 state=cnv->mode;
1351
1352 /*
1353 * If we detect a BOM in this buffer, then we must add the BOM size to the
1354 * offsets because the actual converter function will not see and count the BOM.
1355 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1356 */
1357 offsetDelta=0;
1358
1359 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1360 switch(state) {
1361 case 0:
729e4ab9
A
1362 cnv->toUBytes[0]=(uint8_t)*source++;
1363 cnv->toULength=1;
1364 state=1;
b75a7d8f
A
1365 break;
1366 case 1:
729e4ab9
A
1367 /*
1368 * Only inside this switch case can the state variable
1369 * temporarily take two additional values:
1370 * 6: BOM error, continue with BE
1371 * 7: BOM error, continue with LE
1372 */
1373 b=*source;
1374 if(cnv->toUBytes[0]==0xfe && b==0xff) {
1375 if(IS_UTF16LE(cnv)) {
1376 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1377 } else {
b75a7d8f 1378 state=8; /* detect UTF-16BE */
729e4ab9
A
1379 }
1380 } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1381 if(IS_UTF16BE(cnv)) {
1382 state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1383 } else {
b75a7d8f 1384 state=9; /* detect UTF-16LE */
b75a7d8f 1385 }
729e4ab9
A
1386 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1387 state=6; /* illegal missing BOM for Java "Unicode" */
1388 }
1389 if(state>=8) {
1390 /* BOM detected, consume it */
1391 ++source;
1392 cnv->toULength=0;
1393 offsetDelta=(int32_t)(source-pArgs->source);
1394 } else if(state<6) {
1395 /* ok: no BOM, and not a reverse BOM */
b75a7d8f 1396 if(source!=pArgs->source) {
729e4ab9 1397 /* reset the source for a correct first offset */
b75a7d8f 1398 source=pArgs->source;
729e4ab9
A
1399 cnv->toULength=0;
1400 }
1401 if(IS_UTF16LE(cnv)) {
1402 /* Make Java "UnicodeLittle" default to LE. */
1403 state=9;
b75a7d8f 1404 } else {
729e4ab9
A
1405 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1406 state=8;
b75a7d8f 1407 }
729e4ab9
A
1408 } else {
1409 /*
1410 * error: missing BOM, or reverse BOM
1411 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1412 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1413 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1414 */
1415 /* report the non-BOM or reverse BOM as an illegal sequence */
1416 cnv->toUBytes[1]=b;
1417 cnv->toULength=2;
1418 pArgs->source=source+1;
1419 /* continue with conversion if the callback resets the error */
1420 /*
1421 * Make Java "Unicode" default to BE like standard UTF-16.
1422 * Make Java "UnicodeBig" and "UnicodeLittle" default
1423 * to their normal endiannesses.
1424 */
1425 cnv->mode=state+2;
1426 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1427 return;
b75a7d8f 1428 }
729e4ab9
A
1429 /* convert the rest of the stream */
1430 cnv->mode=state;
1431 continue;
b75a7d8f
A
1432 case 8:
1433 /* call UTF-16BE */
1434 pArgs->source=source;
1435 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1436 source=pArgs->source;
1437 break;
1438 case 9:
1439 /* call UTF-16LE */
1440 pArgs->source=source;
1441 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1442 source=pArgs->source;
1443 break;
1444 default:
1445 break; /* does not occur */
1446 }
1447 }
1448
1449 /* add BOM size to offsets - see comment at offsetDelta declaration */
1450 if(offsets!=NULL && offsetDelta!=0) {
1451 int32_t *offsetsLimit=pArgs->offsets;
1452 while(offsets<offsetsLimit) {
1453 *offsets++ += offsetDelta;
1454 }
1455 }
1456
1457 pArgs->source=source;
1458
1459 if(source==sourceLimit && pArgs->flush) {
1460 /* handle truncated input */
1461 switch(state) {
1462 case 0:
1463 break; /* no input at all, nothing to do */
1464 case 8:
1465 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1466 break;
1467 case 9:
1468 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1469 break;
1470 default:
729e4ab9 1471 /* 0<state<8: framework will report truncation, nothing to do here */
b75a7d8f
A
1472 break;
1473 }
b75a7d8f 1474 }
374ca955
A
1475
1476 cnv->mode=state;
b75a7d8f
A
1477}
1478
f3c0d7a5 1479static UChar32 U_CALLCONV
b75a7d8f
A
1480_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1481 UErrorCode *pErrorCode) {
1482 switch(pArgs->converter->mode) {
1483 case 8:
374ca955 1484 return _UTF16BEGetNextUChar(pArgs, pErrorCode);
b75a7d8f 1485 case 9:
374ca955 1486 return _UTF16LEGetNextUChar(pArgs, pErrorCode);
b75a7d8f 1487 default:
374ca955 1488 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
b75a7d8f
A
1489 }
1490}
f3c0d7a5 1491U_CDECL_END
b75a7d8f
A
1492
1493static const UConverterImpl _UTF16Impl = {
1494 UCNV_UTF16,
1495
1496 NULL,
1497 NULL,
1498
1499 _UTF16Open,
1500 NULL,
1501 _UTF16Reset,
1502
1503 _UTF16ToUnicodeWithOffsets,
1504 _UTF16ToUnicodeWithOffsets,
1505 _UTF16PEFromUnicodeWithOffsets,
1506 _UTF16PEFromUnicodeWithOffsets,
1507 _UTF16GetNextUChar,
1508
1509 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
729e4ab9 1510 _UTF16GetName,
b75a7d8f
A
1511 NULL,
1512 NULL,
f3c0d7a5
A
1513 ucnv_getNonSurrogateUnicodeSet,
1514
1515 NULL,
1516 NULL
b75a7d8f
A
1517};
1518
1519static const UConverterStaticData _UTF16StaticData = {
1520 sizeof(UConverterStaticData),
1521 "UTF-16",
73c04bcf 1522 1204, /* CCSID for BOM sensitive UTF-16 */
729e4ab9 1523 UCNV_IBM, UCNV_UTF16, 2, 2,
b75a7d8f
A
1524#if U_IS_BIG_ENDIAN
1525 { 0xff, 0xfd, 0, 0 }, 2,
1526#else
1527 { 0xfd, 0xff, 0, 0 }, 2,
1528#endif
1529 FALSE, FALSE,
1530 0,
1531 0,
1532 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1533};
1534
2ca993e8
A
1535const UConverterSharedData _UTF16Data =
1536 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
374ca955 1537
729e4ab9
A
1538static const UConverterImpl _UTF16v2Impl = {
1539 UCNV_UTF16,
1540
1541 NULL,
1542 NULL,
1543
1544 _UTF16Open,
1545 NULL,
1546 _UTF16Reset,
1547
1548 _UTF16ToUnicodeWithOffsets,
1549 _UTF16ToUnicodeWithOffsets,
1550 _UTF16BEFromUnicodeWithOffsets,
1551 _UTF16BEFromUnicodeWithOffsets,
1552 _UTF16GetNextUChar,
1553
1554 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1555 _UTF16GetName,
1556 NULL,
1557 NULL,
f3c0d7a5
A
1558 ucnv_getNonSurrogateUnicodeSet,
1559
1560 NULL,
1561 NULL
729e4ab9
A
1562};
1563
1564static const UConverterStaticData _UTF16v2StaticData = {
1565 sizeof(UConverterStaticData),
1566 "UTF-16,version=2",
1567 1204, /* CCSID for BOM sensitive UTF-16 */
1568 UCNV_IBM, UCNV_UTF16, 2, 2,
1569 { 0xff, 0xfd, 0, 0 }, 2,
1570 FALSE, FALSE,
1571 0,
1572 0,
1573 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1574};
1575
f3c0d7a5 1576const UConverterSharedData _UTF16v2Data =
2ca993e8 1577 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
729e4ab9 1578
374ca955 1579#endif