]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv_u16.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u16.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 2002-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv_u16.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2002jul01
12* created by: Markus W. Scherer
13*
14* UTF-16 converter implementation. Used to be in ucnv_utf.c.
15*/
16
17#include "unicode/utypes.h"
374ca955
A
18
19#if !UCONFIG_NO_CONVERSION
20
b75a7d8f 21#include "unicode/ucnv.h"
b75a7d8f
A
22#include "ucnv_bld.h"
23#include "ucnv_cnv.h"
24#include "cmemory.h"
25
374ca955
A
26/* UTF-16BE ----------------------------------------------------------------- */
27
28#if U_IS_BIG_ENDIAN
29# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
30#else
31# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
32#endif
b75a7d8f
A
33
34static void
374ca955
A
35_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
36 UErrorCode *pErrorCode) {
37 UConverter *cnv;
38 const UChar *source;
39 uint8_t *target;
40 int32_t *offsets;
41
42 int32_t targetCapacity, length, count, sourceIndex;
43 UChar c, trail;
44 char overflow[4];
45
46 source=pArgs->source;
47 length=pArgs->sourceLimit-source;
48 if(length<=0) {
b75a7d8f
A
49 /* no input, nothing to do */
50 return;
51 }
52
374ca955
A
53 targetCapacity=pArgs->targetLimit-pArgs->target;
54 if(targetCapacity<=0) {
b75a7d8f
A
55 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
56 return;
57 }
58
374ca955
A
59 cnv=pArgs->converter;
60 target=(uint8_t *)pArgs->target;
61 offsets=pArgs->offsets;
62 sourceIndex=0;
63
64 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
65
66 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
67 /* the last buffer ended with a lead surrogate, output the surrogate pair */
68 ++source;
b75a7d8f 69 --length;
374ca955
A
70 target[0]=(uint8_t)(c>>8);
71 target[1]=(uint8_t)c;
72 target[2]=(uint8_t)(trail>>8);
73 target[3]=(uint8_t)trail;
74 target+=4;
75 targetCapacity-=4;
76 if(offsets!=NULL) {
77 *offsets++=-1;
78 *offsets++=-1;
79 *offsets++=-1;
80 *offsets++=-1;
b75a7d8f 81 }
374ca955
A
82 sourceIndex=1;
83 cnv->fromUChar32=c=0;
b75a7d8f
A
84 }
85
86 /* copy an even number of bytes for complete UChars */
374ca955
A
87 count=2*length;
88 if(count>targetCapacity) {
89 count=targetCapacity&~1;
90 }
91 /* count is even */
92 if(c==0) {
93 targetCapacity-=count;
94 count>>=1;
95 length-=count;
96
97 if(offsets==NULL) {
98 while(count>0) {
99 c=*source++;
100 if(U16_IS_SINGLE(c)) {
101 target[0]=(uint8_t)(c>>8);
102 target[1]=(uint8_t)c;
103 target+=2;
104 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
105 ++source;
106 --count;
107 target[0]=(uint8_t)(c>>8);
108 target[1]=(uint8_t)c;
109 target[2]=(uint8_t)(trail>>8);
110 target[3]=(uint8_t)trail;
111 target+=4;
112 } else {
113 break;
114 }
115 --count;
116 }
117 } else {
118 while(count>0) {
119 c=*source++;
120 if(U16_IS_SINGLE(c)) {
121 target[0]=(uint8_t)(c>>8);
122 target[1]=(uint8_t)c;
123 target+=2;
124 *offsets++=sourceIndex;
125 *offsets++=sourceIndex++;
126 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
127 ++source;
128 --count;
129 target[0]=(uint8_t)(c>>8);
130 target[1]=(uint8_t)c;
131 target[2]=(uint8_t)(trail>>8);
132 target[3]=(uint8_t)trail;
133 target+=4;
134 *offsets++=sourceIndex;
135 *offsets++=sourceIndex;
136 *offsets++=sourceIndex;
137 *offsets++=sourceIndex;
138 sourceIndex+=2;
139 } else {
140 break;
141 }
b75a7d8f
A
142 --count;
143 }
144 }
b75a7d8f 145
374ca955
A
146 if(count==0) {
147 /* done with the loop for complete UChars */
148 if(length>0 && targetCapacity>0) {
149 /*
150 * there is more input and some target capacity -
151 * it must be targetCapacity==1 because otherwise
152 * the above would have copied more;
153 * prepare for overflow output
154 */
155 if(U16_IS_SINGLE(c=*source++)) {
156 overflow[0]=(char)(c>>8);
157 overflow[1]=(char)c;
158 length=2; /* 2 bytes to output */
159 c=0;
160 /* } else { keep c for surrogate handling, length will be set there */
161 }
162 } else {
163 length=0;
164 c=0;
165 }
b75a7d8f 166 } else {
374ca955
A
167 /* keep c for surrogate handling, length will be set there */
168 targetCapacity+=2*count;
b75a7d8f 169 }
374ca955
A
170 } else {
171 length=0; /* from here on, length counts the bytes in overflow[] */
b75a7d8f 172 }
374ca955
A
173
174 if(c!=0) {
175 /*
176 * c is a surrogate, and
177 * - source or target too short
178 * - or the surrogate is unmatched
179 */
180 length=0;
181 if(U16_IS_SURROGATE_LEAD(c)) {
182 if(source<pArgs->sourceLimit) {
183 if(U16_IS_TRAIL(trail=*source)) {
184 /* output the surrogate pair, will overflow (see conditions comment above) */
185 ++source;
186 overflow[0]=(char)(c>>8);
187 overflow[1]=(char)c;
188 overflow[2]=(char)(trail>>8);
189 overflow[3]=(char)trail;
190 length=4; /* 4 bytes to output */
191 c=0;
192 } else {
193 /* unmatched lead surrogate */
194 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
195 }
196 } else {
197 /* see if the trail surrogate is in the next buffer */
198 }
199 } else {
200 /* unmatched trail surrogate */
201 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 202 }
374ca955 203 cnv->fromUChar32=c;
b75a7d8f
A
204 }
205
374ca955
A
206 if(length>0) {
207 /* output length bytes with overflow (length>targetCapacity>0) */
208 ucnv_fromUWriteBytes(cnv,
209 overflow, length,
210 (char **)&target, pArgs->targetLimit,
211 &offsets, sourceIndex,
212 pErrorCode);
213 targetCapacity=pArgs->targetLimit-(char *)target;
b75a7d8f
A
214 }
215
374ca955
A
216 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
217 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f
A
218 }
219
220 /* write back the updated pointers */
374ca955
A
221 pArgs->source=source;
222 pArgs->target=(char *)target;
223 pArgs->offsets=offsets;
b75a7d8f
A
224}
225
b75a7d8f 226static void
374ca955 227_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
b75a7d8f 228 UErrorCode *pErrorCode) {
374ca955
A
229 UConverter *cnv;
230 const uint8_t *source;
231 UChar *target;
232 int32_t *offsets;
233
234 int32_t targetCapacity, length, count, sourceIndex;
235 UChar c, trail;
236
237 cnv=pArgs->converter;
238 source=(const uint8_t *)pArgs->source;
239 length=(const uint8_t *)pArgs->sourceLimit-source;
240 if(length<=0 && cnv->toUnicodeStatus==0) {
b75a7d8f
A
241 /* no input, nothing to do */
242 return;
243 }
244
374ca955
A
245 targetCapacity=pArgs->targetLimit-pArgs->target;
246 if(targetCapacity<=0) {
247 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f
A
248 return;
249 }
250
374ca955
A
251 target=pArgs->target;
252 offsets=pArgs->offsets;
253 sourceIndex=0;
254 c=0;
255
256 /* complete a partial UChar or pair from the last call */
257 if(cnv->toUnicodeStatus!=0) {
b75a7d8f 258 /*
374ca955
A
259 * special case: single byte from a previous buffer,
260 * where the byte turned out not to belong to a trail surrogate
261 * and the preceding, unmatched lead surrogate was put into toUBytes[]
262 * for error handling
b75a7d8f 263 */
374ca955
A
264 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
265 cnv->toULength=1;
266 cnv->toUnicodeStatus=0;
b75a7d8f 267 }
374ca955
A
268 if((count=cnv->toULength)!=0) {
269 uint8_t *p=cnv->toUBytes;
270 do {
271 p[count++]=*source++;
272 ++sourceIndex;
273 --length;
274 if(count==2) {
275 c=((UChar)p[0]<<8)|p[1];
276 if(U16_IS_SINGLE(c)) {
277 /* output the BMP code point */
278 *target++=c;
279 if(offsets!=NULL) {
280 *offsets++=-1;
281 }
282 --targetCapacity;
283 count=0;
284 c=0;
285 break;
286 } else if(U16_IS_SURROGATE_LEAD(c)) {
287 /* continue collecting bytes for the trail surrogate */
288 c=0; /* avoid unnecessary surrogate handling below */
289 } else {
290 /* fall through to error handling for an unmatched trail surrogate */
291 break;
292 }
293 } else if(count==4) {
294 c=((UChar)p[0]<<8)|p[1];
295 trail=((UChar)p[2]<<8)|p[3];
296 if(U16_IS_TRAIL(trail)) {
297 /* output the surrogate pair */
298 *target++=c;
299 if(targetCapacity>=2) {
300 *target++=trail;
301 if(offsets!=NULL) {
302 *offsets++=-1;
303 *offsets++=-1;
304 }
305 targetCapacity-=2;
306 } else /* targetCapacity==1 */ {
307 targetCapacity=0;
308 cnv->UCharErrorBuffer[0]=trail;
309 cnv->UCharErrorBufferLength=1;
310 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
311 }
312 count=0;
313 c=0;
314 break;
315 } else {
316 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
317 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
318
319 /* back out reading the code unit after it */
320 if(((const uint8_t *)pArgs->source-source)>=2) {
321 source-=2;
322 } else {
323 /*
324 * if the trail unit's first byte was in a previous buffer, then
325 * we need to put it into a special place because toUBytes[] will be
326 * used for the lead unit's bytes
327 */
328 cnv->toUnicodeStatus=0x100|p[2];
329 --source;
330 }
331 cnv->toULength=2;
332
333 /* write back the updated pointers */
334 pArgs->source=(const char *)source;
335 pArgs->target=target;
336 pArgs->offsets=offsets;
337 return;
338 }
b75a7d8f 339 }
374ca955
A
340 } while(length>0);
341 cnv->toULength=(int8_t)count;
b75a7d8f
A
342 }
343
374ca955
A
344 /* copy an even number of bytes for complete UChars */
345 count=2*targetCapacity;
346 if(count>length) {
347 count=length&~1;
348 }
349 if(c==0 && count>0) {
350 length-=count;
351 count>>=1;
352 targetCapacity-=count;
353 if(offsets==NULL) {
354 do {
355 c=((UChar)source[0]<<8)|source[1];
356 source+=2;
357 if(U16_IS_SINGLE(c)) {
358 *target++=c;
359 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
360 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
361 ) {
362 source+=2;
363 --count;
364 *target++=c;
365 *target++=trail;
366 } else {
367 break;
368 }
369 } while(--count>0);
b75a7d8f 370 } else {
374ca955
A
371 do {
372 c=((UChar)source[0]<<8)|source[1];
373 source+=2;
374 if(U16_IS_SINGLE(c)) {
375 *target++=c;
376 *offsets++=sourceIndex;
377 sourceIndex+=2;
378 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
379 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
380 ) {
381 source+=2;
382 --count;
383 *target++=c;
384 *target++=trail;
385 *offsets++=sourceIndex;
386 *offsets++=sourceIndex;
387 sourceIndex+=4;
388 } else {
389 break;
390 }
391 } while(--count>0);
b75a7d8f 392 }
b75a7d8f 393
374ca955
A
394 if(count==0) {
395 /* done with the loop for complete UChars */
396 c=0;
397 } else {
398 /* keep c for surrogate handling, trail will be set there */
399 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
400 targetCapacity+=count;
b75a7d8f
A
401 }
402 }
403
374ca955
A
404 if(c!=0) {
405 /*
406 * c is a surrogate, and
407 * - source or target too short
408 * - or the surrogate is unmatched
409 */
410 cnv->toUBytes[0]=(uint8_t)(c>>8);
411 cnv->toUBytes[1]=(uint8_t)c;
412 cnv->toULength=2;
413
414 if(U16_IS_SURROGATE_LEAD(c)) {
415 if(length>=2) {
416 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
417 /* output the surrogate pair, will overflow (see conditions comment above) */
418 source+=2;
419 length-=2;
420 *target++=c;
421 if(offsets!=NULL) {
422 *offsets++=sourceIndex;
423 }
424 cnv->UCharErrorBuffer[0]=trail;
425 cnv->UCharErrorBufferLength=1;
426 cnv->toULength=0;
427 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
428 } else {
429 /* unmatched lead surrogate */
430 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
431 }
432 } else {
433 /* see if the trail surrogate is in the next buffer */
b75a7d8f
A
434 }
435 } else {
374ca955
A
436 /* unmatched trail surrogate */
437 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 438 }
b75a7d8f
A
439 }
440
374ca955
A
441 if(U_SUCCESS(*pErrorCode)) {
442 /* check for a remaining source byte */
443 if(length>0) {
444 if(targetCapacity==0) {
445 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
446 } else {
447 /* it must be length==1 because otherwise the above would have copied more */
448 cnv->toUBytes[cnv->toULength++]=*source++;
b75a7d8f
A
449 }
450 }
451 }
452
453 /* write back the updated pointers */
374ca955
A
454 pArgs->source=(const char *)source;
455 pArgs->target=target;
456 pArgs->offsets=offsets;
b75a7d8f
A
457}
458
374ca955
A
459static UChar32
460_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
461 const uint8_t *s, *sourceLimit;
462 UChar32 c;
b75a7d8f 463
374ca955
A
464 s=(const uint8_t *)pArgs->source;
465 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
b75a7d8f 466
374ca955
A
467 if(s>=sourceLimit) {
468 /* no input */
469 *err=U_INDEX_OUTOFBOUNDS_ERROR;
b75a7d8f
A
470 return 0xffff;
471 }
472
374ca955
A
473 if(s+2>sourceLimit) {
474 /* only one byte: truncated UChar */
475 pArgs->converter->toUBytes[0]=*s++;
476 pArgs->converter->toULength=1;
477 pArgs->source=(const char *)s;
478 *err = U_TRUNCATED_CHAR_FOUND;
479 return 0xffff;
480 }
b75a7d8f 481
374ca955
A
482 /* get one UChar */
483 c=((UChar32)*s<<8)|s[1];
484 s+=2;
485
486 /* check for a surrogate pair */
487 if(U_IS_SURROGATE(c)) {
488 if(U16_IS_SURROGATE_LEAD(c)) {
489 if(s+2<=sourceLimit) {
490 UChar trail;
491
492 /* get a second UChar and see if it is a trail surrogate */
493 trail=((UChar)*s<<8)|s[1];
494 if(U16_IS_TRAIL(trail)) {
495 c=U16_GET_SUPPLEMENTARY(c, trail);
496 s+=2;
497 } else {
498 /* unmatched lead surrogate */
499 c=-2;
500 }
501 } else {
502 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
503 uint8_t *bytes=pArgs->converter->toUBytes;
504 s-=2;
505 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
506 do {
507 *bytes++=*s++;
508 } while(s<sourceLimit);
509
510 c=0xffff;
511 *err=U_TRUNCATED_CHAR_FOUND;
512 }
513 } else {
514 /* unmatched trail surrogate */
515 c=-2;
b75a7d8f
A
516 }
517
374ca955
A
518 if(c<0) {
519 /* write the unmatched surrogate */
520 uint8_t *bytes=pArgs->converter->toUBytes;
521 pArgs->converter->toULength=2;
522 *bytes=*(s-2);
523 bytes[1]=*(s-1);
b75a7d8f 524
374ca955
A
525 c=0xffff;
526 *err=U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
527 }
528 }
529
374ca955
A
530 pArgs->source=(const char *)s;
531 return c;
b75a7d8f
A
532}
533
534static const UConverterImpl _UTF16BEImpl={
535 UCNV_UTF16_BigEndian,
536
537 NULL,
538 NULL,
539
540 NULL,
541 NULL,
542 NULL,
543
544 _UTF16BEToUnicodeWithOffsets,
545 _UTF16BEToUnicodeWithOffsets,
546 _UTF16BEFromUnicodeWithOffsets,
547 _UTF16BEFromUnicodeWithOffsets,
374ca955 548 _UTF16BEGetNextUChar,
b75a7d8f
A
549
550 NULL,
551 NULL,
552 NULL,
553 NULL,
554 ucnv_getCompleteUnicodeSet
555};
556
b75a7d8f
A
557static const UConverterStaticData _UTF16BEStaticData={
558 sizeof(UConverterStaticData),
559 "UTF-16BE",
560 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
561 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
562 0,
563 0,
564 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
565};
566
567
568const UConverterSharedData _UTF16BEData={
569 sizeof(UConverterSharedData), ~((uint32_t) 0),
570 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
571 0
572};
573
574/* UTF-16LE ----------------------------------------------------------------- */
575
374ca955
A
576static void
577_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
578 UErrorCode *pErrorCode) {
579 UConverter *cnv;
580 const UChar *source;
581 uint8_t *target;
582 int32_t *offsets;
583
584 int32_t targetCapacity, length, count, sourceIndex;
585 UChar c, trail;
586 char overflow[4];
587
588 source=pArgs->source;
589 length=pArgs->sourceLimit-source;
590 if(length<=0) {
591 /* no input, nothing to do */
592 return;
593 }
594
595 targetCapacity=pArgs->targetLimit-pArgs->target;
596 if(targetCapacity<=0) {
597 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
598 return;
599 }
600
601 cnv=pArgs->converter;
602 target=(uint8_t *)pArgs->target;
603 offsets=pArgs->offsets;
604 sourceIndex=0;
605
606 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
607
608 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
609 /* the last buffer ended with a lead surrogate, output the surrogate pair */
610 ++source;
611 --length;
612 target[0]=(uint8_t)c;
613 target[1]=(uint8_t)(c>>8);
614 target[2]=(uint8_t)trail;
615 target[3]=(uint8_t)(trail>>8);
616 target+=4;
617 targetCapacity-=4;
618 if(offsets!=NULL) {
619 *offsets++=-1;
620 *offsets++=-1;
621 *offsets++=-1;
622 *offsets++=-1;
b75a7d8f 623 }
374ca955
A
624 sourceIndex=1;
625 cnv->fromUChar32=c=0;
626 }
627
628 /* copy an even number of bytes for complete UChars */
629 count=2*length;
630 if(count>targetCapacity) {
631 count=targetCapacity&~1;
632 }
633 /* count is even */
634 if(c==0) {
635 targetCapacity-=count;
636 count>>=1;
637 length-=count;
638
639 if(offsets==NULL) {
640 while(count>0) {
641 c=*source++;
642 if(U16_IS_SINGLE(c)) {
643 target[0]=(uint8_t)c;
644 target[1]=(uint8_t)(c>>8);
645 target+=2;
646 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
647 ++source;
648 --count;
649 target[0]=(uint8_t)c;
650 target[1]=(uint8_t)(c>>8);
651 target[2]=(uint8_t)trail;
652 target[3]=(uint8_t)(trail>>8);
653 target+=4;
654 } else {
655 break;
656 }
657 --count;
658 }
659 } else {
660 while(count>0) {
661 c=*source++;
662 if(U16_IS_SINGLE(c)) {
663 target[0]=(uint8_t)c;
664 target[1]=(uint8_t)(c>>8);
665 target+=2;
666 *offsets++=sourceIndex;
667 *offsets++=sourceIndex++;
668 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
669 ++source;
670 --count;
671 target[0]=(uint8_t)c;
672 target[1]=(uint8_t)(c>>8);
673 target[2]=(uint8_t)trail;
674 target[3]=(uint8_t)(trail>>8);
675 target+=4;
676 *offsets++=sourceIndex;
677 *offsets++=sourceIndex;
678 *offsets++=sourceIndex;
679 *offsets++=sourceIndex;
680 sourceIndex+=2;
681 } else {
682 break;
683 }
684 --count;
685 }
b75a7d8f
A
686 }
687
374ca955
A
688 if(count==0) {
689 /* done with the loop for complete UChars */
690 if(length>0 && targetCapacity>0) {
691 /*
692 * there is more input and some target capacity -
693 * it must be targetCapacity==1 because otherwise
694 * the above would have copied more;
695 * prepare for overflow output
696 */
697 if(U16_IS_SINGLE(c=*source++)) {
698 overflow[0]=(char)c;
699 overflow[1]=(char)(c>>8);
700 length=2; /* 2 bytes to output */
701 c=0;
702 /* } else { keep c for surrogate handling, length will be set there */
703 }
704 } else {
705 length=0;
706 c=0;
707 }
708 } else {
709 /* keep c for surrogate handling, length will be set there */
710 targetCapacity+=2*count;
711 }
712 } else {
713 length=0; /* from here on, length counts the bytes in overflow[] */
714 }
715
716 if(c!=0) {
717 /*
718 * c is a surrogate, and
719 * - source or target too short
720 * - or the surrogate is unmatched
721 */
722 length=0;
723 if(U16_IS_SURROGATE_LEAD(c)) {
724 if(source<pArgs->sourceLimit) {
725 if(U16_IS_TRAIL(trail=*source)) {
726 /* output the surrogate pair, will overflow (see conditions comment above) */
727 ++source;
728 overflow[0]=(char)c;
729 overflow[1]=(char)(c>>8);
730 overflow[2]=(char)trail;
731 overflow[3]=(char)(trail>>8);
732 length=4; /* 4 bytes to output */
733 c=0;
734 } else {
735 /* unmatched lead surrogate */
736 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
737 }
738 } else {
739 /* see if the trail surrogate is in the next buffer */
740 }
741 } else {
742 /* unmatched trail surrogate */
743 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
744 }
745 cnv->fromUChar32=c;
b75a7d8f
A
746 }
747
374ca955
A
748 if(length>0) {
749 /* output length bytes with overflow (length>targetCapacity>0) */
750 ucnv_fromUWriteBytes(cnv,
751 overflow, length,
752 (char **)&target, pArgs->targetLimit,
753 &offsets, sourceIndex,
754 pErrorCode);
755 targetCapacity=pArgs->targetLimit-(char *)target;
756 }
b75a7d8f 757
374ca955
A
758 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
759 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
760 }
761
762 /* write back the updated pointers */
763 pArgs->source=source;
764 pArgs->target=(char *)target;
765 pArgs->offsets=offsets;
766}
b75a7d8f 767
374ca955
A
768static void
769_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
770 UErrorCode *pErrorCode) {
771 UConverter *cnv;
772 const uint8_t *source;
773 UChar *target;
774 int32_t *offsets;
775
776 int32_t targetCapacity, length, count, sourceIndex;
777 UChar c, trail;
778
779 cnv=pArgs->converter;
780 source=(const uint8_t *)pArgs->source;
781 length=(const uint8_t *)pArgs->sourceLimit-source;
782 if(length<=0 && cnv->toUnicodeStatus==0) {
783 /* no input, nothing to do */
784 return;
785 }
786
787 targetCapacity=pArgs->targetLimit-pArgs->target;
788 if(targetCapacity<=0) {
789 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
790 return;
791 }
792
793 target=pArgs->target;
794 offsets=pArgs->offsets;
795 sourceIndex=0;
796 c=0;
797
798 /* complete a partial UChar or pair from the last call */
799 if(cnv->toUnicodeStatus!=0) {
800 /*
801 * special case: single byte from a previous buffer,
802 * where the byte turned out not to belong to a trail surrogate
803 * and the preceding, unmatched lead surrogate was put into toUBytes[]
804 * for error handling
805 */
806 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
807 cnv->toULength=1;
808 cnv->toUnicodeStatus=0;
809 }
810 if((count=cnv->toULength)!=0) {
811 uint8_t *p=cnv->toUBytes;
812 do {
813 p[count++]=*source++;
814 ++sourceIndex;
815 --length;
816 if(count==2) {
817 c=((UChar)p[1]<<8)|p[0];
818 if(U16_IS_SINGLE(c)) {
819 /* output the BMP code point */
820 *target++=c;
821 if(offsets!=NULL) {
822 *offsets++=-1;
823 }
824 --targetCapacity;
825 count=0;
826 c=0;
827 break;
828 } else if(U16_IS_SURROGATE_LEAD(c)) {
829 /* continue collecting bytes for the trail surrogate */
830 c=0; /* avoid unnecessary surrogate handling below */
831 } else {
832 /* fall through to error handling for an unmatched trail surrogate */
833 break;
834 }
835 } else if(count==4) {
836 c=((UChar)p[1]<<8)|p[0];
837 trail=((UChar)p[3]<<8)|p[2];
838 if(U16_IS_TRAIL(trail)) {
839 /* output the surrogate pair */
840 *target++=c;
841 if(targetCapacity>=2) {
842 *target++=trail;
843 if(offsets!=NULL) {
844 *offsets++=-1;
845 *offsets++=-1;
846 }
847 targetCapacity-=2;
848 } else /* targetCapacity==1 */ {
849 targetCapacity=0;
850 cnv->UCharErrorBuffer[0]=trail;
851 cnv->UCharErrorBufferLength=1;
852 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
853 }
854 count=0;
855 c=0;
856 break;
857 } else {
858 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
859 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
860
861 /* back out reading the code unit after it */
862 if(((const uint8_t *)pArgs->source-source)>=2) {
863 source-=2;
864 } else {
865 /*
866 * if the trail unit's first byte was in a previous buffer, then
867 * we need to put it into a special place because toUBytes[] will be
868 * used for the lead unit's bytes
869 */
870 cnv->toUnicodeStatus=0x100|p[2];
871 --source;
872 }
873 cnv->toULength=2;
874
875 /* write back the updated pointers */
876 pArgs->source=(const char *)source;
877 pArgs->target=target;
878 pArgs->offsets=offsets;
879 return;
880 }
881 }
882 } while(length>0);
883 cnv->toULength=(int8_t)count;
884 }
885
886 /* copy an even number of bytes for complete UChars */
887 count=2*targetCapacity;
888 if(count>length) {
889 count=length&~1;
890 }
891 if(c==0 && count>0) {
892 length-=count;
893 count>>=1;
894 targetCapacity-=count;
895 if(offsets==NULL) {
896 do {
897 c=((UChar)source[1]<<8)|source[0];
898 source+=2;
899 if(U16_IS_SINGLE(c)) {
900 *target++=c;
901 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
902 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
903 ) {
904 source+=2;
905 --count;
906 *target++=c;
907 *target++=trail;
908 } else {
909 break;
910 }
911 } while(--count>0);
912 } else {
913 do {
914 c=((UChar)source[1]<<8)|source[0];
915 source+=2;
916 if(U16_IS_SINGLE(c)) {
917 *target++=c;
918 *offsets++=sourceIndex;
919 sourceIndex+=2;
920 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
921 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
922 ) {
923 source+=2;
924 --count;
925 *target++=c;
926 *target++=trail;
927 *offsets++=sourceIndex;
928 *offsets++=sourceIndex;
929 sourceIndex+=4;
930 } else {
931 break;
932 }
933 } while(--count>0);
b75a7d8f
A
934 }
935
374ca955
A
936 if(count==0) {
937 /* done with the loop for complete UChars */
938 c=0;
939 } else {
940 /* keep c for surrogate handling, trail will be set there */
941 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
942 targetCapacity+=count;
943 }
944 }
b75a7d8f 945
374ca955
A
946 if(c!=0) {
947 /*
948 * c is a surrogate, and
949 * - source or target too short
950 * - or the surrogate is unmatched
951 */
952 cnv->toUBytes[0]=(uint8_t)c;
953 cnv->toUBytes[1]=(uint8_t)(c>>8);
954 cnv->toULength=2;
955
956 if(U16_IS_SURROGATE_LEAD(c)) {
957 if(length>=2) {
958 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
959 /* output the surrogate pair, will overflow (see conditions comment above) */
960 source+=2;
961 length-=2;
962 *target++=c;
963 if(offsets!=NULL) {
964 *offsets++=sourceIndex;
965 }
966 cnv->UCharErrorBuffer[0]=trail;
967 cnv->UCharErrorBufferLength=1;
968 cnv->toULength=0;
969 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
970 } else {
971 /* unmatched lead surrogate */
972 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
973 }
974 } else {
975 /* see if the trail surrogate is in the next buffer */
976 }
977 } else {
978 /* unmatched trail surrogate */
979 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
980 }
981 }
982
374ca955
A
983 if(U_SUCCESS(*pErrorCode)) {
984 /* check for a remaining source byte */
985 if(length>0) {
986 if(targetCapacity==0) {
987 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
988 } else {
989 /* it must be length==1 because otherwise the above would have copied more */
990 cnv->toUBytes[cnv->toULength++]=*source++;
991 }
992 }
993 }
994
995 /* write back the updated pointers */
996 pArgs->source=(const char *)source;
997 pArgs->target=target;
998 pArgs->offsets=offsets;
999}
1000
1001static UChar32
1002_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1003 const uint8_t *s, *sourceLimit;
1004 UChar32 c;
1005
1006 s=(const uint8_t *)pArgs->source;
1007 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1008
1009 if(s>=sourceLimit) {
1010 /* no input */
1011 *err=U_INDEX_OUTOFBOUNDS_ERROR;
1012 return 0xffff;
1013 }
1014
1015 if(s+2>sourceLimit) {
1016 /* only one byte: truncated UChar */
1017 pArgs->converter->toUBytes[0]=*s++;
1018 pArgs->converter->toULength=1;
1019 pArgs->source=(const char *)s;
1020 *err = U_TRUNCATED_CHAR_FOUND;
1021 return 0xffff;
1022 }
1023
1024 /* get one UChar */
1025 c=((UChar32)s[1]<<8)|*s;
1026 s+=2;
1027
1028 /* check for a surrogate pair */
1029 if(U_IS_SURROGATE(c)) {
1030 if(U16_IS_SURROGATE_LEAD(c)) {
1031 if(s+2<=sourceLimit) {
1032 UChar trail;
1033
1034 /* get a second UChar and see if it is a trail surrogate */
1035 trail=((UChar)s[1]<<8)|*s;
1036 if(U16_IS_TRAIL(trail)) {
1037 c=U16_GET_SUPPLEMENTARY(c, trail);
1038 s+=2;
1039 } else {
1040 /* unmatched lead surrogate */
1041 c=-2;
1042 }
1043 } else {
1044 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1045 uint8_t *bytes=pArgs->converter->toUBytes;
1046 s-=2;
1047 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1048 do {
1049 *bytes++=*s++;
1050 } while(s<sourceLimit);
1051
1052 c=0xffff;
1053 *err=U_TRUNCATED_CHAR_FOUND;
1054 }
1055 } else {
1056 /* unmatched trail surrogate */
1057 c=-2;
1058 }
1059
1060 if(c<0) {
1061 /* write the unmatched surrogate */
1062 uint8_t *bytes=pArgs->converter->toUBytes;
1063 pArgs->converter->toULength=2;
1064 *bytes=*(s-2);
1065 bytes[1]=*(s-1);
1066
1067 c=0xffff;
1068 *err=U_ILLEGAL_CHAR_FOUND;
1069 }
1070 }
1071
1072 pArgs->source=(const char *)s;
1073 return c;
b75a7d8f
A
1074}
1075
1076static const UConverterImpl _UTF16LEImpl={
1077 UCNV_UTF16_LittleEndian,
1078
1079 NULL,
1080 NULL,
1081
1082 NULL,
1083 NULL,
1084 NULL,
1085
1086 _UTF16LEToUnicodeWithOffsets,
1087 _UTF16LEToUnicodeWithOffsets,
1088 _UTF16LEFromUnicodeWithOffsets,
1089 _UTF16LEFromUnicodeWithOffsets,
374ca955 1090 _UTF16LEGetNextUChar,
b75a7d8f
A
1091
1092 NULL,
1093 NULL,
1094 NULL,
1095 NULL,
1096 ucnv_getCompleteUnicodeSet
1097};
1098
1099
b75a7d8f
A
1100static const UConverterStaticData _UTF16LEStaticData={
1101 sizeof(UConverterStaticData),
1102 "UTF-16LE",
1103 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1104 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1105 0,
1106 0,
1107 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1108};
1109
1110
1111const UConverterSharedData _UTF16LEData={
1112 sizeof(UConverterSharedData), ~((uint32_t) 0),
1113 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
1114 0
1115};
1116
1117/* UTF-16 (Detect BOM) ------------------------------------------------------ */
1118
1119/*
1120 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1121 * accordingly.
1122 * This is a simpler version of the UTF-32 converter below, with
1123 * fewer states for shorter BOMs.
1124 *
1125 * State values:
1126 * 0 initial state
1127 * 1 saw FE
1128 * 2..4 -
1129 * 5 saw FF
1130 * 6..7 -
1131 * 8 UTF-16BE mode
1132 * 9 UTF-16LE mode
1133 *
1134 * During detection: state&3==number of matching bytes so far.
1135 *
1136 * On output, emit U+FEFF as the first code point.
1137 */
1138
1139static void
1140_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1141 if(choice<=UCNV_RESET_TO_UNICODE) {
1142 /* reset toUnicode: state=0 */
1143 cnv->mode=0;
1144 }
1145 if(choice!=UCNV_RESET_TO_UNICODE) {
1146 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1147 cnv->charErrorBufferLength=2;
1148#if U_IS_BIG_ENDIAN
1149 cnv->charErrorBuffer[0]=0xfe;
1150 cnv->charErrorBuffer[1]=0xff;
1151#else
1152 cnv->charErrorBuffer[0]=0xff;
1153 cnv->charErrorBuffer[1]=0xfe;
1154#endif
1155 }
1156}
1157
1158static void
1159_UTF16Open(UConverter *cnv,
1160 const char *name,
1161 const char *locale,
1162 uint32_t options,
1163 UErrorCode *pErrorCode) {
1164 _UTF16Reset(cnv, UCNV_RESET_BOTH);
1165}
1166
1167static const char utf16BOM[8]={ (char)0xfe, (char)0xff, 0, 0, (char)0xff, (char)0xfe, 0, 0 };
1168
1169static void
1170_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1171 UErrorCode *pErrorCode) {
1172 UConverter *cnv=pArgs->converter;
1173 const char *source=pArgs->source;
1174 const char *sourceLimit=pArgs->sourceLimit;
1175 int32_t *offsets=pArgs->offsets;
1176
1177 int32_t state, offsetDelta;
1178 char b;
1179
1180 state=cnv->mode;
1181
1182 /*
1183 * If we detect a BOM in this buffer, then we must add the BOM size to the
1184 * offsets because the actual converter function will not see and count the BOM.
1185 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1186 */
1187 offsetDelta=0;
1188
1189 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1190 switch(state) {
1191 case 0:
1192 b=*source;
1193 if(b==(char)0xfe) {
1194 state=1; /* could be FE FF */
1195 } else if(b==(char)0xff) {
1196 state=5; /* could be FF FE */
1197 } else {
1198 state=8; /* default to UTF-16BE */
1199 continue;
1200 }
1201 ++source;
1202 break;
1203 case 1:
1204 case 5:
1205 if(*source==utf16BOM[state]) {
1206 ++source;
1207 if(state==1) {
1208 state=8; /* detect UTF-16BE */
1209 offsetDelta=source-pArgs->source;
1210 } else if(state==5) {
1211 state=9; /* detect UTF-16LE */
1212 offsetDelta=source-pArgs->source;
1213 }
1214 } else {
1215 /* switch to UTF-16BE and pass the previous bytes */
1216 if(source!=pArgs->source) {
1217 /* just reset the source */
1218 source=pArgs->source;
1219 } else {
1220 UBool oldFlush=pArgs->flush;
1221
1222 /* the first byte is from a previous buffer, replay it first */
1223 pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1224 pArgs->sourceLimit=pArgs->source+1; /* replay previous byte */
1225 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1226
1227 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1228
1229 /* restore real pointers; pArgs->source will be set in case 8/9 */
1230 pArgs->sourceLimit=sourceLimit;
1231 pArgs->flush=oldFlush;
1232 }
1233 state=8;
1234 continue;
1235 }
1236 break;
1237 case 8:
1238 /* call UTF-16BE */
1239 pArgs->source=source;
1240 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1241 source=pArgs->source;
1242 break;
1243 case 9:
1244 /* call UTF-16LE */
1245 pArgs->source=source;
1246 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1247 source=pArgs->source;
1248 break;
1249 default:
1250 break; /* does not occur */
1251 }
1252 }
1253
1254 /* add BOM size to offsets - see comment at offsetDelta declaration */
1255 if(offsets!=NULL && offsetDelta!=0) {
1256 int32_t *offsetsLimit=pArgs->offsets;
1257 while(offsets<offsetsLimit) {
1258 *offsets++ += offsetDelta;
1259 }
1260 }
1261
1262 pArgs->source=source;
1263
1264 if(source==sourceLimit && pArgs->flush) {
1265 /* handle truncated input */
1266 switch(state) {
1267 case 0:
1268 break; /* no input at all, nothing to do */
1269 case 8:
1270 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1271 break;
1272 case 9:
1273 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1274 break;
1275 default:
1276 /* handle 0<state<8: call UTF-16BE with too-short input */
1277 pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1278 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1279
1280 /* no offsets: not enough for output */
1281 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1282 pArgs->source=source;
1283 pArgs->sourceLimit=sourceLimit;
374ca955 1284 state=8;
b75a7d8f
A
1285 break;
1286 }
b75a7d8f 1287 }
374ca955
A
1288
1289 cnv->mode=state;
b75a7d8f
A
1290}
1291
1292static UChar32
1293_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1294 UErrorCode *pErrorCode) {
1295 switch(pArgs->converter->mode) {
1296 case 8:
374ca955 1297 return _UTF16BEGetNextUChar(pArgs, pErrorCode);
b75a7d8f 1298 case 9:
374ca955 1299 return _UTF16LEGetNextUChar(pArgs, pErrorCode);
b75a7d8f 1300 default:
374ca955 1301 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
b75a7d8f
A
1302 }
1303}
1304
1305static const UConverterImpl _UTF16Impl = {
1306 UCNV_UTF16,
1307
1308 NULL,
1309 NULL,
1310
1311 _UTF16Open,
1312 NULL,
1313 _UTF16Reset,
1314
1315 _UTF16ToUnicodeWithOffsets,
1316 _UTF16ToUnicodeWithOffsets,
1317 _UTF16PEFromUnicodeWithOffsets,
1318 _UTF16PEFromUnicodeWithOffsets,
1319 _UTF16GetNextUChar,
1320
1321 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1322 NULL,
1323 NULL,
1324 NULL,
1325 ucnv_getCompleteUnicodeSet
1326};
1327
1328static const UConverterStaticData _UTF16StaticData = {
1329 sizeof(UConverterStaticData),
1330 "UTF-16",
1331 0, /* ### TODO review correctness of all Unicode CCSIDs */
1332 UCNV_IBM, UCNV_UTF16, 2, 2,
1333#if U_IS_BIG_ENDIAN
1334 { 0xff, 0xfd, 0, 0 }, 2,
1335#else
1336 { 0xfd, 0xff, 0, 0 }, 2,
1337#endif
1338 FALSE, FALSE,
1339 0,
1340 0,
1341 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1342};
1343
1344const UConverterSharedData _UTF16Data = {
1345 sizeof(UConverterSharedData), ~((uint32_t) 0),
1346 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
1347 0
1348};
374ca955
A
1349
1350#endif