icuSources/common/ucnv_u16.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2002-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv_u16.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2002jul01
  12 *   created by: Markus W. Scherer
  13 *
  14 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_CONVERSION
  20
  21 #include "unicode/ucnv.h"
  22 #include "ucnv_bld.h"
  23 #include "ucnv_cnv.h"
  24 #include "cmemory.h"
  25
  26 /* UTF-16BE ----------------------------------------------------------------- */
  27
  28 #if U_IS_BIG_ENDIAN
  29 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
  30 #else
  31 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
  32 #endif
  33
  34 static void
  35 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  36                                UErrorCode *pErrorCode) {
  37     UConverter *cnv;
  38     const UChar *source;
  39     uint8_t *target;
  40     int32_t *offsets;
  41
  42     int32_t targetCapacity, length, count, sourceIndex;
  43     UChar c, trail;
  44     char overflow[4];
  45
  46     source=pArgs->source;
  47     length=pArgs->sourceLimit-source;
  48     if(length<=0) {
  49         /* no input, nothing to do */
  50         return;
  51     }
  52
  53     targetCapacity=pArgs->targetLimit-pArgs->target;
  54     if(targetCapacity<=0) {
  55         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  56         return;
  57     }
  58
  59     cnv=pArgs->converter;
  60     target=(uint8_t *)pArgs->target;
  61     offsets=pArgs->offsets;
  62     sourceIndex=0;
  63
  64     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
  65
  66     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
  67         /* the last buffer ended with a lead surrogate, output the surrogate pair */
  68         ++source;
  69         --length;
  70         target[0]=(uint8_t)(c>>8);
  71         target[1]=(uint8_t)c;
  72         target[2]=(uint8_t)(trail>>8);
  73         target[3]=(uint8_t)trail;
  74         target+=4;
  75         targetCapacity-=4;
  76         if(offsets!=NULL) {
  77             *offsets++=-1;
  78             *offsets++=-1;
  79             *offsets++=-1;
  80             *offsets++=-1;
  81         }
  82         sourceIndex=1;
  83         cnv->fromUChar32=c=0;
  84     }
  85
  86     /* copy an even number of bytes for complete UChars */
  87     count=2*length;
  88     if(count>targetCapacity) {
  89         count=targetCapacity&~1;
  90     }
  91     /* count is even */
  92     if(c==0) {
  93         targetCapacity-=count;
  94         count>>=1;
  95         length-=count;
  96
  97         if(offsets==NULL) {
  98             while(count>0) {
  99                 c=*source++;
 100                 if(U16_IS_SINGLE(c)) {
 101                     target[0]=(uint8_t)(c>>8);
 102                     target[1]=(uint8_t)c;
 103                     target+=2;
 104                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 105                     ++source;
 106                     --count;
 107                     target[0]=(uint8_t)(c>>8);
 108                     target[1]=(uint8_t)c;
 109                     target[2]=(uint8_t)(trail>>8);
 110                     target[3]=(uint8_t)trail;
 111                     target+=4;
 112                 } else {
 113                     break;
 114                 }
 115                 --count;
 116             }
 117         } else {
 118             while(count>0) {
 119                 c=*source++;
 120                 if(U16_IS_SINGLE(c)) {
 121                     target[0]=(uint8_t)(c>>8);
 122                     target[1]=(uint8_t)c;
 123                     target+=2;
 124                     *offsets++=sourceIndex;
 125                     *offsets++=sourceIndex++;
 126                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 127                     ++source;
 128                     --count;
 129                     target[0]=(uint8_t)(c>>8);
 130                     target[1]=(uint8_t)c;
 131                     target[2]=(uint8_t)(trail>>8);
 132                     target[3]=(uint8_t)trail;
 133                     target+=4;
 134                     *offsets++=sourceIndex;
 135                     *offsets++=sourceIndex;
 136                     *offsets++=sourceIndex;
 137                     *offsets++=sourceIndex;
 138                     sourceIndex+=2;
 139                 } else {
 140                     break;
 141                 }
 142                 --count;
 143             }
 144         }
 145
 146         if(count==0) {
 147             /* done with the loop for complete UChars */
 148             if(length>0 && targetCapacity>0) {
 149                 /*
 150                  * there is more input and some target capacity -
 151                  * it must be targetCapacity==1 because otherwise
 152                  * the above would have copied more;
 153                  * prepare for overflow output
 154                  */
 155                 if(U16_IS_SINGLE(c=*source++)) {
 156                     overflow[0]=(char)(c>>8);
 157                     overflow[1]=(char)c;
 158                     length=2; /* 2 bytes to output */
 159                     c=0;
 160                 /* } else { keep c for surrogate handling, length will be set there */
 161                 }
 162             } else {
 163                 length=0;
 164                 c=0;
 165             }
 166         } else {
 167             /* keep c for surrogate handling, length will be set there */
 168             targetCapacity+=2*count;
 169         }
 170     } else {
 171         length=0; /* from here on, length counts the bytes in overflow[] */
 172     }
 173
 174     if(c!=0) {
 175         /*
 176          * c is a surrogate, and
 177          * - source or target too short
 178          * - or the surrogate is unmatched
 179          */
 180         length=0;
 181         if(U16_IS_SURROGATE_LEAD(c)) {
 182             if(source<pArgs->sourceLimit) {
 183                 if(U16_IS_TRAIL(trail=*source)) {
 184                     /* output the surrogate pair, will overflow (see conditions comment above) */
 185                     ++source;
 186                     overflow[0]=(char)(c>>8);
 187                     overflow[1]=(char)c;
 188                     overflow[2]=(char)(trail>>8);
 189                     overflow[3]=(char)trail;
 190                     length=4; /* 4 bytes to output */
 191                     c=0;
 192                 } else {
 193                     /* unmatched lead surrogate */
 194                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 195                 }
 196             } else {
 197                 /* see if the trail surrogate is in the next buffer */
 198             }
 199         } else {
 200             /* unmatched trail surrogate */
 201             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 202         }
 203         cnv->fromUChar32=c;
 204     }
 205
 206     if(length>0) {
 207         /* output length bytes with overflow (length>targetCapacity>0) */
 208         ucnv_fromUWriteBytes(cnv,
 209                              overflow, length,
 210                              (char **)&target, pArgs->targetLimit,
 211                              &offsets, sourceIndex,
 212                              pErrorCode);
 213         targetCapacity=pArgs->targetLimit-(char *)target;
 214     }
 215
 216     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
 217         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 218     }
 219
 220     /* write back the updated pointers */
 221     pArgs->source=source;
 222     pArgs->target=(char *)target;
 223     pArgs->offsets=offsets;
 224 }
 225
 226 static void
 227 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 228                              UErrorCode *pErrorCode) {
 229     UConverter *cnv;
 230     const uint8_t *source;
 231     UChar *target;
 232     int32_t *offsets;
 233
 234     int32_t targetCapacity, length, count, sourceIndex;
 235     UChar c, trail;
 236
 237     cnv=pArgs->converter;
 238     source=(const uint8_t *)pArgs->source;
 239     length=(const uint8_t *)pArgs->sourceLimit-source;
 240     if(length<=0 && cnv->toUnicodeStatus==0) {
 241         /* no input, nothing to do */
 242         return;
 243     }
 244
 245     targetCapacity=pArgs->targetLimit-pArgs->target;
 246     if(targetCapacity<=0) {
 247         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 248         return;
 249     }
 250
 251     target=pArgs->target;
 252     offsets=pArgs->offsets;
 253     sourceIndex=0;
 254     c=0;
 255
 256     /* complete a partial UChar or pair from the last call */
 257     if(cnv->toUnicodeStatus!=0) {
 258         /*
 259          * special case: single byte from a previous buffer,
 260          * where the byte turned out not to belong to a trail surrogate
 261          * and the preceding, unmatched lead surrogate was put into toUBytes[]
 262          * for error handling
 263          */
 264         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
 265         cnv->toULength=1;
 266         cnv->toUnicodeStatus=0;
 267     }
 268     if((count=cnv->toULength)!=0) {
 269         uint8_t *p=cnv->toUBytes;
 270         do {
 271             p[count++]=*source++;
 272             ++sourceIndex;
 273             --length;
 274             if(count==2) {
 275                 c=((UChar)p[0]<<8)|p[1];
 276                 if(U16_IS_SINGLE(c)) {
 277                     /* output the BMP code point */
 278                     *target++=c;
 279                     if(offsets!=NULL) {
 280                         *offsets++=-1;
 281                     }
 282                     --targetCapacity;
 283                     count=0;
 284                     c=0;
 285                     break;
 286                 } else if(U16_IS_SURROGATE_LEAD(c)) {
 287                     /* continue collecting bytes for the trail surrogate */
 288                     c=0; /* avoid unnecessary surrogate handling below */
 289                 } else {
 290                     /* fall through to error handling for an unmatched trail surrogate */
 291                     break;
 292                 }
 293             } else if(count==4) {
 294                 c=((UChar)p[0]<<8)|p[1];
 295                 trail=((UChar)p[2]<<8)|p[3];
 296                 if(U16_IS_TRAIL(trail)) {
 297                     /* output the surrogate pair */
 298                     *target++=c;
 299                     if(targetCapacity>=2) {
 300                         *target++=trail;
 301                         if(offsets!=NULL) {
 302                             *offsets++=-1;
 303                             *offsets++=-1;
 304                         }
 305                         targetCapacity-=2;
 306                     } else /* targetCapacity==1 */ {
 307                         targetCapacity=0;
 308                         cnv->UCharErrorBuffer[0]=trail;
 309                         cnv->UCharErrorBufferLength=1;
 310                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 311                     }
 312                     count=0;
 313                     c=0;
 314                     break;
 315                 } else {
 316                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
 317                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 318
 319                     /* back out reading the code unit after it */
 320                     if(((const uint8_t *)pArgs->source-source)>=2) {
 321                         source-=2;
 322                     } else {
 323                         /*
 324                          * if the trail unit's first byte was in a previous buffer, then
 325                          * we need to put it into a special place because toUBytes[] will be
 326                          * used for the lead unit's bytes
 327                          */
 328                         cnv->toUnicodeStatus=0x100|p[2];
 329                         --source;
 330                     }
 331                     cnv->toULength=2;
 332
 333                     /* write back the updated pointers */
 334                     pArgs->source=(const char *)source;
 335                     pArgs->target=target;
 336                     pArgs->offsets=offsets;
 337                     return;
 338                 }
 339             }
 340         } while(length>0);
 341         cnv->toULength=(int8_t)count;
 342     }
 343
 344     /* copy an even number of bytes for complete UChars */
 345     count=2*targetCapacity;
 346     if(count>length) {
 347         count=length&~1;
 348     }
 349     if(c==0 && count>0) {
 350         length-=count;
 351         count>>=1;
 352         targetCapacity-=count;
 353         if(offsets==NULL) {
 354             do {
 355                 c=((UChar)source[0]<<8)|source[1];
 356                 source+=2;
 357                 if(U16_IS_SINGLE(c)) {
 358                     *target++=c;
 359                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 360                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
 361                 ) {
 362                     source+=2;
 363                     --count;
 364                     *target++=c;
 365                     *target++=trail;
 366                 } else {
 367                     break;
 368                 }
 369             } while(--count>0);
 370         } else {
 371             do {
 372                 c=((UChar)source[0]<<8)|source[1];
 373                 source+=2;
 374                 if(U16_IS_SINGLE(c)) {
 375                     *target++=c;
 376                     *offsets++=sourceIndex;
 377                     sourceIndex+=2;
 378                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 379                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
 380                 ) {
 381                     source+=2;
 382                     --count;
 383                     *target++=c;
 384                     *target++=trail;
 385                     *offsets++=sourceIndex;
 386                     *offsets++=sourceIndex;
 387                     sourceIndex+=4;
 388                 } else {
 389                     break;
 390                 }
 391             } while(--count>0);
 392         }
 393
 394         if(count==0) {
 395             /* done with the loop for complete UChars */
 396             c=0;
 397         } else {
 398             /* keep c for surrogate handling, trail will be set there */
 399             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
 400             targetCapacity+=count;
 401         }
 402     }
 403
 404     if(c!=0) {
 405         /*
 406          * c is a surrogate, and
 407          * - source or target too short
 408          * - or the surrogate is unmatched
 409          */
 410         cnv->toUBytes[0]=(uint8_t)(c>>8);
 411         cnv->toUBytes[1]=(uint8_t)c;
 412         cnv->toULength=2;
 413
 414         if(U16_IS_SURROGATE_LEAD(c)) {
 415             if(length>=2) {
 416                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
 417                     /* output the surrogate pair, will overflow (see conditions comment above) */
 418                     source+=2;
 419                     length-=2;
 420                     *target++=c;
 421                     if(offsets!=NULL) {
 422                         *offsets++=sourceIndex;
 423                     }
 424                     cnv->UCharErrorBuffer[0]=trail;
 425                     cnv->UCharErrorBufferLength=1;
 426                     cnv->toULength=0;
 427                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 428                 } else {
 429                     /* unmatched lead surrogate */
 430                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 431                 }
 432             } else {
 433                 /* see if the trail surrogate is in the next buffer */
 434             }
 435         } else {
 436             /* unmatched trail surrogate */
 437             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 438         }
 439     }
 440
 441     if(U_SUCCESS(*pErrorCode)) {
 442         /* check for a remaining source byte */
 443         if(length>0) {
 444             if(targetCapacity==0) {
 445                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 446             } else {
 447                 /* it must be length==1 because otherwise the above would have copied more */
 448                 cnv->toUBytes[cnv->toULength++]=*source++;
 449             }
 450         }
 451     }
 452
 453     /* write back the updated pointers */
 454     pArgs->source=(const char *)source;
 455     pArgs->target=target;
 456     pArgs->offsets=offsets;
 457 }
 458
 459 static UChar32
 460 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
 461     const uint8_t *s, *sourceLimit;
 462     UChar32 c;
 463
 464     s=(const uint8_t *)pArgs->source;
 465     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 466
 467     if(s>=sourceLimit) {
 468         /* no input */
 469         *err=U_INDEX_OUTOFBOUNDS_ERROR;
 470         return 0xffff;
 471     }
 472
 473     if(s+2>sourceLimit) {
 474         /* only one byte: truncated UChar */
 475         pArgs->converter->toUBytes[0]=*s++;
 476         pArgs->converter->toULength=1;
 477         pArgs->source=(const char *)s;
 478         *err = U_TRUNCATED_CHAR_FOUND;
 479         return 0xffff;
 480     }
 481
 482     /* get one UChar */
 483     c=((UChar32)*s<<8)|s[1];
 484     s+=2;
 485
 486     /* check for a surrogate pair */
 487     if(U_IS_SURROGATE(c)) {
 488         if(U16_IS_SURROGATE_LEAD(c)) {
 489             if(s+2<=sourceLimit) {
 490                 UChar trail;
 491
 492                 /* get a second UChar and see if it is a trail surrogate */
 493                 trail=((UChar)*s<<8)|s[1];
 494                 if(U16_IS_TRAIL(trail)) {
 495                     c=U16_GET_SUPPLEMENTARY(c, trail);
 496                     s+=2;
 497                 } else {
 498                     /* unmatched lead surrogate */
 499                     c=-2;
 500                 }
 501             } else {
 502                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
 503                 uint8_t *bytes=pArgs->converter->toUBytes;
 504                 s-=2;
 505                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
 506                 do {
 507                     *bytes++=*s++;
 508                 } while(s<sourceLimit);
 509
 510                 c=0xffff;
 511                 *err=U_TRUNCATED_CHAR_FOUND;
 512             }
 513         } else {
 514             /* unmatched trail surrogate */
 515             c=-2;
 516         }
 517
 518         if(c<0) {
 519             /* write the unmatched surrogate */
 520             uint8_t *bytes=pArgs->converter->toUBytes;
 521             pArgs->converter->toULength=2;
 522             *bytes=*(s-2);
 523             bytes[1]=*(s-1);
 524
 525             c=0xffff;
 526             *err=U_ILLEGAL_CHAR_FOUND;
 527         }
 528     }
 529
 530     pArgs->source=(const char *)s;
 531     return c;
 532 }
 533
 534 static const UConverterImpl _UTF16BEImpl={
 535     UCNV_UTF16_BigEndian,
 536
 537     NULL,
 538     NULL,
 539
 540     NULL,
 541     NULL,
 542     NULL,
 543
 544     _UTF16BEToUnicodeWithOffsets,
 545     _UTF16BEToUnicodeWithOffsets,
 546     _UTF16BEFromUnicodeWithOffsets,
 547     _UTF16BEFromUnicodeWithOffsets,
 548     _UTF16BEGetNextUChar,
 549
 550     NULL,
 551     NULL,
 552     NULL,
 553     NULL,
 554     ucnv_getCompleteUnicodeSet
 555 };
 556
 557 static const UConverterStaticData _UTF16BEStaticData={
 558     sizeof(UConverterStaticData),
 559     "UTF-16BE",
 560     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
 561     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
 562     0,
 563     0,
 564     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 565 };
 566
 567
 568 const UConverterSharedData _UTF16BEData={
 569     sizeof(UConverterSharedData), ~((uint32_t) 0),
 570     NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
 571     0
 572 };
 573
 574 /* UTF-16LE ----------------------------------------------------------------- */
 575
 576 static void
 577 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 578                                UErrorCode *pErrorCode) {
 579     UConverter *cnv;
 580     const UChar *source;
 581     uint8_t *target;
 582     int32_t *offsets;
 583
 584     int32_t targetCapacity, length, count, sourceIndex;
 585     UChar c, trail;
 586     char overflow[4];
 587
 588     source=pArgs->source;
 589     length=pArgs->sourceLimit-source;
 590     if(length<=0) {
 591         /* no input, nothing to do */
 592         return;
 593     }
 594
 595     targetCapacity=pArgs->targetLimit-pArgs->target;
 596     if(targetCapacity<=0) {
 597         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 598         return;
 599     }
 600
 601     cnv=pArgs->converter;
 602     target=(uint8_t *)pArgs->target;
 603     offsets=pArgs->offsets;
 604     sourceIndex=0;
 605
 606     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
 607
 608     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
 609         /* the last buffer ended with a lead surrogate, output the surrogate pair */
 610         ++source;
 611         --length;
 612         target[0]=(uint8_t)c;
 613         target[1]=(uint8_t)(c>>8);
 614         target[2]=(uint8_t)trail;
 615         target[3]=(uint8_t)(trail>>8);
 616         target+=4;
 617         targetCapacity-=4;
 618         if(offsets!=NULL) {
 619             *offsets++=-1;
 620             *offsets++=-1;
 621             *offsets++=-1;
 622             *offsets++=-1;
 623         }
 624         sourceIndex=1;
 625         cnv->fromUChar32=c=0;
 626     }
 627
 628     /* copy an even number of bytes for complete UChars */
 629     count=2*length;
 630     if(count>targetCapacity) {
 631         count=targetCapacity&~1;
 632     }
 633     /* count is even */
 634     if(c==0) {
 635         targetCapacity-=count;
 636         count>>=1;
 637         length-=count;
 638
 639         if(offsets==NULL) {
 640             while(count>0) {
 641                 c=*source++;
 642                 if(U16_IS_SINGLE(c)) {
 643                     target[0]=(uint8_t)c;
 644                     target[1]=(uint8_t)(c>>8);
 645                     target+=2;
 646                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 647                     ++source;
 648                     --count;
 649                     target[0]=(uint8_t)c;
 650                     target[1]=(uint8_t)(c>>8);
 651                     target[2]=(uint8_t)trail;
 652                     target[3]=(uint8_t)(trail>>8);
 653                     target+=4;
 654                 } else {
 655                     break;
 656                 }
 657                 --count;
 658             }
 659         } else {
 660             while(count>0) {
 661                 c=*source++;
 662                 if(U16_IS_SINGLE(c)) {
 663                     target[0]=(uint8_t)c;
 664                     target[1]=(uint8_t)(c>>8);
 665                     target+=2;
 666                     *offsets++=sourceIndex;
 667                     *offsets++=sourceIndex++;
 668                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 669                     ++source;
 670                     --count;
 671                     target[0]=(uint8_t)c;
 672                     target[1]=(uint8_t)(c>>8);
 673                     target[2]=(uint8_t)trail;
 674                     target[3]=(uint8_t)(trail>>8);
 675                     target+=4;
 676                     *offsets++=sourceIndex;
 677                     *offsets++=sourceIndex;
 678                     *offsets++=sourceIndex;
 679                     *offsets++=sourceIndex;
 680                     sourceIndex+=2;
 681                 } else {
 682                     break;
 683                 }
 684                 --count;
 685             }
 686         }
 687
 688         if(count==0) {
 689             /* done with the loop for complete UChars */
 690             if(length>0 && targetCapacity>0) {
 691                 /*
 692                  * there is more input and some target capacity -
 693                  * it must be targetCapacity==1 because otherwise
 694                  * the above would have copied more;
 695                  * prepare for overflow output
 696                  */
 697                 if(U16_IS_SINGLE(c=*source++)) {
 698                     overflow[0]=(char)c;
 699                     overflow[1]=(char)(c>>8);
 700                     length=2; /* 2 bytes to output */
 701                     c=0;
 702                 /* } else { keep c for surrogate handling, length will be set there */
 703                 }
 704             } else {
 705                 length=0;
 706                 c=0;
 707             }
 708         } else {
 709             /* keep c for surrogate handling, length will be set there */
 710             targetCapacity+=2*count;
 711         }
 712     } else {
 713         length=0; /* from here on, length counts the bytes in overflow[] */
 714     }
 715
 716     if(c!=0) {
 717         /*
 718          * c is a surrogate, and
 719          * - source or target too short
 720          * - or the surrogate is unmatched
 721          */
 722         length=0;
 723         if(U16_IS_SURROGATE_LEAD(c)) {
 724             if(source<pArgs->sourceLimit) {
 725                 if(U16_IS_TRAIL(trail=*source)) {
 726                     /* output the surrogate pair, will overflow (see conditions comment above) */
 727                     ++source;
 728                     overflow[0]=(char)c;
 729                     overflow[1]=(char)(c>>8);
 730                     overflow[2]=(char)trail;
 731                     overflow[3]=(char)(trail>>8);
 732                     length=4; /* 4 bytes to output */
 733                     c=0;
 734                 } else {
 735                     /* unmatched lead surrogate */
 736                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 737                 }
 738             } else {
 739                 /* see if the trail surrogate is in the next buffer */
 740             }
 741         } else {
 742             /* unmatched trail surrogate */
 743             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 744         }
 745         cnv->fromUChar32=c;
 746     }
 747
 748     if(length>0) {
 749         /* output length bytes with overflow (length>targetCapacity>0) */
 750         ucnv_fromUWriteBytes(cnv,
 751                              overflow, length,
 752                              (char **)&target, pArgs->targetLimit,
 753                              &offsets, sourceIndex,
 754                              pErrorCode);
 755         targetCapacity=pArgs->targetLimit-(char *)target;
 756     }
 757
 758     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
 759         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 760     }
 761
 762     /* write back the updated pointers */
 763     pArgs->source=source;
 764     pArgs->target=(char *)target;
 765     pArgs->offsets=offsets;
 766 }
 767
 768 static void
 769 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 770                              UErrorCode *pErrorCode) {
 771     UConverter *cnv;
 772     const uint8_t *source;
 773     UChar *target;
 774     int32_t *offsets;
 775
 776     int32_t targetCapacity, length, count, sourceIndex;
 777     UChar c, trail;
 778
 779     cnv=pArgs->converter;
 780     source=(const uint8_t *)pArgs->source;
 781     length=(const uint8_t *)pArgs->sourceLimit-source;
 782     if(length<=0 && cnv->toUnicodeStatus==0) {
 783         /* no input, nothing to do */
 784         return;
 785     }
 786
 787     targetCapacity=pArgs->targetLimit-pArgs->target;
 788     if(targetCapacity<=0) {
 789         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 790         return;
 791     }
 792
 793     target=pArgs->target;
 794     offsets=pArgs->offsets;
 795     sourceIndex=0;
 796     c=0;
 797
 798     /* complete a partial UChar or pair from the last call */
 799     if(cnv->toUnicodeStatus!=0) {
 800         /*
 801          * special case: single byte from a previous buffer,
 802          * where the byte turned out not to belong to a trail surrogate
 803          * and the preceding, unmatched lead surrogate was put into toUBytes[]
 804          * for error handling
 805          */
 806         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
 807         cnv->toULength=1;
 808         cnv->toUnicodeStatus=0;
 809     }
 810     if((count=cnv->toULength)!=0) {
 811         uint8_t *p=cnv->toUBytes;
 812         do {
 813             p[count++]=*source++;
 814             ++sourceIndex;
 815             --length;
 816             if(count==2) {
 817                 c=((UChar)p[1]<<8)|p[0];
 818                 if(U16_IS_SINGLE(c)) {
 819                     /* output the BMP code point */
 820                     *target++=c;
 821                     if(offsets!=NULL) {
 822                         *offsets++=-1;
 823                     }
 824                     --targetCapacity;
 825                     count=0;
 826                     c=0;
 827                     break;
 828                 } else if(U16_IS_SURROGATE_LEAD(c)) {
 829                     /* continue collecting bytes for the trail surrogate */
 830                     c=0; /* avoid unnecessary surrogate handling below */
 831                 } else {
 832                     /* fall through to error handling for an unmatched trail surrogate */
 833                     break;
 834                 }
 835             } else if(count==4) {
 836                 c=((UChar)p[1]<<8)|p[0];
 837                 trail=((UChar)p[3]<<8)|p[2];
 838                 if(U16_IS_TRAIL(trail)) {
 839                     /* output the surrogate pair */
 840                     *target++=c;
 841                     if(targetCapacity>=2) {
 842                         *target++=trail;
 843                         if(offsets!=NULL) {
 844                             *offsets++=-1;
 845                             *offsets++=-1;
 846                         }
 847                         targetCapacity-=2;
 848                     } else /* targetCapacity==1 */ {
 849                         targetCapacity=0;
 850                         cnv->UCharErrorBuffer[0]=trail;
 851                         cnv->UCharErrorBufferLength=1;
 852                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 853                     }
 854                     count=0;
 855                     c=0;
 856                     break;
 857                 } else {
 858                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
 859                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 860
 861                     /* back out reading the code unit after it */
 862                     if(((const uint8_t *)pArgs->source-source)>=2) {
 863                         source-=2;
 864                     } else {
 865                         /*
 866                          * if the trail unit's first byte was in a previous buffer, then
 867                          * we need to put it into a special place because toUBytes[] will be
 868                          * used for the lead unit's bytes
 869                          */
 870                         cnv->toUnicodeStatus=0x100|p[2];
 871                         --source;
 872                     }
 873                     cnv->toULength=2;
 874
 875                     /* write back the updated pointers */
 876                     pArgs->source=(const char *)source;
 877                     pArgs->target=target;
 878                     pArgs->offsets=offsets;
 879                     return;
 880                 }
 881             }
 882         } while(length>0);
 883         cnv->toULength=(int8_t)count;
 884     }
 885
 886     /* copy an even number of bytes for complete UChars */
 887     count=2*targetCapacity;
 888     if(count>length) {
 889         count=length&~1;
 890     }
 891     if(c==0 && count>0) {
 892         length-=count;
 893         count>>=1;
 894         targetCapacity-=count;
 895         if(offsets==NULL) {
 896             do {
 897                 c=((UChar)source[1]<<8)|source[0];
 898                 source+=2;
 899                 if(U16_IS_SINGLE(c)) {
 900                     *target++=c;
 901                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 902                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
 903                 ) {
 904                     source+=2;
 905                     --count;
 906                     *target++=c;
 907                     *target++=trail;
 908                 } else {
 909                     break;
 910                 }
 911             } while(--count>0);
 912         } else {
 913             do {
 914                 c=((UChar)source[1]<<8)|source[0];
 915                 source+=2;
 916                 if(U16_IS_SINGLE(c)) {
 917                     *target++=c;
 918                     *offsets++=sourceIndex;
 919                     sourceIndex+=2;
 920                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 921                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
 922                 ) {
 923                     source+=2;
 924                     --count;
 925                     *target++=c;
 926                     *target++=trail;
 927                     *offsets++=sourceIndex;
 928                     *offsets++=sourceIndex;
 929                     sourceIndex+=4;
 930                 } else {
 931                     break;
 932                 }
 933             } while(--count>0);
 934         }
 935
 936         if(count==0) {
 937             /* done with the loop for complete UChars */
 938             c=0;
 939         } else {
 940             /* keep c for surrogate handling, trail will be set there */
 941             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
 942             targetCapacity+=count;
 943         }
 944     }
 945
 946     if(c!=0) {
 947         /*
 948          * c is a surrogate, and
 949          * - source or target too short
 950          * - or the surrogate is unmatched
 951          */
 952         cnv->toUBytes[0]=(uint8_t)c;
 953         cnv->toUBytes[1]=(uint8_t)(c>>8);
 954         cnv->toULength=2;
 955
 956         if(U16_IS_SURROGATE_LEAD(c)) {
 957             if(length>=2) {
 958                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
 959                     /* output the surrogate pair, will overflow (see conditions comment above) */
 960                     source+=2;
 961                     length-=2;
 962                     *target++=c;
 963                     if(offsets!=NULL) {
 964                         *offsets++=sourceIndex;
 965                     }
 966                     cnv->UCharErrorBuffer[0]=trail;
 967                     cnv->UCharErrorBufferLength=1;
 968                     cnv->toULength=0;
 969                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 970                 } else {
 971                     /* unmatched lead surrogate */
 972                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 973                 }
 974             } else {
 975                 /* see if the trail surrogate is in the next buffer */
 976             }
 977         } else {
 978             /* unmatched trail surrogate */
 979             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 980         }
 981     }
 982
 983     if(U_SUCCESS(*pErrorCode)) {
 984         /* check for a remaining source byte */
 985         if(length>0) {
 986             if(targetCapacity==0) {
 987                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 988             } else {
 989                 /* it must be length==1 because otherwise the above would have copied more */
 990                 cnv->toUBytes[cnv->toULength++]=*source++;
 991             }
 992         }
 993     }
 994
 995     /* write back the updated pointers */
 996     pArgs->source=(const char *)source;
 997     pArgs->target=target;
 998     pArgs->offsets=offsets;
 999 }
1000
1001 static UChar32
1002 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1003     const uint8_t *s, *sourceLimit;
1004     UChar32 c;
1005
1006     s=(const uint8_t *)pArgs->source;
1007     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1008
1009     if(s>=sourceLimit) {
1010         /* no input */
1011         *err=U_INDEX_OUTOFBOUNDS_ERROR;
1012         return 0xffff;
1013     }
1014
1015     if(s+2>sourceLimit) {
1016         /* only one byte: truncated UChar */
1017         pArgs->converter->toUBytes[0]=*s++;
1018         pArgs->converter->toULength=1;
1019         pArgs->source=(const char *)s;
1020         *err = U_TRUNCATED_CHAR_FOUND;
1021         return 0xffff;
1022     }
1023
1024     /* get one UChar */
1025     c=((UChar32)s[1]<<8)|*s;
1026     s+=2;
1027
1028     /* check for a surrogate pair */
1029     if(U_IS_SURROGATE(c)) {
1030         if(U16_IS_SURROGATE_LEAD(c)) {
1031             if(s+2<=sourceLimit) {
1032                 UChar trail;
1033
1034                 /* get a second UChar and see if it is a trail surrogate */
1035                 trail=((UChar)s[1]<<8)|*s;
1036                 if(U16_IS_TRAIL(trail)) {
1037                     c=U16_GET_SUPPLEMENTARY(c, trail);
1038                     s+=2;
1039                 } else {
1040                     /* unmatched lead surrogate */
1041                     c=-2;
1042                 }
1043             } else {
1044                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1045                 uint8_t *bytes=pArgs->converter->toUBytes;
1046                 s-=2;
1047                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1048                 do {
1049                     *bytes++=*s++;
1050                 } while(s<sourceLimit);
1051
1052                 c=0xffff;
1053                 *err=U_TRUNCATED_CHAR_FOUND;
1054             }
1055         } else {
1056             /* unmatched trail surrogate */
1057             c=-2;
1058         }
1059
1060         if(c<0) {
1061             /* write the unmatched surrogate */
1062             uint8_t *bytes=pArgs->converter->toUBytes;
1063             pArgs->converter->toULength=2;
1064             *bytes=*(s-2);
1065             bytes[1]=*(s-1);
1066
1067             c=0xffff;
1068             *err=U_ILLEGAL_CHAR_FOUND;
1069         }
1070     }
1071
1072     pArgs->source=(const char *)s;
1073     return c;
1074 }
1075
1076 static const UConverterImpl _UTF16LEImpl={
1077     UCNV_UTF16_LittleEndian,
1078
1079     NULL,
1080     NULL,
1081
1082     NULL,
1083     NULL,
1084     NULL,
1085
1086     _UTF16LEToUnicodeWithOffsets,
1087     _UTF16LEToUnicodeWithOffsets,
1088     _UTF16LEFromUnicodeWithOffsets,
1089     _UTF16LEFromUnicodeWithOffsets,
1090     _UTF16LEGetNextUChar,
1091
1092     NULL,
1093     NULL,
1094     NULL,
1095     NULL,
1096     ucnv_getCompleteUnicodeSet
1097 };
1098
1099
1100 static const UConverterStaticData _UTF16LEStaticData={
1101     sizeof(UConverterStaticData),
1102     "UTF-16LE",
1103     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1104     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1105     0,
1106     0,
1107     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1108 };
1109
1110
1111 const UConverterSharedData _UTF16LEData={
1112     sizeof(UConverterSharedData), ~((uint32_t) 0),
1113     NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
1114     0
1115 };
1116
1117 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1118
1119 /*
1120  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1121  * accordingly.
1122  * This is a simpler version of the UTF-32 converter below, with
1123  * fewer states for shorter BOMs.
1124  *
1125  * State values:
1126  * 0    initial state
1127  * 1    saw FE
1128  * 2..4 -
1129  * 5    saw FF
1130  * 6..7 -
1131  * 8    UTF-16BE mode
1132  * 9    UTF-16LE mode
1133  *
1134  * During detection: state&3==number of matching bytes so far.
1135  *
1136  * On output, emit U+FEFF as the first code point.
1137  */
1138
1139 static void
1140 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1141     if(choice<=UCNV_RESET_TO_UNICODE) {
1142         /* reset toUnicode: state=0 */
1143         cnv->mode=0;
1144     }
1145     if(choice!=UCNV_RESET_TO_UNICODE) {
1146         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1147         cnv->charErrorBufferLength=2;
1148 #if U_IS_BIG_ENDIAN
1149         cnv->charErrorBuffer[0]=0xfe;
1150         cnv->charErrorBuffer[1]=0xff;
1151 #else
1152         cnv->charErrorBuffer[0]=0xff;
1153         cnv->charErrorBuffer[1]=0xfe;
1154 #endif
1155     }
1156 }
1157
1158 static void
1159 _UTF16Open(UConverter *cnv,
1160            const char *name,
1161            const char *locale,
1162            uint32_t options,
1163            UErrorCode *pErrorCode) {
1164     _UTF16Reset(cnv, UCNV_RESET_BOTH);
1165 }
1166
1167 static const char utf16BOM[8]={ (char)0xfe, (char)0xff, 0, 0,    (char)0xff, (char)0xfe, 0, 0 };
1168
1169 static void
1170 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1171                            UErrorCode *pErrorCode) {
1172     UConverter *cnv=pArgs->converter;
1173     const char *source=pArgs->source;
1174     const char *sourceLimit=pArgs->sourceLimit;
1175     int32_t *offsets=pArgs->offsets;
1176
1177     int32_t state, offsetDelta;
1178     char b;
1179
1180     state=cnv->mode;
1181
1182     /*
1183      * If we detect a BOM in this buffer, then we must add the BOM size to the
1184      * offsets because the actual converter function will not see and count the BOM.
1185      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1186      */
1187     offsetDelta=0;
1188
1189     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1190         switch(state) {
1191         case 0:
1192             b=*source;
1193             if(b==(char)0xfe) {
1194                 state=1; /* could be FE FF */
1195             } else if(b==(char)0xff) {
1196                 state=5; /* could be FF FE */
1197             } else {
1198                 state=8; /* default to UTF-16BE */
1199                 continue;
1200             }
1201             ++source;
1202             break;
1203         case 1:
1204         case 5:
1205             if(*source==utf16BOM[state]) {
1206                 ++source;
1207                 if(state==1) {
1208                     state=8; /* detect UTF-16BE */
1209                     offsetDelta=source-pArgs->source;
1210                 } else if(state==5) {
1211                     state=9; /* detect UTF-16LE */
1212                     offsetDelta=source-pArgs->source;
1213                 }
1214             } else {
1215                 /* switch to UTF-16BE and pass the previous bytes */
1216                 if(source!=pArgs->source) {
1217                     /* just reset the source */
1218                     source=pArgs->source;
1219                 } else {
1220                     UBool oldFlush=pArgs->flush;
1221
1222                     /* the first byte is from a previous buffer, replay it first */
1223                     pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1224                     pArgs->sourceLimit=pArgs->source+1; /* replay previous byte */
1225                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1226
1227                     _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1228
1229                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1230                     pArgs->sourceLimit=sourceLimit;
1231                     pArgs->flush=oldFlush;
1232                 }
1233                 state=8;
1234                 continue;
1235             }
1236             break;
1237         case 8:
1238             /* call UTF-16BE */
1239             pArgs->source=source;
1240             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1241             source=pArgs->source;
1242             break;
1243         case 9:
1244             /* call UTF-16LE */
1245             pArgs->source=source;
1246             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1247             source=pArgs->source;
1248             break;
1249         default:
1250             break; /* does not occur */
1251         }
1252     }
1253
1254     /* add BOM size to offsets - see comment at offsetDelta declaration */
1255     if(offsets!=NULL && offsetDelta!=0) {
1256         int32_t *offsetsLimit=pArgs->offsets;
1257         while(offsets<offsetsLimit) {
1258             *offsets++ += offsetDelta;
1259         }
1260     }
1261
1262     pArgs->source=source;
1263
1264     if(source==sourceLimit && pArgs->flush) {
1265         /* handle truncated input */
1266         switch(state) {
1267         case 0:
1268             break; /* no input at all, nothing to do */
1269         case 8:
1270             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1271             break;
1272         case 9:
1273             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1274             break;
1275         default:
1276             /* handle 0<state<8: call UTF-16BE with too-short input */
1277             pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1278             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1279
1280             /* no offsets: not enough for output */
1281             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1282             pArgs->source=source;
1283             pArgs->sourceLimit=sourceLimit;
1284             state=8;
1285             break;
1286         }
1287     }
1288
1289     cnv->mode=state;
1290 }
1291
1292 static UChar32
1293 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1294                    UErrorCode *pErrorCode) {
1295     switch(pArgs->converter->mode) {
1296     case 8:
1297         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1298     case 9:
1299         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1300     default:
1301         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1302     }
1303 }
1304
1305 static const UConverterImpl _UTF16Impl = {
1306     UCNV_UTF16,
1307
1308     NULL,
1309     NULL,
1310
1311     _UTF16Open,
1312     NULL,
1313     _UTF16Reset,
1314
1315     _UTF16ToUnicodeWithOffsets,
1316     _UTF16ToUnicodeWithOffsets,
1317     _UTF16PEFromUnicodeWithOffsets,
1318     _UTF16PEFromUnicodeWithOffsets,
1319     _UTF16GetNextUChar,
1320
1321     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1322     NULL,
1323     NULL,
1324     NULL,
1325     ucnv_getCompleteUnicodeSet
1326 };
1327
1328 static const UConverterStaticData _UTF16StaticData = {
1329     sizeof(UConverterStaticData),
1330     "UTF-16",
1331     0, /* ### TODO review correctness of all Unicode CCSIDs */
1332     UCNV_IBM, UCNV_UTF16, 2, 2,
1333 #if U_IS_BIG_ENDIAN
1334     { 0xff, 0xfd, 0, 0 }, 2,
1335 #else
1336     { 0xfd, 0xff, 0, 0 }, 2,
1337 #endif
1338     FALSE, FALSE,
1339     0,
1340     0,
1341     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1342 };
1343
1344 const UConverterSharedData _UTF16Data = {
1345     sizeof(UConverterSharedData), ~((uint32_t) 0),
1346     NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
1347     0
1348 };
1349
1350 #endif