icuSources/common/ucnv_u16.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2015, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u16.c
   9 *   encoding:   UTF-8
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION
  22
  23 #include "unicode/ucnv.h"
  24 #include "unicode/uversion.h"
  25 #include "ucnv_bld.h"
  26 #include "ucnv_cnv.h"
  27 #include "cmemory.h"
  28
  29 enum {
  30     UCNV_NEED_TO_WRITE_BOM=1
  31 };
  32
  33 U_CDECL_BEGIN
  34 /*
  35  * The UTF-16 toUnicode implementation is also used for the Java-specific
  36  * "with BOM" variants of UTF-16BE and UTF-16LE.
  37  */
  38 static void  U_CALLCONV
  39 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  40                            UErrorCode *pErrorCode);
  41
  42 /* UTF-16BE ----------------------------------------------------------------- */
  43
  44 #if U_IS_BIG_ENDIAN
  45 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
  46 #else
  47 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
  48 #endif
  49
  50
  51 static void  U_CALLCONV
  52 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  53                                UErrorCode *pErrorCode) {
  54     UConverter *cnv;
  55     const UChar *source;
  56     char *target;
  57     int32_t *offsets;
  58
  59     uint32_t targetCapacity, length, sourceIndex;
  60     UChar c, trail;
  61     char overflow[4];
  62
  63     source=pArgs->source;
  64     length=(int32_t)(pArgs->sourceLimit-source);
  65     if(length<=0) {
  66         /* no input, nothing to do */
  67         return;
  68     }
  69
  70     cnv=pArgs->converter;
  71
  72     /* write the BOM if necessary */
  73     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
  74         static const char bom[]={ (char)0xfe, (char)0xff };
  75         ucnv_fromUWriteBytes(cnv,
  76                              bom, 2,
  77                              &pArgs->target, pArgs->targetLimit,
  78                              &pArgs->offsets, -1,
  79                              pErrorCode);
  80         cnv->fromUnicodeStatus=0;
  81     }
  82
  83     target=pArgs->target;
  84     if(target >= pArgs->targetLimit) {
  85         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  86         return;
  87     }
  88
  89     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
  90     offsets=pArgs->offsets;
  91     sourceIndex=0;
  92
  93     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
  94
  95     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
  96         /* the last buffer ended with a lead surrogate, output the surrogate pair */
  97         ++source;
  98         --length;
  99         target[0]=(uint8_t)(c>>8);
 100         target[1]=(uint8_t)c;
 101         target[2]=(uint8_t)(trail>>8);
 102         target[3]=(uint8_t)trail;
 103         target+=4;
 104         targetCapacity-=4;
 105         if(offsets!=NULL) {
 106             *offsets++=-1;
 107             *offsets++=-1;
 108             *offsets++=-1;
 109             *offsets++=-1;
 110         }
 111         sourceIndex=1;
 112         cnv->fromUChar32=c=0;
 113     }
 114
 115     if(c==0) {
 116         /* copy an even number of bytes for complete UChars */
 117         uint32_t count=2*length;
 118         if(count>targetCapacity) {
 119             count=targetCapacity&~1;
 120         }
 121         /* count is even */
 122         targetCapacity-=count;
 123         count>>=1;
 124         length-=count;
 125
 126         if(offsets==NULL) {
 127             while(count>0) {
 128                 c=*source++;
 129                 if(U16_IS_SINGLE(c)) {
 130                     target[0]=(uint8_t)(c>>8);
 131                     target[1]=(uint8_t)c;
 132                     target+=2;
 133                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 134                     ++source;
 135                     --count;
 136                     target[0]=(uint8_t)(c>>8);
 137                     target[1]=(uint8_t)c;
 138                     target[2]=(uint8_t)(trail>>8);
 139                     target[3]=(uint8_t)trail;
 140                     target+=4;
 141                 } else {
 142                     break;
 143                 }
 144                 --count;
 145             }
 146         } else {
 147             while(count>0) {
 148                 c=*source++;
 149                 if(U16_IS_SINGLE(c)) {
 150                     target[0]=(uint8_t)(c>>8);
 151                     target[1]=(uint8_t)c;
 152                     target+=2;
 153                     *offsets++=sourceIndex;
 154                     *offsets++=sourceIndex++;
 155                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 156                     ++source;
 157                     --count;
 158                     target[0]=(uint8_t)(c>>8);
 159                     target[1]=(uint8_t)c;
 160                     target[2]=(uint8_t)(trail>>8);
 161                     target[3]=(uint8_t)trail;
 162                     target+=4;
 163                     *offsets++=sourceIndex;
 164                     *offsets++=sourceIndex;
 165                     *offsets++=sourceIndex;
 166                     *offsets++=sourceIndex;
 167                     sourceIndex+=2;
 168                 } else {
 169                     break;
 170                 }
 171                 --count;
 172             }
 173         }
 174
 175         if(count==0) {
 176             /* done with the loop for complete UChars */
 177             if(length>0 && targetCapacity>0) {
 178                 /*
 179                  * there is more input and some target capacity -
 180                  * it must be targetCapacity==1 because otherwise
 181                  * the above would have copied more;
 182                  * prepare for overflow output
 183                  */
 184                 if(U16_IS_SINGLE(c=*source++)) {
 185                     overflow[0]=(char)(c>>8);
 186                     overflow[1]=(char)c;
 187                     length=2; /* 2 bytes to output */
 188                     c=0;
 189                 /* } else { keep c for surrogate handling, length will be set there */
 190                 }
 191             } else {
 192                 length=0;
 193                 c=0;
 194             }
 195         } else {
 196             /* keep c for surrogate handling, length will be set there */
 197             targetCapacity+=2*count;
 198         }
 199     } else {
 200         length=0; /* from here on, length counts the bytes in overflow[] */
 201     }
 202
 203     if(c!=0) {
 204         /*
 205          * c is a surrogate, and
 206          * - source or target too short
 207          * - or the surrogate is unmatched
 208          */
 209         length=0;
 210         if(U16_IS_SURROGATE_LEAD(c)) {
 211             if(source<pArgs->sourceLimit) {
 212                 if(U16_IS_TRAIL(trail=*source)) {
 213                     /* output the surrogate pair, will overflow (see conditions comment above) */
 214                     ++source;
 215                     overflow[0]=(char)(c>>8);
 216                     overflow[1]=(char)c;
 217                     overflow[2]=(char)(trail>>8);
 218                     overflow[3]=(char)trail;
 219                     length=4; /* 4 bytes to output */
 220                     c=0;
 221                 } else {
 222                     /* unmatched lead surrogate */
 223                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 224                 }
 225             } else {
 226                 /* see if the trail surrogate is in the next buffer */
 227             }
 228         } else {
 229             /* unmatched trail surrogate */
 230             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 231         }
 232         cnv->fromUChar32=c;
 233     }
 234
 235     if(length>0) {
 236         /* output length bytes with overflow (length>targetCapacity>0) */
 237         ucnv_fromUWriteBytes(cnv,
 238                              overflow, length,
 239                              (char **)&target, pArgs->targetLimit,
 240                              &offsets, sourceIndex,
 241                              pErrorCode);
 242         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
 243     }
 244
 245     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
 246         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 247     }
 248
 249     /* write back the updated pointers */
 250     pArgs->source=source;
 251     pArgs->target=(char *)target;
 252     pArgs->offsets=offsets;
 253 }
 254
 255 static void  U_CALLCONV
 256 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 257                              UErrorCode *pErrorCode) {
 258     UConverter *cnv;
 259     const uint8_t *source;
 260     UChar *target;
 261     int32_t *offsets;
 262
 263     uint32_t targetCapacity, length, count, sourceIndex;
 264     UChar c, trail;
 265
 266     if(pArgs->converter->mode<8) {
 267         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
 268         return;
 269     }
 270
 271     cnv=pArgs->converter;
 272     source=(const uint8_t *)pArgs->source;
 273     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
 274     if(length<=0 && cnv->toUnicodeStatus==0) {
 275         /* no input, nothing to do */
 276         return;
 277     }
 278
 279     target=pArgs->target;
 280     if(target >= pArgs->targetLimit) {
 281         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 282         return;
 283     }
 284
 285     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
 286     offsets=pArgs->offsets;
 287     sourceIndex=0;
 288     c=0;
 289
 290     /* complete a partial UChar or pair from the last call */
 291     if(cnv->toUnicodeStatus!=0) {
 292         /*
 293          * special case: single byte from a previous buffer,
 294          * where the byte turned out not to belong to a trail surrogate
 295          * and the preceding, unmatched lead surrogate was put into toUBytes[]
 296          * for error handling
 297          */
 298         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
 299         cnv->toULength=1;
 300         cnv->toUnicodeStatus=0;
 301     }
 302     if((count=cnv->toULength)!=0) {
 303         uint8_t *p=cnv->toUBytes;
 304         do {
 305             p[count++]=*source++;
 306             ++sourceIndex;
 307             --length;
 308             if(count==2) {
 309                 c=((UChar)p[0]<<8)|p[1];
 310                 if(U16_IS_SINGLE(c)) {
 311                     /* output the BMP code point */
 312                     *target++=c;
 313                     if(offsets!=NULL) {
 314                         *offsets++=-1;
 315                     }
 316                     --targetCapacity;
 317                     count=0;
 318                     c=0;
 319                     break;
 320                 } else if(U16_IS_SURROGATE_LEAD(c)) {
 321                     /* continue collecting bytes for the trail surrogate */
 322                     c=0; /* avoid unnecessary surrogate handling below */
 323                 } else {
 324                     /* fall through to error handling for an unmatched trail surrogate */
 325                     break;
 326                 }
 327             } else if(count==4) {
 328                 c=((UChar)p[0]<<8)|p[1];
 329                 trail=((UChar)p[2]<<8)|p[3];
 330                 if(U16_IS_TRAIL(trail)) {
 331                     /* output the surrogate pair */
 332                     *target++=c;
 333                     if(targetCapacity>=2) {
 334                         *target++=trail;
 335                         if(offsets!=NULL) {
 336                             *offsets++=-1;
 337                             *offsets++=-1;
 338                         }
 339                         targetCapacity-=2;
 340                     } else /* targetCapacity==1 */ {
 341                         targetCapacity=0;
 342                         cnv->UCharErrorBuffer[0]=trail;
 343                         cnv->UCharErrorBufferLength=1;
 344                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 345                     }
 346                     count=0;
 347                     c=0;
 348                     break;
 349                 } else {
 350                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
 351                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 352
 353                     /* back out reading the code unit after it */
 354                     if(((const uint8_t *)pArgs->source-source)>=2) {
 355                         source-=2;
 356                     } else {
 357                         /*
 358                          * if the trail unit's first byte was in a previous buffer, then
 359                          * we need to put it into a special place because toUBytes[] will be
 360                          * used for the lead unit's bytes
 361                          */
 362                         cnv->toUnicodeStatus=0x100|p[2];
 363                         --source;
 364                     }
 365                     cnv->toULength=2;
 366
 367                     /* write back the updated pointers */
 368                     pArgs->source=(const char *)source;
 369                     pArgs->target=target;
 370                     pArgs->offsets=offsets;
 371                     return;
 372                 }
 373             }
 374         } while(length>0);
 375         cnv->toULength=(int8_t)count;
 376     }
 377
 378     /* copy an even number of bytes for complete UChars */
 379     count=2*targetCapacity;
 380     if(count>length) {
 381         count=length&~1;
 382     }
 383     if(c==0 && count>0) {
 384         length-=count;
 385         count>>=1;
 386         targetCapacity-=count;
 387         if(offsets==NULL) {
 388             do {
 389                 c=((UChar)source[0]<<8)|source[1];
 390                 source+=2;
 391                 if(U16_IS_SINGLE(c)) {
 392                     *target++=c;
 393                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 394                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
 395                 ) {
 396                     source+=2;
 397                     --count;
 398                     *target++=c;
 399                     *target++=trail;
 400                 } else {
 401                     break;
 402                 }
 403             } while(--count>0);
 404         } else {
 405             do {
 406                 c=((UChar)source[0]<<8)|source[1];
 407                 source+=2;
 408                 if(U16_IS_SINGLE(c)) {
 409                     *target++=c;
 410                     *offsets++=sourceIndex;
 411                     sourceIndex+=2;
 412                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 413                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
 414                 ) {
 415                     source+=2;
 416                     --count;
 417                     *target++=c;
 418                     *target++=trail;
 419                     *offsets++=sourceIndex;
 420                     *offsets++=sourceIndex;
 421                     sourceIndex+=4;
 422                 } else {
 423                     break;
 424                 }
 425             } while(--count>0);
 426         }
 427
 428         if(count==0) {
 429             /* done with the loop for complete UChars */
 430             c=0;
 431         } else {
 432             /* keep c for surrogate handling, trail will be set there */
 433             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
 434             targetCapacity+=count;
 435         }
 436     }
 437
 438     if(c!=0) {
 439         /*
 440          * c is a surrogate, and
 441          * - source or target too short
 442          * - or the surrogate is unmatched
 443          */
 444         cnv->toUBytes[0]=(uint8_t)(c>>8);
 445         cnv->toUBytes[1]=(uint8_t)c;
 446         cnv->toULength=2;
 447
 448         if(U16_IS_SURROGATE_LEAD(c)) {
 449             if(length>=2) {
 450                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
 451                     /* output the surrogate pair, will overflow (see conditions comment above) */
 452                     source+=2;
 453                     length-=2;
 454                     *target++=c;
 455                     if(offsets!=NULL) {
 456                         *offsets++=sourceIndex;
 457                     }
 458                     cnv->UCharErrorBuffer[0]=trail;
 459                     cnv->UCharErrorBufferLength=1;
 460                     cnv->toULength=0;
 461                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 462                 } else {
 463                     /* unmatched lead surrogate */
 464                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 465                 }
 466             } else {
 467                 /* see if the trail surrogate is in the next buffer */
 468             }
 469         } else {
 470             /* unmatched trail surrogate */
 471             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 472         }
 473     }
 474
 475     if(U_SUCCESS(*pErrorCode)) {
 476         /* check for a remaining source byte */
 477         if(length>0) {
 478             if(targetCapacity==0) {
 479                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 480             } else {
 481                 /* it must be length==1 because otherwise the above would have copied more */
 482                 cnv->toUBytes[cnv->toULength++]=*source++;
 483             }
 484         }
 485     }
 486
 487     /* write back the updated pointers */
 488     pArgs->source=(const char *)source;
 489     pArgs->target=target;
 490     pArgs->offsets=offsets;
 491 }
 492
 493 static UChar32  U_CALLCONV
 494 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
 495     const uint8_t *s, *sourceLimit;
 496     UChar32 c;
 497
 498     if(pArgs->converter->mode<8) {
 499         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
 500     }
 501
 502     s=(const uint8_t *)pArgs->source;
 503     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 504
 505     if(s>=sourceLimit) {
 506         /* no input */
 507         *err=U_INDEX_OUTOFBOUNDS_ERROR;
 508         return 0xffff;
 509     }
 510
 511     if(s+2>sourceLimit) {
 512         /* only one byte: truncated UChar */
 513         pArgs->converter->toUBytes[0]=*s++;
 514         pArgs->converter->toULength=1;
 515         pArgs->source=(const char *)s;
 516         *err = U_TRUNCATED_CHAR_FOUND;
 517         return 0xffff;
 518     }
 519
 520     /* get one UChar */
 521     c=((UChar32)*s<<8)|s[1];
 522     s+=2;
 523
 524     /* check for a surrogate pair */
 525     if(U_IS_SURROGATE(c)) {
 526         if(U16_IS_SURROGATE_LEAD(c)) {
 527             if(s+2<=sourceLimit) {
 528                 UChar trail;
 529
 530                 /* get a second UChar and see if it is a trail surrogate */
 531                 trail=((UChar)*s<<8)|s[1];
 532                 if(U16_IS_TRAIL(trail)) {
 533                     c=U16_GET_SUPPLEMENTARY(c, trail);
 534                     s+=2;
 535                 } else {
 536                     /* unmatched lead surrogate */
 537                     c=-2;
 538                 }
 539             } else {
 540                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
 541                 uint8_t *bytes=pArgs->converter->toUBytes;
 542                 s-=2;
 543                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
 544                 do {
 545                     *bytes++=*s++;
 546                 } while(s<sourceLimit);
 547
 548                 c=0xffff;
 549                 *err=U_TRUNCATED_CHAR_FOUND;
 550             }
 551         } else {
 552             /* unmatched trail surrogate */
 553             c=-2;
 554         }
 555
 556         if(c<0) {
 557             /* write the unmatched surrogate */
 558             uint8_t *bytes=pArgs->converter->toUBytes;
 559             pArgs->converter->toULength=2;
 560             *bytes=*(s-2);
 561             bytes[1]=*(s-1);
 562
 563             c=0xffff;
 564             *err=U_ILLEGAL_CHAR_FOUND;
 565         }
 566     }
 567
 568     pArgs->source=(const char *)s;
 569     return c;
 570 }
 571
 572 static void  U_CALLCONV
 573 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
 574     if(choice<=UCNV_RESET_TO_UNICODE) {
 575         /* reset toUnicode state */
 576         if(UCNV_GET_VERSION(cnv)==0) {
 577             cnv->mode=8; /* no BOM handling */
 578         } else {
 579             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
 580         }
 581     }
 582     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
 583         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
 584         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
 585     }
 586 }
 587
 588 static void  U_CALLCONV
 589 _UTF16BEOpen(UConverter *cnv,
 590              UConverterLoadArgs *pArgs,
 591              UErrorCode *pErrorCode) {
 592     (void)pArgs;
 593     if(UCNV_GET_VERSION(cnv)<=1) {
 594         _UTF16BEReset(cnv, UCNV_RESET_BOTH);
 595     } else {
 596         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 597     }
 598 }
 599
 600 static const char *  U_CALLCONV
 601 _UTF16BEGetName(const UConverter *cnv) {
 602     if(UCNV_GET_VERSION(cnv)==0) {
 603         return "UTF-16BE";
 604     } else {
 605         return "UTF-16BE,version=1";
 606     }
 607 }
 608 U_CDECL_END
 609
 610 static const UConverterImpl _UTF16BEImpl={
 611     UCNV_UTF16_BigEndian,
 612
 613     NULL,
 614     NULL,
 615
 616     _UTF16BEOpen,
 617     NULL,
 618     _UTF16BEReset,
 619
 620     _UTF16BEToUnicodeWithOffsets,
 621     _UTF16BEToUnicodeWithOffsets,
 622     _UTF16BEFromUnicodeWithOffsets,
 623     _UTF16BEFromUnicodeWithOffsets,
 624     _UTF16BEGetNextUChar,
 625
 626     NULL,
 627     _UTF16BEGetName,
 628     NULL,
 629     NULL,
 630     ucnv_getNonSurrogateUnicodeSet,
 631
 632     NULL,
 633     NULL
 634 };
 635
 636 static const UConverterStaticData _UTF16BEStaticData={
 637     sizeof(UConverterStaticData),
 638     "UTF-16BE",
 639     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
 640     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
 641     0,
 642     0,
 643     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 644 };
 645
 646
 647 const UConverterSharedData _UTF16BEData=
 648         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
 649
 650 /* UTF-16LE ----------------------------------------------------------------- */
 651 U_CDECL_BEGIN
 652 static void  U_CALLCONV
 653 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 654                                UErrorCode *pErrorCode) {
 655     UConverter *cnv;
 656     const UChar *source;
 657     char *target;
 658     int32_t *offsets;
 659
 660     uint32_t targetCapacity, length, sourceIndex;
 661     UChar c, trail;
 662     char overflow[4];
 663
 664     source=pArgs->source;
 665     length=(int32_t)(pArgs->sourceLimit-source);
 666     if(length<=0) {
 667         /* no input, nothing to do */
 668         return;
 669     }
 670
 671     cnv=pArgs->converter;
 672
 673     /* write the BOM if necessary */
 674     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 675         static const char bom[]={ (char)0xff, (char)0xfe };
 676         ucnv_fromUWriteBytes(cnv,
 677                              bom, 2,
 678                              &pArgs->target, pArgs->targetLimit,
 679                              &pArgs->offsets, -1,
 680                              pErrorCode);
 681         cnv->fromUnicodeStatus=0;
 682     }
 683
 684     target=pArgs->target;
 685     if(target >= pArgs->targetLimit) {
 686         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 687         return;
 688     }
 689
 690     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
 691     offsets=pArgs->offsets;
 692     sourceIndex=0;
 693
 694     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
 695
 696     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
 697         /* the last buffer ended with a lead surrogate, output the surrogate pair */
 698         ++source;
 699         --length;
 700         target[0]=(uint8_t)c;
 701         target[1]=(uint8_t)(c>>8);
 702         target[2]=(uint8_t)trail;
 703         target[3]=(uint8_t)(trail>>8);
 704         target+=4;
 705         targetCapacity-=4;
 706         if(offsets!=NULL) {
 707             *offsets++=-1;
 708             *offsets++=-1;
 709             *offsets++=-1;
 710             *offsets++=-1;
 711         }
 712         sourceIndex=1;
 713         cnv->fromUChar32=c=0;
 714     }
 715
 716     if(c==0) {
 717         /* copy an even number of bytes for complete UChars */
 718         uint32_t count=2*length;
 719         if(count>targetCapacity) {
 720             count=targetCapacity&~1;
 721         }
 722         /* count is even */
 723         targetCapacity-=count;
 724         count>>=1;
 725         length-=count;
 726
 727         if(offsets==NULL) {
 728             while(count>0) {
 729                 c=*source++;
 730                 if(U16_IS_SINGLE(c)) {
 731                     target[0]=(uint8_t)c;
 732                     target[1]=(uint8_t)(c>>8);
 733                     target+=2;
 734                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 735                     ++source;
 736                     --count;
 737                     target[0]=(uint8_t)c;
 738                     target[1]=(uint8_t)(c>>8);
 739                     target[2]=(uint8_t)trail;
 740                     target[3]=(uint8_t)(trail>>8);
 741                     target+=4;
 742                 } else {
 743                     break;
 744                 }
 745                 --count;
 746             }
 747         } else {
 748             while(count>0) {
 749                 c=*source++;
 750                 if(U16_IS_SINGLE(c)) {
 751                     target[0]=(uint8_t)c;
 752                     target[1]=(uint8_t)(c>>8);
 753                     target+=2;
 754                     *offsets++=sourceIndex;
 755                     *offsets++=sourceIndex++;
 756                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 757                     ++source;
 758                     --count;
 759                     target[0]=(uint8_t)c;
 760                     target[1]=(uint8_t)(c>>8);
 761                     target[2]=(uint8_t)trail;
 762                     target[3]=(uint8_t)(trail>>8);
 763                     target+=4;
 764                     *offsets++=sourceIndex;
 765                     *offsets++=sourceIndex;
 766                     *offsets++=sourceIndex;
 767                     *offsets++=sourceIndex;
 768                     sourceIndex+=2;
 769                 } else {
 770                     break;
 771                 }
 772                 --count;
 773             }
 774         }
 775
 776         if(count==0) {
 777             /* done with the loop for complete UChars */
 778             if(length>0 && targetCapacity>0) {
 779                 /*
 780                  * there is more input and some target capacity -
 781                  * it must be targetCapacity==1 because otherwise
 782                  * the above would have copied more;
 783                  * prepare for overflow output
 784                  */
 785                 if(U16_IS_SINGLE(c=*source++)) {
 786                     overflow[0]=(char)c;
 787                     overflow[1]=(char)(c>>8);
 788                     length=2; /* 2 bytes to output */
 789                     c=0;
 790                 /* } else { keep c for surrogate handling, length will be set there */
 791                 }
 792             } else {
 793                 length=0;
 794                 c=0;
 795             }
 796         } else {
 797             /* keep c for surrogate handling, length will be set there */
 798             targetCapacity+=2*count;
 799         }
 800     } else {
 801         length=0; /* from here on, length counts the bytes in overflow[] */
 802     }
 803
 804     if(c!=0) {
 805         /*
 806          * c is a surrogate, and
 807          * - source or target too short
 808          * - or the surrogate is unmatched
 809          */
 810         length=0;
 811         if(U16_IS_SURROGATE_LEAD(c)) {
 812             if(source<pArgs->sourceLimit) {
 813                 if(U16_IS_TRAIL(trail=*source)) {
 814                     /* output the surrogate pair, will overflow (see conditions comment above) */
 815                     ++source;
 816                     overflow[0]=(char)c;
 817                     overflow[1]=(char)(c>>8);
 818                     overflow[2]=(char)trail;
 819                     overflow[3]=(char)(trail>>8);
 820                     length=4; /* 4 bytes to output */
 821                     c=0;
 822                 } else {
 823                     /* unmatched lead surrogate */
 824                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 825                 }
 826             } else {
 827                 /* see if the trail surrogate is in the next buffer */
 828             }
 829         } else {
 830             /* unmatched trail surrogate */
 831             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 832         }
 833         cnv->fromUChar32=c;
 834     }
 835
 836     if(length>0) {
 837         /* output length bytes with overflow (length>targetCapacity>0) */
 838         ucnv_fromUWriteBytes(cnv,
 839                              overflow, length,
 840                              &target, pArgs->targetLimit,
 841                              &offsets, sourceIndex,
 842                              pErrorCode);
 843         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
 844     }
 845
 846     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
 847         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 848     }
 849
 850     /* write back the updated pointers */
 851     pArgs->source=source;
 852     pArgs->target=target;
 853     pArgs->offsets=offsets;
 854 }
 855
 856 static void  U_CALLCONV
 857 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 858                              UErrorCode *pErrorCode) {
 859     UConverter *cnv;
 860     const uint8_t *source;
 861     UChar *target;
 862     int32_t *offsets;
 863
 864     uint32_t targetCapacity, length, count, sourceIndex;
 865     UChar c, trail;
 866
 867     if(pArgs->converter->mode<8) {
 868         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
 869         return;
 870     }
 871
 872     cnv=pArgs->converter;
 873     source=(const uint8_t *)pArgs->source;
 874     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
 875     if(length<=0 && cnv->toUnicodeStatus==0) {
 876         /* no input, nothing to do */
 877         return;
 878     }
 879
 880     target=pArgs->target;
 881     if(target >= pArgs->targetLimit) {
 882         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 883         return;
 884     }
 885
 886     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
 887     offsets=pArgs->offsets;
 888     sourceIndex=0;
 889     c=0;
 890
 891     /* complete a partial UChar or pair from the last call */
 892     if(cnv->toUnicodeStatus!=0) {
 893         /*
 894          * special case: single byte from a previous buffer,
 895          * where the byte turned out not to belong to a trail surrogate
 896          * and the preceding, unmatched lead surrogate was put into toUBytes[]
 897          * for error handling
 898          */
 899         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
 900         cnv->toULength=1;
 901         cnv->toUnicodeStatus=0;
 902     }
 903     if((count=cnv->toULength)!=0) {
 904         uint8_t *p=cnv->toUBytes;
 905         do {
 906             p[count++]=*source++;
 907             ++sourceIndex;
 908             --length;
 909             if(count==2) {
 910                 c=((UChar)p[1]<<8)|p[0];
 911                 if(U16_IS_SINGLE(c)) {
 912                     /* output the BMP code point */
 913                     *target++=c;
 914                     if(offsets!=NULL) {
 915                         *offsets++=-1;
 916                     }
 917                     --targetCapacity;
 918                     count=0;
 919                     c=0;
 920                     break;
 921                 } else if(U16_IS_SURROGATE_LEAD(c)) {
 922                     /* continue collecting bytes for the trail surrogate */
 923                     c=0; /* avoid unnecessary surrogate handling below */
 924                 } else {
 925                     /* fall through to error handling for an unmatched trail surrogate */
 926                     break;
 927                 }
 928             } else if(count==4) {
 929                 c=((UChar)p[1]<<8)|p[0];
 930                 trail=((UChar)p[3]<<8)|p[2];
 931                 if(U16_IS_TRAIL(trail)) {
 932                     /* output the surrogate pair */
 933                     *target++=c;
 934                     if(targetCapacity>=2) {
 935                         *target++=trail;
 936                         if(offsets!=NULL) {
 937                             *offsets++=-1;
 938                             *offsets++=-1;
 939                         }
 940                         targetCapacity-=2;
 941                     } else /* targetCapacity==1 */ {
 942                         targetCapacity=0;
 943                         cnv->UCharErrorBuffer[0]=trail;
 944                         cnv->UCharErrorBufferLength=1;
 945                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 946                     }
 947                     count=0;
 948                     c=0;
 949                     break;
 950                 } else {
 951                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
 952                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 953
 954                     /* back out reading the code unit after it */
 955                     if(((const uint8_t *)pArgs->source-source)>=2) {
 956                         source-=2;
 957                     } else {
 958                         /*
 959                          * if the trail unit's first byte was in a previous buffer, then
 960                          * we need to put it into a special place because toUBytes[] will be
 961                          * used for the lead unit's bytes
 962                          */
 963                         cnv->toUnicodeStatus=0x100|p[2];
 964                         --source;
 965                     }
 966                     cnv->toULength=2;
 967
 968                     /* write back the updated pointers */
 969                     pArgs->source=(const char *)source;
 970                     pArgs->target=target;
 971                     pArgs->offsets=offsets;
 972                     return;
 973                 }
 974             }
 975         } while(length>0);
 976         cnv->toULength=(int8_t)count;
 977     }
 978
 979     /* copy an even number of bytes for complete UChars */
 980     count=2*targetCapacity;
 981     if(count>length) {
 982         count=length&~1;
 983     }
 984     if(c==0 && count>0) {
 985         length-=count;
 986         count>>=1;
 987         targetCapacity-=count;
 988         if(offsets==NULL) {
 989             do {
 990                 c=((UChar)source[1]<<8)|source[0];
 991                 source+=2;
 992                 if(U16_IS_SINGLE(c)) {
 993                     *target++=c;
 994                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 995                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
 996                 ) {
 997                     source+=2;
 998                     --count;
 999                     *target++=c;
1000                     *target++=trail;
1001                 } else {
1002                     break;
1003                 }
1004             } while(--count>0);
1005         } else {
1006             do {
1007                 c=((UChar)source[1]<<8)|source[0];
1008                 source+=2;
1009                 if(U16_IS_SINGLE(c)) {
1010                     *target++=c;
1011                     *offsets++=sourceIndex;
1012                     sourceIndex+=2;
1013                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1014                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1015                 ) {
1016                     source+=2;
1017                     --count;
1018                     *target++=c;
1019                     *target++=trail;
1020                     *offsets++=sourceIndex;
1021                     *offsets++=sourceIndex;
1022                     sourceIndex+=4;
1023                 } else {
1024                     break;
1025                 }
1026             } while(--count>0);
1027         }
1028
1029         if(count==0) {
1030             /* done with the loop for complete UChars */
1031             c=0;
1032         } else {
1033             /* keep c for surrogate handling, trail will be set there */
1034             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1035             targetCapacity+=count;
1036         }
1037     }
1038
1039     if(c!=0) {
1040         /*
1041          * c is a surrogate, and
1042          * - source or target too short
1043          * - or the surrogate is unmatched
1044          */
1045         cnv->toUBytes[0]=(uint8_t)c;
1046         cnv->toUBytes[1]=(uint8_t)(c>>8);
1047         cnv->toULength=2;
1048
1049         if(U16_IS_SURROGATE_LEAD(c)) {
1050             if(length>=2) {
1051                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1052                     /* output the surrogate pair, will overflow (see conditions comment above) */
1053                     source+=2;
1054                     length-=2;
1055                     *target++=c;
1056                     if(offsets!=NULL) {
1057                         *offsets++=sourceIndex;
1058                     }
1059                     cnv->UCharErrorBuffer[0]=trail;
1060                     cnv->UCharErrorBufferLength=1;
1061                     cnv->toULength=0;
1062                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1063                 } else {
1064                     /* unmatched lead surrogate */
1065                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1066                 }
1067             } else {
1068                 /* see if the trail surrogate is in the next buffer */
1069             }
1070         } else {
1071             /* unmatched trail surrogate */
1072             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1073         }
1074     }
1075
1076     if(U_SUCCESS(*pErrorCode)) {
1077         /* check for a remaining source byte */
1078         if(length>0) {
1079             if(targetCapacity==0) {
1080                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1081             } else {
1082                 /* it must be length==1 because otherwise the above would have copied more */
1083                 cnv->toUBytes[cnv->toULength++]=*source++;
1084             }
1085         }
1086     }
1087
1088     /* write back the updated pointers */
1089     pArgs->source=(const char *)source;
1090     pArgs->target=target;
1091     pArgs->offsets=offsets;
1092 }
1093
1094 static UChar32  U_CALLCONV
1095 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1096     const uint8_t *s, *sourceLimit;
1097     UChar32 c;
1098
1099     if(pArgs->converter->mode<8) {
1100         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1101     }
1102
1103     s=(const uint8_t *)pArgs->source;
1104     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1105
1106     if(s>=sourceLimit) {
1107         /* no input */
1108         *err=U_INDEX_OUTOFBOUNDS_ERROR;
1109         return 0xffff;
1110     }
1111
1112     if(s+2>sourceLimit) {
1113         /* only one byte: truncated UChar */
1114         pArgs->converter->toUBytes[0]=*s++;
1115         pArgs->converter->toULength=1;
1116         pArgs->source=(const char *)s;
1117         *err = U_TRUNCATED_CHAR_FOUND;
1118         return 0xffff;
1119     }
1120
1121     /* get one UChar */
1122     c=((UChar32)s[1]<<8)|*s;
1123     s+=2;
1124
1125     /* check for a surrogate pair */
1126     if(U_IS_SURROGATE(c)) {
1127         if(U16_IS_SURROGATE_LEAD(c)) {
1128             if(s+2<=sourceLimit) {
1129                 UChar trail;
1130
1131                 /* get a second UChar and see if it is a trail surrogate */
1132                 trail=((UChar)s[1]<<8)|*s;
1133                 if(U16_IS_TRAIL(trail)) {
1134                     c=U16_GET_SUPPLEMENTARY(c, trail);
1135                     s+=2;
1136                 } else {
1137                     /* unmatched lead surrogate */
1138                     c=-2;
1139                 }
1140             } else {
1141                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1142                 uint8_t *bytes=pArgs->converter->toUBytes;
1143                 s-=2;
1144                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1145                 do {
1146                     *bytes++=*s++;
1147                 } while(s<sourceLimit);
1148
1149                 c=0xffff;
1150                 *err=U_TRUNCATED_CHAR_FOUND;
1151             }
1152         } else {
1153             /* unmatched trail surrogate */
1154             c=-2;
1155         }
1156
1157         if(c<0) {
1158             /* write the unmatched surrogate */
1159             uint8_t *bytes=pArgs->converter->toUBytes;
1160             pArgs->converter->toULength=2;
1161             *bytes=*(s-2);
1162             bytes[1]=*(s-1);
1163
1164             c=0xffff;
1165             *err=U_ILLEGAL_CHAR_FOUND;
1166         }
1167     }
1168
1169     pArgs->source=(const char *)s;
1170     return c;
1171 }
1172
1173 static void  U_CALLCONV
1174 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1175     if(choice<=UCNV_RESET_TO_UNICODE) {
1176         /* reset toUnicode state */
1177         if(UCNV_GET_VERSION(cnv)==0) {
1178             cnv->mode=8; /* no BOM handling */
1179         } else {
1180             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1181         }
1182     }
1183     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1184         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1185         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1186     }
1187 }
1188
1189 static void  U_CALLCONV
1190 _UTF16LEOpen(UConverter *cnv,
1191              UConverterLoadArgs *pArgs,
1192              UErrorCode *pErrorCode) {
1193     (void)pArgs;
1194     if(UCNV_GET_VERSION(cnv)<=1) {
1195         _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1196     } else {
1197         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1198     }
1199 }
1200
1201 static const char *  U_CALLCONV
1202 _UTF16LEGetName(const UConverter *cnv) {
1203     if(UCNV_GET_VERSION(cnv)==0) {
1204         return "UTF-16LE";
1205     } else {
1206         return "UTF-16LE,version=1";
1207     }
1208 }
1209 U_CDECL_END
1210
1211 static const UConverterImpl _UTF16LEImpl={
1212     UCNV_UTF16_LittleEndian,
1213
1214     NULL,
1215     NULL,
1216
1217     _UTF16LEOpen,
1218     NULL,
1219     _UTF16LEReset,
1220
1221     _UTF16LEToUnicodeWithOffsets,
1222     _UTF16LEToUnicodeWithOffsets,
1223     _UTF16LEFromUnicodeWithOffsets,
1224     _UTF16LEFromUnicodeWithOffsets,
1225     _UTF16LEGetNextUChar,
1226
1227     NULL,
1228     _UTF16LEGetName,
1229     NULL,
1230     NULL,
1231     ucnv_getNonSurrogateUnicodeSet,
1232
1233     NULL,
1234     NULL
1235 };
1236
1237
1238 static const UConverterStaticData _UTF16LEStaticData={
1239     sizeof(UConverterStaticData),
1240     "UTF-16LE",
1241     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1242     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1243     0,
1244     0,
1245     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1246 };
1247
1248
1249 const UConverterSharedData _UTF16LEData=
1250         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
1251
1252 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1253
1254 /*
1255  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1256  * accordingly.
1257  * This is a simpler version of the UTF-32 converter, with
1258  * fewer states for shorter BOMs.
1259  *
1260  * State values:
1261  * 0    initial state
1262  * 1    saw first byte
1263  * 2..5 -
1264  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1265  * 8    UTF-16BE mode
1266  * 9    UTF-16LE mode
1267  *
1268  * During detection: state==number of initial bytes seen so far.
1269  *
1270  * On output, emit U+FEFF as the first code point.
1271  *
1272  * Variants:
1273  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1274  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1275  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1276  */
1277 U_CDECL_BEGIN
1278 static void  U_CALLCONV
1279 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1280     if(choice<=UCNV_RESET_TO_UNICODE) {
1281         /* reset toUnicode: state=0 */
1282         cnv->mode=0;
1283     }
1284     if(choice!=UCNV_RESET_TO_UNICODE) {
1285         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1286         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1287     }
1288 }
1289 U_CDECL_END
1290 extern const UConverterSharedData _UTF16v2Data;
1291 U_CDECL_BEGIN
1292 static void U_CALLCONV
1293 _UTF16Open(UConverter *cnv,
1294            UConverterLoadArgs *pArgs,
1295            UErrorCode *pErrorCode) {
1296     if(UCNV_GET_VERSION(cnv)<=2) {
1297         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1298             /*
1299              * Switch implementation, and switch the staticData that's different
1300              * and was copied into the UConverter.
1301              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1302              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1303              */
1304             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1305             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1306         }
1307         _UTF16Reset(cnv, UCNV_RESET_BOTH);
1308     } else {
1309         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1310     }
1311 }
1312
1313 static const char *  U_CALLCONV
1314 _UTF16GetName(const UConverter *cnv) {
1315     if(UCNV_GET_VERSION(cnv)==0) {
1316         return "UTF-16";
1317     } else if(UCNV_GET_VERSION(cnv)==1) {
1318         return "UTF-16,version=1";
1319     } else {
1320         return "UTF-16,version=2";
1321     }
1322 }
1323 U_CDECL_END
1324 extern const UConverterSharedData _UTF16Data;
1325
1326 static inline bool IS_UTF16BE(const UConverter *cnv) {
1327     return ((cnv)->sharedData == &_UTF16BEData);
1328 }
1329
1330 static inline bool IS_UTF16LE(const UConverter *cnv) {
1331     return ((cnv)->sharedData == &_UTF16LEData);
1332 }
1333
1334 static inline bool IS_UTF16(const UConverter *cnv) {
1335     return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
1336 }
1337
1338 U_CDECL_BEGIN
1339 static void U_CALLCONV
1340 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1341                            UErrorCode *pErrorCode) {
1342     UConverter *cnv=pArgs->converter;
1343     const char *source=pArgs->source;
1344     const char *sourceLimit=pArgs->sourceLimit;
1345     int32_t *offsets=pArgs->offsets;
1346
1347     int32_t state, offsetDelta;
1348     uint8_t b;
1349
1350     state=cnv->mode;
1351
1352     /*
1353      * If we detect a BOM in this buffer, then we must add the BOM size to the
1354      * offsets because the actual converter function will not see and count the BOM.
1355      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1356      */
1357     offsetDelta=0;
1358
1359     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1360         switch(state) {
1361         case 0:
1362             cnv->toUBytes[0]=(uint8_t)*source++;
1363             cnv->toULength=1;
1364             state=1;
1365             break;
1366         case 1:
1367             /*
1368              * Only inside this switch case can the state variable
1369              * temporarily take two additional values:
1370              * 6: BOM error, continue with BE
1371              * 7: BOM error, continue with LE
1372              */
1373             b=*source;
1374             if(cnv->toUBytes[0]==0xfe && b==0xff) {
1375                 if(IS_UTF16LE(cnv)) {
1376                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1377                 } else {
1378                     state=8; /* detect UTF-16BE */
1379                 }
1380             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1381                 if(IS_UTF16BE(cnv)) {
1382                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1383                 } else {
1384                     state=9; /* detect UTF-16LE */
1385                 }
1386             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1387                 state=6; /* illegal missing BOM for Java "Unicode" */
1388             }
1389             if(state>=8) {
1390                 /* BOM detected, consume it */
1391                 ++source;
1392                 cnv->toULength=0;
1393                 offsetDelta=(int32_t)(source-pArgs->source);
1394             } else if(state<6) {
1395                 /* ok: no BOM, and not a reverse BOM */
1396                 if(source!=pArgs->source) {
1397                     /* reset the source for a correct first offset */
1398                     source=pArgs->source;
1399                     cnv->toULength=0;
1400                 }
1401                 if(IS_UTF16LE(cnv)) {
1402                     /* Make Java "UnicodeLittle" default to LE. */
1403                     state=9;
1404                 } else {
1405                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1406                     state=8;
1407                 }
1408             } else {
1409                 /*
1410                  * error: missing BOM, or reverse BOM
1411                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1412                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1413                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1414                  */
1415                 /* report the non-BOM or reverse BOM as an illegal sequence */
1416                 cnv->toUBytes[1]=b;
1417                 cnv->toULength=2;
1418                 pArgs->source=source+1;
1419                 /* continue with conversion if the callback resets the error */
1420                 /*
1421                  * Make Java "Unicode" default to BE like standard UTF-16.
1422                  * Make Java "UnicodeBig" and "UnicodeLittle" default
1423                  * to their normal endiannesses.
1424                  */
1425                 cnv->mode=state+2;
1426                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1427                 return;
1428             }
1429             /* convert the rest of the stream */
1430             cnv->mode=state;
1431             continue;
1432         case 8:
1433             /* call UTF-16BE */
1434             pArgs->source=source;
1435             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1436             source=pArgs->source;
1437             break;
1438         case 9:
1439             /* call UTF-16LE */
1440             pArgs->source=source;
1441             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1442             source=pArgs->source;
1443             break;
1444         default:
1445             break; /* does not occur */
1446         }
1447     }
1448
1449     /* add BOM size to offsets - see comment at offsetDelta declaration */
1450     if(offsets!=NULL && offsetDelta!=0) {
1451         int32_t *offsetsLimit=pArgs->offsets;
1452         while(offsets<offsetsLimit) {
1453             *offsets++ += offsetDelta;
1454         }
1455     }
1456
1457     pArgs->source=source;
1458
1459     if(source==sourceLimit && pArgs->flush) {
1460         /* handle truncated input */
1461         switch(state) {
1462         case 0:
1463             break; /* no input at all, nothing to do */
1464         case 8:
1465             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1466             break;
1467         case 9:
1468             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1469             break;
1470         default:
1471             /* 0<state<8: framework will report truncation, nothing to do here */
1472             break;
1473         }
1474     }
1475
1476     cnv->mode=state;
1477 }
1478
1479 static UChar32 U_CALLCONV
1480 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1481                    UErrorCode *pErrorCode) {
1482     switch(pArgs->converter->mode) {
1483     case 8:
1484         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1485     case 9:
1486         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1487     default:
1488         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1489     }
1490 }
1491 U_CDECL_END
1492
1493 static const UConverterImpl _UTF16Impl = {
1494     UCNV_UTF16,
1495
1496     NULL,
1497     NULL,
1498
1499     _UTF16Open,
1500     NULL,
1501     _UTF16Reset,
1502
1503     _UTF16ToUnicodeWithOffsets,
1504     _UTF16ToUnicodeWithOffsets,
1505     _UTF16PEFromUnicodeWithOffsets,
1506     _UTF16PEFromUnicodeWithOffsets,
1507     _UTF16GetNextUChar,
1508
1509     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1510     _UTF16GetName,
1511     NULL,
1512     NULL,
1513     ucnv_getNonSurrogateUnicodeSet,
1514
1515     NULL,
1516     NULL
1517 };
1518
1519 static const UConverterStaticData _UTF16StaticData = {
1520     sizeof(UConverterStaticData),
1521     "UTF-16",
1522     1204, /* CCSID for BOM sensitive UTF-16 */
1523     UCNV_IBM, UCNV_UTF16, 2, 2,
1524 #if U_IS_BIG_ENDIAN
1525     { 0xff, 0xfd, 0, 0 }, 2,
1526 #else
1527     { 0xfd, 0xff, 0, 0 }, 2,
1528 #endif
1529     FALSE, FALSE,
1530     0,
1531     0,
1532     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1533 };
1534
1535 const UConverterSharedData _UTF16Data =
1536         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
1537
1538 static const UConverterImpl _UTF16v2Impl = {
1539     UCNV_UTF16,
1540
1541     NULL,
1542     NULL,
1543
1544     _UTF16Open,
1545     NULL,
1546     _UTF16Reset,
1547
1548     _UTF16ToUnicodeWithOffsets,
1549     _UTF16ToUnicodeWithOffsets,
1550     _UTF16BEFromUnicodeWithOffsets,
1551     _UTF16BEFromUnicodeWithOffsets,
1552     _UTF16GetNextUChar,
1553
1554     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1555     _UTF16GetName,
1556     NULL,
1557     NULL,
1558     ucnv_getNonSurrogateUnicodeSet,
1559
1560     NULL,
1561     NULL
1562 };
1563
1564 static const UConverterStaticData _UTF16v2StaticData = {
1565     sizeof(UConverterStaticData),
1566     "UTF-16,version=2",
1567     1204, /* CCSID for BOM sensitive UTF-16 */
1568     UCNV_IBM, UCNV_UTF16, 2, 2,
1569     { 0xff, 0xfd, 0, 0 }, 2,
1570     FALSE, FALSE,
1571     0,
1572     0,
1573     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1574 };
1575
1576 const UConverterSharedData _UTF16v2Data =
1577         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
1578
1579 #endif