icuSources/common/ucnvbocu.c

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2002-2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  ucnvbocu.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002mar27
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This is an implementation of the Binary Ordered Compression for Unicode,
  17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_CONVERSION
  23
  24 #include "unicode/ucnv.h"
  25 #include "unicode/ucnv_cb.h"
  26 #include "ucnv_bld.h"
  27 #include "ucnv_cnv.h"
  28
  29 /* BOCU-1 constants and macros ---------------------------------------------- */
  30
  31 /*
  32  * BOCU-1 encodes the code points of a Unicode string as
  33  * a sequence of byte-encoded differences (slope detection),
  34  * preserving lexical order.
  35  *
  36  * Optimize the difference-taking for runs of Unicode text within
  37  * small scripts:
  38  *
  39  * Most small scripts are allocated within aligned 128-blocks of Unicode
  40  * code points. Lexical order is preserved if the "previous code point" state
  41  * is always moved into the middle of such a block.
  42  *
  43  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  44  * areas into the middle of those areas.
  45  *
  46  * C0 control codes and space are encoded with their US-ASCII bytes.
  47  * "prev" is reset for C0 controls but not for space.
  48  */
  49
  50 /* initial value for "prev": middle of the ASCII range */
  51 #define BOCU1_ASCII_PREV        0x40
  52
  53 /* bounding byte values for differences */
  54 #define BOCU1_MIN               0x21
  55 #define BOCU1_MIDDLE            0x90
  56 #define BOCU1_MAX_LEAD          0xfe
  57 #define BOCU1_MAX_TRAIL         0xff
  58 #define BOCU1_RESET             0xff
  59
  60 /* number of lead bytes */
  61 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  62
  63 /* adjust trail byte counts for the use of some C0 control byte values */
  64 #define BOCU1_TRAIL_CONTROLS_COUNT  20
  65 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  66
  67 /* number of trail bytes */
  68 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  69
  70 /*
  71  * number of positive and negative single-byte codes
  72  * (counting 0==BOCU1_MIDDLE among the positive ones)
  73  */
  74 #define BOCU1_SINGLE            64
  75
  76 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  77 #define BOCU1_LEAD_2            43
  78 #define BOCU1_LEAD_3            3
  79 #define BOCU1_LEAD_4            1
  80
  81 /* The difference value range for single-byters. */
  82 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
  83 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
  84
  85 /* The difference value range for double-byters. */
  86 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  87 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  88
  89 /* The difference value range for 3-byters. */
  90 #define BOCU1_REACH_POS_3   \
  91     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  92
  93 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  94
  95 /* The lead byte start values. */
  96 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
  97 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
  98 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
  99      /* ==BOCU1_MAX_LEAD */
 100
 101 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
 102 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
 103 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
 104      /* ==BOCU1_MIN+1 */
 105
 106 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
 107 #define BOCU1_LENGTH_FROM_LEAD(lead) \
 108     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
 109      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
 110      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
 111
 112 /* The length of a byte sequence, according to its packed form. */
 113 #define BOCU1_LENGTH_FROM_PACKED(packed) \
 114     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
 115
 116 /*
 117  * 12 commonly used C0 control codes (and space) are only used to encode
 118  * themselves directly,
 119  * which makes BOCU-1 MIME-usable and reasonably safe for
 120  * ASCII-oriented software.
 121  *
 122  * These controls are
 123  *  0   NUL
 124  *
 125  *  7   BEL
 126  *  8   BS
 127  *
 128  *  9   TAB
 129  *  a   LF
 130  *  b   VT
 131  *  c   FF
 132  *  d   CR
 133  *
 134  *  e   SO
 135  *  f   SI
 136  *
 137  * 1a   SUB
 138  * 1b   ESC
 139  *
 140  * The other 20 C0 controls are also encoded directly (to preserve order)
 141  * but are also used as trail bytes in difference encoding
 142  * (for better compression).
 143  */
 144 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
 145
 146 /*
 147  * Byte value map for control codes,
 148  * from external byte values 0x00..0x20
 149  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
 150  * External byte values that are illegal as trail bytes are mapped to -1.
 151  */
 152 static const int8_t
 153 bocu1ByteToTrail[BOCU1_MIN]={
 154 /*  0     1     2     3     4     5     6     7    */
 155     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
 156
 157 /*  8     9     a     b     c     d     e     f    */
 158     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 159
 160 /*  10    11    12    13    14    15    16    17   */
 161     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
 162
 163 /*  18    19    1a    1b    1c    1d    1e    1f   */
 164     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
 165
 166 /*  20   */
 167     -1
 168 };
 169
 170 /*
 171  * Byte value map for control codes,
 172  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
 173  * to external byte values 0x00..0x20.
 174  */
 175 static const int8_t
 176 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
 177 /*  0     1     2     3     4     5     6     7    */
 178     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
 179
 180 /*  8     9     a     b     c     d     e     f    */
 181     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 182
 183 /*  10    11    12    13   */
 184     0x1c, 0x1d, 0x1e, 0x1f
 185 };
 186
 187 /**
 188  * Integer division and modulo with negative numerators
 189  * yields negative modulo results and quotients that are one more than
 190  * what we need here.
 191  * This macro adjust the results so that the modulo-value m is always >=0.
 192  *
 193  * For positive n, the if() condition is always FALSE.
 194  *
 195  * @param n Number to be split into quotient and rest.
 196  *          Will be modified to contain the quotient.
 197  * @param d Divisor.
 198  * @param m Output variable for the rest (modulo result).
 199  */
 200 #define NEGDIVMOD(n, d, m) { \
 201     (m)=(n)%(d); \
 202     (n)/=(d); \
 203     if((m)<0) { \
 204         --(n); \
 205         (m)+=(d); \
 206     } \
 207 }
 208
 209 /* BOCU-1 implementation functions ------------------------------------------ */
 210
 211 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
 212
 213 /**
 214  * Compute the next "previous" value for differencing
 215  * from the current code point.
 216  *
 217  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
 218  * @return "previous code point" state value
 219  */
 220 static U_INLINE int32_t
 221 bocu1Prev(int32_t c) {
 222     /* compute new prev */
 223     if(/* 0x3040<=c && */ c<=0x309f) {
 224         /* Hiragana is not 128-aligned */
 225         return 0x3070;
 226     } else if(0x4e00<=c && c<=0x9fa5) {
 227         /* CJK Unihan */
 228         return 0x4e00-BOCU1_REACH_NEG_2;
 229     } else if(0xac00<=c /* && c<=0xd7a3 */) {
 230         /* Korean Hangul */
 231         return (0xd7a3+0xac00)/2;
 232     } else {
 233         /* mostly small scripts */
 234         return BOCU1_SIMPLE_PREV(c);
 235     }
 236 }
 237
 238 /** Fast version of bocu1Prev() for most scripts. */
 239 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
 240
 241 /*
 242  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
 243  * The UConverter fields are used as follows:
 244  *
 245  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 246  *
 247  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 248  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
 249  */
 250
 251 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
 252
 253 /**
 254  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
 255  * and return a packed integer with them.
 256  *
 257  * The encoding favors small absolut differences with short encodings
 258  * to compress runs of same-script characters.
 259  *
 260  * Optimized version with unrolled loops and fewer floating-point operations
 261  * than the standard packDiff().
 262  *
 263  * @param diff difference value -0x10ffff..0x10ffff
 264  * @return
 265  *      0x010000zz for 1-byte sequence zz
 266  *      0x0200yyzz for 2-byte sequence yy zz
 267  *      0x03xxyyzz for 3-byte sequence xx yy zz
 268  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
 269  */
 270 static int32_t
 271 packDiff(int32_t diff) {
 272     int32_t result, m;
 273
 274     if(diff>=BOCU1_REACH_NEG_1) {
 275         /* mostly positive differences, and single-byte negative ones */
 276 #if 0   /* single-byte case handled in macros, see below */
 277         if(diff<=BOCU1_REACH_POS_1) {
 278             /* single byte */
 279             return 0x01000000|(BOCU1_MIDDLE+diff);
 280         } else
 281 #endif
 282         if(diff<=BOCU1_REACH_POS_2) {
 283             /* two bytes */
 284             diff-=BOCU1_REACH_POS_1+1;
 285             result=0x02000000;
 286
 287             m=diff%BOCU1_TRAIL_COUNT;
 288             diff/=BOCU1_TRAIL_COUNT;
 289             result|=BOCU1_TRAIL_TO_BYTE(m);
 290
 291             result|=(BOCU1_START_POS_2+diff)<<8;
 292         } else if(diff<=BOCU1_REACH_POS_3) {
 293             /* three bytes */
 294             diff-=BOCU1_REACH_POS_2+1;
 295             result=0x03000000;
 296
 297             m=diff%BOCU1_TRAIL_COUNT;
 298             diff/=BOCU1_TRAIL_COUNT;
 299             result|=BOCU1_TRAIL_TO_BYTE(m);
 300
 301             m=diff%BOCU1_TRAIL_COUNT;
 302             diff/=BOCU1_TRAIL_COUNT;
 303             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 304
 305             result|=(BOCU1_START_POS_3+diff)<<16;
 306         } else {
 307             /* four bytes */
 308             diff-=BOCU1_REACH_POS_3+1;
 309
 310             m=diff%BOCU1_TRAIL_COUNT;
 311             diff/=BOCU1_TRAIL_COUNT;
 312             result=BOCU1_TRAIL_TO_BYTE(m);
 313
 314             m=diff%BOCU1_TRAIL_COUNT;
 315             diff/=BOCU1_TRAIL_COUNT;
 316             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 317
 318             /*
 319              * We know that / and % would deliver quotient 0 and rest=diff.
 320              * Avoid division and modulo for performance.
 321              */
 322             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
 323
 324             result|=((uint32_t)BOCU1_START_POS_4)<<24;
 325         }
 326     } else {
 327         /* two- to four-byte negative differences */
 328         if(diff>=BOCU1_REACH_NEG_2) {
 329             /* two bytes */
 330             diff-=BOCU1_REACH_NEG_1;
 331             result=0x02000000;
 332
 333             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 334             result|=BOCU1_TRAIL_TO_BYTE(m);
 335
 336             result|=(BOCU1_START_NEG_2+diff)<<8;
 337         } else if(diff>=BOCU1_REACH_NEG_3) {
 338             /* three bytes */
 339             diff-=BOCU1_REACH_NEG_2;
 340             result=0x03000000;
 341
 342             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 343             result|=BOCU1_TRAIL_TO_BYTE(m);
 344
 345             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 346             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 347
 348             result|=(BOCU1_START_NEG_3+diff)<<16;
 349         } else {
 350             /* four bytes */
 351             diff-=BOCU1_REACH_NEG_3;
 352
 353             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 354             result=BOCU1_TRAIL_TO_BYTE(m);
 355
 356             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 357             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 358
 359             /*
 360              * We know that NEGDIVMOD would deliver
 361              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
 362              * Avoid division and modulo for performance.
 363              */
 364             m=diff+BOCU1_TRAIL_COUNT;
 365             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
 366
 367             result|=BOCU1_MIN<<24;
 368         }
 369     }
 370     return result;
 371 }
 372
 373 /* Faster versions of packDiff() for single-byte-encoded diff values. */
 374
 375 /** Is a diff value encodable in a single byte? */
 376 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
 377
 378 /** Encode a diff value in a single byte. */
 379 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
 380
 381 /** Is a diff value encodable in two bytes? */
 382 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
 383
 384 static void
 385 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 386                              UErrorCode *pErrorCode) {
 387     UConverter *cnv;
 388     const UChar *source, *sourceLimit;
 389     uint8_t *target;
 390     int32_t targetCapacity;
 391     int32_t *offsets;
 392
 393     int32_t prev, c, diff;
 394
 395     int32_t sourceIndex, nextSourceIndex;
 396
 397 U_ALIGN_CODE(16)
 398
 399     /* set up the local pointers */
 400     cnv=pArgs->converter;
 401     source=pArgs->source;
 402     sourceLimit=pArgs->sourceLimit;
 403     target=(uint8_t *)pArgs->target;
 404     targetCapacity=pArgs->targetLimit-pArgs->target;
 405     offsets=pArgs->offsets;
 406
 407     /* get the converter state from UConverter */
 408     c=cnv->fromUChar32;
 409     prev=(int32_t)cnv->fromUnicodeStatus;
 410     if(prev==0) {
 411         prev=BOCU1_ASCII_PREV;
 412     }
 413
 414     /* sourceIndex=-1 if the current character began in the previous buffer */
 415     sourceIndex= c==0 ? 0 : -1;
 416     nextSourceIndex=0;
 417
 418     /* conversion loop */
 419     if(c!=0 && targetCapacity>0) {
 420         goto getTrail;
 421     }
 422
 423 fastSingle:
 424     /* fast loop for single-byte differences */
 425     /* use only one loop counter variable, targetCapacity, not also source */
 426     diff=sourceLimit-source;
 427     if(targetCapacity>diff) {
 428         targetCapacity=diff;
 429     }
 430     while(targetCapacity>0 && (c=*source)<0x3000) {
 431         if(c<=0x20) {
 432             if(c!=0x20) {
 433                 prev=BOCU1_ASCII_PREV;
 434             }
 435             *target++=(uint8_t)c;
 436             *offsets++=nextSourceIndex++;
 437             ++source;
 438             --targetCapacity;
 439         } else {
 440             diff=c-prev;
 441             if(DIFF_IS_SINGLE(diff)) {
 442                 prev=BOCU1_SIMPLE_PREV(c);
 443                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 444                 *offsets++=nextSourceIndex++;
 445                 ++source;
 446                 --targetCapacity;
 447             } else {
 448                 break;
 449             }
 450         }
 451     }
 452     /* restore real values */
 453     targetCapacity=(const uint8_t *)pArgs->targetLimit-target;
 454     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
 455
 456     /* regular loop for all cases */
 457     while(source<sourceLimit) {
 458         if(targetCapacity>0) {
 459             c=*source++;
 460             ++nextSourceIndex;
 461
 462             if(c<=0x20) {
 463                 /*
 464                  * ISO C0 control & space:
 465                  * Encode directly for MIME compatibility,
 466                  * and reset state except for space, to not disrupt compression.
 467                  */
 468                 if(c!=0x20) {
 469                     prev=BOCU1_ASCII_PREV;
 470                 }
 471                 *target++=(uint8_t)c;
 472                 *offsets++=sourceIndex;
 473                 --targetCapacity;
 474
 475                 sourceIndex=nextSourceIndex;
 476                 continue;
 477             }
 478
 479             if(UTF_IS_LEAD(c)) {
 480 getTrail:
 481                 if(source<sourceLimit) {
 482                     /* test the following code unit */
 483                     UChar trail=*source;
 484                     if(UTF_IS_SECOND_SURROGATE(trail)) {
 485                         ++source;
 486                         ++nextSourceIndex;
 487                         c=UTF16_GET_PAIR_VALUE(c, trail);
 488                     }
 489                 } else {
 490                     /* no more input */
 491                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 492                     break;
 493                 }
 494             }
 495
 496             /*
 497              * all other Unicode code points c==U+0021..U+10ffff
 498              * are encoded with the difference c-prev
 499              *
 500              * a new prev is computed from c,
 501              * placed in the middle of a 0x80-block (for most small scripts) or
 502              * in the middle of the Unihan and Hangul blocks
 503              * to statistically minimize the following difference
 504              */
 505             diff=c-prev;
 506             prev=BOCU1_PREV(c);
 507             if(DIFF_IS_SINGLE(diff)) {
 508                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 509                 *offsets++=sourceIndex;
 510                 --targetCapacity;
 511                 sourceIndex=nextSourceIndex;
 512                 if(c<0x3000) {
 513                     goto fastSingle;
 514                 }
 515             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 516                 /* optimize 2-byte case */
 517                 int32_t m;
 518
 519                 if(diff>=0) {
 520                     diff-=BOCU1_REACH_POS_1+1;
 521                     m=diff%BOCU1_TRAIL_COUNT;
 522                     diff/=BOCU1_TRAIL_COUNT;
 523                     diff+=BOCU1_START_POS_2;
 524                 } else {
 525                     diff-=BOCU1_REACH_NEG_1;
 526                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 527                     diff+=BOCU1_START_NEG_2;
 528                 }
 529                 *target++=(uint8_t)diff;
 530                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 531                 *offsets++=sourceIndex;
 532                 *offsets++=sourceIndex;
 533                 targetCapacity-=2;
 534                 sourceIndex=nextSourceIndex;
 535             } else {
 536                 int32_t length; /* will be 2..4 */
 537
 538                 diff=packDiff(diff);
 539                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 540
 541                 /* write the output character bytes from diff and length */
 542                 /* from the first if in the loop we know that targetCapacity>0 */
 543                 if(length<=targetCapacity) {
 544                     switch(length) {
 545                         /* each branch falls through to the next one */
 546                     case 4:
 547                         *target++=(uint8_t)(diff>>24);
 548                         *offsets++=sourceIndex;
 549                     case 3:
 550                         *target++=(uint8_t)(diff>>16);
 551                         *offsets++=sourceIndex;
 552                     case 2:
 553                         *target++=(uint8_t)(diff>>8);
 554                         *offsets++=sourceIndex;
 555                     /* case 1: handled above */
 556                         *target++=(uint8_t)diff;
 557                         *offsets++=sourceIndex;
 558                     default:
 559                         /* will never occur */
 560                         break;
 561                     }
 562                     targetCapacity-=length;
 563                     sourceIndex=nextSourceIndex;
 564                 } else {
 565                     uint8_t *charErrorBuffer;
 566
 567                     /*
 568                      * We actually do this backwards here:
 569                      * In order to save an intermediate variable, we output
 570                      * first to the overflow buffer what does not fit into the
 571                      * regular target.
 572                      */
 573                     /* we know that 1<=targetCapacity<length<=4 */
 574                     length-=targetCapacity;
 575                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 576                     switch(length) {
 577                         /* each branch falls through to the next one */
 578                     case 3:
 579                         *charErrorBuffer++=(uint8_t)(diff>>16);
 580                     case 2:
 581                         *charErrorBuffer++=(uint8_t)(diff>>8);
 582                     case 1:
 583                         *charErrorBuffer=(uint8_t)diff;
 584                     default:
 585                         /* will never occur */
 586                         break;
 587                     }
 588                     cnv->charErrorBufferLength=(int8_t)length;
 589
 590                     /* now output what fits into the regular target */
 591                     diff>>=8*length; /* length was reduced by targetCapacity */
 592                     switch(targetCapacity) {
 593                         /* each branch falls through to the next one */
 594                     case 3:
 595                         *target++=(uint8_t)(diff>>16);
 596                         *offsets++=sourceIndex;
 597                     case 2:
 598                         *target++=(uint8_t)(diff>>8);
 599                         *offsets++=sourceIndex;
 600                     case 1:
 601                         *target++=(uint8_t)diff;
 602                         *offsets++=sourceIndex;
 603                     default:
 604                         /* will never occur */
 605                         break;
 606                     }
 607
 608                     /* target overflow */
 609                     targetCapacity=0;
 610                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 611                     break;
 612                 }
 613             }
 614         } else {
 615             /* target is full */
 616             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 617             break;
 618         }
 619     }
 620
 621     /* set the converter state back into UConverter */
 622     cnv->fromUChar32= c<0 ? -c : 0;
 623     cnv->fromUnicodeStatus=(uint32_t)prev;
 624
 625     /* write back the updated pointers */
 626     pArgs->source=source;
 627     pArgs->target=(char *)target;
 628     pArgs->offsets=offsets;
 629 }
 630
 631 /*
 632  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
 633  * If a change is made in the original function, then either
 634  * change this function the same way or
 635  * re-copy the original function and remove the variables
 636  * offsets, sourceIndex, and nextSourceIndex.
 637  */
 638 static void
 639 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
 640                   UErrorCode *pErrorCode) {
 641     UConverter *cnv;
 642     const UChar *source, *sourceLimit;
 643     uint8_t *target;
 644     int32_t targetCapacity;
 645
 646     int32_t prev, c, diff;
 647
 648     /* set up the local pointers */
 649     cnv=pArgs->converter;
 650     source=pArgs->source;
 651     sourceLimit=pArgs->sourceLimit;
 652     target=(uint8_t *)pArgs->target;
 653     targetCapacity=pArgs->targetLimit-pArgs->target;
 654
 655     /* get the converter state from UConverter */
 656     c=cnv->fromUChar32;
 657     prev=(int32_t)cnv->fromUnicodeStatus;
 658     if(prev==0) {
 659         prev=BOCU1_ASCII_PREV;
 660     }
 661
 662     /* conversion loop */
 663     if(c!=0 && targetCapacity>0) {
 664         goto getTrail;
 665     }
 666
 667 fastSingle:
 668     /* fast loop for single-byte differences */
 669     /* use only one loop counter variable, targetCapacity, not also source */
 670     diff=sourceLimit-source;
 671     if(targetCapacity>diff) {
 672         targetCapacity=diff;
 673     }
 674     while(targetCapacity>0 && (c=*source)<0x3000) {
 675         if(c<=0x20) {
 676             if(c!=0x20) {
 677                 prev=BOCU1_ASCII_PREV;
 678             }
 679             *target++=(uint8_t)c;
 680         } else {
 681             diff=c-prev;
 682             if(DIFF_IS_SINGLE(diff)) {
 683                 prev=BOCU1_SIMPLE_PREV(c);
 684                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 685             } else {
 686                 break;
 687             }
 688         }
 689         ++source;
 690         --targetCapacity;
 691     }
 692     /* restore real values */
 693     targetCapacity=(const uint8_t *)pArgs->targetLimit-target;
 694
 695     /* regular loop for all cases */
 696     while(source<sourceLimit) {
 697         if(targetCapacity>0) {
 698             c=*source++;
 699
 700             if(c<=0x20) {
 701                 /*
 702                  * ISO C0 control & space:
 703                  * Encode directly for MIME compatibility,
 704                  * and reset state except for space, to not disrupt compression.
 705                  */
 706                 if(c!=0x20) {
 707                     prev=BOCU1_ASCII_PREV;
 708                 }
 709                 *target++=(uint8_t)c;
 710                 --targetCapacity;
 711                 continue;
 712             }
 713
 714             if(UTF_IS_LEAD(c)) {
 715 getTrail:
 716                 if(source<sourceLimit) {
 717                     /* test the following code unit */
 718                     UChar trail=*source;
 719                     if(UTF_IS_SECOND_SURROGATE(trail)) {
 720                         ++source;
 721                         c=UTF16_GET_PAIR_VALUE(c, trail);
 722                     }
 723                 } else {
 724                     /* no more input */
 725                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 726                     break;
 727                 }
 728             }
 729
 730             /*
 731              * all other Unicode code points c==U+0021..U+10ffff
 732              * are encoded with the difference c-prev
 733              *
 734              * a new prev is computed from c,
 735              * placed in the middle of a 0x80-block (for most small scripts) or
 736              * in the middle of the Unihan and Hangul blocks
 737              * to statistically minimize the following difference
 738              */
 739             diff=c-prev;
 740             prev=BOCU1_PREV(c);
 741             if(DIFF_IS_SINGLE(diff)) {
 742                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 743                 --targetCapacity;
 744                 if(c<0x3000) {
 745                     goto fastSingle;
 746                 }
 747             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 748                 /* optimize 2-byte case */
 749                 int32_t m;
 750
 751                 if(diff>=0) {
 752                     diff-=BOCU1_REACH_POS_1+1;
 753                     m=diff%BOCU1_TRAIL_COUNT;
 754                     diff/=BOCU1_TRAIL_COUNT;
 755                     diff+=BOCU1_START_POS_2;
 756                 } else {
 757                     diff-=BOCU1_REACH_NEG_1;
 758                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 759                     diff+=BOCU1_START_NEG_2;
 760                 }
 761                 *target++=(uint8_t)diff;
 762                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 763                 targetCapacity-=2;
 764             } else {
 765                 int32_t length; /* will be 2..4 */
 766
 767                 diff=packDiff(diff);
 768                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 769
 770                 /* write the output character bytes from diff and length */
 771                 /* from the first if in the loop we know that targetCapacity>0 */
 772                 if(length<=targetCapacity) {
 773                     switch(length) {
 774                         /* each branch falls through to the next one */
 775                     case 4:
 776                         *target++=(uint8_t)(diff>>24);
 777                     case 3:
 778                         *target++=(uint8_t)(diff>>16);
 779                     /* case 2: handled above */
 780                         *target++=(uint8_t)(diff>>8);
 781                     /* case 1: handled above */
 782                         *target++=(uint8_t)diff;
 783                     default:
 784                         /* will never occur */
 785                         break;
 786                     }
 787                     targetCapacity-=length;
 788                 } else {
 789                     uint8_t *charErrorBuffer;
 790
 791                     /*
 792                      * We actually do this backwards here:
 793                      * In order to save an intermediate variable, we output
 794                      * first to the overflow buffer what does not fit into the
 795                      * regular target.
 796                      */
 797                     /* we know that 1<=targetCapacity<length<=4 */
 798                     length-=targetCapacity;
 799                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 800                     switch(length) {
 801                         /* each branch falls through to the next one */
 802                     case 3:
 803                         *charErrorBuffer++=(uint8_t)(diff>>16);
 804                     case 2:
 805                         *charErrorBuffer++=(uint8_t)(diff>>8);
 806                     case 1:
 807                         *charErrorBuffer=(uint8_t)diff;
 808                     default:
 809                         /* will never occur */
 810                         break;
 811                     }
 812                     cnv->charErrorBufferLength=(int8_t)length;
 813
 814                     /* now output what fits into the regular target */
 815                     diff>>=8*length; /* length was reduced by targetCapacity */
 816                     switch(targetCapacity) {
 817                         /* each branch falls through to the next one */
 818                     case 3:
 819                         *target++=(uint8_t)(diff>>16);
 820                     case 2:
 821                         *target++=(uint8_t)(diff>>8);
 822                     case 1:
 823                         *target++=(uint8_t)diff;
 824                     default:
 825                         /* will never occur */
 826                         break;
 827                     }
 828
 829                     /* target overflow */
 830                     targetCapacity=0;
 831                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 832                     break;
 833                 }
 834             }
 835         } else {
 836             /* target is full */
 837             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 838             break;
 839         }
 840     }
 841
 842     /* set the converter state back into UConverter */
 843     cnv->fromUChar32= c<0 ? -c : 0;
 844     cnv->fromUnicodeStatus=(uint32_t)prev;
 845
 846     /* write back the updated pointers */
 847     pArgs->source=source;
 848     pArgs->target=(char *)target;
 849 }
 850
 851 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
 852
 853 /**
 854  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
 855  *
 856  * @param b lead byte;
 857  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
 858  * @return (diff<<2)|count
 859  */
 860 static U_INLINE int32_t
 861 decodeBocu1LeadByte(int32_t b) {
 862     int32_t diff, count;
 863
 864     if(b>=BOCU1_START_NEG_2) {
 865         /* positive difference */
 866         if(b<BOCU1_START_POS_3) {
 867             /* two bytes */
 868             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
 869             count=1;
 870         } else if(b<BOCU1_START_POS_4) {
 871             /* three bytes */
 872             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
 873             count=2;
 874         } else {
 875             /* four bytes */
 876             diff=BOCU1_REACH_POS_3+1;
 877             count=3;
 878         }
 879     } else {
 880         /* negative difference */
 881         if(b>=BOCU1_START_NEG_3) {
 882             /* two bytes */
 883             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
 884             count=1;
 885         } else if(b>BOCU1_MIN) {
 886             /* three bytes */
 887             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
 888             count=2;
 889         } else {
 890             /* four bytes */
 891             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
 892             count=3;
 893         }
 894     }
 895
 896     /* return the state for decoding the trail byte(s) */
 897     return (diff<<2)|count;
 898 }
 899
 900 /**
 901  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
 902  *
 903  * @param count number of remaining trail bytes including this one
 904  * @param b trail byte
 905  * @return new delta for diff including b - <0 indicates an error
 906  *
 907  * @see decodeBocu1
 908  */
 909 static U_INLINE int32_t
 910 decodeBocu1TrailByte(int32_t count, int32_t b) {
 911     if(b<=0x20) {
 912         /* skip some C0 controls and make the trail byte range contiguous */
 913         b=bocu1ByteToTrail[b];
 914         /* b<0 for an illegal trail byte value will result in return<0 below */
 915 #if BOCU1_MAX_TRAIL<0xff
 916     } else if(b>BOCU1_MAX_TRAIL) {
 917         return -99;
 918 #endif
 919     } else {
 920         b-=BOCU1_TRAIL_BYTE_OFFSET;
 921     }
 922
 923     /* add trail byte into difference and decrement count */
 924     if(count==1) {
 925         return b;
 926     } else if(count==2) {
 927         return b*BOCU1_TRAIL_COUNT;
 928     } else /* count==3 */ {
 929         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
 930     }
 931 }
 932
 933 static void
 934 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 935                            UErrorCode *pErrorCode) {
 936     UConverter *cnv;
 937     const uint8_t *source, *sourceLimit;
 938     UChar *target;
 939     const UChar *targetLimit;
 940     int32_t *offsets;
 941
 942     int32_t prev, count, diff, c;
 943
 944     int8_t byteIndex;
 945     uint8_t *bytes;
 946
 947     int32_t sourceIndex, nextSourceIndex;
 948
 949     /* set up the local pointers */
 950     cnv=pArgs->converter;
 951     source=(const uint8_t *)pArgs->source;
 952     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 953     target=pArgs->target;
 954     targetLimit=pArgs->targetLimit;
 955     offsets=pArgs->offsets;
 956
 957     /* get the converter state from UConverter */
 958     prev=(int32_t)cnv->toUnicodeStatus;
 959     if(prev==0) {
 960         prev=BOCU1_ASCII_PREV;
 961     }
 962     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
 963     count=diff&3;
 964     diff>>=2;
 965
 966     byteIndex=cnv->toULength;
 967     bytes=cnv->toUBytes;
 968
 969     /* sourceIndex=-1 if the current character began in the previous buffer */
 970     sourceIndex=byteIndex==0 ? 0 : -1;
 971     nextSourceIndex=0;
 972
 973     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
 974     if(count>0 && byteIndex>0 && target<targetLimit) {
 975         goto getTrail;
 976     }
 977
 978 fastSingle:
 979     /* fast loop for single-byte differences */
 980     /* use count as the only loop counter variable */
 981     diff=sourceLimit-source;
 982     count=pArgs->targetLimit-target;
 983     if(count>diff) {
 984         count=diff;
 985     }
 986     while(count>0) {
 987         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
 988             c=prev+(c-BOCU1_MIDDLE);
 989             if(c<0x3000) {
 990                 *target++=(UChar)c;
 991                 *offsets++=nextSourceIndex++;
 992                 prev=BOCU1_SIMPLE_PREV(c);
 993             } else {
 994                 break;
 995             }
 996         } else if(c<=0x20) {
 997             if(c!=0x20) {
 998                 prev=BOCU1_ASCII_PREV;
 999             }
1000             *target++=(UChar)c;
1001             *offsets++=nextSourceIndex++;
1002         } else {
1003             break;
1004         }
1005         ++source;
1006         --count;
1007     }
1008     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1009
1010     /* decode a sequence of single and lead bytes */
1011     while(source<sourceLimit) {
1012         if(target>=targetLimit) {
1013             /* target is full */
1014             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1015             break;
1016         }
1017
1018         ++nextSourceIndex;
1019         c=*source++;
1020         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1021             /* Write a code point directly from a single-byte difference. */
1022             c=prev+(c-BOCU1_MIDDLE);
1023             if(c<0x3000) {
1024                 *target++=(UChar)c;
1025                 *offsets++=sourceIndex;
1026                 prev=BOCU1_SIMPLE_PREV(c);
1027                 sourceIndex=nextSourceIndex;
1028                 goto fastSingle;
1029             }
1030         } else if(c<=0x20) {
1031             /*
1032              * Direct-encoded C0 control code or space.
1033              * Reset prev for C0 control codes but not for space.
1034              */
1035             if(c!=0x20) {
1036                 prev=BOCU1_ASCII_PREV;
1037             }
1038             *target++=(UChar)c;
1039             *offsets++=sourceIndex;
1040             sourceIndex=nextSourceIndex;
1041             continue;
1042         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1043             /* Optimize two-byte case. */
1044             if(c>=BOCU1_MIDDLE) {
1045                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1046             } else {
1047                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1048             }
1049
1050             /* trail byte */
1051             ++nextSourceIndex;
1052             c=decodeBocu1TrailByte(1, *source++);
1053             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1054                 bytes[0]=source[-2];
1055                 bytes[1]=source[-1];
1056                 byteIndex=2;
1057                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1058                 break;
1059             }
1060         } else if(c==BOCU1_RESET) {
1061             /* only reset the state, no code point */
1062             prev=BOCU1_ASCII_PREV;
1063             sourceIndex=nextSourceIndex;
1064             continue;
1065         } else {
1066             /*
1067              * For multi-byte difference lead bytes, set the decoder state
1068              * with the partial difference value from the lead byte and
1069              * with the number of trail bytes.
1070              */
1071             bytes[0]=(uint8_t)c;
1072             byteIndex=1;
1073
1074             diff=decodeBocu1LeadByte(c);
1075             count=diff&3;
1076             diff>>=2;
1077 getTrail:
1078             for(;;) {
1079                 if(source>=sourceLimit) {
1080                     goto endloop;
1081                 }
1082                 ++nextSourceIndex;
1083                 c=bytes[byteIndex++]=*source++;
1084
1085                 /* trail byte in any position */
1086                 c=decodeBocu1TrailByte(count, c);
1087                 if(c<0) {
1088                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1089                     goto endloop;
1090                 }
1091
1092                 diff+=c;
1093                 if(--count==0) {
1094                     /* final trail byte, deliver a code point */
1095                     byteIndex=0;
1096                     c=prev+diff;
1097                     if((uint32_t)c>0x10ffff) {
1098                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1099                         goto endloop;
1100                     }
1101                     break;
1102                 }
1103             }
1104         }
1105
1106         /* calculate the next prev and output c */
1107         prev=BOCU1_PREV(c);
1108         if(c<=0xffff) {
1109             *target++=(UChar)c;
1110             *offsets++=sourceIndex;
1111         } else {
1112             /* output surrogate pair */
1113             *target++=UTF16_LEAD(c);
1114             if(target<targetLimit) {
1115                 *target++=UTF16_TRAIL(c);
1116                 *offsets++=sourceIndex;
1117                 *offsets++=sourceIndex;
1118             } else {
1119                 /* target overflow */
1120                 *offsets++=sourceIndex;
1121                 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1122                 cnv->UCharErrorBufferLength=1;
1123                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1124                 break;
1125             }
1126         }
1127         sourceIndex=nextSourceIndex;
1128     }
1129 endloop:
1130
1131     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1132         /* set the converter state in UConverter to deal with the next character */
1133         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1134         cnv->mode=0;
1135     } else {
1136         /* set the converter state back into UConverter */
1137         cnv->toUnicodeStatus=(uint32_t)prev;
1138         cnv->mode=(diff<<2)|count;
1139     }
1140     cnv->toULength=byteIndex;
1141
1142     /* write back the updated pointers */
1143     pArgs->source=(const char *)source;
1144     pArgs->target=target;
1145     pArgs->offsets=offsets;
1146     return;
1147 }
1148
1149 /*
1150  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1151  * If a change is made in the original function, then either
1152  * change this function the same way or
1153  * re-copy the original function and remove the variables
1154  * offsets, sourceIndex, and nextSourceIndex.
1155  */
1156 static void
1157 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1158                 UErrorCode *pErrorCode) {
1159     UConverter *cnv;
1160     const uint8_t *source, *sourceLimit;
1161     UChar *target;
1162     const UChar *targetLimit;
1163
1164     int32_t prev, count, diff, c;
1165
1166     int8_t byteIndex;
1167     uint8_t *bytes;
1168
1169 U_ALIGN_CODE(16)
1170
1171     /* set up the local pointers */
1172     cnv=pArgs->converter;
1173     source=(const uint8_t *)pArgs->source;
1174     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1175     target=pArgs->target;
1176     targetLimit=pArgs->targetLimit;
1177
1178     /* get the converter state from UConverter */
1179     prev=(int32_t)cnv->toUnicodeStatus;
1180     if(prev==0) {
1181         prev=BOCU1_ASCII_PREV;
1182     }
1183     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1184     count=diff&3;
1185     diff>>=2;
1186
1187     byteIndex=cnv->toULength;
1188     bytes=cnv->toUBytes;
1189
1190     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1191     if(count>0 && byteIndex>0 && target<targetLimit) {
1192         goto getTrail;
1193     }
1194
1195 fastSingle:
1196     /* fast loop for single-byte differences */
1197     /* use count as the only loop counter variable */
1198     diff=sourceLimit-source;
1199     count=pArgs->targetLimit-target;
1200     if(count>diff) {
1201         count=diff;
1202     }
1203     while(count>0) {
1204         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1205             c=prev+(c-BOCU1_MIDDLE);
1206             if(c<0x3000) {
1207                 *target++=(UChar)c;
1208                 prev=BOCU1_SIMPLE_PREV(c);
1209             } else {
1210                 break;
1211             }
1212         } else if(c<=0x20) {
1213             if(c!=0x20) {
1214                 prev=BOCU1_ASCII_PREV;
1215             }
1216             *target++=(UChar)c;
1217         } else {
1218             break;
1219         }
1220         ++source;
1221         --count;
1222     }
1223
1224     /* decode a sequence of single and lead bytes */
1225     while(source<sourceLimit) {
1226         if(target>=targetLimit) {
1227             /* target is full */
1228             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1229             break;
1230         }
1231
1232         c=*source++;
1233         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1234             /* Write a code point directly from a single-byte difference. */
1235             c=prev+(c-BOCU1_MIDDLE);
1236             if(c<0x3000) {
1237                 *target++=(UChar)c;
1238                 prev=BOCU1_SIMPLE_PREV(c);
1239                 goto fastSingle;
1240             }
1241         } else if(c<=0x20) {
1242             /*
1243              * Direct-encoded C0 control code or space.
1244              * Reset prev for C0 control codes but not for space.
1245              */
1246             if(c!=0x20) {
1247                 prev=BOCU1_ASCII_PREV;
1248             }
1249             *target++=(UChar)c;
1250             continue;
1251         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1252             /* Optimize two-byte case. */
1253             if(c>=BOCU1_MIDDLE) {
1254                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1255             } else {
1256                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1257             }
1258
1259             /* trail byte */
1260             c=decodeBocu1TrailByte(1, *source++);
1261             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1262                 bytes[0]=source[-2];
1263                 bytes[1]=source[-1];
1264                 byteIndex=2;
1265                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1266                 break;
1267             }
1268         } else if(c==BOCU1_RESET) {
1269             /* only reset the state, no code point */
1270             prev=BOCU1_ASCII_PREV;
1271             continue;
1272         } else {
1273             /*
1274              * For multi-byte difference lead bytes, set the decoder state
1275              * with the partial difference value from the lead byte and
1276              * with the number of trail bytes.
1277              */
1278             bytes[0]=(uint8_t)c;
1279             byteIndex=1;
1280
1281             diff=decodeBocu1LeadByte(c);
1282             count=diff&3;
1283             diff>>=2;
1284 getTrail:
1285             for(;;) {
1286                 if(source>=sourceLimit) {
1287                     goto endloop;
1288                 }
1289                 c=bytes[byteIndex++]=*source++;
1290
1291                 /* trail byte in any position */
1292                 c=decodeBocu1TrailByte(count, c);
1293                 if(c<0) {
1294                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1295                     goto endloop;
1296                 }
1297
1298                 diff+=c;
1299                 if(--count==0) {
1300                     /* final trail byte, deliver a code point */
1301                     byteIndex=0;
1302                     c=prev+diff;
1303                     if((uint32_t)c>0x10ffff) {
1304                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1305                         goto endloop;
1306                     }
1307                     break;
1308                 }
1309             }
1310         }
1311
1312         /* calculate the next prev and output c */
1313         prev=BOCU1_PREV(c);
1314         if(c<=0xffff) {
1315             *target++=(UChar)c;
1316         } else {
1317             /* output surrogate pair */
1318             *target++=UTF16_LEAD(c);
1319             if(target<targetLimit) {
1320                 *target++=UTF16_TRAIL(c);
1321             } else {
1322                 /* target overflow */
1323                 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1324                 cnv->UCharErrorBufferLength=1;
1325                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1326                 break;
1327             }
1328         }
1329     }
1330 endloop:
1331
1332     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1333         /* set the converter state in UConverter to deal with the next character */
1334         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1335         cnv->mode=0;
1336     } else {
1337         /* set the converter state back into UConverter */
1338         cnv->toUnicodeStatus=(uint32_t)prev;
1339         cnv->mode=(diff<<2)|count;
1340     }
1341     cnv->toULength=byteIndex;
1342
1343     /* write back the updated pointers */
1344     pArgs->source=(const char *)source;
1345     pArgs->target=target;
1346     return;
1347 }
1348
1349 /* miscellaneous ------------------------------------------------------------ */
1350
1351 static const UConverterImpl _Bocu1Impl={
1352     UCNV_BOCU1,
1353
1354     NULL,
1355     NULL,
1356
1357     NULL,
1358     NULL,
1359     NULL,
1360
1361     _Bocu1ToUnicode,
1362     _Bocu1ToUnicodeWithOffsets,
1363     _Bocu1FromUnicode,
1364     _Bocu1FromUnicodeWithOffsets,
1365     NULL,
1366
1367     NULL,
1368     NULL,
1369     NULL,
1370     NULL,
1371     ucnv_getCompleteUnicodeSet
1372 };
1373
1374 static const UConverterStaticData _Bocu1StaticData={
1375     sizeof(UConverterStaticData),
1376     "BOCU-1",
1377     0, /* CCSID for BOCU-1 */
1378     UCNV_IBM, UCNV_BOCU1,
1379     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1380     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1381     FALSE, FALSE,
1382     0,
1383     0,
1384     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1385 };
1386
1387 const UConverterSharedData _Bocu1Data={
1388     sizeof(UConverterSharedData), ~((uint32_t)0),
1389     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
1390     0
1391 };
1392
1393 #endif