icuSources/common/ucnvbocu.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2002-2011, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  ucnvbocu.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002mar27
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This is an implementation of the Binary Ordered Compression for Unicode,
  17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_CONVERSION
  23
  24 #include "unicode/ucnv.h"
  25 #include "unicode/ucnv_cb.h"
  26 #include "unicode/utf16.h"
  27 #include "putilimp.h"
  28 #include "ucnv_bld.h"
  29 #include "ucnv_cnv.h"
  30 #include "uassert.h"
  31
  32 /* BOCU-1 constants and macros ---------------------------------------------- */
  33
  34 /*
  35  * BOCU-1 encodes the code points of a Unicode string as
  36  * a sequence of byte-encoded differences (slope detection),
  37  * preserving lexical order.
  38  *
  39  * Optimize the difference-taking for runs of Unicode text within
  40  * small scripts:
  41  *
  42  * Most small scripts are allocated within aligned 128-blocks of Unicode
  43  * code points. Lexical order is preserved if the "previous code point" state
  44  * is always moved into the middle of such a block.
  45  *
  46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  47  * areas into the middle of those areas.
  48  *
  49  * C0 control codes and space are encoded with their US-ASCII bytes.
  50  * "prev" is reset for C0 controls but not for space.
  51  */
  52
  53 /* initial value for "prev": middle of the ASCII range */
  54 #define BOCU1_ASCII_PREV        0x40
  55
  56 /* bounding byte values for differences */
  57 #define BOCU1_MIN               0x21
  58 #define BOCU1_MIDDLE            0x90
  59 #define BOCU1_MAX_LEAD          0xfe
  60 #define BOCU1_MAX_TRAIL         0xff
  61 #define BOCU1_RESET             0xff
  62
  63 /* number of lead bytes */
  64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  65
  66 /* adjust trail byte counts for the use of some C0 control byte values */
  67 #define BOCU1_TRAIL_CONTROLS_COUNT  20
  68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  69
  70 /* number of trail bytes */
  71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  72
  73 /*
  74  * number of positive and negative single-byte codes
  75  * (counting 0==BOCU1_MIDDLE among the positive ones)
  76  */
  77 #define BOCU1_SINGLE            64
  78
  79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  80 #define BOCU1_LEAD_2            43
  81 #define BOCU1_LEAD_3            3
  82 #define BOCU1_LEAD_4            1
  83
  84 /* The difference value range for single-byters. */
  85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
  86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
  87
  88 /* The difference value range for double-byters. */
  89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  91
  92 /* The difference value range for 3-byters. */
  93 #define BOCU1_REACH_POS_3   \
  94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  95
  96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  97
  98 /* The lead byte start values. */
  99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
 100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
 101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
 102      /* ==BOCU1_MAX_LEAD */
 103
 104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
 105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
 106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
 107      /* ==BOCU1_MIN+1 */
 108
 109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
 110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
 111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
 112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
 113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
 114
 115 /* The length of a byte sequence, according to its packed form. */
 116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
 117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
 118
 119 /*
 120  * 12 commonly used C0 control codes (and space) are only used to encode
 121  * themselves directly,
 122  * which makes BOCU-1 MIME-usable and reasonably safe for
 123  * ASCII-oriented software.
 124  *
 125  * These controls are
 126  *  0   NUL
 127  *
 128  *  7   BEL
 129  *  8   BS
 130  *
 131  *  9   TAB
 132  *  a   LF
 133  *  b   VT
 134  *  c   FF
 135  *  d   CR
 136  *
 137  *  e   SO
 138  *  f   SI
 139  *
 140  * 1a   SUB
 141  * 1b   ESC
 142  *
 143  * The other 20 C0 controls are also encoded directly (to preserve order)
 144  * but are also used as trail bytes in difference encoding
 145  * (for better compression).
 146  */
 147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
 148
 149 /*
 150  * Byte value map for control codes,
 151  * from external byte values 0x00..0x20
 152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
 153  * External byte values that are illegal as trail bytes are mapped to -1.
 154  */
 155 static const int8_t
 156 bocu1ByteToTrail[BOCU1_MIN]={
 157 /*  0     1     2     3     4     5     6     7    */
 158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
 159
 160 /*  8     9     a     b     c     d     e     f    */
 161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 162
 163 /*  10    11    12    13    14    15    16    17   */
 164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
 165
 166 /*  18    19    1a    1b    1c    1d    1e    1f   */
 167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
 168
 169 /*  20   */
 170     -1
 171 };
 172
 173 /*
 174  * Byte value map for control codes,
 175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
 176  * to external byte values 0x00..0x20.
 177  */
 178 static const int8_t
 179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
 180 /*  0     1     2     3     4     5     6     7    */
 181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
 182
 183 /*  8     9     a     b     c     d     e     f    */
 184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 185
 186 /*  10    11    12    13   */
 187     0x1c, 0x1d, 0x1e, 0x1f
 188 };
 189
 190 /**
 191  * Integer division and modulo with negative numerators
 192  * yields negative modulo results and quotients that are one more than
 193  * what we need here.
 194  * This macro adjust the results so that the modulo-value m is always >=0.
 195  *
 196  * For positive n, the if() condition is always FALSE.
 197  *
 198  * @param n Number to be split into quotient and rest.
 199  *          Will be modified to contain the quotient.
 200  * @param d Divisor.
 201  * @param m Output variable for the rest (modulo result).
 202  */
 203 #define NEGDIVMOD(n, d, m) { \
 204     (m)=(n)%(d); \
 205     (n)/=(d); \
 206     if((m)<0) { \
 207         --(n); \
 208         (m)+=(d); \
 209     } \
 210 }
 211
 212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
 213
 214 /** Is a diff value encodable in a single byte? */
 215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
 216
 217 /** Encode a diff value in a single byte. */
 218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
 219
 220 /** Is a diff value encodable in two bytes? */
 221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
 222
 223 /* BOCU-1 implementation functions ------------------------------------------ */
 224
 225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
 226
 227 /**
 228  * Compute the next "previous" value for differencing
 229  * from the current code point.
 230  *
 231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
 232  * @return "previous code point" state value
 233  */
 234 static inline int32_t
 235 bocu1Prev(int32_t c) {
 236     /* compute new prev */
 237     if(/* 0x3040<=c && */ c<=0x309f) {
 238         /* Hiragana is not 128-aligned */
 239         return 0x3070;
 240     } else if(0x4e00<=c && c<=0x9fa5) {
 241         /* CJK Unihan */
 242         return 0x4e00-BOCU1_REACH_NEG_2;
 243     } else if(0xac00<=c /* && c<=0xd7a3 */) {
 244         /* Korean Hangul */
 245         return (0xd7a3+0xac00)/2;
 246     } else {
 247         /* mostly small scripts */
 248         return BOCU1_SIMPLE_PREV(c);
 249     }
 250 }
 251
 252 /** Fast version of bocu1Prev() for most scripts. */
 253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
 254
 255 /*
 256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
 257  * The UConverter fields are used as follows:
 258  *
 259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 260  *
 261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
 263  */
 264
 265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
 266
 267 /**
 268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
 269  * and return a packed integer with them.
 270  *
 271  * The encoding favors small absolute differences with short encodings
 272  * to compress runs of same-script characters.
 273  *
 274  * Optimized version with unrolled loops and fewer floating-point operations
 275  * than the standard packDiff().
 276  *
 277  * @param diff difference value -0x10ffff..0x10ffff
 278  * @return
 279  *      0x010000zz for 1-byte sequence zz
 280  *      0x0200yyzz for 2-byte sequence yy zz
 281  *      0x03xxyyzz for 3-byte sequence xx yy zz
 282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
 283  */
 284 static int32_t
 285 packDiff(int32_t diff) {
 286     int32_t result, m;
 287
 288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
 289     if(diff>=BOCU1_REACH_NEG_1) {
 290         /* mostly positive differences, and single-byte negative ones */
 291 #if 0   /* single-byte case handled in macros, see below */
 292         if(diff<=BOCU1_REACH_POS_1) {
 293             /* single byte */
 294             return 0x01000000|(BOCU1_MIDDLE+diff);
 295         } else
 296 #endif
 297         if(diff<=BOCU1_REACH_POS_2) {
 298             /* two bytes */
 299             diff-=BOCU1_REACH_POS_1+1;
 300             result=0x02000000;
 301
 302             m=diff%BOCU1_TRAIL_COUNT;
 303             diff/=BOCU1_TRAIL_COUNT;
 304             result|=BOCU1_TRAIL_TO_BYTE(m);
 305
 306             result|=(BOCU1_START_POS_2+diff)<<8;
 307         } else if(diff<=BOCU1_REACH_POS_3) {
 308             /* three bytes */
 309             diff-=BOCU1_REACH_POS_2+1;
 310             result=0x03000000;
 311
 312             m=diff%BOCU1_TRAIL_COUNT;
 313             diff/=BOCU1_TRAIL_COUNT;
 314             result|=BOCU1_TRAIL_TO_BYTE(m);
 315
 316             m=diff%BOCU1_TRAIL_COUNT;
 317             diff/=BOCU1_TRAIL_COUNT;
 318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 319
 320             result|=(BOCU1_START_POS_3+diff)<<16;
 321         } else {
 322             /* four bytes */
 323             diff-=BOCU1_REACH_POS_3+1;
 324
 325             m=diff%BOCU1_TRAIL_COUNT;
 326             diff/=BOCU1_TRAIL_COUNT;
 327             result=BOCU1_TRAIL_TO_BYTE(m);
 328
 329             m=diff%BOCU1_TRAIL_COUNT;
 330             diff/=BOCU1_TRAIL_COUNT;
 331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 332
 333             /*
 334              * We know that / and % would deliver quotient 0 and rest=diff.
 335              * Avoid division and modulo for performance.
 336              */
 337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
 338
 339             result|=((uint32_t)BOCU1_START_POS_4)<<24;
 340         }
 341     } else {
 342         /* two- to four-byte negative differences */
 343         if(diff>=BOCU1_REACH_NEG_2) {
 344             /* two bytes */
 345             diff-=BOCU1_REACH_NEG_1;
 346             result=0x02000000;
 347
 348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 349             result|=BOCU1_TRAIL_TO_BYTE(m);
 350
 351             result|=(BOCU1_START_NEG_2+diff)<<8;
 352         } else if(diff>=BOCU1_REACH_NEG_3) {
 353             /* three bytes */
 354             diff-=BOCU1_REACH_NEG_2;
 355             result=0x03000000;
 356
 357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 358             result|=BOCU1_TRAIL_TO_BYTE(m);
 359
 360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 362
 363             result|=(BOCU1_START_NEG_3+diff)<<16;
 364         } else {
 365             /* four bytes */
 366             diff-=BOCU1_REACH_NEG_3;
 367
 368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 369             result=BOCU1_TRAIL_TO_BYTE(m);
 370
 371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 373
 374             /*
 375              * We know that NEGDIVMOD would deliver
 376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
 377              * Avoid division and modulo for performance.
 378              */
 379             m=diff+BOCU1_TRAIL_COUNT;
 380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
 381
 382             result|=BOCU1_MIN<<24;
 383         }
 384     }
 385     return result;
 386 }
 387
 388
 389 static void
 390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 391                              UErrorCode *pErrorCode) {
 392     UConverter *cnv;
 393     const UChar *source, *sourceLimit;
 394     uint8_t *target;
 395     int32_t targetCapacity;
 396     int32_t *offsets;
 397
 398     int32_t prev, c, diff;
 399
 400     int32_t sourceIndex, nextSourceIndex;
 401
 402 U_ALIGN_CODE(16)
 403
 404     /* set up the local pointers */
 405     cnv=pArgs->converter;
 406     source=pArgs->source;
 407     sourceLimit=pArgs->sourceLimit;
 408     target=(uint8_t *)pArgs->target;
 409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
 410     offsets=pArgs->offsets;
 411
 412     /* get the converter state from UConverter */
 413     c=cnv->fromUChar32;
 414     prev=(int32_t)cnv->fromUnicodeStatus;
 415     if(prev==0) {
 416         prev=BOCU1_ASCII_PREV;
 417     }
 418
 419     /* sourceIndex=-1 if the current character began in the previous buffer */
 420     sourceIndex= c==0 ? 0 : -1;
 421     nextSourceIndex=0;
 422
 423     /* conversion loop */
 424     if(c!=0 && targetCapacity>0) {
 425         goto getTrail;
 426     }
 427
 428 fastSingle:
 429     /* fast loop for single-byte differences */
 430     /* use only one loop counter variable, targetCapacity, not also source */
 431     diff=(int32_t)(sourceLimit-source);
 432     if(targetCapacity>diff) {
 433         targetCapacity=diff;
 434     }
 435     while(targetCapacity>0 && (c=*source)<0x3000) {
 436         if(c<=0x20) {
 437             if(c!=0x20) {
 438                 prev=BOCU1_ASCII_PREV;
 439             }
 440             *target++=(uint8_t)c;
 441             *offsets++=nextSourceIndex++;
 442             ++source;
 443             --targetCapacity;
 444         } else {
 445             diff=c-prev;
 446             if(DIFF_IS_SINGLE(diff)) {
 447                 prev=BOCU1_SIMPLE_PREV(c);
 448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 449                 *offsets++=nextSourceIndex++;
 450                 ++source;
 451                 --targetCapacity;
 452             } else {
 453                 break;
 454             }
 455         }
 456     }
 457     /* restore real values */
 458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
 459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
 460
 461     /* regular loop for all cases */
 462     while(source<sourceLimit) {
 463         if(targetCapacity>0) {
 464             c=*source++;
 465             ++nextSourceIndex;
 466
 467             if(c<=0x20) {
 468                 /*
 469                  * ISO C0 control & space:
 470                  * Encode directly for MIME compatibility,
 471                  * and reset state except for space, to not disrupt compression.
 472                  */
 473                 if(c!=0x20) {
 474                     prev=BOCU1_ASCII_PREV;
 475                 }
 476                 *target++=(uint8_t)c;
 477                 *offsets++=sourceIndex;
 478                 --targetCapacity;
 479
 480                 sourceIndex=nextSourceIndex;
 481                 continue;
 482             }
 483
 484             if(U16_IS_LEAD(c)) {
 485 getTrail:
 486                 if(source<sourceLimit) {
 487                     /* test the following code unit */
 488                     UChar trail=*source;
 489                     if(U16_IS_TRAIL(trail)) {
 490                         ++source;
 491                         ++nextSourceIndex;
 492                         c=U16_GET_SUPPLEMENTARY(c, trail);
 493                     }
 494                 } else {
 495                     /* no more input */
 496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 497                     break;
 498                 }
 499             }
 500
 501             /*
 502              * all other Unicode code points c==U+0021..U+10ffff
 503              * are encoded with the difference c-prev
 504              *
 505              * a new prev is computed from c,
 506              * placed in the middle of a 0x80-block (for most small scripts) or
 507              * in the middle of the Unihan and Hangul blocks
 508              * to statistically minimize the following difference
 509              */
 510             diff=c-prev;
 511             prev=BOCU1_PREV(c);
 512             if(DIFF_IS_SINGLE(diff)) {
 513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 514                 *offsets++=sourceIndex;
 515                 --targetCapacity;
 516                 sourceIndex=nextSourceIndex;
 517                 if(c<0x3000) {
 518                     goto fastSingle;
 519                 }
 520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 521                 /* optimize 2-byte case */
 522                 int32_t m;
 523
 524                 if(diff>=0) {
 525                     diff-=BOCU1_REACH_POS_1+1;
 526                     m=diff%BOCU1_TRAIL_COUNT;
 527                     diff/=BOCU1_TRAIL_COUNT;
 528                     diff+=BOCU1_START_POS_2;
 529                 } else {
 530                     diff-=BOCU1_REACH_NEG_1;
 531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 532                     diff+=BOCU1_START_NEG_2;
 533                 }
 534                 *target++=(uint8_t)diff;
 535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 536                 *offsets++=sourceIndex;
 537                 *offsets++=sourceIndex;
 538                 targetCapacity-=2;
 539                 sourceIndex=nextSourceIndex;
 540             } else {
 541                 int32_t length; /* will be 2..4 */
 542
 543                 diff=packDiff(diff);
 544                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 545
 546                 /* write the output character bytes from diff and length */
 547                 /* from the first if in the loop we know that targetCapacity>0 */
 548                 if(length<=targetCapacity) {
 549                     switch(length) {
 550                         /* each branch falls through to the next one */
 551                     case 4:
 552                         *target++=(uint8_t)(diff>>24);
 553                         *offsets++=sourceIndex;
 554                     case 3: /*fall through*/
 555                         *target++=(uint8_t)(diff>>16);
 556                         *offsets++=sourceIndex;
 557                     case 2: /*fall through*/
 558                         *target++=(uint8_t)(diff>>8);
 559                         *offsets++=sourceIndex;
 560                     /* case 1: handled above */
 561                         *target++=(uint8_t)diff;
 562                         *offsets++=sourceIndex;
 563                     default:
 564                         /* will never occur */
 565                         break;
 566                     }
 567                     targetCapacity-=length;
 568                     sourceIndex=nextSourceIndex;
 569                 } else {
 570                     uint8_t *charErrorBuffer;
 571
 572                     /*
 573                      * We actually do this backwards here:
 574                      * In order to save an intermediate variable, we output
 575                      * first to the overflow buffer what does not fit into the
 576                      * regular target.
 577                      */
 578                     /* we know that 1<=targetCapacity<length<=4 */
 579                     length-=targetCapacity;
 580                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 581                     switch(length) {
 582                         /* each branch falls through to the next one */
 583                     case 3:
 584                         *charErrorBuffer++=(uint8_t)(diff>>16);
 585                     case 2: /*fall through*/
 586                         *charErrorBuffer++=(uint8_t)(diff>>8);
 587                     case 1: /*fall through*/
 588                         *charErrorBuffer=(uint8_t)diff;
 589                     default:
 590                         /* will never occur */
 591                         break;
 592                     }
 593                     cnv->charErrorBufferLength=(int8_t)length;
 594
 595                     /* now output what fits into the regular target */
 596                     diff>>=8*length; /* length was reduced by targetCapacity */
 597                     switch(targetCapacity) {
 598                         /* each branch falls through to the next one */
 599                     case 3:
 600                         *target++=(uint8_t)(diff>>16);
 601                         *offsets++=sourceIndex;
 602                     case 2: /*fall through*/
 603                         *target++=(uint8_t)(diff>>8);
 604                         *offsets++=sourceIndex;
 605                     case 1: /*fall through*/
 606                         *target++=(uint8_t)diff;
 607                         *offsets++=sourceIndex;
 608                     default:
 609                         /* will never occur */
 610                         break;
 611                     }
 612
 613                     /* target overflow */
 614                     targetCapacity=0;
 615                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 616                     break;
 617                 }
 618             }
 619         } else {
 620             /* target is full */
 621             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 622             break;
 623         }
 624     }
 625
 626     /* set the converter state back into UConverter */
 627     cnv->fromUChar32= c<0 ? -c : 0;
 628     cnv->fromUnicodeStatus=(uint32_t)prev;
 629
 630     /* write back the updated pointers */
 631     pArgs->source=source;
 632     pArgs->target=(char *)target;
 633     pArgs->offsets=offsets;
 634 }
 635
 636 /*
 637  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
 638  * If a change is made in the original function, then either
 639  * change this function the same way or
 640  * re-copy the original function and remove the variables
 641  * offsets, sourceIndex, and nextSourceIndex.
 642  */
 643 static void
 644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
 645                   UErrorCode *pErrorCode) {
 646     UConverter *cnv;
 647     const UChar *source, *sourceLimit;
 648     uint8_t *target;
 649     int32_t targetCapacity;
 650
 651     int32_t prev, c, diff;
 652
 653     /* set up the local pointers */
 654     cnv=pArgs->converter;
 655     source=pArgs->source;
 656     sourceLimit=pArgs->sourceLimit;
 657     target=(uint8_t *)pArgs->target;
 658     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
 659
 660     /* get the converter state from UConverter */
 661     c=cnv->fromUChar32;
 662     prev=(int32_t)cnv->fromUnicodeStatus;
 663     if(prev==0) {
 664         prev=BOCU1_ASCII_PREV;
 665     }
 666
 667     /* conversion loop */
 668     if(c!=0 && targetCapacity>0) {
 669         goto getTrail;
 670     }
 671
 672 fastSingle:
 673     /* fast loop for single-byte differences */
 674     /* use only one loop counter variable, targetCapacity, not also source */
 675     diff=(int32_t)(sourceLimit-source);
 676     if(targetCapacity>diff) {
 677         targetCapacity=diff;
 678     }
 679     while(targetCapacity>0 && (c=*source)<0x3000) {
 680         if(c<=0x20) {
 681             if(c!=0x20) {
 682                 prev=BOCU1_ASCII_PREV;
 683             }
 684             *target++=(uint8_t)c;
 685         } else {
 686             diff=c-prev;
 687             if(DIFF_IS_SINGLE(diff)) {
 688                 prev=BOCU1_SIMPLE_PREV(c);
 689                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 690             } else {
 691                 break;
 692             }
 693         }
 694         ++source;
 695         --targetCapacity;
 696     }
 697     /* restore real values */
 698     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
 699
 700     /* regular loop for all cases */
 701     while(source<sourceLimit) {
 702         if(targetCapacity>0) {
 703             c=*source++;
 704
 705             if(c<=0x20) {
 706                 /*
 707                  * ISO C0 control & space:
 708                  * Encode directly for MIME compatibility,
 709                  * and reset state except for space, to not disrupt compression.
 710                  */
 711                 if(c!=0x20) {
 712                     prev=BOCU1_ASCII_PREV;
 713                 }
 714                 *target++=(uint8_t)c;
 715                 --targetCapacity;
 716                 continue;
 717             }
 718
 719             if(U16_IS_LEAD(c)) {
 720 getTrail:
 721                 if(source<sourceLimit) {
 722                     /* test the following code unit */
 723                     UChar trail=*source;
 724                     if(U16_IS_TRAIL(trail)) {
 725                         ++source;
 726                         c=U16_GET_SUPPLEMENTARY(c, trail);
 727                     }
 728                 } else {
 729                     /* no more input */
 730                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 731                     break;
 732                 }
 733             }
 734
 735             /*
 736              * all other Unicode code points c==U+0021..U+10ffff
 737              * are encoded with the difference c-prev
 738              *
 739              * a new prev is computed from c,
 740              * placed in the middle of a 0x80-block (for most small scripts) or
 741              * in the middle of the Unihan and Hangul blocks
 742              * to statistically minimize the following difference
 743              */
 744             diff=c-prev;
 745             prev=BOCU1_PREV(c);
 746             if(DIFF_IS_SINGLE(diff)) {
 747                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 748                 --targetCapacity;
 749                 if(c<0x3000) {
 750                     goto fastSingle;
 751                 }
 752             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 753                 /* optimize 2-byte case */
 754                 int32_t m;
 755
 756                 if(diff>=0) {
 757                     diff-=BOCU1_REACH_POS_1+1;
 758                     m=diff%BOCU1_TRAIL_COUNT;
 759                     diff/=BOCU1_TRAIL_COUNT;
 760                     diff+=BOCU1_START_POS_2;
 761                 } else {
 762                     diff-=BOCU1_REACH_NEG_1;
 763                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 764                     diff+=BOCU1_START_NEG_2;
 765                 }
 766                 *target++=(uint8_t)diff;
 767                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 768                 targetCapacity-=2;
 769             } else {
 770                 int32_t length; /* will be 2..4 */
 771
 772                 diff=packDiff(diff);
 773                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 774
 775                 /* write the output character bytes from diff and length */
 776                 /* from the first if in the loop we know that targetCapacity>0 */
 777                 if(length<=targetCapacity) {
 778                     switch(length) {
 779                         /* each branch falls through to the next one */
 780                     case 4:
 781                         *target++=(uint8_t)(diff>>24);
 782                     case 3: /*fall through*/
 783                         *target++=(uint8_t)(diff>>16);
 784                     /* case 2: handled above */
 785                         *target++=(uint8_t)(diff>>8);
 786                     /* case 1: handled above */
 787                         *target++=(uint8_t)diff;
 788                     default:
 789                         /* will never occur */
 790                         break;
 791                     }
 792                     targetCapacity-=length;
 793                 } else {
 794                     uint8_t *charErrorBuffer;
 795
 796                     /*
 797                      * We actually do this backwards here:
 798                      * In order to save an intermediate variable, we output
 799                      * first to the overflow buffer what does not fit into the
 800                      * regular target.
 801                      */
 802                     /* we know that 1<=targetCapacity<length<=4 */
 803                     length-=targetCapacity;
 804                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 805                     switch(length) {
 806                         /* each branch falls through to the next one */
 807                     case 3:
 808                         *charErrorBuffer++=(uint8_t)(diff>>16);
 809                     case 2: /*fall through*/
 810                         *charErrorBuffer++=(uint8_t)(diff>>8);
 811                     case 1: /*fall through*/
 812                         *charErrorBuffer=(uint8_t)diff;
 813                     default:
 814                         /* will never occur */
 815                         break;
 816                     }
 817                     cnv->charErrorBufferLength=(int8_t)length;
 818
 819                     /* now output what fits into the regular target */
 820                     diff>>=8*length; /* length was reduced by targetCapacity */
 821                     switch(targetCapacity) {
 822                         /* each branch falls through to the next one */
 823                     case 3:
 824                         *target++=(uint8_t)(diff>>16);
 825                     case 2: /*fall through*/
 826                         *target++=(uint8_t)(diff>>8);
 827                     case 1: /*fall through*/
 828                         *target++=(uint8_t)diff;
 829                     default:
 830                         /* will never occur */
 831                         break;
 832                     }
 833
 834                     /* target overflow */
 835                     targetCapacity=0;
 836                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 837                     break;
 838                 }
 839             }
 840         } else {
 841             /* target is full */
 842             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 843             break;
 844         }
 845     }
 846
 847     /* set the converter state back into UConverter */
 848     cnv->fromUChar32= c<0 ? -c : 0;
 849     cnv->fromUnicodeStatus=(uint32_t)prev;
 850
 851     /* write back the updated pointers */
 852     pArgs->source=source;
 853     pArgs->target=(char *)target;
 854 }
 855
 856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
 857
 858 /**
 859  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
 860  *
 861  * @param b lead byte;
 862  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
 863  * @return (diff<<2)|count
 864  */
 865 static inline int32_t
 866 decodeBocu1LeadByte(int32_t b) {
 867     int32_t diff, count;
 868
 869     if(b>=BOCU1_START_NEG_2) {
 870         /* positive difference */
 871         if(b<BOCU1_START_POS_3) {
 872             /* two bytes */
 873             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
 874             count=1;
 875         } else if(b<BOCU1_START_POS_4) {
 876             /* three bytes */
 877             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
 878             count=2;
 879         } else {
 880             /* four bytes */
 881             diff=BOCU1_REACH_POS_3+1;
 882             count=3;
 883         }
 884     } else {
 885         /* negative difference */
 886         if(b>=BOCU1_START_NEG_3) {
 887             /* two bytes */
 888             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
 889             count=1;
 890         } else if(b>BOCU1_MIN) {
 891             /* three bytes */
 892             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
 893             count=2;
 894         } else {
 895             /* four bytes */
 896             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
 897             count=3;
 898         }
 899     }
 900
 901     /* return the state for decoding the trail byte(s) */
 902     return (diff<<2)|count;
 903 }
 904
 905 /**
 906  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
 907  *
 908  * @param count number of remaining trail bytes including this one
 909  * @param b trail byte
 910  * @return new delta for diff including b - <0 indicates an error
 911  *
 912  * @see decodeBocu1
 913  */
 914 static inline int32_t
 915 decodeBocu1TrailByte(int32_t count, int32_t b) {
 916     if(b<=0x20) {
 917         /* skip some C0 controls and make the trail byte range contiguous */
 918         b=bocu1ByteToTrail[b];
 919         /* b<0 for an illegal trail byte value will result in return<0 below */
 920 #if BOCU1_MAX_TRAIL<0xff
 921     } else if(b>BOCU1_MAX_TRAIL) {
 922         return -99;
 923 #endif
 924     } else {
 925         b-=BOCU1_TRAIL_BYTE_OFFSET;
 926     }
 927
 928     /* add trail byte into difference and decrement count */
 929     if(count==1) {
 930         return b;
 931     } else if(count==2) {
 932         return b*BOCU1_TRAIL_COUNT;
 933     } else /* count==3 */ {
 934         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
 935     }
 936 }
 937
 938 static void
 939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 940                            UErrorCode *pErrorCode) {
 941     UConverter *cnv;
 942     const uint8_t *source, *sourceLimit;
 943     UChar *target;
 944     const UChar *targetLimit;
 945     int32_t *offsets;
 946
 947     int32_t prev, count, diff, c;
 948
 949     int8_t byteIndex;
 950     uint8_t *bytes;
 951
 952     int32_t sourceIndex, nextSourceIndex;
 953
 954     /* set up the local pointers */
 955     cnv=pArgs->converter;
 956     source=(const uint8_t *)pArgs->source;
 957     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 958     target=pArgs->target;
 959     targetLimit=pArgs->targetLimit;
 960     offsets=pArgs->offsets;
 961
 962     /* get the converter state from UConverter */
 963     prev=(int32_t)cnv->toUnicodeStatus;
 964     if(prev==0) {
 965         prev=BOCU1_ASCII_PREV;
 966     }
 967     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
 968     count=diff&3;
 969     diff>>=2;
 970
 971     byteIndex=cnv->toULength;
 972     bytes=cnv->toUBytes;
 973
 974     /* sourceIndex=-1 if the current character began in the previous buffer */
 975     sourceIndex=byteIndex==0 ? 0 : -1;
 976     nextSourceIndex=0;
 977
 978     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
 979     if(count>0 && byteIndex>0 && target<targetLimit) {
 980         goto getTrail;
 981     }
 982
 983 fastSingle:
 984     /* fast loop for single-byte differences */
 985     /* use count as the only loop counter variable */
 986     diff=(int32_t)(sourceLimit-source);
 987     count=(int32_t)(pArgs->targetLimit-target);
 988     if(count>diff) {
 989         count=diff;
 990     }
 991     while(count>0) {
 992         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
 993             c=prev+(c-BOCU1_MIDDLE);
 994             if(c<0x3000) {
 995                 *target++=(UChar)c;
 996                 *offsets++=nextSourceIndex++;
 997                 prev=BOCU1_SIMPLE_PREV(c);
 998             } else {
 999                 break;
1000             }
1001         } else if(c<=0x20) {
1002             if(c!=0x20) {
1003                 prev=BOCU1_ASCII_PREV;
1004             }
1005             *target++=(UChar)c;
1006             *offsets++=nextSourceIndex++;
1007         } else {
1008             break;
1009         }
1010         ++source;
1011         --count;
1012     }
1013     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1014
1015     /* decode a sequence of single and lead bytes */
1016     while(source<sourceLimit) {
1017         if(target>=targetLimit) {
1018             /* target is full */
1019             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1020             break;
1021         }
1022
1023         ++nextSourceIndex;
1024         c=*source++;
1025         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1026             /* Write a code point directly from a single-byte difference. */
1027             c=prev+(c-BOCU1_MIDDLE);
1028             if(c<0x3000) {
1029                 *target++=(UChar)c;
1030                 *offsets++=sourceIndex;
1031                 prev=BOCU1_SIMPLE_PREV(c);
1032                 sourceIndex=nextSourceIndex;
1033                 goto fastSingle;
1034             }
1035         } else if(c<=0x20) {
1036             /*
1037              * Direct-encoded C0 control code or space.
1038              * Reset prev for C0 control codes but not for space.
1039              */
1040             if(c!=0x20) {
1041                 prev=BOCU1_ASCII_PREV;
1042             }
1043             *target++=(UChar)c;
1044             *offsets++=sourceIndex;
1045             sourceIndex=nextSourceIndex;
1046             continue;
1047         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1048             /* Optimize two-byte case. */
1049             if(c>=BOCU1_MIDDLE) {
1050                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1051             } else {
1052                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1053             }
1054
1055             /* trail byte */
1056             ++nextSourceIndex;
1057             c=decodeBocu1TrailByte(1, *source++);
1058             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1059                 bytes[0]=source[-2];
1060                 bytes[1]=source[-1];
1061                 byteIndex=2;
1062                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063                 break;
1064             }
1065         } else if(c==BOCU1_RESET) {
1066             /* only reset the state, no code point */
1067             prev=BOCU1_ASCII_PREV;
1068             sourceIndex=nextSourceIndex;
1069             continue;
1070         } else {
1071             /*
1072              * For multi-byte difference lead bytes, set the decoder state
1073              * with the partial difference value from the lead byte and
1074              * with the number of trail bytes.
1075              */
1076             bytes[0]=(uint8_t)c;
1077             byteIndex=1;
1078
1079             diff=decodeBocu1LeadByte(c);
1080             count=diff&3;
1081             diff>>=2;
1082 getTrail:
1083             for(;;) {
1084                 if(source>=sourceLimit) {
1085                     goto endloop;
1086                 }
1087                 ++nextSourceIndex;
1088                 c=bytes[byteIndex++]=*source++;
1089
1090                 /* trail byte in any position */
1091                 c=decodeBocu1TrailByte(count, c);
1092                 if(c<0) {
1093                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1094                     goto endloop;
1095                 }
1096
1097                 diff+=c;
1098                 if(--count==0) {
1099                     /* final trail byte, deliver a code point */
1100                     byteIndex=0;
1101                     c=prev+diff;
1102                     if((uint32_t)c>0x10ffff) {
1103                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1104                         goto endloop;
1105                     }
1106                     break;
1107                 }
1108             }
1109         }
1110
1111         /* calculate the next prev and output c */
1112         prev=BOCU1_PREV(c);
1113         if(c<=0xffff) {
1114             *target++=(UChar)c;
1115             *offsets++=sourceIndex;
1116         } else {
1117             /* output surrogate pair */
1118             *target++=U16_LEAD(c);
1119             if(target<targetLimit) {
1120                 *target++=U16_TRAIL(c);
1121                 *offsets++=sourceIndex;
1122                 *offsets++=sourceIndex;
1123             } else {
1124                 /* target overflow */
1125                 *offsets++=sourceIndex;
1126                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1127                 cnv->UCharErrorBufferLength=1;
1128                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1129                 break;
1130             }
1131         }
1132         sourceIndex=nextSourceIndex;
1133     }
1134 endloop:
1135
1136     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1137         /* set the converter state in UConverter to deal with the next character */
1138         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1139         cnv->mode=0;
1140     } else {
1141         /* set the converter state back into UConverter */
1142         cnv->toUnicodeStatus=(uint32_t)prev;
1143         cnv->mode=(diff<<2)|count;
1144     }
1145     cnv->toULength=byteIndex;
1146
1147     /* write back the updated pointers */
1148     pArgs->source=(const char *)source;
1149     pArgs->target=target;
1150     pArgs->offsets=offsets;
1151     return;
1152 }
1153
1154 /*
1155  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1156  * If a change is made in the original function, then either
1157  * change this function the same way or
1158  * re-copy the original function and remove the variables
1159  * offsets, sourceIndex, and nextSourceIndex.
1160  */
1161 static void
1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1163                 UErrorCode *pErrorCode) {
1164     UConverter *cnv;
1165     const uint8_t *source, *sourceLimit;
1166     UChar *target;
1167     const UChar *targetLimit;
1168
1169     int32_t prev, count, diff, c;
1170
1171     int8_t byteIndex;
1172     uint8_t *bytes;
1173
1174 U_ALIGN_CODE(16)
1175
1176     /* set up the local pointers */
1177     cnv=pArgs->converter;
1178     source=(const uint8_t *)pArgs->source;
1179     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1180     target=pArgs->target;
1181     targetLimit=pArgs->targetLimit;
1182
1183     /* get the converter state from UConverter */
1184     prev=(int32_t)cnv->toUnicodeStatus;
1185     if(prev==0) {
1186         prev=BOCU1_ASCII_PREV;
1187     }
1188     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1189     count=diff&3;
1190     diff>>=2;
1191
1192     byteIndex=cnv->toULength;
1193     bytes=cnv->toUBytes;
1194
1195     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1196     if(count>0 && byteIndex>0 && target<targetLimit) {
1197         goto getTrail;
1198     }
1199
1200 fastSingle:
1201     /* fast loop for single-byte differences */
1202     /* use count as the only loop counter variable */
1203     diff=(int32_t)(sourceLimit-source);
1204     count=(int32_t)(pArgs->targetLimit-target);
1205     if(count>diff) {
1206         count=diff;
1207     }
1208     while(count>0) {
1209         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1210             c=prev+(c-BOCU1_MIDDLE);
1211             if(c<0x3000) {
1212                 *target++=(UChar)c;
1213                 prev=BOCU1_SIMPLE_PREV(c);
1214             } else {
1215                 break;
1216             }
1217         } else if(c<=0x20) {
1218             if(c!=0x20) {
1219                 prev=BOCU1_ASCII_PREV;
1220             }
1221             *target++=(UChar)c;
1222         } else {
1223             break;
1224         }
1225         ++source;
1226         --count;
1227     }
1228
1229     /* decode a sequence of single and lead bytes */
1230     while(source<sourceLimit) {
1231         if(target>=targetLimit) {
1232             /* target is full */
1233             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234             break;
1235         }
1236
1237         c=*source++;
1238         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1239             /* Write a code point directly from a single-byte difference. */
1240             c=prev+(c-BOCU1_MIDDLE);
1241             if(c<0x3000) {
1242                 *target++=(UChar)c;
1243                 prev=BOCU1_SIMPLE_PREV(c);
1244                 goto fastSingle;
1245             }
1246         } else if(c<=0x20) {
1247             /*
1248              * Direct-encoded C0 control code or space.
1249              * Reset prev for C0 control codes but not for space.
1250              */
1251             if(c!=0x20) {
1252                 prev=BOCU1_ASCII_PREV;
1253             }
1254             *target++=(UChar)c;
1255             continue;
1256         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1257             /* Optimize two-byte case. */
1258             if(c>=BOCU1_MIDDLE) {
1259                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1260             } else {
1261                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1262             }
1263
1264             /* trail byte */
1265             c=decodeBocu1TrailByte(1, *source++);
1266             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1267                 bytes[0]=source[-2];
1268                 bytes[1]=source[-1];
1269                 byteIndex=2;
1270                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1271                 break;
1272             }
1273         } else if(c==BOCU1_RESET) {
1274             /* only reset the state, no code point */
1275             prev=BOCU1_ASCII_PREV;
1276             continue;
1277         } else {
1278             /*
1279              * For multi-byte difference lead bytes, set the decoder state
1280              * with the partial difference value from the lead byte and
1281              * with the number of trail bytes.
1282              */
1283             bytes[0]=(uint8_t)c;
1284             byteIndex=1;
1285
1286             diff=decodeBocu1LeadByte(c);
1287             count=diff&3;
1288             diff>>=2;
1289 getTrail:
1290             for(;;) {
1291                 if(source>=sourceLimit) {
1292                     goto endloop;
1293                 }
1294                 c=bytes[byteIndex++]=*source++;
1295
1296                 /* trail byte in any position */
1297                 c=decodeBocu1TrailByte(count, c);
1298                 if(c<0) {
1299                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1300                     goto endloop;
1301                 }
1302
1303                 diff+=c;
1304                 if(--count==0) {
1305                     /* final trail byte, deliver a code point */
1306                     byteIndex=0;
1307                     c=prev+diff;
1308                     if((uint32_t)c>0x10ffff) {
1309                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1310                         goto endloop;
1311                     }
1312                     break;
1313                 }
1314             }
1315         }
1316
1317         /* calculate the next prev and output c */
1318         prev=BOCU1_PREV(c);
1319         if(c<=0xffff) {
1320             *target++=(UChar)c;
1321         } else {
1322             /* output surrogate pair */
1323             *target++=U16_LEAD(c);
1324             if(target<targetLimit) {
1325                 *target++=U16_TRAIL(c);
1326             } else {
1327                 /* target overflow */
1328                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1329                 cnv->UCharErrorBufferLength=1;
1330                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1331                 break;
1332             }
1333         }
1334     }
1335 endloop:
1336
1337     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1338         /* set the converter state in UConverter to deal with the next character */
1339         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1340         cnv->mode=0;
1341     } else {
1342         /* set the converter state back into UConverter */
1343         cnv->toUnicodeStatus=(uint32_t)prev;
1344         cnv->mode=(diff<<2)|count;
1345     }
1346     cnv->toULength=byteIndex;
1347
1348     /* write back the updated pointers */
1349     pArgs->source=(const char *)source;
1350     pArgs->target=target;
1351     return;
1352 }
1353
1354 /* miscellaneous ------------------------------------------------------------ */
1355
1356 static const UConverterImpl _Bocu1Impl={
1357     UCNV_BOCU1,
1358
1359     NULL,
1360     NULL,
1361
1362     NULL,
1363     NULL,
1364     NULL,
1365
1366     _Bocu1ToUnicode,
1367     _Bocu1ToUnicodeWithOffsets,
1368     _Bocu1FromUnicode,
1369     _Bocu1FromUnicodeWithOffsets,
1370     NULL,
1371
1372     NULL,
1373     NULL,
1374     NULL,
1375     NULL,
1376     ucnv_getCompleteUnicodeSet,
1377
1378     NULL,
1379     NULL
1380 };
1381
1382 static const UConverterStaticData _Bocu1StaticData={
1383     sizeof(UConverterStaticData),
1384     "BOCU-1",
1385     1214, /* CCSID for BOCU-1 */
1386     UCNV_IBM, UCNV_BOCU1,
1387     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1388     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1389     FALSE, FALSE,
1390     0,
1391     0,
1392     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1393 };
1394
1395 const UConverterSharedData _Bocu1Data={
1396     sizeof(UConverterSharedData), ~((uint32_t)0),
1397     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
1398     0,
1399     UCNV_MBCS_TABLE_INITIALIZER
1400 };
1401
1402 #endif