icuSources/common/ucnvbocu.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 *
   6 *   Copyright (C) 2002-2016, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 ******************************************************************************
  10 *   file name:  ucnvbocu.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2002mar27
  16 *   created by: Markus W. Scherer
  17 *
  18 *   This is an implementation of the Binary Ordered Compression for Unicode,
  19 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
  20 */
  21
  22 #include "unicode/utypes.h"
  23
  24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  25
  26 #include "unicode/ucnv.h"
  27 #include "unicode/ucnv_cb.h"
  28 #include "unicode/utf16.h"
  29 #include "putilimp.h"
  30 #include "ucnv_bld.h"
  31 #include "ucnv_cnv.h"
  32 #include "uassert.h"
  33
  34 /* BOCU-1 constants and macros ---------------------------------------------- */
  35
  36 /*
  37  * BOCU-1 encodes the code points of a Unicode string as
  38  * a sequence of byte-encoded differences (slope detection),
  39  * preserving lexical order.
  40  *
  41  * Optimize the difference-taking for runs of Unicode text within
  42  * small scripts:
  43  *
  44  * Most small scripts are allocated within aligned 128-blocks of Unicode
  45  * code points. Lexical order is preserved if the "previous code point" state
  46  * is always moved into the middle of such a block.
  47  *
  48  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  49  * areas into the middle of those areas.
  50  *
  51  * C0 control codes and space are encoded with their US-ASCII bytes.
  52  * "prev" is reset for C0 controls but not for space.
  53  */
  54
  55 /* initial value for "prev": middle of the ASCII range */
  56 #define BOCU1_ASCII_PREV        0x40
  57
  58 /* bounding byte values for differences */
  59 #define BOCU1_MIN               0x21
  60 #define BOCU1_MIDDLE            0x90
  61 #define BOCU1_MAX_LEAD          0xfe
  62 #define BOCU1_MAX_TRAIL         0xff
  63 #define BOCU1_RESET             0xff
  64
  65 /* number of lead bytes */
  66 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  67
  68 /* adjust trail byte counts for the use of some C0 control byte values */
  69 #define BOCU1_TRAIL_CONTROLS_COUNT  20
  70 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  71
  72 /* number of trail bytes */
  73 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  74
  75 /*
  76  * number of positive and negative single-byte codes
  77  * (counting 0==BOCU1_MIDDLE among the positive ones)
  78  */
  79 #define BOCU1_SINGLE            64
  80
  81 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  82 #define BOCU1_LEAD_2            43
  83 #define BOCU1_LEAD_3            3
  84 #define BOCU1_LEAD_4            1
  85
  86 /* The difference value range for single-byters. */
  87 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
  88 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
  89
  90 /* The difference value range for double-byters. */
  91 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  92 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  93
  94 /* The difference value range for 3-byters. */
  95 #define BOCU1_REACH_POS_3   \
  96     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  97
  98 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  99
 100 /* The lead byte start values. */
 101 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
 102 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
 103 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
 104      /* ==BOCU1_MAX_LEAD */
 105
 106 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
 107 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
 108 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
 109      /* ==BOCU1_MIN+1 */
 110
 111 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
 112 #define BOCU1_LENGTH_FROM_LEAD(lead) \
 113     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
 114      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
 115      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
 116
 117 /* The length of a byte sequence, according to its packed form. */
 118 #define BOCU1_LENGTH_FROM_PACKED(packed) \
 119     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
 120
 121 /*
 122  * 12 commonly used C0 control codes (and space) are only used to encode
 123  * themselves directly,
 124  * which makes BOCU-1 MIME-usable and reasonably safe for
 125  * ASCII-oriented software.
 126  *
 127  * These controls are
 128  *  0   NUL
 129  *
 130  *  7   BEL
 131  *  8   BS
 132  *
 133  *  9   TAB
 134  *  a   LF
 135  *  b   VT
 136  *  c   FF
 137  *  d   CR
 138  *
 139  *  e   SO
 140  *  f   SI
 141  *
 142  * 1a   SUB
 143  * 1b   ESC
 144  *
 145  * The other 20 C0 controls are also encoded directly (to preserve order)
 146  * but are also used as trail bytes in difference encoding
 147  * (for better compression).
 148  */
 149 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
 150
 151 /*
 152  * Byte value map for control codes,
 153  * from external byte values 0x00..0x20
 154  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
 155  * External byte values that are illegal as trail bytes are mapped to -1.
 156  */
 157 static const int8_t
 158 bocu1ByteToTrail[BOCU1_MIN]={
 159 /*  0     1     2     3     4     5     6     7    */
 160     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
 161
 162 /*  8     9     a     b     c     d     e     f    */
 163     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 164
 165 /*  10    11    12    13    14    15    16    17   */
 166     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
 167
 168 /*  18    19    1a    1b    1c    1d    1e    1f   */
 169     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
 170
 171 /*  20   */
 172     -1
 173 };
 174
 175 /*
 176  * Byte value map for control codes,
 177  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
 178  * to external byte values 0x00..0x20.
 179  */
 180 static const int8_t
 181 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
 182 /*  0     1     2     3     4     5     6     7    */
 183     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
 184
 185 /*  8     9     a     b     c     d     e     f    */
 186     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 187
 188 /*  10    11    12    13   */
 189     0x1c, 0x1d, 0x1e, 0x1f
 190 };
 191
 192 /**
 193  * Integer division and modulo with negative numerators
 194  * yields negative modulo results and quotients that are one more than
 195  * what we need here.
 196  * This macro adjust the results so that the modulo-value m is always >=0.
 197  *
 198  * For positive n, the if() condition is always FALSE.
 199  *
 200  * @param n Number to be split into quotient and rest.
 201  *          Will be modified to contain the quotient.
 202  * @param d Divisor.
 203  * @param m Output variable for the rest (modulo result).
 204  */
 205 #define NEGDIVMOD(n, d, m) { \
 206     (m)=(n)%(d); \
 207     (n)/=(d); \
 208     if((m)<0) { \
 209         --(n); \
 210         (m)+=(d); \
 211     } \
 212 }
 213
 214 /* Faster versions of packDiff() for single-byte-encoded diff values. */
 215
 216 /** Is a diff value encodable in a single byte? */
 217 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
 218
 219 /** Encode a diff value in a single byte. */
 220 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
 221
 222 /** Is a diff value encodable in two bytes? */
 223 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
 224
 225 /* BOCU-1 implementation functions ------------------------------------------ */
 226
 227 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
 228
 229 /**
 230  * Compute the next "previous" value for differencing
 231  * from the current code point.
 232  *
 233  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
 234  * @return "previous code point" state value
 235  */
 236 static inline int32_t
 237 bocu1Prev(int32_t c) {
 238     /* compute new prev */
 239     if(/* 0x3040<=c && */ c<=0x309f) {
 240         /* Hiragana is not 128-aligned */
 241         return 0x3070;
 242     } else if(0x4e00<=c && c<=0x9fa5) {
 243         /* CJK Unihan */
 244         return 0x4e00-BOCU1_REACH_NEG_2;
 245     } else if(0xac00<=c /* && c<=0xd7a3 */) {
 246         /* Korean Hangul */
 247         return (0xd7a3+0xac00)/2;
 248     } else {
 249         /* mostly small scripts */
 250         return BOCU1_SIMPLE_PREV(c);
 251     }
 252 }
 253
 254 /** Fast version of bocu1Prev() for most scripts. */
 255 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
 256
 257 /*
 258  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
 259  * The UConverter fields are used as follows:
 260  *
 261  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 262  *
 263  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 264  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
 265  */
 266
 267 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
 268
 269 /**
 270  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
 271  * and return a packed integer with them.
 272  *
 273  * The encoding favors small absolute differences with short encodings
 274  * to compress runs of same-script characters.
 275  *
 276  * Optimized version with unrolled loops and fewer floating-point operations
 277  * than the standard packDiff().
 278  *
 279  * @param diff difference value -0x10ffff..0x10ffff
 280  * @return
 281  *      0x010000zz for 1-byte sequence zz
 282  *      0x0200yyzz for 2-byte sequence yy zz
 283  *      0x03xxyyzz for 3-byte sequence xx yy zz
 284  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
 285  */
 286 static int32_t
 287 packDiff(int32_t diff) {
 288     int32_t result, m;
 289
 290     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
 291     if(diff>=BOCU1_REACH_NEG_1) {
 292         /* mostly positive differences, and single-byte negative ones */
 293 #if 0   /* single-byte case handled in macros, see below */
 294         if(diff<=BOCU1_REACH_POS_1) {
 295             /* single byte */
 296             return 0x01000000|(BOCU1_MIDDLE+diff);
 297         } else
 298 #endif
 299         if(diff<=BOCU1_REACH_POS_2) {
 300             /* two bytes */
 301             diff-=BOCU1_REACH_POS_1+1;
 302             result=0x02000000;
 303
 304             m=diff%BOCU1_TRAIL_COUNT;
 305             diff/=BOCU1_TRAIL_COUNT;
 306             result|=BOCU1_TRAIL_TO_BYTE(m);
 307
 308             result|=(BOCU1_START_POS_2+diff)<<8;
 309         } else if(diff<=BOCU1_REACH_POS_3) {
 310             /* three bytes */
 311             diff-=BOCU1_REACH_POS_2+1;
 312             result=0x03000000;
 313
 314             m=diff%BOCU1_TRAIL_COUNT;
 315             diff/=BOCU1_TRAIL_COUNT;
 316             result|=BOCU1_TRAIL_TO_BYTE(m);
 317
 318             m=diff%BOCU1_TRAIL_COUNT;
 319             diff/=BOCU1_TRAIL_COUNT;
 320             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 321
 322             result|=(BOCU1_START_POS_3+diff)<<16;
 323         } else {
 324             /* four bytes */
 325             diff-=BOCU1_REACH_POS_3+1;
 326
 327             m=diff%BOCU1_TRAIL_COUNT;
 328             diff/=BOCU1_TRAIL_COUNT;
 329             result=BOCU1_TRAIL_TO_BYTE(m);
 330
 331             m=diff%BOCU1_TRAIL_COUNT;
 332             diff/=BOCU1_TRAIL_COUNT;
 333             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 334
 335             /*
 336              * We know that / and % would deliver quotient 0 and rest=diff.
 337              * Avoid division and modulo for performance.
 338              */
 339             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
 340
 341             result|=((uint32_t)BOCU1_START_POS_4)<<24;
 342         }
 343     } else {
 344         /* two- to four-byte negative differences */
 345         if(diff>=BOCU1_REACH_NEG_2) {
 346             /* two bytes */
 347             diff-=BOCU1_REACH_NEG_1;
 348             result=0x02000000;
 349
 350             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 351             result|=BOCU1_TRAIL_TO_BYTE(m);
 352
 353             result|=(BOCU1_START_NEG_2+diff)<<8;
 354         } else if(diff>=BOCU1_REACH_NEG_3) {
 355             /* three bytes */
 356             diff-=BOCU1_REACH_NEG_2;
 357             result=0x03000000;
 358
 359             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 360             result|=BOCU1_TRAIL_TO_BYTE(m);
 361
 362             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 363             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 364
 365             result|=(BOCU1_START_NEG_3+diff)<<16;
 366         } else {
 367             /* four bytes */
 368             diff-=BOCU1_REACH_NEG_3;
 369
 370             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 371             result=BOCU1_TRAIL_TO_BYTE(m);
 372
 373             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 374             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 375
 376             /*
 377              * We know that NEGDIVMOD would deliver
 378              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
 379              * Avoid division and modulo for performance.
 380              */
 381             m=diff+BOCU1_TRAIL_COUNT;
 382             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
 383
 384             result|=BOCU1_MIN<<24;
 385         }
 386     }
 387     return result;
 388 }
 389
 390
 391 static void U_CALLCONV
 392 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 393                              UErrorCode *pErrorCode) {
 394     UConverter *cnv;
 395     const UChar *source, *sourceLimit;
 396     uint8_t *target;
 397     int32_t targetCapacity;
 398     int32_t *offsets;
 399
 400     int32_t prev, c, diff;
 401
 402     int32_t sourceIndex, nextSourceIndex;
 403
 404     /* set up the local pointers */
 405     cnv=pArgs->converter;
 406     source=pArgs->source;
 407     sourceLimit=pArgs->sourceLimit;
 408     target=(uint8_t *)pArgs->target;
 409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
 410     offsets=pArgs->offsets;
 411
 412     /* get the converter state from UConverter */
 413     c=cnv->fromUChar32;
 414     prev=(int32_t)cnv->fromUnicodeStatus;
 415     if(prev==0) {
 416         prev=BOCU1_ASCII_PREV;
 417     }
 418
 419     /* sourceIndex=-1 if the current character began in the previous buffer */
 420     sourceIndex= c==0 ? 0 : -1;
 421     nextSourceIndex=0;
 422
 423     /* conversion loop */
 424     if(c!=0 && targetCapacity>0) {
 425         goto getTrail;
 426     }
 427
 428 fastSingle:
 429     /* fast loop for single-byte differences */
 430     /* use only one loop counter variable, targetCapacity, not also source */
 431     diff=(int32_t)(sourceLimit-source);
 432     if(targetCapacity>diff) {
 433         targetCapacity=diff;
 434     }
 435     while(targetCapacity>0 && (c=*source)<0x3000) {
 436         if(c<=0x20) {
 437             if(c!=0x20) {
 438                 prev=BOCU1_ASCII_PREV;
 439             }
 440             *target++=(uint8_t)c;
 441             *offsets++=nextSourceIndex++;
 442             ++source;
 443             --targetCapacity;
 444         } else {
 445             diff=c-prev;
 446             if(DIFF_IS_SINGLE(diff)) {
 447                 prev=BOCU1_SIMPLE_PREV(c);
 448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 449                 *offsets++=nextSourceIndex++;
 450                 ++source;
 451                 --targetCapacity;
 452             } else {
 453                 break;
 454             }
 455         }
 456     }
 457     /* restore real values */
 458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
 459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
 460
 461     /* regular loop for all cases */
 462     while(source<sourceLimit) {
 463         if(targetCapacity>0) {
 464             c=*source++;
 465             ++nextSourceIndex;
 466
 467             if(c<=0x20) {
 468                 /*
 469                  * ISO C0 control & space:
 470                  * Encode directly for MIME compatibility,
 471                  * and reset state except for space, to not disrupt compression.
 472                  */
 473                 if(c!=0x20) {
 474                     prev=BOCU1_ASCII_PREV;
 475                 }
 476                 *target++=(uint8_t)c;
 477                 *offsets++=sourceIndex;
 478                 --targetCapacity;
 479
 480                 sourceIndex=nextSourceIndex;
 481                 continue;
 482             }
 483
 484             if(U16_IS_LEAD(c)) {
 485 getTrail:
 486                 if(source<sourceLimit) {
 487                     /* test the following code unit */
 488                     UChar trail=*source;
 489                     if(U16_IS_TRAIL(trail)) {
 490                         ++source;
 491                         ++nextSourceIndex;
 492                         c=U16_GET_SUPPLEMENTARY(c, trail);
 493                     }
 494                 } else {
 495                     /* no more input */
 496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 497                     break;
 498                 }
 499             }
 500
 501             /*
 502              * all other Unicode code points c==U+0021..U+10ffff
 503              * are encoded with the difference c-prev
 504              *
 505              * a new prev is computed from c,
 506              * placed in the middle of a 0x80-block (for most small scripts) or
 507              * in the middle of the Unihan and Hangul blocks
 508              * to statistically minimize the following difference
 509              */
 510             diff=c-prev;
 511             prev=BOCU1_PREV(c);
 512             if(DIFF_IS_SINGLE(diff)) {
 513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 514                 *offsets++=sourceIndex;
 515                 --targetCapacity;
 516                 sourceIndex=nextSourceIndex;
 517                 if(c<0x3000) {
 518                     goto fastSingle;
 519                 }
 520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 521                 /* optimize 2-byte case */
 522                 int32_t m;
 523
 524                 if(diff>=0) {
 525                     diff-=BOCU1_REACH_POS_1+1;
 526                     m=diff%BOCU1_TRAIL_COUNT;
 527                     diff/=BOCU1_TRAIL_COUNT;
 528                     diff+=BOCU1_START_POS_2;
 529                 } else {
 530                     diff-=BOCU1_REACH_NEG_1;
 531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 532                     diff+=BOCU1_START_NEG_2;
 533                 }
 534                 *target++=(uint8_t)diff;
 535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 536                 *offsets++=sourceIndex;
 537                 *offsets++=sourceIndex;
 538                 targetCapacity-=2;
 539                 sourceIndex=nextSourceIndex;
 540             } else {
 541                 int32_t length; /* will be 2..4 */
 542
 543                 diff=packDiff(diff);
 544                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 545
 546                 /* write the output character bytes from diff and length */
 547                 /* from the first if in the loop we know that targetCapacity>0 */
 548                 if(length<=targetCapacity) {
 549                     switch(length) {
 550                         /* each branch falls through to the next one */
 551                     case 4:
 552                         *target++=(uint8_t)(diff>>24);
 553                         *offsets++=sourceIndex;
 554                         U_FALLTHROUGH;
 555                     case 3:
 556                         *target++=(uint8_t)(diff>>16);
 557                         *offsets++=sourceIndex;
 558                         U_FALLTHROUGH;
 559                     case 2:
 560                         *target++=(uint8_t)(diff>>8);
 561                         *offsets++=sourceIndex;
 562                     /* case 1: handled above */
 563                         *target++=(uint8_t)diff;
 564                         *offsets++=sourceIndex;
 565                         U_FALLTHROUGH;
 566                     default:
 567                         /* will never occur */
 568                         break;
 569                     }
 570                     targetCapacity-=length;
 571                     sourceIndex=nextSourceIndex;
 572                 } else {
 573                     uint8_t *charErrorBuffer;
 574
 575                     /*
 576                      * We actually do this backwards here:
 577                      * In order to save an intermediate variable, we output
 578                      * first to the overflow buffer what does not fit into the
 579                      * regular target.
 580                      */
 581                     /* we know that 1<=targetCapacity<length<=4 */
 582                     length-=targetCapacity;
 583                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 584                     switch(length) {
 585                         /* each branch falls through to the next one */
 586                     case 3:
 587                         *charErrorBuffer++=(uint8_t)(diff>>16);
 588                         U_FALLTHROUGH;
 589                     case 2:
 590                         *charErrorBuffer++=(uint8_t)(diff>>8);
 591                         U_FALLTHROUGH;
 592                     case 1:
 593                         *charErrorBuffer=(uint8_t)diff;
 594                         U_FALLTHROUGH;
 595                     default:
 596                         /* will never occur */
 597                         break;
 598                     }
 599                     cnv->charErrorBufferLength=(int8_t)length;
 600
 601                     /* now output what fits into the regular target */
 602                     diff>>=8*length; /* length was reduced by targetCapacity */
 603                     switch(targetCapacity) {
 604                         /* each branch falls through to the next one */
 605                     case 3:
 606                         *target++=(uint8_t)(diff>>16);
 607                         *offsets++=sourceIndex;
 608                         U_FALLTHROUGH;
 609                     case 2:
 610                         *target++=(uint8_t)(diff>>8);
 611                         *offsets++=sourceIndex;
 612                         U_FALLTHROUGH;
 613                     case 1:
 614                         *target++=(uint8_t)diff;
 615                         *offsets++=sourceIndex;
 616                         U_FALLTHROUGH;
 617                     default:
 618                         /* will never occur */
 619                         break;
 620                     }
 621
 622                     /* target overflow */
 623                     targetCapacity=0;
 624                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 625                     break;
 626                 }
 627             }
 628         } else {
 629             /* target is full */
 630             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 631             break;
 632         }
 633     }
 634
 635     /* set the converter state back into UConverter */
 636     cnv->fromUChar32= c<0 ? -c : 0;
 637     cnv->fromUnicodeStatus=(uint32_t)prev;
 638
 639     /* write back the updated pointers */
 640     pArgs->source=source;
 641     pArgs->target=(char *)target;
 642     pArgs->offsets=offsets;
 643 }
 644
 645 /*
 646  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
 647  * If a change is made in the original function, then either
 648  * change this function the same way or
 649  * re-copy the original function and remove the variables
 650  * offsets, sourceIndex, and nextSourceIndex.
 651  */
 652 static void U_CALLCONV
 653 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
 654                   UErrorCode *pErrorCode) {
 655     UConverter *cnv;
 656     const UChar *source, *sourceLimit;
 657     uint8_t *target;
 658     int32_t targetCapacity;
 659
 660     int32_t prev, c, diff;
 661
 662     /* set up the local pointers */
 663     cnv=pArgs->converter;
 664     source=pArgs->source;
 665     sourceLimit=pArgs->sourceLimit;
 666     target=(uint8_t *)pArgs->target;
 667     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
 668
 669     /* get the converter state from UConverter */
 670     c=cnv->fromUChar32;
 671     prev=(int32_t)cnv->fromUnicodeStatus;
 672     if(prev==0) {
 673         prev=BOCU1_ASCII_PREV;
 674     }
 675
 676     /* conversion loop */
 677     if(c!=0 && targetCapacity>0) {
 678         goto getTrail;
 679     }
 680
 681 fastSingle:
 682     /* fast loop for single-byte differences */
 683     /* use only one loop counter variable, targetCapacity, not also source */
 684     diff=(int32_t)(sourceLimit-source);
 685     if(targetCapacity>diff) {
 686         targetCapacity=diff;
 687     }
 688     while(targetCapacity>0 && (c=*source)<0x3000) {
 689         if(c<=0x20) {
 690             if(c!=0x20) {
 691                 prev=BOCU1_ASCII_PREV;
 692             }
 693             *target++=(uint8_t)c;
 694         } else {
 695             diff=c-prev;
 696             if(DIFF_IS_SINGLE(diff)) {
 697                 prev=BOCU1_SIMPLE_PREV(c);
 698                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 699             } else {
 700                 break;
 701             }
 702         }
 703         ++source;
 704         --targetCapacity;
 705     }
 706     /* restore real values */
 707     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
 708
 709     /* regular loop for all cases */
 710     while(source<sourceLimit) {
 711         if(targetCapacity>0) {
 712             c=*source++;
 713
 714             if(c<=0x20) {
 715                 /*
 716                  * ISO C0 control & space:
 717                  * Encode directly for MIME compatibility,
 718                  * and reset state except for space, to not disrupt compression.
 719                  */
 720                 if(c!=0x20) {
 721                     prev=BOCU1_ASCII_PREV;
 722                 }
 723                 *target++=(uint8_t)c;
 724                 --targetCapacity;
 725                 continue;
 726             }
 727
 728             if(U16_IS_LEAD(c)) {
 729 getTrail:
 730                 if(source<sourceLimit) {
 731                     /* test the following code unit */
 732                     UChar trail=*source;
 733                     if(U16_IS_TRAIL(trail)) {
 734                         ++source;
 735                         c=U16_GET_SUPPLEMENTARY(c, trail);
 736                     }
 737                 } else {
 738                     /* no more input */
 739                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 740                     break;
 741                 }
 742             }
 743
 744             /*
 745              * all other Unicode code points c==U+0021..U+10ffff
 746              * are encoded with the difference c-prev
 747              *
 748              * a new prev is computed from c,
 749              * placed in the middle of a 0x80-block (for most small scripts) or
 750              * in the middle of the Unihan and Hangul blocks
 751              * to statistically minimize the following difference
 752              */
 753             diff=c-prev;
 754             prev=BOCU1_PREV(c);
 755             if(DIFF_IS_SINGLE(diff)) {
 756                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 757                 --targetCapacity;
 758                 if(c<0x3000) {
 759                     goto fastSingle;
 760                 }
 761             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 762                 /* optimize 2-byte case */
 763                 int32_t m;
 764
 765                 if(diff>=0) {
 766                     diff-=BOCU1_REACH_POS_1+1;
 767                     m=diff%BOCU1_TRAIL_COUNT;
 768                     diff/=BOCU1_TRAIL_COUNT;
 769                     diff+=BOCU1_START_POS_2;
 770                 } else {
 771                     diff-=BOCU1_REACH_NEG_1;
 772                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 773                     diff+=BOCU1_START_NEG_2;
 774                 }
 775                 *target++=(uint8_t)diff;
 776                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 777                 targetCapacity-=2;
 778             } else {
 779                 int32_t length; /* will be 2..4 */
 780
 781                 diff=packDiff(diff);
 782                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 783
 784                 /* write the output character bytes from diff and length */
 785                 /* from the first if in the loop we know that targetCapacity>0 */
 786                 if(length<=targetCapacity) {
 787                     switch(length) {
 788                         /* each branch falls through to the next one */
 789                     case 4:
 790                         *target++=(uint8_t)(diff>>24);
 791                         U_FALLTHROUGH;
 792                     case 3:
 793                         *target++=(uint8_t)(diff>>16);
 794                     /* case 2: handled above */
 795                         *target++=(uint8_t)(diff>>8);
 796                     /* case 1: handled above */
 797                         *target++=(uint8_t)diff;
 798                         U_FALLTHROUGH;
 799                     default:
 800                         /* will never occur */
 801                         break;
 802                     }
 803                     targetCapacity-=length;
 804                 } else {
 805                     uint8_t *charErrorBuffer;
 806
 807                     /*
 808                      * We actually do this backwards here:
 809                      * In order to save an intermediate variable, we output
 810                      * first to the overflow buffer what does not fit into the
 811                      * regular target.
 812                      */
 813                     /* we know that 1<=targetCapacity<length<=4 */
 814                     length-=targetCapacity;
 815                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 816                     switch(length) {
 817                         /* each branch falls through to the next one */
 818                     case 3:
 819                         *charErrorBuffer++=(uint8_t)(diff>>16);
 820                         U_FALLTHROUGH;
 821                     case 2:
 822                         *charErrorBuffer++=(uint8_t)(diff>>8);
 823                         U_FALLTHROUGH;
 824                     case 1:
 825                         *charErrorBuffer=(uint8_t)diff;
 826                         U_FALLTHROUGH;
 827                     default:
 828                         /* will never occur */
 829                         break;
 830                     }
 831                     cnv->charErrorBufferLength=(int8_t)length;
 832
 833                     /* now output what fits into the regular target */
 834                     diff>>=8*length; /* length was reduced by targetCapacity */
 835                     switch(targetCapacity) {
 836                         /* each branch falls through to the next one */
 837                     case 3:
 838                         *target++=(uint8_t)(diff>>16);
 839                         U_FALLTHROUGH;
 840                     case 2:
 841                         *target++=(uint8_t)(diff>>8);
 842                         U_FALLTHROUGH;
 843                     case 1:
 844                         *target++=(uint8_t)diff;
 845                         U_FALLTHROUGH;
 846                     default:
 847                         /* will never occur */
 848                         break;
 849                     }
 850
 851                     /* target overflow */
 852                     targetCapacity=0;
 853                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 854                     break;
 855                 }
 856             }
 857         } else {
 858             /* target is full */
 859             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 860             break;
 861         }
 862     }
 863
 864     /* set the converter state back into UConverter */
 865     cnv->fromUChar32= c<0 ? -c : 0;
 866     cnv->fromUnicodeStatus=(uint32_t)prev;
 867
 868     /* write back the updated pointers */
 869     pArgs->source=source;
 870     pArgs->target=(char *)target;
 871 }
 872
 873 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
 874
 875 /**
 876  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
 877  *
 878  * @param b lead byte;
 879  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
 880  * @return (diff<<2)|count
 881  */
 882 static inline int32_t
 883 decodeBocu1LeadByte(int32_t b) {
 884     int32_t diff, count;
 885
 886     if(b>=BOCU1_START_NEG_2) {
 887         /* positive difference */
 888         if(b<BOCU1_START_POS_3) {
 889             /* two bytes */
 890             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
 891             count=1;
 892         } else if(b<BOCU1_START_POS_4) {
 893             /* three bytes */
 894             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
 895             count=2;
 896         } else {
 897             /* four bytes */
 898             diff=BOCU1_REACH_POS_3+1;
 899             count=3;
 900         }
 901     } else {
 902         /* negative difference */
 903         if(b>=BOCU1_START_NEG_3) {
 904             /* two bytes */
 905             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
 906             count=1;
 907         } else if(b>BOCU1_MIN) {
 908             /* three bytes */
 909             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
 910             count=2;
 911         } else {
 912             /* four bytes */
 913             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
 914             count=3;
 915         }
 916     }
 917
 918     /* return the state for decoding the trail byte(s) */
 919     return (diff<<2)|count;
 920 }
 921
 922 /**
 923  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
 924  *
 925  * @param count number of remaining trail bytes including this one
 926  * @param b trail byte
 927  * @return new delta for diff including b - <0 indicates an error
 928  *
 929  * @see decodeBocu1
 930  */
 931 static inline int32_t
 932 decodeBocu1TrailByte(int32_t count, int32_t b) {
 933     if(b<=0x20) {
 934         /* skip some C0 controls and make the trail byte range contiguous */
 935         b=bocu1ByteToTrail[b];
 936         /* b<0 for an illegal trail byte value will result in return<0 below */
 937 #if BOCU1_MAX_TRAIL<0xff
 938     } else if(b>BOCU1_MAX_TRAIL) {
 939         return -99;
 940 #endif
 941     } else {
 942         b-=BOCU1_TRAIL_BYTE_OFFSET;
 943     }
 944
 945     /* add trail byte into difference and decrement count */
 946     if(count==1) {
 947         return b;
 948     } else if(count==2) {
 949         return b*BOCU1_TRAIL_COUNT;
 950     } else /* count==3 */ {
 951         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
 952     }
 953 }
 954
 955 static void U_CALLCONV
 956 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 957                            UErrorCode *pErrorCode) {
 958     UConverter *cnv;
 959     const uint8_t *source, *sourceLimit;
 960     UChar *target;
 961     const UChar *targetLimit;
 962     int32_t *offsets;
 963
 964     int32_t prev, count, diff, c;
 965
 966     int8_t byteIndex;
 967     uint8_t *bytes;
 968
 969     int32_t sourceIndex, nextSourceIndex;
 970
 971     /* set up the local pointers */
 972     cnv=pArgs->converter;
 973     source=(const uint8_t *)pArgs->source;
 974     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 975     target=pArgs->target;
 976     targetLimit=pArgs->targetLimit;
 977     offsets=pArgs->offsets;
 978
 979     /* get the converter state from UConverter */
 980     prev=(int32_t)cnv->toUnicodeStatus;
 981     if(prev==0) {
 982         prev=BOCU1_ASCII_PREV;
 983     }
 984     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
 985     count=diff&3;
 986     diff>>=2;
 987
 988     byteIndex=cnv->toULength;
 989     bytes=cnv->toUBytes;
 990
 991     /* sourceIndex=-1 if the current character began in the previous buffer */
 992     sourceIndex=byteIndex==0 ? 0 : -1;
 993     nextSourceIndex=0;
 994
 995     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
 996     if(count>0 && byteIndex>0 && target<targetLimit) {
 997         goto getTrail;
 998     }
 999
1000 fastSingle:
1001     /* fast loop for single-byte differences */
1002     /* use count as the only loop counter variable */
1003     diff=(int32_t)(sourceLimit-source);
1004     count=(int32_t)(pArgs->targetLimit-target);
1005     if(count>diff) {
1006         count=diff;
1007     }
1008     while(count>0) {
1009         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1010             c=prev+(c-BOCU1_MIDDLE);
1011             if(c<0x3000) {
1012                 *target++=(UChar)c;
1013                 *offsets++=nextSourceIndex++;
1014                 prev=BOCU1_SIMPLE_PREV(c);
1015             } else {
1016                 break;
1017             }
1018         } else if(c<=0x20) {
1019             if(c!=0x20) {
1020                 prev=BOCU1_ASCII_PREV;
1021             }
1022             *target++=(UChar)c;
1023             *offsets++=nextSourceIndex++;
1024         } else {
1025             break;
1026         }
1027         ++source;
1028         --count;
1029     }
1030     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1031
1032     /* decode a sequence of single and lead bytes */
1033     while(source<sourceLimit) {
1034         if(target>=targetLimit) {
1035             /* target is full */
1036             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1037             break;
1038         }
1039
1040         ++nextSourceIndex;
1041         c=*source++;
1042         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1043             /* Write a code point directly from a single-byte difference. */
1044             c=prev+(c-BOCU1_MIDDLE);
1045             if(c<0x3000) {
1046                 *target++=(UChar)c;
1047                 *offsets++=sourceIndex;
1048                 prev=BOCU1_SIMPLE_PREV(c);
1049                 sourceIndex=nextSourceIndex;
1050                 goto fastSingle;
1051             }
1052         } else if(c<=0x20) {
1053             /*
1054              * Direct-encoded C0 control code or space.
1055              * Reset prev for C0 control codes but not for space.
1056              */
1057             if(c!=0x20) {
1058                 prev=BOCU1_ASCII_PREV;
1059             }
1060             *target++=(UChar)c;
1061             *offsets++=sourceIndex;
1062             sourceIndex=nextSourceIndex;
1063             continue;
1064         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1065             /* Optimize two-byte case. */
1066             if(c>=BOCU1_MIDDLE) {
1067                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1068             } else {
1069                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1070             }
1071
1072             /* trail byte */
1073             ++nextSourceIndex;
1074             c=decodeBocu1TrailByte(1, *source++);
1075             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1076                 bytes[0]=source[-2];
1077                 bytes[1]=source[-1];
1078                 byteIndex=2;
1079                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1080                 break;
1081             }
1082         } else if(c==BOCU1_RESET) {
1083             /* only reset the state, no code point */
1084             prev=BOCU1_ASCII_PREV;
1085             sourceIndex=nextSourceIndex;
1086             continue;
1087         } else {
1088             /*
1089              * For multi-byte difference lead bytes, set the decoder state
1090              * with the partial difference value from the lead byte and
1091              * with the number of trail bytes.
1092              */
1093             bytes[0]=(uint8_t)c;
1094             byteIndex=1;
1095
1096             diff=decodeBocu1LeadByte(c);
1097             count=diff&3;
1098             diff>>=2;
1099 getTrail:
1100             for(;;) {
1101                 if(source>=sourceLimit) {
1102                     goto endloop;
1103                 }
1104                 ++nextSourceIndex;
1105                 c=bytes[byteIndex++]=*source++;
1106
1107                 /* trail byte in any position */
1108                 c=decodeBocu1TrailByte(count, c);
1109                 if(c<0) {
1110                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1111                     goto endloop;
1112                 }
1113
1114                 diff+=c;
1115                 if(--count==0) {
1116                     /* final trail byte, deliver a code point */
1117                     byteIndex=0;
1118                     c=prev+diff;
1119                     if((uint32_t)c>0x10ffff) {
1120                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121                         goto endloop;
1122                     }
1123                     break;
1124                 }
1125             }
1126         }
1127
1128         /* calculate the next prev and output c */
1129         prev=BOCU1_PREV(c);
1130         if(c<=0xffff) {
1131             *target++=(UChar)c;
1132             *offsets++=sourceIndex;
1133         } else {
1134             /* output surrogate pair */
1135             *target++=U16_LEAD(c);
1136             if(target<targetLimit) {
1137                 *target++=U16_TRAIL(c);
1138                 *offsets++=sourceIndex;
1139                 *offsets++=sourceIndex;
1140             } else {
1141                 /* target overflow */
1142                 *offsets++=sourceIndex;
1143                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1144                 cnv->UCharErrorBufferLength=1;
1145                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1146                 break;
1147             }
1148         }
1149         sourceIndex=nextSourceIndex;
1150     }
1151 endloop:
1152
1153     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1154         /* set the converter state in UConverter to deal with the next character */
1155         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1156         cnv->mode=0;
1157     } else {
1158         /* set the converter state back into UConverter */
1159         cnv->toUnicodeStatus=(uint32_t)prev;
1160         cnv->mode=(diff<<2)|count;
1161     }
1162     cnv->toULength=byteIndex;
1163
1164     /* write back the updated pointers */
1165     pArgs->source=(const char *)source;
1166     pArgs->target=target;
1167     pArgs->offsets=offsets;
1168     return;
1169 }
1170
1171 /*
1172  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173  * If a change is made in the original function, then either
1174  * change this function the same way or
1175  * re-copy the original function and remove the variables
1176  * offsets, sourceIndex, and nextSourceIndex.
1177  */
1178 static void U_CALLCONV
1179 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1180                 UErrorCode *pErrorCode) {
1181     UConverter *cnv;
1182     const uint8_t *source, *sourceLimit;
1183     UChar *target;
1184     const UChar *targetLimit;
1185
1186     int32_t prev, count, diff, c;
1187
1188     int8_t byteIndex;
1189     uint8_t *bytes;
1190
1191     /* set up the local pointers */
1192     cnv=pArgs->converter;
1193     source=(const uint8_t *)pArgs->source;
1194     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1195     target=pArgs->target;
1196     targetLimit=pArgs->targetLimit;
1197
1198     /* get the converter state from UConverter */
1199     prev=(int32_t)cnv->toUnicodeStatus;
1200     if(prev==0) {
1201         prev=BOCU1_ASCII_PREV;
1202     }
1203     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1204     count=diff&3;
1205     diff>>=2;
1206
1207     byteIndex=cnv->toULength;
1208     bytes=cnv->toUBytes;
1209
1210     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1211     if(count>0 && byteIndex>0 && target<targetLimit) {
1212         goto getTrail;
1213     }
1214
1215 fastSingle:
1216     /* fast loop for single-byte differences */
1217     /* use count as the only loop counter variable */
1218     diff=(int32_t)(sourceLimit-source);
1219     count=(int32_t)(pArgs->targetLimit-target);
1220     if(count>diff) {
1221         count=diff;
1222     }
1223     while(count>0) {
1224         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1225             c=prev+(c-BOCU1_MIDDLE);
1226             if(c<0x3000) {
1227                 *target++=(UChar)c;
1228                 prev=BOCU1_SIMPLE_PREV(c);
1229             } else {
1230                 break;
1231             }
1232         } else if(c<=0x20) {
1233             if(c!=0x20) {
1234                 prev=BOCU1_ASCII_PREV;
1235             }
1236             *target++=(UChar)c;
1237         } else {
1238             break;
1239         }
1240         ++source;
1241         --count;
1242     }
1243
1244     /* decode a sequence of single and lead bytes */
1245     while(source<sourceLimit) {
1246         if(target>=targetLimit) {
1247             /* target is full */
1248             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249             break;
1250         }
1251
1252         c=*source++;
1253         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1254             /* Write a code point directly from a single-byte difference. */
1255             c=prev+(c-BOCU1_MIDDLE);
1256             if(c<0x3000) {
1257                 *target++=(UChar)c;
1258                 prev=BOCU1_SIMPLE_PREV(c);
1259                 goto fastSingle;
1260             }
1261         } else if(c<=0x20) {
1262             /*
1263              * Direct-encoded C0 control code or space.
1264              * Reset prev for C0 control codes but not for space.
1265              */
1266             if(c!=0x20) {
1267                 prev=BOCU1_ASCII_PREV;
1268             }
1269             *target++=(UChar)c;
1270             continue;
1271         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1272             /* Optimize two-byte case. */
1273             if(c>=BOCU1_MIDDLE) {
1274                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1275             } else {
1276                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1277             }
1278
1279             /* trail byte */
1280             c=decodeBocu1TrailByte(1, *source++);
1281             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1282                 bytes[0]=source[-2];
1283                 bytes[1]=source[-1];
1284                 byteIndex=2;
1285                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1286                 break;
1287             }
1288         } else if(c==BOCU1_RESET) {
1289             /* only reset the state, no code point */
1290             prev=BOCU1_ASCII_PREV;
1291             continue;
1292         } else {
1293             /*
1294              * For multi-byte difference lead bytes, set the decoder state
1295              * with the partial difference value from the lead byte and
1296              * with the number of trail bytes.
1297              */
1298             bytes[0]=(uint8_t)c;
1299             byteIndex=1;
1300
1301             diff=decodeBocu1LeadByte(c);
1302             count=diff&3;
1303             diff>>=2;
1304 getTrail:
1305             for(;;) {
1306                 if(source>=sourceLimit) {
1307                     goto endloop;
1308                 }
1309                 c=bytes[byteIndex++]=*source++;
1310
1311                 /* trail byte in any position */
1312                 c=decodeBocu1TrailByte(count, c);
1313                 if(c<0) {
1314                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315                     goto endloop;
1316                 }
1317
1318                 diff+=c;
1319                 if(--count==0) {
1320                     /* final trail byte, deliver a code point */
1321                     byteIndex=0;
1322                     c=prev+diff;
1323                     if((uint32_t)c>0x10ffff) {
1324                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325                         goto endloop;
1326                     }
1327                     break;
1328                 }
1329             }
1330         }
1331
1332         /* calculate the next prev and output c */
1333         prev=BOCU1_PREV(c);
1334         if(c<=0xffff) {
1335             *target++=(UChar)c;
1336         } else {
1337             /* output surrogate pair */
1338             *target++=U16_LEAD(c);
1339             if(target<targetLimit) {
1340                 *target++=U16_TRAIL(c);
1341             } else {
1342                 /* target overflow */
1343                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1344                 cnv->UCharErrorBufferLength=1;
1345                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346                 break;
1347             }
1348         }
1349     }
1350 endloop:
1351
1352     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1353         /* set the converter state in UConverter to deal with the next character */
1354         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1355         cnv->mode=0;
1356     } else {
1357         /* set the converter state back into UConverter */
1358         cnv->toUnicodeStatus=(uint32_t)prev;
1359         cnv->mode=(diff<<2)|count;
1360     }
1361     cnv->toULength=byteIndex;
1362
1363     /* write back the updated pointers */
1364     pArgs->source=(const char *)source;
1365     pArgs->target=target;
1366     return;
1367 }
1368
1369 /* miscellaneous ------------------------------------------------------------ */
1370
1371 static const UConverterImpl _Bocu1Impl={
1372     UCNV_BOCU1,
1373
1374     NULL,
1375     NULL,
1376
1377     NULL,
1378     NULL,
1379     NULL,
1380
1381     _Bocu1ToUnicode,
1382     _Bocu1ToUnicodeWithOffsets,
1383     _Bocu1FromUnicode,
1384     _Bocu1FromUnicodeWithOffsets,
1385     NULL,
1386
1387     NULL,
1388     NULL,
1389     NULL,
1390     NULL,
1391     ucnv_getCompleteUnicodeSet,
1392
1393     NULL,
1394     NULL
1395 };
1396
1397 static const UConverterStaticData _Bocu1StaticData={
1398     sizeof(UConverterStaticData),
1399     "BOCU-1",
1400     1214, /* CCSID for BOCU-1 */
1401     UCNV_IBM, UCNV_BOCU1,
1402     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1403     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1404     FALSE, FALSE,
1405     0,
1406     0,
1407     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1408 };
1409
1410 const UConverterSharedData _Bocu1Data=
1411         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1412
1413 #endif