icuSources/common/ucnvbocu.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2002-2016, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  ucnvbocu.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002mar27
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This is an implementation of the Binary Ordered Compression for Unicode,
  17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  23
  24 #include "unicode/ucnv.h"
  25 #include "unicode/ucnv_cb.h"
  26 #include "unicode/utf16.h"
  27 #include "putilimp.h"
  28 #include "ucnv_bld.h"
  29 #include "ucnv_cnv.h"
  30 #include "uassert.h"
  31
  32 /* BOCU-1 constants and macros ---------------------------------------------- */
  33
  34 /*
  35  * BOCU-1 encodes the code points of a Unicode string as
  36  * a sequence of byte-encoded differences (slope detection),
  37  * preserving lexical order.
  38  *
  39  * Optimize the difference-taking for runs of Unicode text within
  40  * small scripts:
  41  *
  42  * Most small scripts are allocated within aligned 128-blocks of Unicode
  43  * code points. Lexical order is preserved if the "previous code point" state
  44  * is always moved into the middle of such a block.
  45  *
  46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  47  * areas into the middle of those areas.
  48  *
  49  * C0 control codes and space are encoded with their US-ASCII bytes.
  50  * "prev" is reset for C0 controls but not for space.
  51  */
  52
  53 /* initial value for "prev": middle of the ASCII range */
  54 #define BOCU1_ASCII_PREV        0x40
  55
  56 /* bounding byte values for differences */
  57 #define BOCU1_MIN               0x21
  58 #define BOCU1_MIDDLE            0x90
  59 #define BOCU1_MAX_LEAD          0xfe
  60 #define BOCU1_MAX_TRAIL         0xff
  61 #define BOCU1_RESET             0xff
  62
  63 /* number of lead bytes */
  64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  65
  66 /* adjust trail byte counts for the use of some C0 control byte values */
  67 #define BOCU1_TRAIL_CONTROLS_COUNT  20
  68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  69
  70 /* number of trail bytes */
  71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  72
  73 /*
  74  * number of positive and negative single-byte codes
  75  * (counting 0==BOCU1_MIDDLE among the positive ones)
  76  */
  77 #define BOCU1_SINGLE            64
  78
  79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  80 #define BOCU1_LEAD_2            43
  81 #define BOCU1_LEAD_3            3
  82 #define BOCU1_LEAD_4            1
  83
  84 /* The difference value range for single-byters. */
  85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
  86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
  87
  88 /* The difference value range for double-byters. */
  89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  91
  92 /* The difference value range for 3-byters. */
  93 #define BOCU1_REACH_POS_3   \
  94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  95
  96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  97
  98 /* The lead byte start values. */
  99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
 100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
 101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
 102      /* ==BOCU1_MAX_LEAD */
 103
 104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
 105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
 106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
 107      /* ==BOCU1_MIN+1 */
 108
 109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
 110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
 111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
 112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
 113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
 114
 115 /* The length of a byte sequence, according to its packed form. */
 116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
 117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
 118
 119 /*
 120  * 12 commonly used C0 control codes (and space) are only used to encode
 121  * themselves directly,
 122  * which makes BOCU-1 MIME-usable and reasonably safe for
 123  * ASCII-oriented software.
 124  *
 125  * These controls are
 126  *  0   NUL
 127  *
 128  *  7   BEL
 129  *  8   BS
 130  *
 131  *  9   TAB
 132  *  a   LF
 133  *  b   VT
 134  *  c   FF
 135  *  d   CR
 136  *
 137  *  e   SO
 138  *  f   SI
 139  *
 140  * 1a   SUB
 141  * 1b   ESC
 142  *
 143  * The other 20 C0 controls are also encoded directly (to preserve order)
 144  * but are also used as trail bytes in difference encoding
 145  * (for better compression).
 146  */
 147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
 148
 149 /*
 150  * Byte value map for control codes,
 151  * from external byte values 0x00..0x20
 152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
 153  * External byte values that are illegal as trail bytes are mapped to -1.
 154  */
 155 static const int8_t
 156 bocu1ByteToTrail[BOCU1_MIN]={
 157 /*  0     1     2     3     4     5     6     7    */
 158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
 159
 160 /*  8     9     a     b     c     d     e     f    */
 161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 162
 163 /*  10    11    12    13    14    15    16    17   */
 164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
 165
 166 /*  18    19    1a    1b    1c    1d    1e    1f   */
 167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
 168
 169 /*  20   */
 170     -1
 171 };
 172
 173 /*
 174  * Byte value map for control codes,
 175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
 176  * to external byte values 0x00..0x20.
 177  */
 178 static const int8_t
 179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
 180 /*  0     1     2     3     4     5     6     7    */
 181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
 182
 183 /*  8     9     a     b     c     d     e     f    */
 184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 185
 186 /*  10    11    12    13   */
 187     0x1c, 0x1d, 0x1e, 0x1f
 188 };
 189
 190 /**
 191  * Integer division and modulo with negative numerators
 192  * yields negative modulo results and quotients that are one more than
 193  * what we need here.
 194  * This macro adjust the results so that the modulo-value m is always >=0.
 195  *
 196  * For positive n, the if() condition is always FALSE.
 197  *
 198  * @param n Number to be split into quotient and rest.
 199  *          Will be modified to contain the quotient.
 200  * @param d Divisor.
 201  * @param m Output variable for the rest (modulo result).
 202  */
 203 #define NEGDIVMOD(n, d, m) { \
 204     (m)=(n)%(d); \
 205     (n)/=(d); \
 206     if((m)<0) { \
 207         --(n); \
 208         (m)+=(d); \
 209     } \
 210 }
 211
 212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
 213
 214 /** Is a diff value encodable in a single byte? */
 215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
 216
 217 /** Encode a diff value in a single byte. */
 218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
 219
 220 /** Is a diff value encodable in two bytes? */
 221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
 222
 223 /* BOCU-1 implementation functions ------------------------------------------ */
 224
 225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
 226
 227 /**
 228  * Compute the next "previous" value for differencing
 229  * from the current code point.
 230  *
 231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
 232  * @return "previous code point" state value
 233  */
 234 static inline int32_t
 235 bocu1Prev(int32_t c) {
 236     /* compute new prev */
 237     if(/* 0x3040<=c && */ c<=0x309f) {
 238         /* Hiragana is not 128-aligned */
 239         return 0x3070;
 240     } else if(0x4e00<=c && c<=0x9fa5) {
 241         /* CJK Unihan */
 242         return 0x4e00-BOCU1_REACH_NEG_2;
 243     } else if(0xac00<=c /* && c<=0xd7a3 */) {
 244         /* Korean Hangul */
 245         return (0xd7a3+0xac00)/2;
 246     } else {
 247         /* mostly small scripts */
 248         return BOCU1_SIMPLE_PREV(c);
 249     }
 250 }
 251
 252 /** Fast version of bocu1Prev() for most scripts. */
 253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
 254
 255 /*
 256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
 257  * The UConverter fields are used as follows:
 258  *
 259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 260  *
 261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
 262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
 263  */
 264
 265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
 266
 267 /**
 268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
 269  * and return a packed integer with them.
 270  *
 271  * The encoding favors small absolute differences with short encodings
 272  * to compress runs of same-script characters.
 273  *
 274  * Optimized version with unrolled loops and fewer floating-point operations
 275  * than the standard packDiff().
 276  *
 277  * @param diff difference value -0x10ffff..0x10ffff
 278  * @return
 279  *      0x010000zz for 1-byte sequence zz
 280  *      0x0200yyzz for 2-byte sequence yy zz
 281  *      0x03xxyyzz for 3-byte sequence xx yy zz
 282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
 283  */
 284 static int32_t
 285 packDiff(int32_t diff) {
 286     int32_t result, m;
 287
 288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
 289     if(diff>=BOCU1_REACH_NEG_1) {
 290         /* mostly positive differences, and single-byte negative ones */
 291 #if 0   /* single-byte case handled in macros, see below */
 292         if(diff<=BOCU1_REACH_POS_1) {
 293             /* single byte */
 294             return 0x01000000|(BOCU1_MIDDLE+diff);
 295         } else
 296 #endif
 297         if(diff<=BOCU1_REACH_POS_2) {
 298             /* two bytes */
 299             diff-=BOCU1_REACH_POS_1+1;
 300             result=0x02000000;
 301
 302             m=diff%BOCU1_TRAIL_COUNT;
 303             diff/=BOCU1_TRAIL_COUNT;
 304             result|=BOCU1_TRAIL_TO_BYTE(m);
 305
 306             result|=(BOCU1_START_POS_2+diff)<<8;
 307         } else if(diff<=BOCU1_REACH_POS_3) {
 308             /* three bytes */
 309             diff-=BOCU1_REACH_POS_2+1;
 310             result=0x03000000;
 311
 312             m=diff%BOCU1_TRAIL_COUNT;
 313             diff/=BOCU1_TRAIL_COUNT;
 314             result|=BOCU1_TRAIL_TO_BYTE(m);
 315
 316             m=diff%BOCU1_TRAIL_COUNT;
 317             diff/=BOCU1_TRAIL_COUNT;
 318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 319
 320             result|=(BOCU1_START_POS_3+diff)<<16;
 321         } else {
 322             /* four bytes */
 323             diff-=BOCU1_REACH_POS_3+1;
 324
 325             m=diff%BOCU1_TRAIL_COUNT;
 326             diff/=BOCU1_TRAIL_COUNT;
 327             result=BOCU1_TRAIL_TO_BYTE(m);
 328
 329             m=diff%BOCU1_TRAIL_COUNT;
 330             diff/=BOCU1_TRAIL_COUNT;
 331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 332
 333             /*
 334              * We know that / and % would deliver quotient 0 and rest=diff.
 335              * Avoid division and modulo for performance.
 336              */
 337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
 338
 339             result|=((uint32_t)BOCU1_START_POS_4)<<24;
 340         }
 341     } else {
 342         /* two- to four-byte negative differences */
 343         if(diff>=BOCU1_REACH_NEG_2) {
 344             /* two bytes */
 345             diff-=BOCU1_REACH_NEG_1;
 346             result=0x02000000;
 347
 348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 349             result|=BOCU1_TRAIL_TO_BYTE(m);
 350
 351             result|=(BOCU1_START_NEG_2+diff)<<8;
 352         } else if(diff>=BOCU1_REACH_NEG_3) {
 353             /* three bytes */
 354             diff-=BOCU1_REACH_NEG_2;
 355             result=0x03000000;
 356
 357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 358             result|=BOCU1_TRAIL_TO_BYTE(m);
 359
 360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 362
 363             result|=(BOCU1_START_NEG_3+diff)<<16;
 364         } else {
 365             /* four bytes */
 366             diff-=BOCU1_REACH_NEG_3;
 367
 368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 369             result=BOCU1_TRAIL_TO_BYTE(m);
 370
 371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
 373
 374             /*
 375              * We know that NEGDIVMOD would deliver
 376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
 377              * Avoid division and modulo for performance.
 378              */
 379             m=diff+BOCU1_TRAIL_COUNT;
 380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
 381
 382             result|=BOCU1_MIN<<24;
 383         }
 384     }
 385     return result;
 386 }
 387
 388
 389 static void
 390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 391                              UErrorCode *pErrorCode) {
 392     UConverter *cnv;
 393     const UChar *source, *sourceLimit;
 394     uint8_t *target;
 395     int32_t targetCapacity;
 396     int32_t *offsets;
 397
 398     int32_t prev, c, diff;
 399
 400     int32_t sourceIndex, nextSourceIndex;
 401
 402     /* set up the local pointers */
 403     cnv=pArgs->converter;
 404     source=pArgs->source;
 405     sourceLimit=pArgs->sourceLimit;
 406     target=(uint8_t *)pArgs->target;
 407     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
 408     offsets=pArgs->offsets;
 409
 410     /* get the converter state from UConverter */
 411     c=cnv->fromUChar32;
 412     prev=(int32_t)cnv->fromUnicodeStatus;
 413     if(prev==0) {
 414         prev=BOCU1_ASCII_PREV;
 415     }
 416
 417     /* sourceIndex=-1 if the current character began in the previous buffer */
 418     sourceIndex= c==0 ? 0 : -1;
 419     nextSourceIndex=0;
 420
 421     /* conversion loop */
 422     if(c!=0 && targetCapacity>0) {
 423         goto getTrail;
 424     }
 425
 426 fastSingle:
 427     /* fast loop for single-byte differences */
 428     /* use only one loop counter variable, targetCapacity, not also source */
 429     diff=(int32_t)(sourceLimit-source);
 430     if(targetCapacity>diff) {
 431         targetCapacity=diff;
 432     }
 433     while(targetCapacity>0 && (c=*source)<0x3000) {
 434         if(c<=0x20) {
 435             if(c!=0x20) {
 436                 prev=BOCU1_ASCII_PREV;
 437             }
 438             *target++=(uint8_t)c;
 439             *offsets++=nextSourceIndex++;
 440             ++source;
 441             --targetCapacity;
 442         } else {
 443             diff=c-prev;
 444             if(DIFF_IS_SINGLE(diff)) {
 445                 prev=BOCU1_SIMPLE_PREV(c);
 446                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 447                 *offsets++=nextSourceIndex++;
 448                 ++source;
 449                 --targetCapacity;
 450             } else {
 451                 break;
 452             }
 453         }
 454     }
 455     /* restore real values */
 456     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
 457     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
 458
 459     /* regular loop for all cases */
 460     while(source<sourceLimit) {
 461         if(targetCapacity>0) {
 462             c=*source++;
 463             ++nextSourceIndex;
 464
 465             if(c<=0x20) {
 466                 /*
 467                  * ISO C0 control & space:
 468                  * Encode directly for MIME compatibility,
 469                  * and reset state except for space, to not disrupt compression.
 470                  */
 471                 if(c!=0x20) {
 472                     prev=BOCU1_ASCII_PREV;
 473                 }
 474                 *target++=(uint8_t)c;
 475                 *offsets++=sourceIndex;
 476                 --targetCapacity;
 477
 478                 sourceIndex=nextSourceIndex;
 479                 continue;
 480             }
 481
 482             if(U16_IS_LEAD(c)) {
 483 getTrail:
 484                 if(source<sourceLimit) {
 485                     /* test the following code unit */
 486                     UChar trail=*source;
 487                     if(U16_IS_TRAIL(trail)) {
 488                         ++source;
 489                         ++nextSourceIndex;
 490                         c=U16_GET_SUPPLEMENTARY(c, trail);
 491                     }
 492                 } else {
 493                     /* no more input */
 494                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 495                     break;
 496                 }
 497             }
 498
 499             /*
 500              * all other Unicode code points c==U+0021..U+10ffff
 501              * are encoded with the difference c-prev
 502              *
 503              * a new prev is computed from c,
 504              * placed in the middle of a 0x80-block (for most small scripts) or
 505              * in the middle of the Unihan and Hangul blocks
 506              * to statistically minimize the following difference
 507              */
 508             diff=c-prev;
 509             prev=BOCU1_PREV(c);
 510             if(DIFF_IS_SINGLE(diff)) {
 511                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 512                 *offsets++=sourceIndex;
 513                 --targetCapacity;
 514                 sourceIndex=nextSourceIndex;
 515                 if(c<0x3000) {
 516                     goto fastSingle;
 517                 }
 518             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 519                 /* optimize 2-byte case */
 520                 int32_t m;
 521
 522                 if(diff>=0) {
 523                     diff-=BOCU1_REACH_POS_1+1;
 524                     m=diff%BOCU1_TRAIL_COUNT;
 525                     diff/=BOCU1_TRAIL_COUNT;
 526                     diff+=BOCU1_START_POS_2;
 527                 } else {
 528                     diff-=BOCU1_REACH_NEG_1;
 529                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 530                     diff+=BOCU1_START_NEG_2;
 531                 }
 532                 *target++=(uint8_t)diff;
 533                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 534                 *offsets++=sourceIndex;
 535                 *offsets++=sourceIndex;
 536                 targetCapacity-=2;
 537                 sourceIndex=nextSourceIndex;
 538             } else {
 539                 int32_t length; /* will be 2..4 */
 540
 541                 diff=packDiff(diff);
 542                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 543
 544                 /* write the output character bytes from diff and length */
 545                 /* from the first if in the loop we know that targetCapacity>0 */
 546                 if(length<=targetCapacity) {
 547                     switch(length) {
 548                         /* each branch falls through to the next one */
 549                     case 4:
 550                         *target++=(uint8_t)(diff>>24);
 551                         *offsets++=sourceIndex;
 552                         U_FALLTHROUGH;
 553                     case 3:
 554                         *target++=(uint8_t)(diff>>16);
 555                         *offsets++=sourceIndex;
 556                         U_FALLTHROUGH;
 557                     case 2:
 558                         *target++=(uint8_t)(diff>>8);
 559                         *offsets++=sourceIndex;
 560                     /* case 1: handled above */
 561                         *target++=(uint8_t)diff;
 562                         *offsets++=sourceIndex;
 563                         U_FALLTHROUGH;
 564                     default:
 565                         /* will never occur */
 566                         break;
 567                     }
 568                     targetCapacity-=length;
 569                     sourceIndex=nextSourceIndex;
 570                 } else {
 571                     uint8_t *charErrorBuffer;
 572
 573                     /*
 574                      * We actually do this backwards here:
 575                      * In order to save an intermediate variable, we output
 576                      * first to the overflow buffer what does not fit into the
 577                      * regular target.
 578                      */
 579                     /* we know that 1<=targetCapacity<length<=4 */
 580                     length-=targetCapacity;
 581                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 582                     switch(length) {
 583                         /* each branch falls through to the next one */
 584                     case 3:
 585                         *charErrorBuffer++=(uint8_t)(diff>>16);
 586                         U_FALLTHROUGH;
 587                     case 2:
 588                         *charErrorBuffer++=(uint8_t)(diff>>8);
 589                         U_FALLTHROUGH;
 590                     case 1:
 591                         *charErrorBuffer=(uint8_t)diff;
 592                         U_FALLTHROUGH;
 593                     default:
 594                         /* will never occur */
 595                         break;
 596                     }
 597                     cnv->charErrorBufferLength=(int8_t)length;
 598
 599                     /* now output what fits into the regular target */
 600                     diff>>=8*length; /* length was reduced by targetCapacity */
 601                     switch(targetCapacity) {
 602                         /* each branch falls through to the next one */
 603                     case 3:
 604                         *target++=(uint8_t)(diff>>16);
 605                         *offsets++=sourceIndex;
 606                         U_FALLTHROUGH;
 607                     case 2:
 608                         *target++=(uint8_t)(diff>>8);
 609                         *offsets++=sourceIndex;
 610                         U_FALLTHROUGH;
 611                     case 1:
 612                         *target++=(uint8_t)diff;
 613                         *offsets++=sourceIndex;
 614                         U_FALLTHROUGH;
 615                     default:
 616                         /* will never occur */
 617                         break;
 618                     }
 619
 620                     /* target overflow */
 621                     targetCapacity=0;
 622                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 623                     break;
 624                 }
 625             }
 626         } else {
 627             /* target is full */
 628             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 629             break;
 630         }
 631     }
 632
 633     /* set the converter state back into UConverter */
 634     cnv->fromUChar32= c<0 ? -c : 0;
 635     cnv->fromUnicodeStatus=(uint32_t)prev;
 636
 637     /* write back the updated pointers */
 638     pArgs->source=source;
 639     pArgs->target=(char *)target;
 640     pArgs->offsets=offsets;
 641 }
 642
 643 /*
 644  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
 645  * If a change is made in the original function, then either
 646  * change this function the same way or
 647  * re-copy the original function and remove the variables
 648  * offsets, sourceIndex, and nextSourceIndex.
 649  */
 650 static void
 651 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
 652                   UErrorCode *pErrorCode) {
 653     UConverter *cnv;
 654     const UChar *source, *sourceLimit;
 655     uint8_t *target;
 656     int32_t targetCapacity;
 657
 658     int32_t prev, c, diff;
 659
 660     /* set up the local pointers */
 661     cnv=pArgs->converter;
 662     source=pArgs->source;
 663     sourceLimit=pArgs->sourceLimit;
 664     target=(uint8_t *)pArgs->target;
 665     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
 666
 667     /* get the converter state from UConverter */
 668     c=cnv->fromUChar32;
 669     prev=(int32_t)cnv->fromUnicodeStatus;
 670     if(prev==0) {
 671         prev=BOCU1_ASCII_PREV;
 672     }
 673
 674     /* conversion loop */
 675     if(c!=0 && targetCapacity>0) {
 676         goto getTrail;
 677     }
 678
 679 fastSingle:
 680     /* fast loop for single-byte differences */
 681     /* use only one loop counter variable, targetCapacity, not also source */
 682     diff=(int32_t)(sourceLimit-source);
 683     if(targetCapacity>diff) {
 684         targetCapacity=diff;
 685     }
 686     while(targetCapacity>0 && (c=*source)<0x3000) {
 687         if(c<=0x20) {
 688             if(c!=0x20) {
 689                 prev=BOCU1_ASCII_PREV;
 690             }
 691             *target++=(uint8_t)c;
 692         } else {
 693             diff=c-prev;
 694             if(DIFF_IS_SINGLE(diff)) {
 695                 prev=BOCU1_SIMPLE_PREV(c);
 696                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 697             } else {
 698                 break;
 699             }
 700         }
 701         ++source;
 702         --targetCapacity;
 703     }
 704     /* restore real values */
 705     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
 706
 707     /* regular loop for all cases */
 708     while(source<sourceLimit) {
 709         if(targetCapacity>0) {
 710             c=*source++;
 711
 712             if(c<=0x20) {
 713                 /*
 714                  * ISO C0 control & space:
 715                  * Encode directly for MIME compatibility,
 716                  * and reset state except for space, to not disrupt compression.
 717                  */
 718                 if(c!=0x20) {
 719                     prev=BOCU1_ASCII_PREV;
 720                 }
 721                 *target++=(uint8_t)c;
 722                 --targetCapacity;
 723                 continue;
 724             }
 725
 726             if(U16_IS_LEAD(c)) {
 727 getTrail:
 728                 if(source<sourceLimit) {
 729                     /* test the following code unit */
 730                     UChar trail=*source;
 731                     if(U16_IS_TRAIL(trail)) {
 732                         ++source;
 733                         c=U16_GET_SUPPLEMENTARY(c, trail);
 734                     }
 735                 } else {
 736                     /* no more input */
 737                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
 738                     break;
 739                 }
 740             }
 741
 742             /*
 743              * all other Unicode code points c==U+0021..U+10ffff
 744              * are encoded with the difference c-prev
 745              *
 746              * a new prev is computed from c,
 747              * placed in the middle of a 0x80-block (for most small scripts) or
 748              * in the middle of the Unihan and Hangul blocks
 749              * to statistically minimize the following difference
 750              */
 751             diff=c-prev;
 752             prev=BOCU1_PREV(c);
 753             if(DIFF_IS_SINGLE(diff)) {
 754                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
 755                 --targetCapacity;
 756                 if(c<0x3000) {
 757                     goto fastSingle;
 758                 }
 759             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
 760                 /* optimize 2-byte case */
 761                 int32_t m;
 762
 763                 if(diff>=0) {
 764                     diff-=BOCU1_REACH_POS_1+1;
 765                     m=diff%BOCU1_TRAIL_COUNT;
 766                     diff/=BOCU1_TRAIL_COUNT;
 767                     diff+=BOCU1_START_POS_2;
 768                 } else {
 769                     diff-=BOCU1_REACH_NEG_1;
 770                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 771                     diff+=BOCU1_START_NEG_2;
 772                 }
 773                 *target++=(uint8_t)diff;
 774                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
 775                 targetCapacity-=2;
 776             } else {
 777                 int32_t length; /* will be 2..4 */
 778
 779                 diff=packDiff(diff);
 780                 length=BOCU1_LENGTH_FROM_PACKED(diff);
 781
 782                 /* write the output character bytes from diff and length */
 783                 /* from the first if in the loop we know that targetCapacity>0 */
 784                 if(length<=targetCapacity) {
 785                     switch(length) {
 786                         /* each branch falls through to the next one */
 787                     case 4:
 788                         *target++=(uint8_t)(diff>>24);
 789                         U_FALLTHROUGH;
 790                     case 3:
 791                         *target++=(uint8_t)(diff>>16);
 792                     /* case 2: handled above */
 793                         *target++=(uint8_t)(diff>>8);
 794                     /* case 1: handled above */
 795                         *target++=(uint8_t)diff;
 796                         U_FALLTHROUGH;
 797                     default:
 798                         /* will never occur */
 799                         break;
 800                     }
 801                     targetCapacity-=length;
 802                 } else {
 803                     uint8_t *charErrorBuffer;
 804
 805                     /*
 806                      * We actually do this backwards here:
 807                      * In order to save an intermediate variable, we output
 808                      * first to the overflow buffer what does not fit into the
 809                      * regular target.
 810                      */
 811                     /* we know that 1<=targetCapacity<length<=4 */
 812                     length-=targetCapacity;
 813                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
 814                     switch(length) {
 815                         /* each branch falls through to the next one */
 816                     case 3:
 817                         *charErrorBuffer++=(uint8_t)(diff>>16);
 818                         U_FALLTHROUGH;
 819                     case 2:
 820                         *charErrorBuffer++=(uint8_t)(diff>>8);
 821                         U_FALLTHROUGH;
 822                     case 1:
 823                         *charErrorBuffer=(uint8_t)diff;
 824                         U_FALLTHROUGH;
 825                     default:
 826                         /* will never occur */
 827                         break;
 828                     }
 829                     cnv->charErrorBufferLength=(int8_t)length;
 830
 831                     /* now output what fits into the regular target */
 832                     diff>>=8*length; /* length was reduced by targetCapacity */
 833                     switch(targetCapacity) {
 834                         /* each branch falls through to the next one */
 835                     case 3:
 836                         *target++=(uint8_t)(diff>>16);
 837                         U_FALLTHROUGH;
 838                     case 2:
 839                         *target++=(uint8_t)(diff>>8);
 840                         U_FALLTHROUGH;
 841                     case 1:
 842                         *target++=(uint8_t)diff;
 843                         U_FALLTHROUGH;
 844                     default:
 845                         /* will never occur */
 846                         break;
 847                     }
 848
 849                     /* target overflow */
 850                     targetCapacity=0;
 851                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 852                     break;
 853                 }
 854             }
 855         } else {
 856             /* target is full */
 857             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 858             break;
 859         }
 860     }
 861
 862     /* set the converter state back into UConverter */
 863     cnv->fromUChar32= c<0 ? -c : 0;
 864     cnv->fromUnicodeStatus=(uint32_t)prev;
 865
 866     /* write back the updated pointers */
 867     pArgs->source=source;
 868     pArgs->target=(char *)target;
 869 }
 870
 871 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
 872
 873 /**
 874  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
 875  *
 876  * @param b lead byte;
 877  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
 878  * @return (diff<<2)|count
 879  */
 880 static inline int32_t
 881 decodeBocu1LeadByte(int32_t b) {
 882     int32_t diff, count;
 883
 884     if(b>=BOCU1_START_NEG_2) {
 885         /* positive difference */
 886         if(b<BOCU1_START_POS_3) {
 887             /* two bytes */
 888             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
 889             count=1;
 890         } else if(b<BOCU1_START_POS_4) {
 891             /* three bytes */
 892             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
 893             count=2;
 894         } else {
 895             /* four bytes */
 896             diff=BOCU1_REACH_POS_3+1;
 897             count=3;
 898         }
 899     } else {
 900         /* negative difference */
 901         if(b>=BOCU1_START_NEG_3) {
 902             /* two bytes */
 903             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
 904             count=1;
 905         } else if(b>BOCU1_MIN) {
 906             /* three bytes */
 907             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
 908             count=2;
 909         } else {
 910             /* four bytes */
 911             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
 912             count=3;
 913         }
 914     }
 915
 916     /* return the state for decoding the trail byte(s) */
 917     return (diff<<2)|count;
 918 }
 919
 920 /**
 921  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
 922  *
 923  * @param count number of remaining trail bytes including this one
 924  * @param b trail byte
 925  * @return new delta for diff including b - <0 indicates an error
 926  *
 927  * @see decodeBocu1
 928  */
 929 static inline int32_t
 930 decodeBocu1TrailByte(int32_t count, int32_t b) {
 931     if(b<=0x20) {
 932         /* skip some C0 controls and make the trail byte range contiguous */
 933         b=bocu1ByteToTrail[b];
 934         /* b<0 for an illegal trail byte value will result in return<0 below */
 935 #if BOCU1_MAX_TRAIL<0xff
 936     } else if(b>BOCU1_MAX_TRAIL) {
 937         return -99;
 938 #endif
 939     } else {
 940         b-=BOCU1_TRAIL_BYTE_OFFSET;
 941     }
 942
 943     /* add trail byte into difference and decrement count */
 944     if(count==1) {
 945         return b;
 946     } else if(count==2) {
 947         return b*BOCU1_TRAIL_COUNT;
 948     } else /* count==3 */ {
 949         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
 950     }
 951 }
 952
 953 static void
 954 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 955                            UErrorCode *pErrorCode) {
 956     UConverter *cnv;
 957     const uint8_t *source, *sourceLimit;
 958     UChar *target;
 959     const UChar *targetLimit;
 960     int32_t *offsets;
 961
 962     int32_t prev, count, diff, c;
 963
 964     int8_t byteIndex;
 965     uint8_t *bytes;
 966
 967     int32_t sourceIndex, nextSourceIndex;
 968
 969     /* set up the local pointers */
 970     cnv=pArgs->converter;
 971     source=(const uint8_t *)pArgs->source;
 972     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 973     target=pArgs->target;
 974     targetLimit=pArgs->targetLimit;
 975     offsets=pArgs->offsets;
 976
 977     /* get the converter state from UConverter */
 978     prev=(int32_t)cnv->toUnicodeStatus;
 979     if(prev==0) {
 980         prev=BOCU1_ASCII_PREV;
 981     }
 982     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
 983     count=diff&3;
 984     diff>>=2;
 985
 986     byteIndex=cnv->toULength;
 987     bytes=cnv->toUBytes;
 988
 989     /* sourceIndex=-1 if the current character began in the previous buffer */
 990     sourceIndex=byteIndex==0 ? 0 : -1;
 991     nextSourceIndex=0;
 992
 993     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
 994     if(count>0 && byteIndex>0 && target<targetLimit) {
 995         goto getTrail;
 996     }
 997
 998 fastSingle:
 999     /* fast loop for single-byte differences */
1000     /* use count as the only loop counter variable */
1001     diff=(int32_t)(sourceLimit-source);
1002     count=(int32_t)(pArgs->targetLimit-target);
1003     if(count>diff) {
1004         count=diff;
1005     }
1006     while(count>0) {
1007         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1008             c=prev+(c-BOCU1_MIDDLE);
1009             if(c<0x3000) {
1010                 *target++=(UChar)c;
1011                 *offsets++=nextSourceIndex++;
1012                 prev=BOCU1_SIMPLE_PREV(c);
1013             } else {
1014                 break;
1015             }
1016         } else if(c<=0x20) {
1017             if(c!=0x20) {
1018                 prev=BOCU1_ASCII_PREV;
1019             }
1020             *target++=(UChar)c;
1021             *offsets++=nextSourceIndex++;
1022         } else {
1023             break;
1024         }
1025         ++source;
1026         --count;
1027     }
1028     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1029
1030     /* decode a sequence of single and lead bytes */
1031     while(source<sourceLimit) {
1032         if(target>=targetLimit) {
1033             /* target is full */
1034             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1035             break;
1036         }
1037
1038         ++nextSourceIndex;
1039         c=*source++;
1040         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1041             /* Write a code point directly from a single-byte difference. */
1042             c=prev+(c-BOCU1_MIDDLE);
1043             if(c<0x3000) {
1044                 *target++=(UChar)c;
1045                 *offsets++=sourceIndex;
1046                 prev=BOCU1_SIMPLE_PREV(c);
1047                 sourceIndex=nextSourceIndex;
1048                 goto fastSingle;
1049             }
1050         } else if(c<=0x20) {
1051             /*
1052              * Direct-encoded C0 control code or space.
1053              * Reset prev for C0 control codes but not for space.
1054              */
1055             if(c!=0x20) {
1056                 prev=BOCU1_ASCII_PREV;
1057             }
1058             *target++=(UChar)c;
1059             *offsets++=sourceIndex;
1060             sourceIndex=nextSourceIndex;
1061             continue;
1062         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1063             /* Optimize two-byte case. */
1064             if(c>=BOCU1_MIDDLE) {
1065                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1066             } else {
1067                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1068             }
1069
1070             /* trail byte */
1071             ++nextSourceIndex;
1072             c=decodeBocu1TrailByte(1, *source++);
1073             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1074                 bytes[0]=source[-2];
1075                 bytes[1]=source[-1];
1076                 byteIndex=2;
1077                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1078                 break;
1079             }
1080         } else if(c==BOCU1_RESET) {
1081             /* only reset the state, no code point */
1082             prev=BOCU1_ASCII_PREV;
1083             sourceIndex=nextSourceIndex;
1084             continue;
1085         } else {
1086             /*
1087              * For multi-byte difference lead bytes, set the decoder state
1088              * with the partial difference value from the lead byte and
1089              * with the number of trail bytes.
1090              */
1091             bytes[0]=(uint8_t)c;
1092             byteIndex=1;
1093
1094             diff=decodeBocu1LeadByte(c);
1095             count=diff&3;
1096             diff>>=2;
1097 getTrail:
1098             for(;;) {
1099                 if(source>=sourceLimit) {
1100                     goto endloop;
1101                 }
1102                 ++nextSourceIndex;
1103                 c=bytes[byteIndex++]=*source++;
1104
1105                 /* trail byte in any position */
1106                 c=decodeBocu1TrailByte(count, c);
1107                 if(c<0) {
1108                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1109                     goto endloop;
1110                 }
1111
1112                 diff+=c;
1113                 if(--count==0) {
1114                     /* final trail byte, deliver a code point */
1115                     byteIndex=0;
1116                     c=prev+diff;
1117                     if((uint32_t)c>0x10ffff) {
1118                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1119                         goto endloop;
1120                     }
1121                     break;
1122                 }
1123             }
1124         }
1125
1126         /* calculate the next prev and output c */
1127         prev=BOCU1_PREV(c);
1128         if(c<=0xffff) {
1129             *target++=(UChar)c;
1130             *offsets++=sourceIndex;
1131         } else {
1132             /* output surrogate pair */
1133             *target++=U16_LEAD(c);
1134             if(target<targetLimit) {
1135                 *target++=U16_TRAIL(c);
1136                 *offsets++=sourceIndex;
1137                 *offsets++=sourceIndex;
1138             } else {
1139                 /* target overflow */
1140                 *offsets++=sourceIndex;
1141                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1142                 cnv->UCharErrorBufferLength=1;
1143                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1144                 break;
1145             }
1146         }
1147         sourceIndex=nextSourceIndex;
1148     }
1149 endloop:
1150
1151     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1152         /* set the converter state in UConverter to deal with the next character */
1153         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1154         cnv->mode=0;
1155     } else {
1156         /* set the converter state back into UConverter */
1157         cnv->toUnicodeStatus=(uint32_t)prev;
1158         cnv->mode=(diff<<2)|count;
1159     }
1160     cnv->toULength=byteIndex;
1161
1162     /* write back the updated pointers */
1163     pArgs->source=(const char *)source;
1164     pArgs->target=target;
1165     pArgs->offsets=offsets;
1166     return;
1167 }
1168
1169 /*
1170  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1171  * If a change is made in the original function, then either
1172  * change this function the same way or
1173  * re-copy the original function and remove the variables
1174  * offsets, sourceIndex, and nextSourceIndex.
1175  */
1176 static void
1177 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1178                 UErrorCode *pErrorCode) {
1179     UConverter *cnv;
1180     const uint8_t *source, *sourceLimit;
1181     UChar *target;
1182     const UChar *targetLimit;
1183
1184     int32_t prev, count, diff, c;
1185
1186     int8_t byteIndex;
1187     uint8_t *bytes;
1188
1189     /* set up the local pointers */
1190     cnv=pArgs->converter;
1191     source=(const uint8_t *)pArgs->source;
1192     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1193     target=pArgs->target;
1194     targetLimit=pArgs->targetLimit;
1195
1196     /* get the converter state from UConverter */
1197     prev=(int32_t)cnv->toUnicodeStatus;
1198     if(prev==0) {
1199         prev=BOCU1_ASCII_PREV;
1200     }
1201     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1202     count=diff&3;
1203     diff>>=2;
1204
1205     byteIndex=cnv->toULength;
1206     bytes=cnv->toUBytes;
1207
1208     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1209     if(count>0 && byteIndex>0 && target<targetLimit) {
1210         goto getTrail;
1211     }
1212
1213 fastSingle:
1214     /* fast loop for single-byte differences */
1215     /* use count as the only loop counter variable */
1216     diff=(int32_t)(sourceLimit-source);
1217     count=(int32_t)(pArgs->targetLimit-target);
1218     if(count>diff) {
1219         count=diff;
1220     }
1221     while(count>0) {
1222         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1223             c=prev+(c-BOCU1_MIDDLE);
1224             if(c<0x3000) {
1225                 *target++=(UChar)c;
1226                 prev=BOCU1_SIMPLE_PREV(c);
1227             } else {
1228                 break;
1229             }
1230         } else if(c<=0x20) {
1231             if(c!=0x20) {
1232                 prev=BOCU1_ASCII_PREV;
1233             }
1234             *target++=(UChar)c;
1235         } else {
1236             break;
1237         }
1238         ++source;
1239         --count;
1240     }
1241
1242     /* decode a sequence of single and lead bytes */
1243     while(source<sourceLimit) {
1244         if(target>=targetLimit) {
1245             /* target is full */
1246             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1247             break;
1248         }
1249
1250         c=*source++;
1251         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1252             /* Write a code point directly from a single-byte difference. */
1253             c=prev+(c-BOCU1_MIDDLE);
1254             if(c<0x3000) {
1255                 *target++=(UChar)c;
1256                 prev=BOCU1_SIMPLE_PREV(c);
1257                 goto fastSingle;
1258             }
1259         } else if(c<=0x20) {
1260             /*
1261              * Direct-encoded C0 control code or space.
1262              * Reset prev for C0 control codes but not for space.
1263              */
1264             if(c!=0x20) {
1265                 prev=BOCU1_ASCII_PREV;
1266             }
1267             *target++=(UChar)c;
1268             continue;
1269         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1270             /* Optimize two-byte case. */
1271             if(c>=BOCU1_MIDDLE) {
1272                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1273             } else {
1274                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1275             }
1276
1277             /* trail byte */
1278             c=decodeBocu1TrailByte(1, *source++);
1279             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1280                 bytes[0]=source[-2];
1281                 bytes[1]=source[-1];
1282                 byteIndex=2;
1283                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1284                 break;
1285             }
1286         } else if(c==BOCU1_RESET) {
1287             /* only reset the state, no code point */
1288             prev=BOCU1_ASCII_PREV;
1289             continue;
1290         } else {
1291             /*
1292              * For multi-byte difference lead bytes, set the decoder state
1293              * with the partial difference value from the lead byte and
1294              * with the number of trail bytes.
1295              */
1296             bytes[0]=(uint8_t)c;
1297             byteIndex=1;
1298
1299             diff=decodeBocu1LeadByte(c);
1300             count=diff&3;
1301             diff>>=2;
1302 getTrail:
1303             for(;;) {
1304                 if(source>=sourceLimit) {
1305                     goto endloop;
1306                 }
1307                 c=bytes[byteIndex++]=*source++;
1308
1309                 /* trail byte in any position */
1310                 c=decodeBocu1TrailByte(count, c);
1311                 if(c<0) {
1312                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1313                     goto endloop;
1314                 }
1315
1316                 diff+=c;
1317                 if(--count==0) {
1318                     /* final trail byte, deliver a code point */
1319                     byteIndex=0;
1320                     c=prev+diff;
1321                     if((uint32_t)c>0x10ffff) {
1322                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1323                         goto endloop;
1324                     }
1325                     break;
1326                 }
1327             }
1328         }
1329
1330         /* calculate the next prev and output c */
1331         prev=BOCU1_PREV(c);
1332         if(c<=0xffff) {
1333             *target++=(UChar)c;
1334         } else {
1335             /* output surrogate pair */
1336             *target++=U16_LEAD(c);
1337             if(target<targetLimit) {
1338                 *target++=U16_TRAIL(c);
1339             } else {
1340                 /* target overflow */
1341                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1342                 cnv->UCharErrorBufferLength=1;
1343                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1344                 break;
1345             }
1346         }
1347     }
1348 endloop:
1349
1350     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1351         /* set the converter state in UConverter to deal with the next character */
1352         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1353         cnv->mode=0;
1354     } else {
1355         /* set the converter state back into UConverter */
1356         cnv->toUnicodeStatus=(uint32_t)prev;
1357         cnv->mode=(diff<<2)|count;
1358     }
1359     cnv->toULength=byteIndex;
1360
1361     /* write back the updated pointers */
1362     pArgs->source=(const char *)source;
1363     pArgs->target=target;
1364     return;
1365 }
1366
1367 /* miscellaneous ------------------------------------------------------------ */
1368
1369 static const UConverterImpl _Bocu1Impl={
1370     UCNV_BOCU1,
1371
1372     NULL,
1373     NULL,
1374
1375     NULL,
1376     NULL,
1377     NULL,
1378
1379     _Bocu1ToUnicode,
1380     _Bocu1ToUnicodeWithOffsets,
1381     _Bocu1FromUnicode,
1382     _Bocu1FromUnicodeWithOffsets,
1383     NULL,
1384
1385     NULL,
1386     NULL,
1387     NULL,
1388     NULL,
1389     ucnv_getCompleteUnicodeSet,
1390
1391     NULL,
1392     NULL
1393 };
1394
1395 static const UConverterStaticData _Bocu1StaticData={
1396     sizeof(UConverterStaticData),
1397     "BOCU-1",
1398     1214, /* CCSID for BOCU-1 */
1399     UCNV_IBM, UCNV_BOCU1,
1400     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1401     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1402     FALSE, FALSE,
1403     0,
1404     0,
1405     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1406 };
1407
1408 const UConverterSharedData _Bocu1Data=
1409         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1410
1411 #endif