icuSources/test/cintltst/bocu1tst.c

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 *
   6 *   Copyright (C) 2002-2015, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 ******************************************************************************
  10 *   file name:  bocu1tst.c
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2002may27
  16 *   created by: Markus W. Scherer
  17 *
  18 *   This is the reference implementation of BOCU-1,
  19 *   the MIME-friendly form of the Binary Ordered Compression for Unicode,
  20 *   taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
  21 *   The files bocu1.h and bocu1.c from the design folder are taken
  22 *   verbatim (minus copyright and #include) and copied together into this file.
  23 *   The reference code and some of the reference bocu1tst.c
  24 *   is modified to run as part of the ICU cintltst
  25 *   test framework (minus main(), log_ln() etc. instead of printf()).
  26 *
  27 *   This reference implementation is used here to verify
  28 *   the ICU BOCU-1 implementation, which is
  29 *   adapted for ICU conversion APIs and optimized.
  30 *   ### links in design doc to here and to ucnvbocu.c
  31 */
  32
  33 #include "unicode/utypes.h"
  34 #include "unicode/ustring.h"
  35 #include "unicode/ucnv.h"
  36 #include "unicode/utf16.h"
  37 #include "cmemory.h"
  38 #include "cintltst.h"
  39
  40 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
  41
  42 /* BOCU-1 constants and macros ---------------------------------------------- */
  43
  44 /*
  45  * BOCU-1 encodes the code points of a Unicode string as
  46  * a sequence of byte-encoded differences (slope detection),
  47  * preserving lexical order.
  48  *
  49  * Optimize the difference-taking for runs of Unicode text within
  50  * small scripts:
  51  *
  52  * Most small scripts are allocated within aligned 128-blocks of Unicode
  53  * code points. Lexical order is preserved if the "previous code point" state
  54  * is always moved into the middle of such a block.
  55  *
  56  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  57  * areas into the middle of those areas.
  58  *
  59  * C0 control codes and space are encoded with their US-ASCII bytes.
  60  * "prev" is reset for C0 controls but not for space.
  61  */
  62
  63 /* initial value for "prev": middle of the ASCII range */
  64 #define BOCU1_ASCII_PREV        0x40
  65
  66 /* bounding byte values for differences */
  67 #define BOCU1_MIN               0x21
  68 #define BOCU1_MIDDLE            0x90
  69 #define BOCU1_MAX_LEAD          0xfe
  70
  71 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
  72 #define BOCU1_MAX_TRAIL         0xffL
  73 #define BOCU1_RESET             0xff
  74
  75 /* number of lead bytes */
  76 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  77
  78 /* adjust trail byte counts for the use of some C0 control byte values */
  79 #define BOCU1_TRAIL_CONTROLS_COUNT  20
  80 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  81
  82 /* number of trail bytes */
  83 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  84
  85 /*
  86  * number of positive and negative single-byte codes
  87  * (counting 0==BOCU1_MIDDLE among the positive ones)
  88  */
  89 #define BOCU1_SINGLE            64
  90
  91 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  92 #define BOCU1_LEAD_2            43
  93 #define BOCU1_LEAD_3            3
  94 #define BOCU1_LEAD_4            1
  95
  96 /* The difference value range for single-byters. */
  97 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
  98 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
  99
 100 /* The difference value range for double-byters. */
 101 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
 102 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
 103
 104 /* The difference value range for 3-byters. */
 105 #define BOCU1_REACH_POS_3   \
 106     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
 107
 108 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
 109
 110 /* The lead byte start values. */
 111 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
 112 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
 113 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
 114      /* ==BOCU1_MAX_LEAD */
 115
 116 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
 117 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
 118 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
 119      /* ==BOCU1_MIN+1 */
 120
 121 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
 122 #define BOCU1_LENGTH_FROM_LEAD(lead) \
 123     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
 124      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
 125      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
 126
 127 /* The length of a byte sequence, according to its packed form. */
 128 #define BOCU1_LENGTH_FROM_PACKED(packed) \
 129     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
 130
 131 /*
 132  * 12 commonly used C0 control codes (and space) are only used to encode
 133  * themselves directly,
 134  * which makes BOCU-1 MIME-usable and reasonably safe for
 135  * ASCII-oriented software.
 136  *
 137  * These controls are
 138  *  0   NUL
 139  *
 140  *  7   BEL
 141  *  8   BS
 142  *
 143  *  9   TAB
 144  *  a   LF
 145  *  b   VT
 146  *  c   FF
 147  *  d   CR
 148  *
 149  *  e   SO
 150  *  f   SI
 151  *
 152  * 1a   SUB
 153  * 1b   ESC
 154  *
 155  * The other 20 C0 controls are also encoded directly (to preserve order)
 156  * but are also used as trail bytes in difference encoding
 157  * (for better compression).
 158  */
 159 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
 160
 161 /*
 162  * Byte value map for control codes,
 163  * from external byte values 0x00..0x20
 164  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
 165  * External byte values that are illegal as trail bytes are mapped to -1.
 166  */
 167 static const int8_t
 168 bocu1ByteToTrail[BOCU1_MIN]={
 169 /*  0     1     2     3     4     5     6     7    */
 170     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
 171
 172 /*  8     9     a     b     c     d     e     f    */
 173     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 174
 175 /*  10    11    12    13    14    15    16    17   */
 176     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
 177
 178 /*  18    19    1a    1b    1c    1d    1e    1f   */
 179     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
 180
 181 /*  20   */
 182     -1
 183 };
 184
 185 /*
 186  * Byte value map for control codes,
 187  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
 188  * to external byte values 0x00..0x20.
 189  */
 190 static const int8_t
 191 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
 192 /*  0     1     2     3     4     5     6     7    */
 193     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
 194
 195 /*  8     9     a     b     c     d     e     f    */
 196     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 197
 198 /*  10    11    12    13   */
 199     0x1c, 0x1d, 0x1e, 0x1f
 200 };
 201
 202 /**
 203  * Integer division and modulo with negative numerators
 204  * yields negative modulo results and quotients that are one more than
 205  * what we need here.
 206  * This macro adjust the results so that the modulo-value m is always >=0.
 207  *
 208  * For positive n, the if() condition is always FALSE.
 209  *
 210  * @param n Number to be split into quotient and rest.
 211  *          Will be modified to contain the quotient.
 212  * @param d Divisor.
 213  * @param m Output variable for the rest (modulo result).
 214  */
 215 #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
 216     (m)=(n)%(d); \
 217     (n)/=(d); \
 218     if((m)<0) { \
 219         --(n); \
 220         (m)+=(d); \
 221     } \
 222 } UPRV_BLOCK_MACRO_END
 223
 224 /* State for BOCU-1 decoder function. */
 225 struct Bocu1Rx {
 226     int32_t prev, count, diff;
 227 };
 228
 229 typedef struct Bocu1Rx Bocu1Rx;
 230
 231 /* Function prototypes ------------------------------------------------------ */
 232
 233 /* see bocu1.c */
 234 U_CFUNC int32_t
 235 packDiff(int32_t diff);
 236
 237 U_CFUNC int32_t
 238 encodeBocu1(int32_t *pPrev, int32_t c);
 239
 240 U_CFUNC int32_t
 241 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
 242
 243 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
 244
 245 /* BOCU-1 implementation functions ------------------------------------------ */
 246
 247 /**
 248  * Compute the next "previous" value for differencing
 249  * from the current code point.
 250  *
 251  * @param c current code point, 0..0x10ffff
 252  * @return "previous code point" state value
 253  */
 254 static int32_t
 255 bocu1Prev(int32_t c) {
 256     /* compute new prev */
 257     if(0x3040<=c && c<=0x309f) {
 258         /* Hiragana is not 128-aligned */
 259         return 0x3070;
 260     } else if(0x4e00<=c && c<=0x9fa5) {
 261         /* CJK Unihan */
 262         return 0x4e00-BOCU1_REACH_NEG_2;
 263     } else if(0xac00<=c && c<=0xd7a3) {
 264         /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
 265         return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
 266     } else {
 267         /* mostly small scripts */
 268         return (c&~0x7f)+BOCU1_ASCII_PREV;
 269     }
 270 }
 271
 272 /**
 273  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
 274  * and return a packed integer with them.
 275  *
 276  * The encoding favors small absolut differences with short encodings
 277  * to compress runs of same-script characters.
 278  *
 279  * @param diff difference value -0x10ffff..0x10ffff
 280  * @return
 281  *      0x010000zz for 1-byte sequence zz
 282  *      0x0200yyzz for 2-byte sequence yy zz
 283  *      0x03xxyyzz for 3-byte sequence xx yy zz
 284  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
 285  */
 286 U_CFUNC int32_t
 287 packDiff(int32_t diff) {
 288     int32_t result, m, lead, count, shift;
 289
 290     if(diff>=BOCU1_REACH_NEG_1) {
 291         /* mostly positive differences, and single-byte negative ones */
 292         if(diff<=BOCU1_REACH_POS_1) {
 293             /* single byte */
 294             return 0x01000000|(BOCU1_MIDDLE+diff);
 295         } else if(diff<=BOCU1_REACH_POS_2) {
 296             /* two bytes */
 297             diff-=BOCU1_REACH_POS_1+1;
 298             lead=BOCU1_START_POS_2;
 299             count=1;
 300         } else if(diff<=BOCU1_REACH_POS_3) {
 301             /* three bytes */
 302             diff-=BOCU1_REACH_POS_2+1;
 303             lead=BOCU1_START_POS_3;
 304             count=2;
 305         } else {
 306             /* four bytes */
 307             diff-=BOCU1_REACH_POS_3+1;
 308             lead=BOCU1_START_POS_4;
 309             count=3;
 310         }
 311     } else {
 312         /* two- and four-byte negative differences */
 313         if(diff>=BOCU1_REACH_NEG_2) {
 314             /* two bytes */
 315             diff-=BOCU1_REACH_NEG_1;
 316             lead=BOCU1_START_NEG_2;
 317             count=1;
 318         } else if(diff>=BOCU1_REACH_NEG_3) {
 319             /* three bytes */
 320             diff-=BOCU1_REACH_NEG_2;
 321             lead=BOCU1_START_NEG_3;
 322             count=2;
 323         } else {
 324             /* four bytes */
 325             diff-=BOCU1_REACH_NEG_3;
 326             lead=BOCU1_START_NEG_4;
 327             count=3;
 328         }
 329     }
 330
 331     /* encode the length of the packed result */
 332     if(count<3) {
 333         result=(count+1)<<24;
 334     } else /* count==3, MSB used for the lead byte */ {
 335         result=0;
 336     }
 337
 338     /* calculate trail bytes like digits in itoa() */
 339     shift=0;
 340     do {
 341         NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 342         result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
 343         shift+=8;
 344     } while(--count>0);
 345
 346     /* add lead byte */
 347     result |= (uint32_t)(lead+diff)<<shift;
 348
 349     return result;
 350 }
 351
 352 /**
 353  * BOCU-1 encoder function.
 354  *
 355  * @param pPrev pointer to the integer that holds
 356  *        the "previous code point" state;
 357  *        the initial value should be 0 which
 358  *        encodeBocu1 will set to the actual BOCU-1 initial state value
 359  * @param c the code point to encode
 360  * @return the packed 1/2/3/4-byte encoding, see packDiff(),
 361  *         or 0 if an error occurs
 362  *
 363  * @see packDiff
 364  */
 365 U_CFUNC int32_t
 366 encodeBocu1(int32_t *pPrev, int32_t c) {
 367     int32_t prev;
 368
 369     if(pPrev==NULL || c<0 || c>0x10ffff) {
 370         /* illegal argument */
 371         return 0;
 372     }
 373
 374     prev=*pPrev;
 375     if(prev==0) {
 376         /* lenient handling of initial value 0 */
 377         prev=*pPrev=BOCU1_ASCII_PREV;
 378     }
 379
 380     if(c<=0x20) {
 381         /*
 382          * ISO C0 control & space:
 383          * Encode directly for MIME compatibility,
 384          * and reset state except for space, to not disrupt compression.
 385          */
 386         if(c!=0x20) {
 387             *pPrev=BOCU1_ASCII_PREV;
 388         }
 389         return 0x01000000|c;
 390     }
 391
 392     /*
 393      * all other Unicode code points c==U+0021..U+10ffff
 394      * are encoded with the difference c-prev
 395      *
 396      * a new prev is computed from c,
 397      * placed in the middle of a 0x80-block (for most small scripts) or
 398      * in the middle of the Unihan and Hangul blocks
 399      * to statistically minimize the following difference
 400      */
 401     *pPrev=bocu1Prev(c);
 402     return packDiff(c-prev);
 403 }
 404
 405 /**
 406  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
 407  *
 408  * @param pRx pointer to the decoder state structure
 409  * @param b lead byte;
 410  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
 411  * @return -1 (state change only)
 412  *
 413  * @see decodeBocu1
 414  */
 415 static int32_t
 416 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
 417     int32_t c, count;
 418
 419     if(b>=BOCU1_START_NEG_2) {
 420         /* positive difference */
 421         if(b<BOCU1_START_POS_3) {
 422             /* two bytes */
 423             c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
 424             count=1;
 425         } else if(b<BOCU1_START_POS_4) {
 426             /* three bytes */
 427             c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
 428             count=2;
 429         } else {
 430             /* four bytes */
 431             c=BOCU1_REACH_POS_3+1;
 432             count=3;
 433         }
 434     } else {
 435         /* negative difference */
 436         if(b>=BOCU1_START_NEG_3) {
 437             /* two bytes */
 438             c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
 439             count=1;
 440         } else if(b>BOCU1_MIN) {
 441             /* three bytes */
 442             c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
 443             count=2;
 444         } else {
 445             /* four bytes */
 446             c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
 447             count=3;
 448         }
 449     }
 450
 451     /* set the state for decoding the trail byte(s) */
 452     pRx->diff=c;
 453     pRx->count=count;
 454     return -1;
 455 }
 456
 457 /**
 458  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
 459  *
 460  * @param pRx pointer to the decoder state structure
 461  * @param b trail byte
 462  * @return result value, same as decodeBocu1
 463  *
 464  * @see decodeBocu1
 465  */
 466 static int32_t
 467 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
 468     int32_t t, c, count;
 469
 470     if(b<=0x20) {
 471         /* skip some C0 controls and make the trail byte range contiguous */
 472         t=bocu1ByteToTrail[b];
 473         if(t<0) {
 474             /* illegal trail byte value */
 475             pRx->prev=BOCU1_ASCII_PREV;
 476             pRx->count=0;
 477             return -99;
 478         }
 479 #if BOCU1_MAX_TRAIL<0xff
 480     } else if(b>BOCU1_MAX_TRAIL) {
 481         return -99;
 482 #endif
 483     } else {
 484         t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
 485     }
 486
 487     /* add trail byte into difference and decrement count */
 488     c=pRx->diff;
 489     count=pRx->count;
 490
 491     if(count==1) {
 492         /* final trail byte, deliver a code point */
 493         c=pRx->prev+c+t;
 494         if(0<=c && c<=0x10ffff) {
 495             /* valid code point result */
 496             pRx->prev=bocu1Prev(c);
 497             pRx->count=0;
 498             return c;
 499         } else {
 500             /* illegal code point result */
 501             pRx->prev=BOCU1_ASCII_PREV;
 502             pRx->count=0;
 503             return -99;
 504         }
 505     }
 506
 507     /* intermediate trail byte */
 508     if(count==2) {
 509         pRx->diff=c+t*BOCU1_TRAIL_COUNT;
 510     } else /* count==3 */ {
 511         pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
 512     }
 513     pRx->count=count-1;
 514     return -1;
 515 }
 516
 517 /**
 518  * BOCU-1 decoder function.
 519  *
 520  * @param pRx pointer to the decoder state structure;
 521  *        the initial values should be 0 which
 522  *        decodeBocu1 will set to actual initial state values
 523  * @param b an input byte
 524  * @return
 525  *      0..0x10ffff for a result code point
 526  *      -1 if only the state changed without code point output
 527  *     <-1 if an error occurs
 528  */
 529 U_CFUNC int32_t
 530 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
 531     int32_t prev, c, count;
 532
 533     if(pRx==NULL) {
 534         /* illegal argument */
 535         return -99;
 536     }
 537
 538     prev=pRx->prev;
 539     if(prev==0) {
 540         /* lenient handling of initial 0 values */
 541         prev=pRx->prev=BOCU1_ASCII_PREV;
 542         count=pRx->count=0;
 543     } else {
 544         count=pRx->count;
 545     }
 546
 547     if(count==0) {
 548         /* byte in lead position */
 549         if(b<=0x20) {
 550             /*
 551              * Direct-encoded C0 control code or space.
 552              * Reset prev for C0 control codes but not for space.
 553              */
 554             if(b!=0x20) {
 555                 pRx->prev=BOCU1_ASCII_PREV;
 556             }
 557             return b;
 558         }
 559
 560         /*
 561          * b is a difference lead byte.
 562          *
 563          * Return a code point directly from a single-byte difference.
 564          *
 565          * For multi-byte difference lead bytes, set the decoder state
 566          * with the partial difference value from the lead byte and
 567          * with the number of trail bytes.
 568          *
 569          * For four-byte differences, the signedness also affects the
 570          * first trail byte, which has special handling farther below.
 571          */
 572         if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
 573             /* single-byte difference */
 574             c=prev+((int32_t)b-BOCU1_MIDDLE);
 575             pRx->prev=bocu1Prev(c);
 576             return c;
 577         } else if(b==BOCU1_RESET) {
 578             /* only reset the state, no code point */
 579             pRx->prev=BOCU1_ASCII_PREV;
 580             return -1;
 581         } else {
 582             return decodeBocu1LeadByte(pRx, b);
 583         }
 584     } else {
 585         /* trail byte in any position */
 586         return decodeBocu1TrailByte(pRx, b);
 587     }
 588 }
 589
 590 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
 591
 592 /* test code ---------------------------------------------------------------- */
 593
 594 /* test code options */
 595
 596 /* ignore comma when processing name lists in testText() */
 597 #define TEST_IGNORE_COMMA       1
 598
 599 /**
 600  * Write a packed BOCU-1 byte sequence into a byte array,
 601  * without overflow check.
 602  * Test function.
 603  *
 604  * @param packed packed BOCU-1 byte sequence, see packDiff()
 605  * @param p pointer to byte array
 606  * @return number of bytes
 607  *
 608  * @see packDiff
 609  */
 610 static int32_t
 611 writePacked(int32_t packed, uint8_t *p) {
 612     int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
 613     switch(count) {
 614     case 4:
 615         *p++=(uint8_t)(packed>>24);
 616     case 3:
 617         *p++=(uint8_t)(packed>>16);
 618     case 2:
 619         *p++=(uint8_t)(packed>>8);
 620     case 1:
 621         *p++=(uint8_t)packed;
 622     default:
 623         break;
 624     }
 625
 626     return count;
 627 }
 628
 629 /**
 630  * Unpack a packed BOCU-1 non-C0/space byte sequence and get
 631  * the difference to initialPrev.
 632  * Used only for round-trip testing of the difference encoding and decoding.
 633  * Test function.
 634  *
 635  * @param initialPrev bogus "previous code point" value to make sure that
 636  *                    the resulting code point is in the range 0..0x10ffff
 637  * @param packed packed BOCU-1 byte sequence
 638  * @return the difference to initialPrev
 639  *
 640  * @see packDiff
 641  * @see writeDiff
 642  */
 643 static int32_t
 644 unpackDiff(int32_t initialPrev, int32_t packed) {
 645     Bocu1Rx rx={ 0, 0, 0 };
 646     int32_t count;
 647
 648     rx.prev=initialPrev;
 649     count=BOCU1_LENGTH_FROM_PACKED(packed);
 650     switch(count) {
 651     case 4:
 652         decodeBocu1(&rx, (uint8_t)(packed>>24));
 653     case 3:
 654         decodeBocu1(&rx, (uint8_t)(packed>>16));
 655     case 2:
 656         decodeBocu1(&rx, (uint8_t)(packed>>8));
 657     case 1:
 658         /* subtract initial prev */
 659         return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
 660     default:
 661         return -0x7fffffff;
 662     }
 663 }
 664
 665 /**
 666  * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
 667  * preserving lexical order.
 668  * Also checks for roundtripping of the difference encoding.
 669  * Test function.
 670  *
 671  * @param diff difference value to test, -0x10ffff..0x10ffff
 672  * @param p pointer to output byte array
 673  * @return p advanced by number of bytes output
 674  *
 675  * @see unpackDiff
 676  */
 677 static uint8_t *
 678 writeDiff(int32_t diff, uint8_t *p) {
 679     /* generate the difference as a packed value and serialize it */
 680     int32_t packed, initialPrev;
 681
 682     packed=packDiff(diff);
 683
 684     /*
 685      * bogus initial "prev" to work around
 686      * code point range check in decodeBocu1()
 687      */
 688     if(diff<=0) {
 689         initialPrev=0x10ffff;
 690     } else {
 691         initialPrev=-1;
 692     }
 693
 694     if(diff!=unpackDiff(initialPrev, packed)) {
 695         log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
 696                 diff, packed, unpackDiff(initialPrev, packed));
 697     }
 698     return p+writePacked(packed, p);
 699 }
 700
 701 /**
 702  * Encode a UTF-16 string in BOCU-1.
 703  * Does not check for overflows, but otherwise useful function.
 704  *
 705  * @param s input UTF-16 string
 706  * @param length number of UChar code units in s
 707  * @param p pointer to output byte array
 708  * @return number of bytes output
 709  */
 710 static int32_t
 711 writeString(const UChar *s, int32_t length, uint8_t *p) {
 712     uint8_t *p0;
 713     int32_t c, prev, i;
 714
 715     prev=0;
 716     p0=p;
 717     i=0;
 718     while(i<length) {
 719         U16_NEXT(s, i, length, c);
 720         p+=writePacked(encodeBocu1(&prev, c), p);
 721     }
 722     return (int32_t)(p-p0);
 723 }
 724
 725 /**
 726  * Decode a BOCU-1 byte sequence to a UTF-16 string.
 727  * Does not check for overflows, but otherwise useful function.
 728  *
 729  * @param p pointer to input BOCU-1 bytes
 730  * @param length number of input bytes
 731  * @param s point to output UTF-16 string array
 732  * @return number of UChar code units output
 733  */
 734 static int32_t
 735 readString(const uint8_t *p, int32_t length, UChar *s) {
 736     Bocu1Rx rx={ 0, 0, 0 };
 737     int32_t c, i, sLength;
 738
 739     i=sLength=0;
 740     while(i<length) {
 741         c=decodeBocu1(&rx, p[i++]);
 742         if(c<-1) {
 743             log_err("error: readString detects encoding error at string index %ld\n", i);
 744             return -1;
 745         }
 746         if(c>=0) {
 747             U16_APPEND_UNSAFE(s, sLength, c);
 748         }
 749     }
 750     return sLength;
 751 }
 752
 753 static char
 754 hexDigit(uint8_t digit) {
 755     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
 756 }
 757
 758 /**
 759  * Pretty-print 0-terminated byte values.
 760  * Helper function for test output.
 761  *
 762  * @param bytes 0-terminated byte array to print
 763  */
 764 static void
 765 printBytes(uint8_t *bytes, char *out) {
 766     int i;
 767     uint8_t b;
 768
 769     i=0;
 770     while((b=*bytes++)!=0) {
 771         *out++=' ';
 772         *out++=hexDigit((uint8_t)(b>>4));
 773         *out++=hexDigit((uint8_t)(b&0xf));
 774         ++i;
 775     }
 776     i=3*(5-i);
 777     while(i>0) {
 778         *out++=' ';
 779         --i;
 780     }
 781     *out=0;
 782 }
 783
 784 /**
 785  * Basic BOCU-1 test function, called when there are no command line arguments.
 786  * Prints some of the #define values and performs round-trip tests of the
 787  * difference encoding and decoding.
 788  */
 789 static void
 790 TestBOCU1RefDiff(void) {
 791     char buf1[80], buf2[80];
 792     uint8_t prev[5], level[5];
 793     int32_t i, cmp, countErrors;
 794
 795     log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
 796     log_verbose("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
 797     log_verbose("reach of 3 bytes     : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
 798
 799     log_verbose("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
 800     log_verbose("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
 801     log_verbose("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
 802
 803     log_verbose("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
 804     log_verbose("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
 805     log_verbose("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
 806
 807     /* test packDiff() & unpackDiff() with some specific values */
 808     writeDiff(0, level);
 809     writeDiff(1, level);
 810     writeDiff(65, level);
 811     writeDiff(130, level);
 812     writeDiff(30000, level);
 813     writeDiff(1000000, level);
 814     writeDiff(-65, level);
 815     writeDiff(-130, level);
 816     writeDiff(-30000, level);
 817     writeDiff(-1000000, level);
 818
 819     /* test that each value is smaller than any following one */
 820     countErrors=0;
 821     i=-0x10ffff;
 822     *writeDiff(i, prev)=0;
 823
 824     /* show first number and bytes */
 825     printBytes(prev, buf1);
 826     log_verbose("              wD(%8ld)                    %s\n", i, buf1);
 827
 828     for(++i; i<=0x10ffff; ++i) {
 829         *writeDiff(i, level)=0;
 830         cmp=strcmp((const char *)prev, (const char *)level);
 831         if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
 832             log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
 833                    level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
 834         }
 835         if(cmp<0) {
 836             if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
 837                 /*
 838                  * if the result is good, then print only if the length changed
 839                  * to get little but interesting output
 840                  */
 841                 printBytes(prev, buf1);
 842                 printBytes(level, buf2);
 843                 log_verbose("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
 844             }
 845         } else {
 846             ++countErrors;
 847             printBytes(prev, buf1);
 848             printBytes(level, buf2);
 849             log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
 850         }
 851         /* remember the previous bytes */
 852         memcpy(prev, level, 4);
 853     }
 854
 855     /* show last number and bytes */
 856     printBytes((uint8_t *)"", buf1);
 857     printBytes(prev, buf2);
 858     log_verbose("                            wD(%8ld)      %s%s\n", i-1, buf1, buf2);
 859
 860     if(countErrors==0) {
 861         log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
 862     } else {
 863         log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
 864     }
 865
 866     /* output signature byte sequence */
 867     i=0;
 868     writePacked(encodeBocu1(&i, 0xfeff), level);
 869     log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
 870             level[0], level[1], level[2]);
 871 }
 872
 873 /* cintltst code ------------------------------------------------------------ */
 874
 875 static const int32_t DEFAULT_BUFFER_SIZE = 30000;
 876
 877
 878 /* test one string with the ICU and the reference BOCU-1 implementations */
 879 static void
 880 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
 881     UChar *roundtripRef, *roundtripICU;
 882     char *bocu1Ref, *bocu1ICU;
 883
 884     int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
 885     UErrorCode errorCode;
 886
 887     roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
 888     roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
 889     bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
 890     bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
 891
 892     /* Unicode -> BOCU-1 */
 893     bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
 894
 895     errorCode=U_ZERO_ERROR;
 896     bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
 897     if(U_FAILURE(errorCode)) {
 898         log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
 899         goto cleanup;
 900     }
 901
 902     if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
 903         log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
 904         goto cleanup;
 905     }
 906
 907     /* BOCU-1 -> Unicode */
 908     roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
 909     if(roundtripRefLength<0) {
 910         goto cleanup; /* readString() found an error and reported it */
 911     }
 912
 913     roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
 914     if(U_FAILURE(errorCode)) {
 915         log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
 916         goto cleanup;
 917     }
 918
 919     if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
 920         log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
 921         goto cleanup;
 922     }
 923     if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
 924         log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
 925         goto cleanup;
 926     }
 927 cleanup:
 928     free(roundtripRef);
 929     free(roundtripICU);
 930     free(bocu1Ref);
 931     free(bocu1ICU);
 932 }
 933
 934 static const UChar feff[]={ 0xfeff };
 935 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
 936 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
 937 static const UChar nul[]={ 0 };
 938 static const UChar latin[]={ 0xdf, 0xe6 };
 939 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
 940 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
 941 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
 942 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
 943 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
 944 static const UChar plane1[]={ 0xd800, 0xdc00 };
 945 static const UChar plane2[]={ 0xd845, 0xdddd };
 946 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
 947 static const UChar plane16[]={ 0xdbff, 0xdfff };
 948 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
 949
 950 static const struct {
 951     const UChar *s;
 952     int32_t length;
 953 } strings[]={
 954     { feff,         UPRV_LENGTHOF(feff) },
 955     { ascii,        UPRV_LENGTHOF(ascii) },
 956     { crlf,         UPRV_LENGTHOF(crlf) },
 957     { nul,          UPRV_LENGTHOF(nul) },
 958     { latin,        UPRV_LENGTHOF(latin) },
 959     { devanagari,   UPRV_LENGTHOF(devanagari) },
 960     { hiragana,     UPRV_LENGTHOF(hiragana) },
 961     { unihan,       UPRV_LENGTHOF(unihan) },
 962     { hangul,       UPRV_LENGTHOF(hangul) },
 963     { surrogates,   UPRV_LENGTHOF(surrogates) },
 964     { plane1,       UPRV_LENGTHOF(plane1) },
 965     { plane2,       UPRV_LENGTHOF(plane2) },
 966     { plane15,      UPRV_LENGTHOF(plane15) },
 967     { plane16,      UPRV_LENGTHOF(plane16) },
 968     { c0,           UPRV_LENGTHOF(c0) }
 969 };
 970
 971 /*
 972  * Verify that the ICU BOCU-1 implementation produces the same results as
 973  * the reference implementation from the design folder.
 974  * Generate some texts and convert them with both converters, verifying
 975  * identical results and roundtripping.
 976  */
 977 static void
 978 TestBOCU1(void) {
 979     UChar *text;
 980     int32_t i, length;
 981
 982     UConverter *bocu1;
 983     UErrorCode errorCode;
 984
 985     errorCode=U_ZERO_ERROR;
 986     bocu1=ucnv_open("BOCU-1", &errorCode);
 987     if(U_FAILURE(errorCode)) {
 988         log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
 989         return;
 990     }
 991
 992     text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
 993
 994     /* text 1: each of strings[] once */
 995     length=0;
 996     for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
 997         u_memcpy(text+length, strings[i].s, strings[i].length);
 998         length+=strings[i].length;
 999     }
1000     roundtripBOCU1(bocu1, 1, text, length);
1001
1002     /* text 2: each of strings[] twice */
1003     length=0;
1004     for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
1005         u_memcpy(text+length, strings[i].s, strings[i].length);
1006         length+=strings[i].length;
1007         u_memcpy(text+length, strings[i].s, strings[i].length);
1008         length+=strings[i].length;
1009     }
1010     roundtripBOCU1(bocu1, 2, text, length);
1011
1012     /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
1013     length=0;
1014     for(i=1; length<5000; i+=7) {
1015         if(i>=UPRV_LENGTHOF(strings)) {
1016             i-=UPRV_LENGTHOF(strings);
1017         }
1018         u_memcpy(text+length, strings[i].s, strings[i].length);
1019         length+=strings[i].length;
1020     }
1021     roundtripBOCU1(bocu1, 3, text, length);
1022
1023     ucnv_close(bocu1);
1024     free(text);
1025 }
1026
1027 U_CFUNC void addBOCU1Tests(TestNode** root);
1028
1029 U_CFUNC void
1030 addBOCU1Tests(TestNode** root) {
1031     addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1032     addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1033 }