icuSources/test/cintltst/bocu1tst.c

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2002, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  bocu1tst.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002may27
  14 *   created by: Markus W. Scherer
  15 *
  16 *   This is the reference implementation of BOCU-1,
  17 *   the MIME-friendly form of the Binary Ordered Compression for Unicode,
  18 *   taken directly from ### http://oss.software.ibm.com/cvs/icu/icuhtml/design/conversion/bocu1/
  19 *   The files bocu1.h and bocu1.c from the design folder are taken
  20 *   verbatim (minus copyright and #include) and copied together into this file.
  21 *   The reference code and some of the reference bocu1tst.c
  22 *   is modified to run as part of the ICU cintltst
  23 *   test framework (minus main(), log_ln() etc. instead of printf()).
  24 *
  25 *   This reference implementation is used here to verify
  26 *   the ICU BOCU-1 implementation, which is
  27 *   adapted for ICU conversion APIs and optimized.
  28 *   ### links in design doc to here and to ucnvbocu.c
  29 */
  30
  31 #include "unicode/utypes.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/ucnv.h"
  34 #include "cmemory.h"
  35 #include "cintltst.h"
  36
  37 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
  38
  39 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
  40
  41 /* BOCU-1 constants and macros ---------------------------------------------- */
  42
  43 /*
  44  * BOCU-1 encodes the code points of a Unicode string as
  45  * a sequence of byte-encoded differences (slope detection),
  46  * preserving lexical order.
  47  *
  48  * Optimize the difference-taking for runs of Unicode text within
  49  * small scripts:
  50  *
  51  * Most small scripts are allocated within aligned 128-blocks of Unicode
  52  * code points. Lexical order is preserved if the "previous code point" state
  53  * is always moved into the middle of such a block.
  54  *
  55  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  56  * areas into the middle of those areas.
  57  *
  58  * C0 control codes and space are encoded with their US-ASCII bytes.
  59  * "prev" is reset for C0 controls but not for space.
  60  */
  61
  62 /* initial value for "prev": middle of the ASCII range */
  63 #define BOCU1_ASCII_PREV        0x40
  64
  65 /* bounding byte values for differences */
  66 #define BOCU1_MIN               0x21
  67 #define BOCU1_MIDDLE            0x90
  68 #define BOCU1_MAX_LEAD          0xfe
  69 #define BOCU1_MAX_TRAIL         0xff
  70 #define BOCU1_RESET             0xff
  71
  72 /* number of lead bytes */
  73 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  74
  75 /* adjust trail byte counts for the use of some C0 control byte values */
  76 #define BOCU1_TRAIL_CONTROLS_COUNT  20
  77 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  78
  79 /* number of trail bytes */
  80 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  81
  82 /*
  83  * number of positive and negative single-byte codes
  84  * (counting 0==BOCU1_MIDDLE among the positive ones)
  85  */
  86 #define BOCU1_SINGLE            64
  87
  88 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  89 #define BOCU1_LEAD_2            43
  90 #define BOCU1_LEAD_3            3
  91 #define BOCU1_LEAD_4            1
  92
  93 /* The difference value range for single-byters. */
  94 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
  95 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
  96
  97 /* The difference value range for double-byters. */
  98 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  99 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
 100
 101 /* The difference value range for 3-byters. */
 102 #define BOCU1_REACH_POS_3   \
 103     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
 104
 105 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
 106
 107 /* The lead byte start values. */
 108 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
 109 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
 110 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
 111      /* ==BOCU1_MAX_LEAD */
 112
 113 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
 114 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
 115 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
 116      /* ==BOCU1_MIN+1 */
 117
 118 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
 119 #define BOCU1_LENGTH_FROM_LEAD(lead) \
 120     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
 121      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
 122      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
 123
 124 /* The length of a byte sequence, according to its packed form. */
 125 #define BOCU1_LENGTH_FROM_PACKED(packed) \
 126     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
 127
 128 /*
 129  * 12 commonly used C0 control codes (and space) are only used to encode
 130  * themselves directly,
 131  * which makes BOCU-1 MIME-usable and reasonably safe for
 132  * ASCII-oriented software.
 133  *
 134  * These controls are
 135  *  0   NUL
 136  *
 137  *  7   BEL
 138  *  8   BS
 139  *
 140  *  9   TAB
 141  *  a   LF
 142  *  b   VT
 143  *  c   FF
 144  *  d   CR
 145  *
 146  *  e   SO
 147  *  f   SI
 148  *
 149  * 1a   SUB
 150  * 1b   ESC
 151  *
 152  * The other 20 C0 controls are also encoded directly (to preserve order)
 153  * but are also used as trail bytes in difference encoding
 154  * (for better compression).
 155  */
 156 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
 157
 158 /*
 159  * Byte value map for control codes,
 160  * from external byte values 0x00..0x20
 161  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
 162  * External byte values that are illegal as trail bytes are mapped to -1.
 163  */
 164 static int8_t
 165 bocu1ByteToTrail[BOCU1_MIN]={
 166 /*  0     1     2     3     4     5     6     7    */
 167     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
 168
 169 /*  8     9     a     b     c     d     e     f    */
 170     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 171
 172 /*  10    11    12    13    14    15    16    17   */
 173     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
 174
 175 /*  18    19    1a    1b    1c    1d    1e    1f   */
 176     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
 177
 178 /*  20   */
 179     -1
 180 };
 181
 182 /*
 183  * Byte value map for control codes,
 184  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
 185  * to external byte values 0x00..0x20.
 186  */
 187 static int8_t
 188 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
 189 /*  0     1     2     3     4     5     6     7    */
 190     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
 191
 192 /*  8     9     a     b     c     d     e     f    */
 193     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 194
 195 /*  10    11    12    13   */
 196     0x1c, 0x1d, 0x1e, 0x1f
 197 };
 198
 199 /**
 200  * Integer division and modulo with negative numerators
 201  * yields negative modulo results and quotients that are one more than
 202  * what we need here.
 203  * This macro adjust the results so that the modulo-value m is always >=0.
 204  *
 205  * For positive n, the if() condition is always FALSE.
 206  *
 207  * @param n Number to be split into quotient and rest.
 208  *          Will be modified to contain the quotient.
 209  * @param d Divisor.
 210  * @param m Output variable for the rest (modulo result).
 211  */
 212 #define NEGDIVMOD(n, d, m) { \
 213     (m)=(n)%(d); \
 214     (n)/=(d); \
 215     if((m)<0) { \
 216         --(n); \
 217         (m)+=(d); \
 218     } \
 219 }
 220
 221 /* State for BOCU-1 decoder function. */
 222 struct Bocu1Rx {
 223     int32_t prev, count, diff;
 224 };
 225
 226 typedef struct Bocu1Rx Bocu1Rx;
 227
 228 /* Function prototypes ------------------------------------------------------ */
 229
 230 /* see bocu1.c */
 231 U_CFUNC int32_t
 232 packDiff(int32_t diff);
 233
 234 U_CFUNC int32_t
 235 encodeBocu1(int32_t *pPrev, int32_t c);
 236
 237 U_CFUNC int32_t
 238 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
 239
 240 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
 241
 242 /* BOCU-1 implementation functions ------------------------------------------ */
 243
 244 /**
 245  * Compute the next "previous" value for differencing
 246  * from the current code point.
 247  *
 248  * @param c current code point, 0..0x10ffff
 249  * @return "previous code point" state value
 250  */
 251 static U_INLINE int32_t
 252 bocu1Prev(int32_t c) {
 253     /* compute new prev */
 254     if(0x3040<=c && c<=0x309f) {
 255         /* Hiragana is not 128-aligned */
 256         return 0x3070;
 257     } else if(0x4e00<=c && c<=0x9fa5) {
 258         /* CJK Unihan */
 259         return 0x4e00-BOCU1_REACH_NEG_2;
 260     } else if(0xac00<=c && c<=0xd7a3) {
 261         /* Korean Hangul */
 262         return (0xd7a3+0xac00)/2;
 263     } else {
 264         /* mostly small scripts */
 265         return (c&~0x7f)+BOCU1_ASCII_PREV;
 266     }
 267 }
 268
 269 /**
 270  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
 271  * and return a packed integer with them.
 272  *
 273  * The encoding favors small absolut differences with short encodings
 274  * to compress runs of same-script characters.
 275  *
 276  * @param diff difference value -0x10ffff..0x10ffff
 277  * @return
 278  *      0x010000zz for 1-byte sequence zz
 279  *      0x0200yyzz for 2-byte sequence yy zz
 280  *      0x03xxyyzz for 3-byte sequence xx yy zz
 281  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
 282  */
 283 U_CFUNC int32_t
 284 packDiff(int32_t diff) {
 285     int32_t result, m, lead, count, shift;
 286
 287     if(diff>=BOCU1_REACH_NEG_1) {
 288         /* mostly positive differences, and single-byte negative ones */
 289         if(diff<=BOCU1_REACH_POS_1) {
 290             /* single byte */
 291             return 0x01000000|(BOCU1_MIDDLE+diff);
 292         } else if(diff<=BOCU1_REACH_POS_2) {
 293             /* two bytes */
 294             diff-=BOCU1_REACH_POS_1+1;
 295             lead=BOCU1_START_POS_2;
 296             count=1;
 297         } else if(diff<=BOCU1_REACH_POS_3) {
 298             /* three bytes */
 299             diff-=BOCU1_REACH_POS_2+1;
 300             lead=BOCU1_START_POS_3;
 301             count=2;
 302         } else {
 303             /* four bytes */
 304             diff-=BOCU1_REACH_POS_3+1;
 305             lead=BOCU1_START_POS_4;
 306             count=3;
 307         }
 308     } else {
 309         /* two- and four-byte negative differences */
 310         if(diff>=BOCU1_REACH_NEG_2) {
 311             /* two bytes */
 312             diff-=BOCU1_REACH_NEG_1;
 313             lead=BOCU1_START_NEG_2;
 314             count=1;
 315         } else if(diff>=BOCU1_REACH_NEG_3) {
 316             /* three bytes */
 317             diff-=BOCU1_REACH_NEG_2;
 318             lead=BOCU1_START_NEG_3;
 319             count=2;
 320         } else {
 321             /* four bytes */
 322             diff-=BOCU1_REACH_NEG_3;
 323             lead=BOCU1_START_NEG_4;
 324             count=3;
 325         }
 326     }
 327
 328     /* encode the length of the packed result */
 329     if(count<3) {
 330         result=(count+1)<<24;
 331     } else /* count==3, MSB used for the lead byte */ {
 332         result=0;
 333     }
 334
 335     /* calculate trail bytes like digits in itoa() */
 336     shift=0;
 337     do {
 338         NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
 339         result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
 340         shift+=8;
 341     } while(--count>0);
 342
 343     /* add lead byte */
 344     result|=(lead+diff)<<shift;
 345
 346     return result;
 347 }
 348
 349 /**
 350  * BOCU-1 encoder function.
 351  *
 352  * @param pPrev pointer to the integer that holds
 353  *        the "previous code point" state;
 354  *        the initial value should be 0 which
 355  *        encodeBocu1 will set to the actual BOCU-1 initial state value
 356  * @param c the code point to encode
 357  * @return the packed 1/2/3/4-byte encoding, see packDiff(),
 358  *         or 0 if an error occurs
 359  *
 360  * @see packDiff
 361  */
 362 U_CFUNC int32_t
 363 encodeBocu1(int32_t *pPrev, int32_t c) {
 364     int32_t prev;
 365
 366     if(pPrev==NULL || c<0 || c>0x10ffff) {
 367         /* illegal argument */
 368         return 0;
 369     }
 370
 371     prev=*pPrev;
 372     if(prev==0) {
 373         /* lenient handling of initial value 0 */
 374         prev=*pPrev=BOCU1_ASCII_PREV;
 375     }
 376
 377     if(c<=0x20) {
 378         /*
 379          * ISO C0 control & space:
 380          * Encode directly for MIME compatibility,
 381          * and reset state except for space, to not disrupt compression.
 382          */
 383         if(c!=0x20) {
 384             *pPrev=BOCU1_ASCII_PREV;
 385         }
 386         return 0x01000000|c;
 387     }
 388
 389     /*
 390      * all other Unicode code points c==U+0021..U+10ffff
 391      * are encoded with the difference c-prev
 392      *
 393      * a new prev is computed from c,
 394      * placed in the middle of a 0x80-block (for most small scripts) or
 395      * in the middle of the Unihan and Hangul blocks
 396      * to statistically minimize the following difference
 397      */
 398     *pPrev=bocu1Prev(c);
 399     return packDiff(c-prev);
 400 }
 401
 402 /**
 403  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
 404  *
 405  * @param pRx pointer to the decoder state structure
 406  * @param b lead byte;
 407  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
 408  * @return -1 (state change only)
 409  *
 410  * @see decodeBocu1
 411  */
 412 static int32_t
 413 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
 414     int32_t c, count;
 415
 416     if(b>=BOCU1_START_NEG_2) {
 417         /* positive difference */
 418         if(b<BOCU1_START_POS_3) {
 419             /* two bytes */
 420             c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
 421             count=1;
 422         } else if(b<BOCU1_START_POS_4) {
 423             /* three bytes */
 424             c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
 425             count=2;
 426         } else {
 427             /* four bytes */
 428             c=BOCU1_REACH_POS_3+1;
 429             count=3;
 430         }
 431     } else {
 432         /* negative difference */
 433         if(b>=BOCU1_START_NEG_3) {
 434             /* two bytes */
 435             c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
 436             count=1;
 437         } else if(b>BOCU1_MIN) {
 438             /* three bytes */
 439             c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
 440             count=2;
 441         } else {
 442             /* four bytes */
 443             c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
 444             count=3;
 445         }
 446     }
 447
 448     /* set the state for decoding the trail byte(s) */
 449     pRx->diff=c;
 450     pRx->count=count;
 451     return -1;
 452 }
 453
 454 /**
 455  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
 456  *
 457  * @param pRx pointer to the decoder state structure
 458  * @param b trail byte
 459  * @return result value, same as decodeBocu1
 460  *
 461  * @see decodeBocu1
 462  */
 463 static int32_t
 464 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
 465     int32_t t, c, count;
 466
 467     if(b<=0x20) {
 468         /* skip some C0 controls and make the trail byte range contiguous */
 469         t=bocu1ByteToTrail[b];
 470         if(t<0) {
 471             /* illegal trail byte value */
 472             pRx->prev=BOCU1_ASCII_PREV;
 473             pRx->count=0;
 474             return -99;
 475         }
 476 #if BOCU1_MAX_TRAIL<0xff
 477     } else if(b>BOCU1_MAX_TRAIL) {
 478         return -99;
 479 #endif
 480     } else {
 481         t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
 482     }
 483
 484     /* add trail byte into difference and decrement count */
 485     c=pRx->diff;
 486     count=pRx->count;
 487
 488     if(count==1) {
 489         /* final trail byte, deliver a code point */
 490         c=pRx->prev+c+t;
 491         if(0<=c && c<=0x10ffff) {
 492             /* valid code point result */
 493             pRx->prev=bocu1Prev(c);
 494             pRx->count=0;
 495             return c;
 496         } else {
 497             /* illegal code point result */
 498             pRx->prev=BOCU1_ASCII_PREV;
 499             pRx->count=0;
 500             return -99;
 501         }
 502     }
 503
 504     /* intermediate trail byte */
 505     if(count==2) {
 506         pRx->diff=c+t*BOCU1_TRAIL_COUNT;
 507     } else /* count==3 */ {
 508         pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
 509     }
 510     pRx->count=count-1;
 511     return -1;
 512 }
 513
 514 /**
 515  * BOCU-1 decoder function.
 516  *
 517  * @param pRx pointer to the decoder state structure;
 518  *        the initial values should be 0 which
 519  *        decodeBocu1 will set to actual initial state values
 520  * @param b an input byte
 521  * @return
 522  *      0..0x10ffff for a result code point
 523  *      -1 if only the state changed without code point output
 524  *     <-1 if an error occurs
 525  */
 526 U_CFUNC int32_t
 527 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
 528     int32_t prev, c, count;
 529
 530     if(pRx==NULL) {
 531         /* illegal argument */
 532         return -99;
 533     }
 534
 535     prev=pRx->prev;
 536     if(prev==0) {
 537         /* lenient handling of initial 0 values */
 538         prev=pRx->prev=BOCU1_ASCII_PREV;
 539         count=pRx->count=0;
 540     } else {
 541         count=pRx->count;
 542     }
 543
 544     if(count==0) {
 545         /* byte in lead position */
 546         if(b<=0x20) {
 547             /*
 548              * Direct-encoded C0 control code or space.
 549              * Reset prev for C0 control codes but not for space.
 550              */
 551             if(b!=0x20) {
 552                 pRx->prev=BOCU1_ASCII_PREV;
 553             }
 554             return b;
 555         }
 556
 557         /*
 558          * b is a difference lead byte.
 559          *
 560          * Return a code point directly from a single-byte difference.
 561          *
 562          * For multi-byte difference lead bytes, set the decoder state
 563          * with the partial difference value from the lead byte and
 564          * with the number of trail bytes.
 565          *
 566          * For four-byte differences, the signedness also affects the
 567          * first trail byte, which has special handling farther below.
 568          */
 569         if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
 570             /* single-byte difference */
 571             c=prev+((int32_t)b-BOCU1_MIDDLE);
 572             pRx->prev=bocu1Prev(c);
 573             return c;
 574         } else if(b==BOCU1_RESET) {
 575             /* only reset the state, no code point */
 576             pRx->prev=BOCU1_ASCII_PREV;
 577             return -1;
 578         } else {
 579             return decodeBocu1LeadByte(pRx, b);
 580         }
 581     } else {
 582         /* trail byte in any position */
 583         return decodeBocu1TrailByte(pRx, b);
 584     }
 585 }
 586
 587 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
 588
 589 /* test code ---------------------------------------------------------------- */
 590
 591 /* test code options */
 592
 593 /* ignore comma when processing name lists in testText() */
 594 #define TEST_IGNORE_COMMA       1
 595
 596 /**
 597  * Write a packed BOCU-1 byte sequence into a byte array,
 598  * without overflow check.
 599  * Test function.
 600  *
 601  * @param packed packed BOCU-1 byte sequence, see packDiff()
 602  * @param p pointer to byte array
 603  * @return number of bytes
 604  *
 605  * @see packDiff
 606  */
 607 static int32_t
 608 writePacked(int32_t packed, uint8_t *p) {
 609     int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
 610     switch(count) {
 611     case 4:
 612         *p++=(uint8_t)(packed>>24);
 613     case 3:
 614         *p++=(uint8_t)(packed>>16);
 615     case 2:
 616         *p++=(uint8_t)(packed>>8);
 617     case 1:
 618         *p++=(uint8_t)packed;
 619     default:
 620         break;
 621     }
 622
 623     return count;
 624 }
 625
 626 /**
 627  * Unpack a packed BOCU-1 non-C0/space byte sequence and get
 628  * the difference to initialPrev.
 629  * Used only for round-trip testing of the difference encoding and decoding.
 630  * Test function.
 631  *
 632  * @param initialPrev bogus "previous code point" value to make sure that
 633  *                    the resulting code point is in the range 0..0x10ffff
 634  * @param packed packed BOCU-1 byte sequence
 635  * @return the difference to initialPrev
 636  *
 637  * @see packDiff
 638  * @see writeDiff
 639  */
 640 static int32_t
 641 unpackDiff(int32_t initialPrev, int32_t packed) {
 642     Bocu1Rx rx={ 0, 0, 0 };
 643     int32_t count;
 644
 645     rx.prev=initialPrev;
 646     count=BOCU1_LENGTH_FROM_PACKED(packed);
 647     switch(count) {
 648     case 4:
 649         decodeBocu1(&rx, (uint8_t)(packed>>24));
 650     case 3:
 651         decodeBocu1(&rx, (uint8_t)(packed>>16));
 652     case 2:
 653         decodeBocu1(&rx, (uint8_t)(packed>>8));
 654     case 1:
 655         /* subtract initial prev */
 656         return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
 657     default:
 658         return -0x7fffffff;
 659     }
 660 }
 661
 662 /**
 663  * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
 664  * preserving lexical order.
 665  * Also checks for roundtripping of the difference encoding.
 666  * Test function.
 667  *
 668  * @param diff difference value to test, -0x10ffff..0x10ffff
 669  * @param p pointer to output byte array
 670  * @return p advanced by number of bytes output
 671  *
 672  * @see unpackDiff
 673  */
 674 static uint8_t *
 675 writeDiff(int32_t diff, uint8_t *p) {
 676     /* generate the difference as a packed value and serialize it */
 677     int32_t packed, initialPrev;
 678
 679     packed=packDiff(diff);
 680
 681     /*
 682      * bogus initial "prev" to work around
 683      * code point range check in decodeBocu1()
 684      */
 685     if(diff<=0) {
 686         initialPrev=0x10ffff;
 687     } else {
 688         initialPrev=-1;
 689     }
 690
 691     if(diff!=unpackDiff(initialPrev, packed)) {
 692         log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
 693                 diff, packed, unpackDiff(initialPrev, packed));
 694     }
 695     return p+writePacked(packed, p);
 696 }
 697
 698 /**
 699  * Encode a UTF-16 string in BOCU-1.
 700  * Does not check for overflows, but otherwise useful function.
 701  *
 702  * @param s input UTF-16 string
 703  * @param length number of UChar code units in s
 704  * @param p pointer to output byte array
 705  * @return number of bytes output
 706  */
 707 static int32_t
 708 writeString(const UChar *s, int32_t length, uint8_t *p) {
 709     uint8_t *p0;
 710     int32_t c, prev, i;
 711
 712     prev=0;
 713     p0=p;
 714     i=0;
 715     while(i<length) {
 716         UTF_NEXT_CHAR(s, i, length, c);
 717         p+=writePacked(encodeBocu1(&prev, c), p);
 718     }
 719     return p-p0;
 720 }
 721
 722 /**
 723  * Decode a BOCU-1 byte sequence to a UTF-16 string.
 724  * Does not check for overflows, but otherwise useful function.
 725  *
 726  * @param p pointer to input BOCU-1 bytes
 727  * @param length number of input bytes
 728  * @param s point to output UTF-16 string array
 729  * @return number of UChar code units output
 730  */
 731 static int32_t
 732 readString(const uint8_t *p, int32_t length, UChar *s) {
 733     Bocu1Rx rx={ 0, 0, 0 };
 734     int32_t c, i, sLength;
 735
 736     i=sLength=0;
 737     while(i<length) {
 738         c=decodeBocu1(&rx, p[i++]);
 739         if(c<-1) {
 740             log_err("error: readString detects encoding error at string index %ld\n", i);
 741             return -1;
 742         }
 743         if(c>=0) {
 744             UTF_APPEND_CHAR_UNSAFE(s, sLength, c);
 745         }
 746     }
 747     return sLength;
 748 }
 749
 750 static U_INLINE char
 751 hexDigit(uint8_t digit) {
 752     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
 753 }
 754
 755 /**
 756  * Pretty-print 0-terminated byte values.
 757  * Helper function for test output.
 758  *
 759  * @param bytes 0-terminated byte array to print
 760  */
 761 static void
 762 printBytes(uint8_t *bytes, char *out) {
 763     int i;
 764     uint8_t b;
 765
 766     i=0;
 767     while((b=*bytes++)!=0) {
 768         *out++=' ';
 769         *out++=hexDigit((uint8_t)(b>>4));
 770         *out++=hexDigit((uint8_t)(b&0xf));
 771         ++i;
 772     }
 773     i=3*(5-i);
 774     while(i>0) {
 775         *out++=' ';
 776         --i;
 777     }
 778     *out=0;
 779 }
 780
 781 /**
 782  * Basic BOCU-1 test function, called when there are no command line arguments.
 783  * Prints some of the #define values and performs round-trip tests of the
 784  * difference encoding and decoding.
 785  */
 786 static void
 787 TestBOCU1RefDiff(void) {
 788     char buf1[80], buf2[80];
 789     uint8_t prev[5], level[5];
 790     int32_t i, cmp, countErrors;
 791
 792     log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
 793     log_verbose("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
 794     log_verbose("reach of 3 bytes     : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
 795
 796     log_verbose("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
 797     log_verbose("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
 798     log_verbose("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
 799
 800     log_verbose("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
 801     log_verbose("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
 802     log_verbose("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
 803
 804     /* test packDiff() & unpackDiff() with some specific values */
 805     writeDiff(0, level);
 806     writeDiff(1, level);
 807     writeDiff(65, level);
 808     writeDiff(130, level);
 809     writeDiff(30000, level);
 810     writeDiff(1000000, level);
 811     writeDiff(-65, level);
 812     writeDiff(-130, level);
 813     writeDiff(-30000, level);
 814     writeDiff(-1000000, level);
 815
 816     /* test that each value is smaller than any following one */
 817     countErrors=0;
 818     i=-0x10ffff;
 819     *writeDiff(i, prev)=0;
 820
 821     /* show first number and bytes */
 822     printBytes(prev, buf1);
 823     log_verbose("              wD(%8ld)                    %s\n", i, buf1);
 824
 825     for(++i; i<=0x10ffff; ++i) {
 826         *writeDiff(i, level)=0;
 827         cmp=strcmp((const char *)prev, (const char *)level);
 828         if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
 829             log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
 830                    level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
 831         }
 832         if(cmp<0) {
 833             if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
 834                 /*
 835                  * if the result is good, then print only if the length changed
 836                  * to get little but interesting output
 837                  */
 838                 printBytes(prev, buf1);
 839                 printBytes(level, buf2);
 840                 log_verbose("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
 841             }
 842         } else {
 843             ++countErrors;
 844             printBytes(prev, buf1);
 845             printBytes(level, buf2);
 846             log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
 847         }
 848         /* remember the previous bytes */
 849         memcpy(prev, level, 4);
 850     }
 851
 852     /* show last number and bytes */
 853     printBytes((uint8_t *)"", buf1);
 854     printBytes(prev, buf2);
 855     log_verbose("                            wD(%8ld)      %s%s\n", i-1, buf1, buf2);
 856
 857     if(countErrors==0) {
 858         log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
 859     } else {
 860         log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
 861     }
 862
 863     /* output signature byte sequence */
 864     i=0;
 865     writePacked(encodeBocu1(&i, 0xfeff), level);
 866     log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
 867             level[0], level[1], level[2]);
 868 }
 869
 870 /* cintltst code ------------------------------------------------------------ */
 871
 872 /* test one string with the ICU and the reference BOCU-1 implementations */
 873 static void
 874 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
 875     static UChar roundtripRef[30000], roundtripICU[30000];
 876     static char bocu1Ref[30000], bocu1ICU[30000];
 877
 878     int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
 879     UErrorCode errorCode;
 880
 881     /* Unicode -> BOCU-1 */
 882     bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
 883
 884     errorCode=U_ZERO_ERROR;
 885     bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, sizeof(bocu1ICU), text, length, &errorCode);
 886     if(U_FAILURE(errorCode)) {
 887         log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
 888         return;
 889     }
 890
 891     if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
 892         log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
 893         return;
 894     }
 895
 896     /* BOCU-1 -> Unicode */
 897     roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
 898     if(roundtripRefLength<0) {
 899         return; /* readString() found an error and reported it */
 900     }
 901
 902     roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, sizeof(roundtripICU)/U_SIZEOF_UCHAR, bocu1ICU, bocu1ICULength, &errorCode);
 903     if(U_FAILURE(errorCode)) {
 904         log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
 905         return;
 906     }
 907
 908     if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
 909         log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
 910         return;
 911     }
 912     if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
 913         log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
 914         return;
 915     }
 916 }
 917
 918 static const UChar feff[]={ 0xfeff };
 919 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
 920 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
 921 static const UChar nul[]={ 0 };
 922 static const UChar latin[]={ 0xdf, 0xe6 };
 923 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
 924 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
 925 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
 926 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
 927 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
 928 static const UChar plane1[]={ 0xd800, 0xdc00 };
 929 static const UChar plane2[]={ 0xd845, 0xdddd };
 930 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
 931 static const UChar plane16[]={ 0xdbff, 0xdfff };
 932 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
 933
 934 static const struct {
 935     const UChar *s;
 936     int32_t length;
 937 } strings[]={
 938     { feff,         LENGTHOF(feff) },
 939     { ascii,        LENGTHOF(ascii) },
 940     { crlf,         LENGTHOF(crlf) },
 941     { nul,          LENGTHOF(nul) },
 942     { latin,        LENGTHOF(latin) },
 943     { devanagari,   LENGTHOF(devanagari) },
 944     { hiragana,     LENGTHOF(hiragana) },
 945     { unihan,       LENGTHOF(unihan) },
 946     { hangul,       LENGTHOF(hangul) },
 947     { surrogates,   LENGTHOF(surrogates) },
 948     { plane1,       LENGTHOF(plane1) },
 949     { plane2,       LENGTHOF(plane2) },
 950     { plane15,      LENGTHOF(plane15) },
 951     { plane16,      LENGTHOF(plane16) },
 952     { c0,           LENGTHOF(c0) }
 953 };
 954
 955 /*
 956  * Verify that the ICU BOCU-1 implementation produces the same results as
 957  * the reference implementation from the design folder.
 958  * Generate some texts and convert them with both converters, verifying
 959  * identical results and roundtripping.
 960  */
 961 static void
 962 TestBOCU1(void) {
 963     UChar text[30000];
 964     int32_t i, length;
 965
 966     UConverter *bocu1;
 967     UErrorCode errorCode;
 968
 969     errorCode=U_ZERO_ERROR;
 970     bocu1=ucnv_open("BOCU-1", &errorCode);
 971     if(U_FAILURE(errorCode)) {
 972         log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
 973         return;
 974     }
 975
 976     /* text 1: each of strings[] once */
 977     length=0;
 978     for(i=0; i<LENGTHOF(strings); ++i) {
 979         u_memcpy(text+length, strings[i].s, strings[i].length);
 980         length+=strings[i].length;
 981     }
 982     roundtripBOCU1(bocu1, 1, text, length);
 983
 984     /* text 2: each of strings[] twice */
 985     length=0;
 986     for(i=0; i<LENGTHOF(strings); ++i) {
 987         u_memcpy(text+length, strings[i].s, strings[i].length);
 988         length+=strings[i].length;
 989         u_memcpy(text+length, strings[i].s, strings[i].length);
 990         length+=strings[i].length;
 991     }
 992     roundtripBOCU1(bocu1, 2, text, length);
 993
 994     /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
 995     length=0;
 996     for(i=1; length<5000; i+=7) {
 997         if(i>=LENGTHOF(strings)) {
 998             i-=LENGTHOF(strings);
 999         }
1000         u_memcpy(text+length, strings[i].s, strings[i].length);
1001         length+=strings[i].length;
1002     }
1003     roundtripBOCU1(bocu1, 3, text, length);
1004
1005     ucnv_close(bocu1);
1006 }
1007
1008 U_CFUNC void addBOCU1Tests(TestNode** root);
1009
1010 U_CFUNC void
1011 addBOCU1Tests(TestNode** root) {
1012     addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1013     addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1014 }