1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2002-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: bocu1tst.c
12 * tab size: 8 (not used)
15 * created on: 2002may27
16 * created by: Markus W. Scherer
18 * This is the reference implementation of BOCU-1,
19 * the MIME-friendly form of the Binary Ordered Compression for Unicode,
20 * taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
21 * The files bocu1.h and bocu1.c from the design folder are taken
22 * verbatim (minus copyright and #include) and copied together into this file.
23 * The reference code and some of the reference bocu1tst.c
24 * is modified to run as part of the ICU cintltst
25 * test framework (minus main(), log_ln() etc. instead of printf()).
27 * This reference implementation is used here to verify
28 * the ICU BOCU-1 implementation, which is
29 * adapted for ICU conversion APIs and optimized.
30 * ### links in design doc to here and to ucnvbocu.c
33 #include "unicode/utypes.h"
34 #include "unicode/ustring.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/utf16.h"
40 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
42 /* BOCU-1 constants and macros ---------------------------------------------- */
45 * BOCU-1 encodes the code points of a Unicode string as
46 * a sequence of byte-encoded differences (slope detection),
47 * preserving lexical order.
49 * Optimize the difference-taking for runs of Unicode text within
52 * Most small scripts are allocated within aligned 128-blocks of Unicode
53 * code points. Lexical order is preserved if the "previous code point" state
54 * is always moved into the middle of such a block.
56 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
57 * areas into the middle of those areas.
59 * C0 control codes and space are encoded with their US-ASCII bytes.
60 * "prev" is reset for C0 controls but not for space.
63 /* initial value for "prev": middle of the ASCII range */
64 #define BOCU1_ASCII_PREV 0x40
66 /* bounding byte values for differences */
67 #define BOCU1_MIN 0x21
68 #define BOCU1_MIDDLE 0x90
69 #define BOCU1_MAX_LEAD 0xfe
71 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
72 #define BOCU1_MAX_TRAIL 0xffL
73 #define BOCU1_RESET 0xff
75 /* number of lead bytes */
76 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
78 /* adjust trail byte counts for the use of some C0 control byte values */
79 #define BOCU1_TRAIL_CONTROLS_COUNT 20
80 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
82 /* number of trail bytes */
83 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
86 * number of positive and negative single-byte codes
87 * (counting 0==BOCU1_MIDDLE among the positive ones)
89 #define BOCU1_SINGLE 64
91 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
92 #define BOCU1_LEAD_2 43
93 #define BOCU1_LEAD_3 3
94 #define BOCU1_LEAD_4 1
96 /* The difference value range for single-byters. */
97 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
98 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
100 /* The difference value range for double-byters. */
101 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
102 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
104 /* The difference value range for 3-byters. */
105 #define BOCU1_REACH_POS_3 \
106 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
108 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
110 /* The lead byte start values. */
111 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
112 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
113 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
114 /* ==BOCU1_MAX_LEAD */
116 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
117 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
118 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
121 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
122 #define BOCU1_LENGTH_FROM_LEAD(lead) \
123 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
124 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
125 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
127 /* The length of a byte sequence, according to its packed form. */
128 #define BOCU1_LENGTH_FROM_PACKED(packed) \
129 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
132 * 12 commonly used C0 control codes (and space) are only used to encode
133 * themselves directly,
134 * which makes BOCU-1 MIME-usable and reasonably safe for
135 * ASCII-oriented software.
155 * The other 20 C0 controls are also encoded directly (to preserve order)
156 * but are also used as trail bytes in difference encoding
157 * (for better compression).
159 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
162 * Byte value map for control codes,
163 * from external byte values 0x00..0x20
164 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
165 * External byte values that are illegal as trail bytes are mapped to -1.
168 bocu1ByteToTrail
[BOCU1_MIN
]={
169 /* 0 1 2 3 4 5 6 7 */
170 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
172 /* 8 9 a b c d e f */
173 -1, -1, -1, -1, -1, -1, -1, -1,
175 /* 10 11 12 13 14 15 16 17 */
176 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
178 /* 18 19 1a 1b 1c 1d 1e 1f */
179 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
186 * Byte value map for control codes,
187 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
188 * to external byte values 0x00..0x20.
191 bocu1TrailToByte
[BOCU1_TRAIL_CONTROLS_COUNT
]={
192 /* 0 1 2 3 4 5 6 7 */
193 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
195 /* 8 9 a b c d e f */
196 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
199 0x1c, 0x1d, 0x1e, 0x1f
203 * Integer division and modulo with negative numerators
204 * yields negative modulo results and quotients that are one more than
206 * This macro adjust the results so that the modulo-value m is always >=0.
208 * For positive n, the if() condition is always FALSE.
210 * @param n Number to be split into quotient and rest.
211 * Will be modified to contain the quotient.
213 * @param m Output variable for the rest (modulo result).
215 #define NEGDIVMOD(n, d, m) { \
224 /* State for BOCU-1 decoder function. */
226 int32_t prev
, count
, diff
;
229 typedef struct Bocu1Rx Bocu1Rx
;
231 /* Function prototypes ------------------------------------------------------ */
235 packDiff(int32_t diff
);
238 encodeBocu1(int32_t *pPrev
, int32_t c
);
241 decodeBocu1(Bocu1Rx
*pRx
, uint8_t b
);
243 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
245 /* BOCU-1 implementation functions ------------------------------------------ */
248 * Compute the next "previous" value for differencing
249 * from the current code point.
251 * @param c current code point, 0..0x10ffff
252 * @return "previous code point" state value
255 bocu1Prev(int32_t c
) {
256 /* compute new prev */
257 if(0x3040<=c
&& c
<=0x309f) {
258 /* Hiragana is not 128-aligned */
260 } else if(0x4e00<=c
&& c
<=0x9fa5) {
262 return 0x4e00-BOCU1_REACH_NEG_2
;
263 } else if(0xac00<=c
&& c
<=0xd7a3) {
264 /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
265 return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
267 /* mostly small scripts */
268 return (c
&~0x7f)+BOCU1_ASCII_PREV
;
273 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
274 * and return a packed integer with them.
276 * The encoding favors small absolut differences with short encodings
277 * to compress runs of same-script characters.
279 * @param diff difference value -0x10ffff..0x10ffff
281 * 0x010000zz for 1-byte sequence zz
282 * 0x0200yyzz for 2-byte sequence yy zz
283 * 0x03xxyyzz for 3-byte sequence xx yy zz
284 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
287 packDiff(int32_t diff
) {
288 int32_t result
, m
, lead
, count
, shift
;
290 if(diff
>=BOCU1_REACH_NEG_1
) {
291 /* mostly positive differences, and single-byte negative ones */
292 if(diff
<=BOCU1_REACH_POS_1
) {
294 return 0x01000000|(BOCU1_MIDDLE
+diff
);
295 } else if(diff
<=BOCU1_REACH_POS_2
) {
297 diff
-=BOCU1_REACH_POS_1
+1;
298 lead
=BOCU1_START_POS_2
;
300 } else if(diff
<=BOCU1_REACH_POS_3
) {
302 diff
-=BOCU1_REACH_POS_2
+1;
303 lead
=BOCU1_START_POS_3
;
307 diff
-=BOCU1_REACH_POS_3
+1;
308 lead
=BOCU1_START_POS_4
;
312 /* two- and four-byte negative differences */
313 if(diff
>=BOCU1_REACH_NEG_2
) {
315 diff
-=BOCU1_REACH_NEG_1
;
316 lead
=BOCU1_START_NEG_2
;
318 } else if(diff
>=BOCU1_REACH_NEG_3
) {
320 diff
-=BOCU1_REACH_NEG_2
;
321 lead
=BOCU1_START_NEG_3
;
325 diff
-=BOCU1_REACH_NEG_3
;
326 lead
=BOCU1_START_NEG_4
;
331 /* encode the length of the packed result */
333 result
=(count
+1)<<24;
334 } else /* count==3, MSB used for the lead byte */ {
338 /* calculate trail bytes like digits in itoa() */
341 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
342 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<shift
;
347 result
|= (uint32_t)(lead
+diff
)<<shift
;
353 * BOCU-1 encoder function.
355 * @param pPrev pointer to the integer that holds
356 * the "previous code point" state;
357 * the initial value should be 0 which
358 * encodeBocu1 will set to the actual BOCU-1 initial state value
359 * @param c the code point to encode
360 * @return the packed 1/2/3/4-byte encoding, see packDiff(),
361 * or 0 if an error occurs
366 encodeBocu1(int32_t *pPrev
, int32_t c
) {
369 if(pPrev
==NULL
|| c
<0 || c
>0x10ffff) {
370 /* illegal argument */
376 /* lenient handling of initial value 0 */
377 prev
=*pPrev
=BOCU1_ASCII_PREV
;
382 * ISO C0 control & space:
383 * Encode directly for MIME compatibility,
384 * and reset state except for space, to not disrupt compression.
387 *pPrev
=BOCU1_ASCII_PREV
;
393 * all other Unicode code points c==U+0021..U+10ffff
394 * are encoded with the difference c-prev
396 * a new prev is computed from c,
397 * placed in the middle of a 0x80-block (for most small scripts) or
398 * in the middle of the Unihan and Hangul blocks
399 * to statistically minimize the following difference
402 return packDiff(c
-prev
);
406 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
408 * @param pRx pointer to the decoder state structure
409 * @param b lead byte;
410 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
411 * @return -1 (state change only)
416 decodeBocu1LeadByte(Bocu1Rx
*pRx
, uint8_t b
) {
419 if(b
>=BOCU1_START_NEG_2
) {
420 /* positive difference */
421 if(b
<BOCU1_START_POS_3
) {
423 c
=((int32_t)b
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
425 } else if(b
<BOCU1_START_POS_4
) {
427 c
=((int32_t)b
-BOCU1_START_POS_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_2
+1;
431 c
=BOCU1_REACH_POS_3
+1;
435 /* negative difference */
436 if(b
>=BOCU1_START_NEG_3
) {
438 c
=((int32_t)b
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
440 } else if(b
>BOCU1_MIN
) {
442 c
=((int32_t)b
-BOCU1_START_NEG_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_2
;
446 c
=-BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_3
;
451 /* set the state for decoding the trail byte(s) */
458 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
460 * @param pRx pointer to the decoder state structure
461 * @param b trail byte
462 * @return result value, same as decodeBocu1
467 decodeBocu1TrailByte(Bocu1Rx
*pRx
, uint8_t b
) {
471 /* skip some C0 controls and make the trail byte range contiguous */
472 t
=bocu1ByteToTrail
[b
];
474 /* illegal trail byte value */
475 pRx
->prev
=BOCU1_ASCII_PREV
;
479 #if BOCU1_MAX_TRAIL<0xff
480 } else if(b
>BOCU1_MAX_TRAIL
) {
484 t
=(int32_t)b
-BOCU1_TRAIL_BYTE_OFFSET
;
487 /* add trail byte into difference and decrement count */
492 /* final trail byte, deliver a code point */
494 if(0<=c
&& c
<=0x10ffff) {
495 /* valid code point result */
496 pRx
->prev
=bocu1Prev(c
);
500 /* illegal code point result */
501 pRx
->prev
=BOCU1_ASCII_PREV
;
507 /* intermediate trail byte */
509 pRx
->diff
=c
+t
*BOCU1_TRAIL_COUNT
;
510 } else /* count==3 */ {
511 pRx
->diff
=c
+t
*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
;
518 * BOCU-1 decoder function.
520 * @param pRx pointer to the decoder state structure;
521 * the initial values should be 0 which
522 * decodeBocu1 will set to actual initial state values
523 * @param b an input byte
525 * 0..0x10ffff for a result code point
526 * -1 if only the state changed without code point output
527 * <-1 if an error occurs
530 decodeBocu1(Bocu1Rx
*pRx
, uint8_t b
) {
531 int32_t prev
, c
, count
;
534 /* illegal argument */
540 /* lenient handling of initial 0 values */
541 prev
=pRx
->prev
=BOCU1_ASCII_PREV
;
548 /* byte in lead position */
551 * Direct-encoded C0 control code or space.
552 * Reset prev for C0 control codes but not for space.
555 pRx
->prev
=BOCU1_ASCII_PREV
;
561 * b is a difference lead byte.
563 * Return a code point directly from a single-byte difference.
565 * For multi-byte difference lead bytes, set the decoder state
566 * with the partial difference value from the lead byte and
567 * with the number of trail bytes.
569 * For four-byte differences, the signedness also affects the
570 * first trail byte, which has special handling farther below.
572 if(b
>=BOCU1_START_NEG_2
&& b
<BOCU1_START_POS_2
) {
573 /* single-byte difference */
574 c
=prev
+((int32_t)b
-BOCU1_MIDDLE
);
575 pRx
->prev
=bocu1Prev(c
);
577 } else if(b
==BOCU1_RESET
) {
578 /* only reset the state, no code point */
579 pRx
->prev
=BOCU1_ASCII_PREV
;
582 return decodeBocu1LeadByte(pRx
, b
);
585 /* trail byte in any position */
586 return decodeBocu1TrailByte(pRx
, b
);
590 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
592 /* test code ---------------------------------------------------------------- */
594 /* test code options */
596 /* ignore comma when processing name lists in testText() */
597 #define TEST_IGNORE_COMMA 1
600 * Write a packed BOCU-1 byte sequence into a byte array,
601 * without overflow check.
604 * @param packed packed BOCU-1 byte sequence, see packDiff()
605 * @param p pointer to byte array
606 * @return number of bytes
611 writePacked(int32_t packed
, uint8_t *p
) {
612 int32_t count
=BOCU1_LENGTH_FROM_PACKED(packed
);
615 *p
++=(uint8_t)(packed
>>24);
617 *p
++=(uint8_t)(packed
>>16);
619 *p
++=(uint8_t)(packed
>>8);
621 *p
++=(uint8_t)packed
;
630 * Unpack a packed BOCU-1 non-C0/space byte sequence and get
631 * the difference to initialPrev.
632 * Used only for round-trip testing of the difference encoding and decoding.
635 * @param initialPrev bogus "previous code point" value to make sure that
636 * the resulting code point is in the range 0..0x10ffff
637 * @param packed packed BOCU-1 byte sequence
638 * @return the difference to initialPrev
644 unpackDiff(int32_t initialPrev
, int32_t packed
) {
645 Bocu1Rx rx
={ 0, 0, 0 };
649 count
=BOCU1_LENGTH_FROM_PACKED(packed
);
652 decodeBocu1(&rx
, (uint8_t)(packed
>>24));
654 decodeBocu1(&rx
, (uint8_t)(packed
>>16));
656 decodeBocu1(&rx
, (uint8_t)(packed
>>8));
658 /* subtract initial prev */
659 return decodeBocu1(&rx
, (uint8_t)packed
)-initialPrev
;
666 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
667 * preserving lexical order.
668 * Also checks for roundtripping of the difference encoding.
671 * @param diff difference value to test, -0x10ffff..0x10ffff
672 * @param p pointer to output byte array
673 * @return p advanced by number of bytes output
678 writeDiff(int32_t diff
, uint8_t *p
) {
679 /* generate the difference as a packed value and serialize it */
680 int32_t packed
, initialPrev
;
682 packed
=packDiff(diff
);
685 * bogus initial "prev" to work around
686 * code point range check in decodeBocu1()
689 initialPrev
=0x10ffff;
694 if(diff
!=unpackDiff(initialPrev
, packed
)) {
695 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
696 diff
, packed
, unpackDiff(initialPrev
, packed
));
698 return p
+writePacked(packed
, p
);
702 * Encode a UTF-16 string in BOCU-1.
703 * Does not check for overflows, but otherwise useful function.
705 * @param s input UTF-16 string
706 * @param length number of UChar code units in s
707 * @param p pointer to output byte array
708 * @return number of bytes output
711 writeString(const UChar
*s
, int32_t length
, uint8_t *p
) {
719 U16_NEXT(s
, i
, length
, c
);
720 p
+=writePacked(encodeBocu1(&prev
, c
), p
);
722 return (int32_t)(p
-p0
);
726 * Decode a BOCU-1 byte sequence to a UTF-16 string.
727 * Does not check for overflows, but otherwise useful function.
729 * @param p pointer to input BOCU-1 bytes
730 * @param length number of input bytes
731 * @param s point to output UTF-16 string array
732 * @return number of UChar code units output
735 readString(const uint8_t *p
, int32_t length
, UChar
*s
) {
736 Bocu1Rx rx
={ 0, 0, 0 };
737 int32_t c
, i
, sLength
;
741 c
=decodeBocu1(&rx
, p
[i
++]);
743 log_err("error: readString detects encoding error at string index %ld\n", i
);
747 U16_APPEND_UNSAFE(s
, sLength
, c
);
754 hexDigit(uint8_t digit
) {
755 return digit
<=9 ? (char)('0'+digit
) : (char)('a'-10+digit
);
759 * Pretty-print 0-terminated byte values.
760 * Helper function for test output.
762 * @param bytes 0-terminated byte array to print
765 printBytes(uint8_t *bytes
, char *out
) {
770 while((b
=*bytes
++)!=0) {
772 *out
++=hexDigit((uint8_t)(b
>>4));
773 *out
++=hexDigit((uint8_t)(b
&0xf));
785 * Basic BOCU-1 test function, called when there are no command line arguments.
786 * Prints some of the #define values and performs round-trip tests of the
787 * difference encoding and decoding.
790 TestBOCU1RefDiff(void) {
791 char buf1
[80], buf2
[80];
792 uint8_t prev
[5], level
[5];
793 int32_t i
, cmp
, countErrors
;
795 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1
-BOCU1_REACH_NEG_1
);
796 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2
-BOCU1_REACH_NEG_2
);
797 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3
-BOCU1_REACH_NEG_3
);
799 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1
, BOCU1_REACH_POS_1
);
800 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2
, BOCU1_REACH_POS_2
);
801 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3
, BOCU1_REACH_POS_3
);
803 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE
);
804 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2
, BOCU1_START_POS_2
);
805 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3
, BOCU1_START_POS_3
);
807 /* test packDiff() & unpackDiff() with some specific values */
810 writeDiff(65, level
);
811 writeDiff(130, level
);
812 writeDiff(30000, level
);
813 writeDiff(1000000, level
);
814 writeDiff(-65, level
);
815 writeDiff(-130, level
);
816 writeDiff(-30000, level
);
817 writeDiff(-1000000, level
);
819 /* test that each value is smaller than any following one */
822 *writeDiff(i
, prev
)=0;
824 /* show first number and bytes */
825 printBytes(prev
, buf1
);
826 log_verbose(" wD(%8ld) %s\n", i
, buf1
);
828 for(++i
; i
<=0x10ffff; ++i
) {
829 *writeDiff(i
, level
)=0;
830 cmp
=strcmp((const char *)prev
, (const char *)level
);
831 if(BOCU1_LENGTH_FROM_LEAD(level
[0])!=(int32_t)strlen((const char *)level
)) {
832 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
833 level
[0], BOCU1_LENGTH_FROM_LEAD(level
[0]), strlen((const char *)level
), i
);
836 if(i
==0 || i
==1 || strlen((const char *)prev
)!=strlen((const char *)level
)) {
838 * if the result is good, then print only if the length changed
839 * to get little but interesting output
841 printBytes(prev
, buf1
);
842 printBytes(level
, buf2
);
843 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i
-1, i
, cmp
, buf1
, buf2
);
847 printBytes(prev
, buf1
);
848 printBytes(level
, buf2
);
849 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i
-1, i
, cmp
, buf1
, buf2
);
851 /* remember the previous bytes */
852 memcpy(prev
, level
, 4);
855 /* show last number and bytes */
856 printBytes((uint8_t *)"", buf1
);
857 printBytes(prev
, buf2
);
858 log_verbose(" wD(%8ld) %s%s\n", i
-1, buf1
, buf2
);
861 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
863 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors
);
866 /* output signature byte sequence */
868 writePacked(encodeBocu1(&i
, 0xfeff), level
);
869 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
870 level
[0], level
[1], level
[2]);
873 /* cintltst code ------------------------------------------------------------ */
875 static const int32_t DEFAULT_BUFFER_SIZE
= 30000;
878 /* test one string with the ICU and the reference BOCU-1 implementations */
880 roundtripBOCU1(UConverter
*bocu1
, int32_t number
, const UChar
*text
, int32_t length
) {
881 UChar
*roundtripRef
, *roundtripICU
;
882 char *bocu1Ref
, *bocu1ICU
;
884 int32_t bocu1RefLength
, bocu1ICULength
, roundtripRefLength
, roundtripICULength
;
885 UErrorCode errorCode
;
887 roundtripRef
= malloc(DEFAULT_BUFFER_SIZE
* sizeof(UChar
));
888 roundtripICU
= malloc(DEFAULT_BUFFER_SIZE
* sizeof(UChar
));
889 bocu1Ref
= malloc(DEFAULT_BUFFER_SIZE
);
890 bocu1ICU
= malloc(DEFAULT_BUFFER_SIZE
);
892 /* Unicode -> BOCU-1 */
893 bocu1RefLength
=writeString(text
, length
, (uint8_t *)bocu1Ref
);
895 errorCode
=U_ZERO_ERROR
;
896 bocu1ICULength
=ucnv_fromUChars(bocu1
, bocu1ICU
, DEFAULT_BUFFER_SIZE
, text
, length
, &errorCode
);
897 if(U_FAILURE(errorCode
)) {
898 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number
, length
, u_errorName(errorCode
));
902 if(bocu1RefLength
!=bocu1ICULength
|| 0!=uprv_memcmp(bocu1Ref
, bocu1ICU
, bocu1RefLength
)) {
903 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number
, length
, bocu1RefLength
, bocu1ICULength
);
907 /* BOCU-1 -> Unicode */
908 roundtripRefLength
=readString((uint8_t *)bocu1Ref
, bocu1RefLength
, roundtripRef
);
909 if(roundtripRefLength
<0) {
910 goto cleanup
; /* readString() found an error and reported it */
913 roundtripICULength
=ucnv_toUChars(bocu1
, roundtripICU
, DEFAULT_BUFFER_SIZE
, bocu1ICU
, bocu1ICULength
, &errorCode
);
914 if(U_FAILURE(errorCode
)) {
915 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number
, length
, u_errorName(errorCode
));
919 if(length
!=roundtripRefLength
|| 0!=u_memcmp(text
, roundtripRef
, length
)) {
920 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number
, length
, roundtripRefLength
);
923 if(roundtripRefLength
!=roundtripICULength
|| 0!=u_memcmp(roundtripRef
, roundtripICU
, roundtripRefLength
)) {
924 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number
, roundtripRefLength
, roundtripICULength
);
934 static const UChar feff
[]={ 0xfeff };
935 static const UChar ascii
[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
936 static const UChar crlf
[]={ 0xd, 0xa, 0x20 };
937 static const UChar nul
[]={ 0 };
938 static const UChar latin
[]={ 0xdf, 0xe6 };
939 static const UChar devanagari
[]={ 0x930, 0x20, 0x918, 0x909 };
940 static const UChar hiragana
[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
941 static const UChar unihan
[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
942 static const UChar hangul
[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
943 static const UChar surrogates
[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
944 static const UChar plane1
[]={ 0xd800, 0xdc00 };
945 static const UChar plane2
[]={ 0xd845, 0xdddd };
946 static const UChar plane15
[]={ 0xdbbb, 0xddee, 0x20 };
947 static const UChar plane16
[]={ 0xdbff, 0xdfff };
948 static const UChar c0
[]={ 1, 0xe40, 0x20, 9 };
950 static const struct {
954 { feff
, UPRV_LENGTHOF(feff
) },
955 { ascii
, UPRV_LENGTHOF(ascii
) },
956 { crlf
, UPRV_LENGTHOF(crlf
) },
957 { nul
, UPRV_LENGTHOF(nul
) },
958 { latin
, UPRV_LENGTHOF(latin
) },
959 { devanagari
, UPRV_LENGTHOF(devanagari
) },
960 { hiragana
, UPRV_LENGTHOF(hiragana
) },
961 { unihan
, UPRV_LENGTHOF(unihan
) },
962 { hangul
, UPRV_LENGTHOF(hangul
) },
963 { surrogates
, UPRV_LENGTHOF(surrogates
) },
964 { plane1
, UPRV_LENGTHOF(plane1
) },
965 { plane2
, UPRV_LENGTHOF(plane2
) },
966 { plane15
, UPRV_LENGTHOF(plane15
) },
967 { plane16
, UPRV_LENGTHOF(plane16
) },
968 { c0
, UPRV_LENGTHOF(c0
) }
972 * Verify that the ICU BOCU-1 implementation produces the same results as
973 * the reference implementation from the design folder.
974 * Generate some texts and convert them with both converters, verifying
975 * identical results and roundtripping.
983 UErrorCode errorCode
;
985 errorCode
=U_ZERO_ERROR
;
986 bocu1
=ucnv_open("BOCU-1", &errorCode
);
987 if(U_FAILURE(errorCode
)) {
988 log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode
));
992 text
= malloc(DEFAULT_BUFFER_SIZE
* sizeof(UChar
));
994 /* text 1: each of strings[] once */
996 for(i
=0; i
<UPRV_LENGTHOF(strings
); ++i
) {
997 u_memcpy(text
+length
, strings
[i
].s
, strings
[i
].length
);
998 length
+=strings
[i
].length
;
1000 roundtripBOCU1(bocu1
, 1, text
, length
);
1002 /* text 2: each of strings[] twice */
1004 for(i
=0; i
<UPRV_LENGTHOF(strings
); ++i
) {
1005 u_memcpy(text
+length
, strings
[i
].s
, strings
[i
].length
);
1006 length
+=strings
[i
].length
;
1007 u_memcpy(text
+length
, strings
[i
].s
, strings
[i
].length
);
1008 length
+=strings
[i
].length
;
1010 roundtripBOCU1(bocu1
, 2, text
, length
);
1012 /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
1014 for(i
=1; length
<5000; i
+=7) {
1015 if(i
>=UPRV_LENGTHOF(strings
)) {
1016 i
-=UPRV_LENGTHOF(strings
);
1018 u_memcpy(text
+length
, strings
[i
].s
, strings
[i
].length
);
1019 length
+=strings
[i
].length
;
1021 roundtripBOCU1(bocu1
, 3, text
, length
);
1027 U_CFUNC
void addBOCU1Tests(TestNode
** root
);
1030 addBOCU1Tests(TestNode
** root
) {
1031 addTest(root
, TestBOCU1RefDiff
, "tsconv/bocu1tst/TestBOCU1RefDiff");
1032 addTest(root
, TestBOCU1
, "tsconv/bocu1tst/TestBOCU1");