git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/cintltst/bocu1tst.c

... / ...

Commit	Line	Data
	1	/*
	2	******************************************************************************
	3	*
	4	* Copyright (C) 2002-2003, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	******************************************************************************
	8	* file name: bocu1tst.c
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2002may27
	14	* created by: Markus W. Scherer
	15	*
	16	* This is the reference implementation of BOCU-1,
	17	* the MIME-friendly form of the Binary Ordered Compression for Unicode,
	18	* taken directly from ### http://oss.software.ibm.com/cvs/icu/icuhtml/design/conversion/bocu1/
	19	* The files bocu1.h and bocu1.c from the design folder are taken
	20	* verbatim (minus copyright and #include) and copied together into this file.
	21	* The reference code and some of the reference bocu1tst.c
	22	* is modified to run as part of the ICU cintltst
	23	* test framework (minus main(), log_ln() etc. instead of printf()).
	24	*
	25	* This reference implementation is used here to verify
	26	* the ICU BOCU-1 implementation, which is
	27	* adapted for ICU conversion APIs and optimized.
	28	* ### links in design doc to here and to ucnvbocu.c
	29	*/
	30
	31	#include "unicode/utypes.h"
	32	#include "unicode/ustring.h"
	33	#include "unicode/ucnv.h"
	34	#include "cmemory.h"
	35	#include "cintltst.h"
	36
	37	#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
	38
	39	/* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
	40
	41	/* BOCU-1 constants and macros ---------------------------------------------- */
	42
	43	/*
	44	* BOCU-1 encodes the code points of a Unicode string as
	45	* a sequence of byte-encoded differences (slope detection),
	46	* preserving lexical order.
	47	*
	48	* Optimize the difference-taking for runs of Unicode text within
	49	* small scripts:
	50	*
	51	* Most small scripts are allocated within aligned 128-blocks of Unicode
	52	* code points. Lexical order is preserved if the "previous code point" state
	53	* is always moved into the middle of such a block.
	54	*
	55	* Additionally, "prev" is moved from anywhere in the Unihan and Hangul
	56	* areas into the middle of those areas.
	57	*
	58	* C0 control codes and space are encoded with their US-ASCII bytes.
	59	* "prev" is reset for C0 controls but not for space.
	60	*/
	61
	62	/* initial value for "prev": middle of the ASCII range */
	63	#define BOCU1_ASCII_PREV 0x40
	64
	65	/* bounding byte values for differences */
	66	#define BOCU1_MIN 0x21
	67	#define BOCU1_MIDDLE 0x90
	68	#define BOCU1_MAX_LEAD 0xfe
	69
	70	/* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
	71	#define BOCU1_MAX_TRAIL 0xffL
	72	#define BOCU1_RESET 0xff
	73
	74	/* number of lead bytes */
	75	#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
	76
	77	/* adjust trail byte counts for the use of some C0 control byte values */
	78	#define BOCU1_TRAIL_CONTROLS_COUNT 20
	79	#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
	80
	81	/* number of trail bytes */
	82	#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
	83
	84	/*
	85	* number of positive and negative single-byte codes
	86	* (counting 0==BOCU1_MIDDLE among the positive ones)
	87	*/
	88	#define BOCU1_SINGLE 64
	89
	90	/* number of lead bytes for positive and negative 2/3/4-byte sequences */
	91	#define BOCU1_LEAD_2 43
	92	#define BOCU1_LEAD_3 3
	93	#define BOCU1_LEAD_4 1
	94
	95	/* The difference value range for single-byters. */
	96	#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
	97	#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
	98
	99	/* The difference value range for double-byters. */
	100	#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
	101	#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
	102
	103	/* The difference value range for 3-byters. */
	104	#define BOCU1_REACH_POS_3 \
	105	(BOCU1_REACH_POS_2+BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)
	106
	107	#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)
	108
	109	/* The lead byte start values. */
	110	#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
	111	#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
	112	#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
	113	/* ==BOCU1_MAX_LEAD */
	114
	115	#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
	116	#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
	117	#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
	118	/* ==BOCU1_MIN+1 */
	119
	120	/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
	121	#define BOCU1_LENGTH_FROM_LEAD(lead) \
	122	((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
	123	(BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
	124	(BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
	125
	126	/* The length of a byte sequence, according to its packed form. */
	127	#define BOCU1_LENGTH_FROM_PACKED(packed) \
	128	((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
	129
	130	/*
	131	* 12 commonly used C0 control codes (and space) are only used to encode
	132	* themselves directly,
	133	* which makes BOCU-1 MIME-usable and reasonably safe for
	134	* ASCII-oriented software.
	135	*
	136	* These controls are
	137	* 0 NUL
	138	*
	139	* 7 BEL
	140	* 8 BS
	141	*
	142	* 9 TAB
	143	* a LF
	144	* b VT
	145	* c FF
	146	* d CR
	147	*
	148	* e SO
	149	* f SI
	150	*
	151	* 1a SUB
	152	* 1b ESC
	153	*
	154	* The other 20 C0 controls are also encoded directly (to preserve order)
	155	* but are also used as trail bytes in difference encoding
	156	* (for better compression).
	157	*/
	158	#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
	159
	160	/*
	161	* Byte value map for control codes,
	162	* from external byte values 0x00..0x20
	163	* to trail byte values 0..19 (0..0x13) as used in the difference calculation.
	164	* External byte values that are illegal as trail bytes are mapped to -1.
	165	*/
	166	static int8_t
	167	bocu1ByteToTrail[BOCU1_MIN]={
	168	/* 0 1 2 3 4 5 6 7 */
	169	-1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
	170
	171	/* 8 9 a b c d e f */
	172	-1, -1, -1, -1, -1, -1, -1, -1,
	173
	174	/* 10 11 12 13 14 15 16 17 */
	175	0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
	176
	177	/* 18 19 1a 1b 1c 1d 1e 1f */
	178	0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
	179
	180	/* 20 */
	181	-1
	182	};
	183
	184	/*
	185	* Byte value map for control codes,
	186	* from trail byte values 0..19 (0..0x13) as used in the difference calculation
	187	* to external byte values 0x00..0x20.
	188	*/
	189	static int8_t
	190	bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
	191	/* 0 1 2 3 4 5 6 7 */
	192	0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
	193
	194	/* 8 9 a b c d e f */
	195	0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
	196
	197	/* 10 11 12 13 */
	198	0x1c, 0x1d, 0x1e, 0x1f
	199	};
	200
	201	/**
	202	* Integer division and modulo with negative numerators
	203	* yields negative modulo results and quotients that are one more than
	204	* what we need here.
	205	* This macro adjust the results so that the modulo-value m is always >=0.
	206	*
	207	* For positive n, the if() condition is always FALSE.
	208	*
	209	* @param n Number to be split into quotient and rest.
	210	* Will be modified to contain the quotient.
	211	* @param d Divisor.
	212	* @param m Output variable for the rest (modulo result).
	213	*/
	214	#define NEGDIVMOD(n, d, m) { \
	215	(m)=(n)%(d); \
	216	(n)/=(d); \
	217	if((m)<0) { \
	218	--(n); \
	219	(m)+=(d); \
	220	} \
	221	}
	222
	223	/* State for BOCU-1 decoder function. */
	224	struct Bocu1Rx {
	225	int32_t prev, count, diff;
	226	};
	227
	228	typedef struct Bocu1Rx Bocu1Rx;
	229
	230	/* Function prototypes ------------------------------------------------------ */
	231
	232	/* see bocu1.c */
	233	U_CFUNC int32_t
	234	packDiff(int32_t diff);
	235
	236	U_CFUNC int32_t
	237	encodeBocu1(int32_t *pPrev, int32_t c);
	238
	239	U_CFUNC int32_t
	240	decodeBocu1(Bocu1Rx *pRx, uint8_t b);
	241
	242	/* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
	243
	244	/* BOCU-1 implementation functions ------------------------------------------ */
	245
	246	/**
	247	* Compute the next "previous" value for differencing
	248	* from the current code point.
	249	*
	250	* @param c current code point, 0..0x10ffff
	251	* @return "previous code point" state value
	252	*/
	253	static U_INLINE int32_t
	254	bocu1Prev(int32_t c) {
	255	/* compute new prev */
	256	if(0x3040<=c && c<=0x309f) {
	257	/* Hiragana is not 128-aligned */
	258	return 0x3070;
	259	} else if(0x4e00<=c && c<=0x9fa5) {
	260	/* CJK Unihan */
	261	return 0x4e00-BOCU1_REACH_NEG_2;
	262	} else if(0xac00<=c && c<=0xd7a3) {
	263	/* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
	264	return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
	265	} else {
	266	/* mostly small scripts */
	267	return (c&~0x7f)+BOCU1_ASCII_PREV;
	268	}
	269	}
	270
	271	/**
	272	* Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
	273	* and return a packed integer with them.
	274	*
	275	* The encoding favors small absolut differences with short encodings
	276	* to compress runs of same-script characters.
	277	*
	278	* @param diff difference value -0x10ffff..0x10ffff
	279	* @return
	280	* 0x010000zz for 1-byte sequence zz
	281	* 0x0200yyzz for 2-byte sequence yy zz
	282	* 0x03xxyyzz for 3-byte sequence xx yy zz
	283	* 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
	284	*/
	285	U_CFUNC int32_t
	286	packDiff(int32_t diff) {
	287	int32_t result, m, lead, count, shift;
	288
	289	if(diff>=BOCU1_REACH_NEG_1) {
	290	/* mostly positive differences, and single-byte negative ones */
	291	if(diff<=BOCU1_REACH_POS_1) {
	292	/* single byte */
	293	return 0x01000000\|(BOCU1_MIDDLE+diff);
	294	} else if(diff<=BOCU1_REACH_POS_2) {
	295	/* two bytes */
	296	diff-=BOCU1_REACH_POS_1+1;
	297	lead=BOCU1_START_POS_2;
	298	count=1;
	299	} else if(diff<=BOCU1_REACH_POS_3) {
	300	/* three bytes */
	301	diff-=BOCU1_REACH_POS_2+1;
	302	lead=BOCU1_START_POS_3;
	303	count=2;
	304	} else {
	305	/* four bytes */
	306	diff-=BOCU1_REACH_POS_3+1;
	307	lead=BOCU1_START_POS_4;
	308	count=3;
	309	}
	310	} else {
	311	/* two- and four-byte negative differences */
	312	if(diff>=BOCU1_REACH_NEG_2) {
	313	/* two bytes */
	314	diff-=BOCU1_REACH_NEG_1;
	315	lead=BOCU1_START_NEG_2;
	316	count=1;
	317	} else if(diff>=BOCU1_REACH_NEG_3) {
	318	/* three bytes */
	319	diff-=BOCU1_REACH_NEG_2;
	320	lead=BOCU1_START_NEG_3;
	321	count=2;
	322	} else {
	323	/* four bytes */
	324	diff-=BOCU1_REACH_NEG_3;
	325	lead=BOCU1_START_NEG_4;
	326	count=3;
	327	}
	328	}
	329
	330	/* encode the length of the packed result */
	331	if(count<3) {
	332	result=(count+1)<<24;
	333	} else /* count==3, MSB used for the lead byte */ {
	334	result=0;
	335	}
	336
	337	/* calculate trail bytes like digits in itoa() */
	338	shift=0;
	339	do {
	340	NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
	341	result\|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
	342	shift+=8;
	343	} while(--count>0);
	344
	345	/* add lead byte */
	346	result\|=(lead+diff)<<shift;
	347
	348	return result;
	349	}
	350
	351	/**
	352	* BOCU-1 encoder function.
	353	*
	354	* @param pPrev pointer to the integer that holds
	355	* the "previous code point" state;
	356	* the initial value should be 0 which
	357	* encodeBocu1 will set to the actual BOCU-1 initial state value
	358	* @param c the code point to encode
	359	* @return the packed 1/2/3/4-byte encoding, see packDiff(),
	360	* or 0 if an error occurs
	361	*
	362	* @see packDiff
	363	*/
	364	U_CFUNC int32_t
	365	encodeBocu1(int32_t *pPrev, int32_t c) {
	366	int32_t prev;
	367
	368	if(pPrev==NULL \|\| c<0 \|\| c>0x10ffff) {
	369	/* illegal argument */
	370	return 0;
	371	}
	372
	373	prev=*pPrev;
	374	if(prev==0) {
	375	/* lenient handling of initial value 0 */
	376	prev=*pPrev=BOCU1_ASCII_PREV;
	377	}
	378
	379	if(c<=0x20) {
	380	/*
	381	* ISO C0 control & space:
	382	* Encode directly for MIME compatibility,
	383	* and reset state except for space, to not disrupt compression.
	384	*/
	385	if(c!=0x20) {
	386	*pPrev=BOCU1_ASCII_PREV;
	387	}
	388	return 0x01000000\|c;
	389	}
	390
	391	/*
	392	* all other Unicode code points c==U+0021..U+10ffff
	393	* are encoded with the difference c-prev
	394	*
	395	* a new prev is computed from c,
	396	* placed in the middle of a 0x80-block (for most small scripts) or
	397	* in the middle of the Unihan and Hangul blocks
	398	* to statistically minimize the following difference
	399	*/
	400	*pPrev=bocu1Prev(c);
	401	return packDiff(c-prev);
	402	}
	403
	404	/**
	405	* Function for BOCU-1 decoder; handles multi-byte lead bytes.
	406	*
	407	* @param pRx pointer to the decoder state structure
	408	* @param b lead byte;
	409	* BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
	410	* @return -1 (state change only)
	411	*
	412	* @see decodeBocu1
	413	*/
	414	static int32_t
	415	decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
	416	int32_t c, count;
	417
	418	if(b>=BOCU1_START_NEG_2) {
	419	/* positive difference */
	420	if(b<BOCU1_START_POS_3) {
	421	/* two bytes */
	422	c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
	423	count=1;
	424	} else if(b<BOCU1_START_POS_4) {
	425	/* three bytes */
	426	c=((int32_t)b-BOCU1_START_POS_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
	427	count=2;
	428	} else {
	429	/* four bytes */
	430	c=BOCU1_REACH_POS_3+1;
	431	count=3;
	432	}
	433	} else {
	434	/* negative difference */
	435	if(b>=BOCU1_START_NEG_3) {
	436	/* two bytes */
	437	c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
	438	count=1;
	439	} else if(b>BOCU1_MIN) {
	440	/* three bytes */
	441	c=((int32_t)b-BOCU1_START_NEG_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
	442	count=2;
	443	} else {
	444	/* four bytes */
	445	c=-BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
	446	count=3;
	447	}
	448	}
	449
	450	/* set the state for decoding the trail byte(s) */
	451	pRx->diff=c;
	452	pRx->count=count;
	453	return -1;
	454	}
	455
	456	/**
	457	* Function for BOCU-1 decoder; handles multi-byte trail bytes.
	458	*
	459	* @param pRx pointer to the decoder state structure
	460	* @param b trail byte
	461	* @return result value, same as decodeBocu1
	462	*
	463	* @see decodeBocu1
	464	*/
	465	static int32_t
	466	decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
	467	int32_t t, c, count;
	468
	469	if(b<=0x20) {
	470	/* skip some C0 controls and make the trail byte range contiguous */
	471	t=bocu1ByteToTrail[b];
	472	if(t<0) {
	473	/* illegal trail byte value */
	474	pRx->prev=BOCU1_ASCII_PREV;
	475	pRx->count=0;
	476	return -99;
	477	}
	478	#if BOCU1_MAX_TRAIL<0xff
	479	} else if(b>BOCU1_MAX_TRAIL) {
	480	return -99;
	481	#endif
	482	} else {
	483	t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
	484	}
	485
	486	/* add trail byte into difference and decrement count */
	487	c=pRx->diff;
	488	count=pRx->count;
	489
	490	if(count==1) {
	491	/* final trail byte, deliver a code point */
	492	c=pRx->prev+c+t;
	493	if(0<=c && c<=0x10ffff) {
	494	/* valid code point result */
	495	pRx->prev=bocu1Prev(c);
	496	pRx->count=0;
	497	return c;
	498	} else {
	499	/* illegal code point result */
	500	pRx->prev=BOCU1_ASCII_PREV;
	501	pRx->count=0;
	502	return -99;
	503	}
	504	}
	505
	506	/* intermediate trail byte */
	507	if(count==2) {
	508	pRx->diff=c+t*BOCU1_TRAIL_COUNT;
	509	} else /* count==3 */ {
	510	pRx->diff=c+tBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT;
	511	}
	512	pRx->count=count-1;
	513	return -1;
	514	}
	515
	516	/**
	517	* BOCU-1 decoder function.
	518	*
	519	* @param pRx pointer to the decoder state structure;
	520	* the initial values should be 0 which
	521	* decodeBocu1 will set to actual initial state values
	522	* @param b an input byte
	523	* @return
	524	* 0..0x10ffff for a result code point
	525	* -1 if only the state changed without code point output
	526	* <-1 if an error occurs
	527	*/
	528	U_CFUNC int32_t
	529	decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
	530	int32_t prev, c, count;
	531
	532	if(pRx==NULL) {
	533	/* illegal argument */
	534	return -99;
	535	}
	536
	537	prev=pRx->prev;
	538	if(prev==0) {
	539	/* lenient handling of initial 0 values */
	540	prev=pRx->prev=BOCU1_ASCII_PREV;
	541	count=pRx->count=0;
	542	} else {
	543	count=pRx->count;
	544	}
	545
	546	if(count==0) {
	547	/* byte in lead position */
	548	if(b<=0x20) {
	549	/*
	550	* Direct-encoded C0 control code or space.
	551	* Reset prev for C0 control codes but not for space.
	552	*/
	553	if(b!=0x20) {
	554	pRx->prev=BOCU1_ASCII_PREV;
	555	}
	556	return b;
	557	}
	558
	559	/*
	560	* b is a difference lead byte.
	561	*
	562	* Return a code point directly from a single-byte difference.
	563	*
	564	* For multi-byte difference lead bytes, set the decoder state
	565	* with the partial difference value from the lead byte and
	566	* with the number of trail bytes.
	567	*
	568	* For four-byte differences, the signedness also affects the
	569	* first trail byte, which has special handling farther below.
	570	*/
	571	if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
	572	/* single-byte difference */
	573	c=prev+((int32_t)b-BOCU1_MIDDLE);
	574	pRx->prev=bocu1Prev(c);
	575	return c;
	576	} else if(b==BOCU1_RESET) {
	577	/* only reset the state, no code point */
	578	pRx->prev=BOCU1_ASCII_PREV;
	579	return -1;
	580	} else {
	581	return decodeBocu1LeadByte(pRx, b);
	582	}
	583	} else {
	584	/* trail byte in any position */
	585	return decodeBocu1TrailByte(pRx, b);
	586	}
	587	}
	588
	589	/* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
	590
	591	/* test code ---------------------------------------------------------------- */
	592
	593	/* test code options */
	594
	595	/* ignore comma when processing name lists in testText() */
	596	#define TEST_IGNORE_COMMA 1
	597
	598	/**
	599	* Write a packed BOCU-1 byte sequence into a byte array,
	600	* without overflow check.
	601	* Test function.
	602	*
	603	* @param packed packed BOCU-1 byte sequence, see packDiff()
	604	* @param p pointer to byte array
	605	* @return number of bytes
	606	*
	607	* @see packDiff
	608	*/
	609	static int32_t
	610	writePacked(int32_t packed, uint8_t *p) {
	611	int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
	612	switch(count) {
	613	case 4:
	614	*p++=(uint8_t)(packed>>24);
	615	case 3:
	616	*p++=(uint8_t)(packed>>16);
	617	case 2:
	618	*p++=(uint8_t)(packed>>8);
	619	case 1:
	620	*p++=(uint8_t)packed;
	621	default:
	622	break;
	623	}
	624
	625	return count;
	626	}
	627
	628	/**
	629	* Unpack a packed BOCU-1 non-C0/space byte sequence and get
	630	* the difference to initialPrev.
	631	* Used only for round-trip testing of the difference encoding and decoding.
	632	* Test function.
	633	*
	634	* @param initialPrev bogus "previous code point" value to make sure that
	635	* the resulting code point is in the range 0..0x10ffff
	636	* @param packed packed BOCU-1 byte sequence
	637	* @return the difference to initialPrev
	638	*
	639	* @see packDiff
	640	* @see writeDiff
	641	*/
	642	static int32_t
	643	unpackDiff(int32_t initialPrev, int32_t packed) {
	644	Bocu1Rx rx={ 0, 0, 0 };
	645	int32_t count;
	646
	647	rx.prev=initialPrev;
	648	count=BOCU1_LENGTH_FROM_PACKED(packed);
	649	switch(count) {
	650	case 4:
	651	decodeBocu1(&rx, (uint8_t)(packed>>24));
	652	case 3:
	653	decodeBocu1(&rx, (uint8_t)(packed>>16));
	654	case 2:
	655	decodeBocu1(&rx, (uint8_t)(packed>>8));
	656	case 1:
	657	/* subtract initial prev */
	658	return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
	659	default:
	660	return -0x7fffffff;
	661	}
	662	}
	663
	664	/**
	665	* Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
	666	* preserving lexical order.
	667	* Also checks for roundtripping of the difference encoding.
	668	* Test function.
	669	*
	670	* @param diff difference value to test, -0x10ffff..0x10ffff
	671	* @param p pointer to output byte array
	672	* @return p advanced by number of bytes output
	673	*
	674	* @see unpackDiff
	675	*/
	676	static uint8_t *
	677	writeDiff(int32_t diff, uint8_t *p) {
	678	/* generate the difference as a packed value and serialize it */
	679	int32_t packed, initialPrev;
	680
	681	packed=packDiff(diff);
	682
	683	/*
	684	* bogus initial "prev" to work around
	685	* code point range check in decodeBocu1()
	686	*/
	687	if(diff<=0) {
	688	initialPrev=0x10ffff;
	689	} else {
	690	initialPrev=-1;
	691	}
	692
	693	if(diff!=unpackDiff(initialPrev, packed)) {
	694	log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
	695	diff, packed, unpackDiff(initialPrev, packed));
	696	}
	697	return p+writePacked(packed, p);
	698	}
	699
	700	/**
	701	* Encode a UTF-16 string in BOCU-1.
	702	* Does not check for overflows, but otherwise useful function.
	703	*
	704	* @param s input UTF-16 string
	705	* @param length number of UChar code units in s
	706	* @param p pointer to output byte array
	707	* @return number of bytes output
	708	*/
	709	static int32_t
	710	writeString(const UChar s, int32_t length, uint8_t p) {
	711	uint8_t *p0;
	712	int32_t c, prev, i;
	713
	714	prev=0;
	715	p0=p;
	716	i=0;
	717	while(i<length) {
	718	UTF_NEXT_CHAR(s, i, length, c);
	719	p+=writePacked(encodeBocu1(&prev, c), p);
	720	}
	721	return p-p0;
	722	}
	723
	724	/**
	725	* Decode a BOCU-1 byte sequence to a UTF-16 string.
	726	* Does not check for overflows, but otherwise useful function.
	727	*
	728	* @param p pointer to input BOCU-1 bytes
	729	* @param length number of input bytes
	730	* @param s point to output UTF-16 string array
	731	* @return number of UChar code units output
	732	*/
	733	static int32_t
	734	readString(const uint8_t p, int32_t length, UChar s) {
	735	Bocu1Rx rx={ 0, 0, 0 };
	736	int32_t c, i, sLength;
	737
	738	i=sLength=0;
	739	while(i<length) {
	740	c=decodeBocu1(&rx, p[i++]);
	741	if(c<-1) {
	742	log_err("error: readString detects encoding error at string index %ld\n", i);
	743	return -1;
	744	}
	745	if(c>=0) {
	746	UTF_APPEND_CHAR_UNSAFE(s, sLength, c);
	747	}
	748	}
	749	return sLength;
	750	}
	751
	752	static U_INLINE char
	753	hexDigit(uint8_t digit) {
	754	return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
	755	}
	756
	757	/**
	758	* Pretty-print 0-terminated byte values.
	759	* Helper function for test output.
	760	*
	761	* @param bytes 0-terminated byte array to print
	762	*/
	763	static void
	764	printBytes(uint8_t bytes, char out) {
	765	int i;
	766	uint8_t b;
	767
	768	i=0;
	769	while((b=*bytes++)!=0) {
	770	*out++=' ';
	771	*out++=hexDigit((uint8_t)(b>>4));
	772	*out++=hexDigit((uint8_t)(b&0xf));
	773	++i;
	774	}
	775	i=3*(5-i);
	776	while(i>0) {
	777	*out++=' ';
	778	--i;
	779	}
	780	*out=0;
	781	}
	782
	783	/**
	784	* Basic BOCU-1 test function, called when there are no command line arguments.
	785	* Prints some of the #define values and performs round-trip tests of the
	786	* difference encoding and decoding.
	787	*/
	788	static void
	789	TestBOCU1RefDiff(void) {
	790	char buf1[80], buf2[80];
	791	uint8_t prev[5], level[5];
	792	int32_t i, cmp, countErrors;
	793
	794	log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
	795	log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
	796	log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
	797
	798	log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
	799	log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
	800	log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
	801
	802	log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE);
	803	log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
	804	log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
	805
	806	/* test packDiff() & unpackDiff() with some specific values */
	807	writeDiff(0, level);
	808	writeDiff(1, level);
	809	writeDiff(65, level);
	810	writeDiff(130, level);
	811	writeDiff(30000, level);
	812	writeDiff(1000000, level);
	813	writeDiff(-65, level);
	814	writeDiff(-130, level);
	815	writeDiff(-30000, level);
	816	writeDiff(-1000000, level);
	817
	818	/* test that each value is smaller than any following one */
	819	countErrors=0;
	820	i=-0x10ffff;
	821	*writeDiff(i, prev)=0;
	822
	823	/* show first number and bytes */
	824	printBytes(prev, buf1);
	825	log_verbose(" wD(%8ld) %s\n", i, buf1);
	826
	827	for(++i; i<=0x10ffff; ++i) {
	828	*writeDiff(i, level)=0;
	829	cmp=strcmp((const char )prev, (const char )level);
	830	if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
	831	log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
	832	level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
	833	}
	834	if(cmp<0) {
	835	if(i==0 \|\| i==1 \|\| strlen((const char )prev)!=strlen((const char )level)) {
	836	/*
	837	* if the result is good, then print only if the length changed
	838	* to get little but interesting output
	839	*/
	840	printBytes(prev, buf1);
	841	printBytes(level, buf2);
	842	log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
	843	}
	844	} else {
	845	++countErrors;
	846	printBytes(prev, buf1);
	847	printBytes(level, buf2);
	848	log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
	849	}
	850	/* remember the previous bytes */
	851	memcpy(prev, level, 4);
	852	}
	853
	854	/* show last number and bytes */
	855	printBytes((uint8_t *)"", buf1);
	856	printBytes(prev, buf2);
	857	log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2);
	858
	859	if(countErrors==0) {
	860	log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
	861	} else {
	862	log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
	863	}
	864
	865	/* output signature byte sequence */
	866	i=0;
	867	writePacked(encodeBocu1(&i, 0xfeff), level);
	868	log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
	869	level[0], level[1], level[2]);
	870	}
	871
	872	/* cintltst code ------------------------------------------------------------ */
	873
	874	/* test one string with the ICU and the reference BOCU-1 implementations */
	875	static void
	876	roundtripBOCU1(UConverter bocu1, int32_t number, const UChar text, int32_t length) {
	877	static UChar roundtripRef[30000], roundtripICU[30000];
	878	static char bocu1Ref[30000], bocu1ICU[30000];
	879
	880	int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
	881	UErrorCode errorCode;
	882
	883	/* Unicode -> BOCU-1 */
	884	bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
	885
	886	errorCode=U_ZERO_ERROR;
	887	bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, sizeof(bocu1ICU), text, length, &errorCode);
	888	if(U_FAILURE(errorCode)) {
	889	log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
	890	return;
	891	}
	892
	893	if(bocu1RefLength!=bocu1ICULength \|\| 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
	894	log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
	895	return;
	896	}
	897
	898	/* BOCU-1 -> Unicode */
	899	roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
	900	if(roundtripRefLength<0) {
	901	return; /* readString() found an error and reported it */
	902	}
	903
	904	roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, sizeof(roundtripICU)/U_SIZEOF_UCHAR, bocu1ICU, bocu1ICULength, &errorCode);
	905	if(U_FAILURE(errorCode)) {
	906	log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
	907	return;
	908	}
	909
	910	if(length!=roundtripRefLength \|\| 0!=u_memcmp(text, roundtripRef, length)) {
	911	log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
	912	return;
	913	}
	914	if(roundtripRefLength!=roundtripICULength \|\| 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
	915	log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
	916	return;
	917	}
	918	}
	919
	920	static const UChar feff[]={ 0xfeff };
	921	static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
	922	static const UChar crlf[]={ 0xd, 0xa, 0x20 };
	923	static const UChar nul[]={ 0 };
	924	static const UChar latin[]={ 0xdf, 0xe6 };
	925	static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
	926	static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
	927	static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
	928	static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
	929	static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
	930	static const UChar plane1[]={ 0xd800, 0xdc00 };
	931	static const UChar plane2[]={ 0xd845, 0xdddd };
	932	static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
	933	static const UChar plane16[]={ 0xdbff, 0xdfff };
	934	static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
	935
	936	static const struct {
	937	const UChar *s;
	938	int32_t length;
	939	} strings[]={
	940	{ feff, LENGTHOF(feff) },
	941	{ ascii, LENGTHOF(ascii) },
	942	{ crlf, LENGTHOF(crlf) },
	943	{ nul, LENGTHOF(nul) },
	944	{ latin, LENGTHOF(latin) },
	945	{ devanagari, LENGTHOF(devanagari) },
	946	{ hiragana, LENGTHOF(hiragana) },
	947	{ unihan, LENGTHOF(unihan) },
	948	{ hangul, LENGTHOF(hangul) },
	949	{ surrogates, LENGTHOF(surrogates) },
	950	{ plane1, LENGTHOF(plane1) },
	951	{ plane2, LENGTHOF(plane2) },
	952	{ plane15, LENGTHOF(plane15) },
	953	{ plane16, LENGTHOF(plane16) },
	954	{ c0, LENGTHOF(c0) }
	955	};
	956
	957	/*
	958	* Verify that the ICU BOCU-1 implementation produces the same results as
	959	* the reference implementation from the design folder.
	960	* Generate some texts and convert them with both converters, verifying
	961	* identical results and roundtripping.
	962	*/
	963	static void
	964	TestBOCU1(void) {
	965	UChar text[30000];
	966	int32_t i, length;
	967
	968	UConverter *bocu1;
	969	UErrorCode errorCode;
	970
	971	errorCode=U_ZERO_ERROR;
	972	bocu1=ucnv_open("BOCU-1", &errorCode);
	973	if(U_FAILURE(errorCode)) {
	974	log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
	975	return;
	976	}
	977
	978	/* text 1: each of strings[] once */
	979	length=0;
	980	for(i=0; i<LENGTHOF(strings); ++i) {
	981	u_memcpy(text+length, strings[i].s, strings[i].length);
	982	length+=strings[i].length;
	983	}
	984	roundtripBOCU1(bocu1, 1, text, length);
	985
	986	/* text 2: each of strings[] twice */
	987	length=0;
	988	for(i=0; i<LENGTHOF(strings); ++i) {
	989	u_memcpy(text+length, strings[i].s, strings[i].length);
	990	length+=strings[i].length;
	991	u_memcpy(text+length, strings[i].s, strings[i].length);
	992	length+=strings[i].length;
	993	}
	994	roundtripBOCU1(bocu1, 2, text, length);
	995
	996	/* text 3: each of strings[] many times (set step vs. \|strings\| so that all strings are used) */
	997	length=0;
	998	for(i=1; length<5000; i+=7) {
	999	if(i>=LENGTHOF(strings)) {
	1000	i-=LENGTHOF(strings);
	1001	}
	1002	u_memcpy(text+length, strings[i].s, strings[i].length);
	1003	length+=strings[i].length;
	1004	}
	1005	roundtripBOCU1(bocu1, 3, text, length);
	1006
	1007	ucnv_close(bocu1);
	1008	}
	1009
	1010	U_CFUNC void addBOCU1Tests(TestNode** root);
	1011
	1012	U_CFUNC void
	1013	addBOCU1Tests(TestNode** root) {
	1014	addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
	1015	addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
	1016	}