]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/cintltst/bocu1tst.c
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / test / cintltst / bocu1tst.c
... / ...
CommitLineData
1/*
2******************************************************************************
3*
4* Copyright (C) 2002-2003, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8* file name: bocu1tst.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2002may27
14* created by: Markus W. Scherer
15*
16* This is the reference implementation of BOCU-1,
17* the MIME-friendly form of the Binary Ordered Compression for Unicode,
18* taken directly from ### http://oss.software.ibm.com/cvs/icu/icuhtml/design/conversion/bocu1/
19* The files bocu1.h and bocu1.c from the design folder are taken
20* verbatim (minus copyright and #include) and copied together into this file.
21* The reference code and some of the reference bocu1tst.c
22* is modified to run as part of the ICU cintltst
23* test framework (minus main(), log_ln() etc. instead of printf()).
24*
25* This reference implementation is used here to verify
26* the ICU BOCU-1 implementation, which is
27* adapted for ICU conversion APIs and optimized.
28* ### links in design doc to here and to ucnvbocu.c
29*/
30
31#include "unicode/utypes.h"
32#include "unicode/ustring.h"
33#include "unicode/ucnv.h"
34#include "cmemory.h"
35#include "cintltst.h"
36
37#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
38
39/* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
40
41/* BOCU-1 constants and macros ---------------------------------------------- */
42
43/*
44 * BOCU-1 encodes the code points of a Unicode string as
45 * a sequence of byte-encoded differences (slope detection),
46 * preserving lexical order.
47 *
48 * Optimize the difference-taking for runs of Unicode text within
49 * small scripts:
50 *
51 * Most small scripts are allocated within aligned 128-blocks of Unicode
52 * code points. Lexical order is preserved if the "previous code point" state
53 * is always moved into the middle of such a block.
54 *
55 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
56 * areas into the middle of those areas.
57 *
58 * C0 control codes and space are encoded with their US-ASCII bytes.
59 * "prev" is reset for C0 controls but not for space.
60 */
61
62/* initial value for "prev": middle of the ASCII range */
63#define BOCU1_ASCII_PREV 0x40
64
65/* bounding byte values for differences */
66#define BOCU1_MIN 0x21
67#define BOCU1_MIDDLE 0x90
68#define BOCU1_MAX_LEAD 0xfe
69
70/* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
71#define BOCU1_MAX_TRAIL 0xffL
72#define BOCU1_RESET 0xff
73
74/* number of lead bytes */
75#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
76
77/* adjust trail byte counts for the use of some C0 control byte values */
78#define BOCU1_TRAIL_CONTROLS_COUNT 20
79#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
80
81/* number of trail bytes */
82#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
83
84/*
85 * number of positive and negative single-byte codes
86 * (counting 0==BOCU1_MIDDLE among the positive ones)
87 */
88#define BOCU1_SINGLE 64
89
90/* number of lead bytes for positive and negative 2/3/4-byte sequences */
91#define BOCU1_LEAD_2 43
92#define BOCU1_LEAD_3 3
93#define BOCU1_LEAD_4 1
94
95/* The difference value range for single-byters. */
96#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
97#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
98
99/* The difference value range for double-byters. */
100#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
101#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
102
103/* The difference value range for 3-byters. */
104#define BOCU1_REACH_POS_3 \
105 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
106
107#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
108
109/* The lead byte start values. */
110#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
111#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
112#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
113 /* ==BOCU1_MAX_LEAD */
114
115#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
116#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
117#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
118 /* ==BOCU1_MIN+1 */
119
120/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
121#define BOCU1_LENGTH_FROM_LEAD(lead) \
122 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
123 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
124 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
125
126/* The length of a byte sequence, according to its packed form. */
127#define BOCU1_LENGTH_FROM_PACKED(packed) \
128 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
129
130/*
131 * 12 commonly used C0 control codes (and space) are only used to encode
132 * themselves directly,
133 * which makes BOCU-1 MIME-usable and reasonably safe for
134 * ASCII-oriented software.
135 *
136 * These controls are
137 * 0 NUL
138 *
139 * 7 BEL
140 * 8 BS
141 *
142 * 9 TAB
143 * a LF
144 * b VT
145 * c FF
146 * d CR
147 *
148 * e SO
149 * f SI
150 *
151 * 1a SUB
152 * 1b ESC
153 *
154 * The other 20 C0 controls are also encoded directly (to preserve order)
155 * but are also used as trail bytes in difference encoding
156 * (for better compression).
157 */
158#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
159
160/*
161 * Byte value map for control codes,
162 * from external byte values 0x00..0x20
163 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
164 * External byte values that are illegal as trail bytes are mapped to -1.
165 */
166static int8_t
167bocu1ByteToTrail[BOCU1_MIN]={
168/* 0 1 2 3 4 5 6 7 */
169 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
170
171/* 8 9 a b c d e f */
172 -1, -1, -1, -1, -1, -1, -1, -1,
173
174/* 10 11 12 13 14 15 16 17 */
175 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
176
177/* 18 19 1a 1b 1c 1d 1e 1f */
178 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
179
180/* 20 */
181 -1
182};
183
184/*
185 * Byte value map for control codes,
186 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
187 * to external byte values 0x00..0x20.
188 */
189static int8_t
190bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
191/* 0 1 2 3 4 5 6 7 */
192 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
193
194/* 8 9 a b c d e f */
195 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
196
197/* 10 11 12 13 */
198 0x1c, 0x1d, 0x1e, 0x1f
199};
200
201/**
202 * Integer division and modulo with negative numerators
203 * yields negative modulo results and quotients that are one more than
204 * what we need here.
205 * This macro adjust the results so that the modulo-value m is always >=0.
206 *
207 * For positive n, the if() condition is always FALSE.
208 *
209 * @param n Number to be split into quotient and rest.
210 * Will be modified to contain the quotient.
211 * @param d Divisor.
212 * @param m Output variable for the rest (modulo result).
213 */
214#define NEGDIVMOD(n, d, m) { \
215 (m)=(n)%(d); \
216 (n)/=(d); \
217 if((m)<0) { \
218 --(n); \
219 (m)+=(d); \
220 } \
221}
222
223/* State for BOCU-1 decoder function. */
224struct Bocu1Rx {
225 int32_t prev, count, diff;
226};
227
228typedef struct Bocu1Rx Bocu1Rx;
229
230/* Function prototypes ------------------------------------------------------ */
231
232/* see bocu1.c */
233U_CFUNC int32_t
234packDiff(int32_t diff);
235
236U_CFUNC int32_t
237encodeBocu1(int32_t *pPrev, int32_t c);
238
239U_CFUNC int32_t
240decodeBocu1(Bocu1Rx *pRx, uint8_t b);
241
242/* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
243
244/* BOCU-1 implementation functions ------------------------------------------ */
245
246/**
247 * Compute the next "previous" value for differencing
248 * from the current code point.
249 *
250 * @param c current code point, 0..0x10ffff
251 * @return "previous code point" state value
252 */
253static U_INLINE int32_t
254bocu1Prev(int32_t c) {
255 /* compute new prev */
256 if(0x3040<=c && c<=0x309f) {
257 /* Hiragana is not 128-aligned */
258 return 0x3070;
259 } else if(0x4e00<=c && c<=0x9fa5) {
260 /* CJK Unihan */
261 return 0x4e00-BOCU1_REACH_NEG_2;
262 } else if(0xac00<=c && c<=0xd7a3) {
263 /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
264 return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
265 } else {
266 /* mostly small scripts */
267 return (c&~0x7f)+BOCU1_ASCII_PREV;
268 }
269}
270
271/**
272 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
273 * and return a packed integer with them.
274 *
275 * The encoding favors small absolut differences with short encodings
276 * to compress runs of same-script characters.
277 *
278 * @param diff difference value -0x10ffff..0x10ffff
279 * @return
280 * 0x010000zz for 1-byte sequence zz
281 * 0x0200yyzz for 2-byte sequence yy zz
282 * 0x03xxyyzz for 3-byte sequence xx yy zz
283 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
284 */
285U_CFUNC int32_t
286packDiff(int32_t diff) {
287 int32_t result, m, lead, count, shift;
288
289 if(diff>=BOCU1_REACH_NEG_1) {
290 /* mostly positive differences, and single-byte negative ones */
291 if(diff<=BOCU1_REACH_POS_1) {
292 /* single byte */
293 return 0x01000000|(BOCU1_MIDDLE+diff);
294 } else if(diff<=BOCU1_REACH_POS_2) {
295 /* two bytes */
296 diff-=BOCU1_REACH_POS_1+1;
297 lead=BOCU1_START_POS_2;
298 count=1;
299 } else if(diff<=BOCU1_REACH_POS_3) {
300 /* three bytes */
301 diff-=BOCU1_REACH_POS_2+1;
302 lead=BOCU1_START_POS_3;
303 count=2;
304 } else {
305 /* four bytes */
306 diff-=BOCU1_REACH_POS_3+1;
307 lead=BOCU1_START_POS_4;
308 count=3;
309 }
310 } else {
311 /* two- and four-byte negative differences */
312 if(diff>=BOCU1_REACH_NEG_2) {
313 /* two bytes */
314 diff-=BOCU1_REACH_NEG_1;
315 lead=BOCU1_START_NEG_2;
316 count=1;
317 } else if(diff>=BOCU1_REACH_NEG_3) {
318 /* three bytes */
319 diff-=BOCU1_REACH_NEG_2;
320 lead=BOCU1_START_NEG_3;
321 count=2;
322 } else {
323 /* four bytes */
324 diff-=BOCU1_REACH_NEG_3;
325 lead=BOCU1_START_NEG_4;
326 count=3;
327 }
328 }
329
330 /* encode the length of the packed result */
331 if(count<3) {
332 result=(count+1)<<24;
333 } else /* count==3, MSB used for the lead byte */ {
334 result=0;
335 }
336
337 /* calculate trail bytes like digits in itoa() */
338 shift=0;
339 do {
340 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
341 result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
342 shift+=8;
343 } while(--count>0);
344
345 /* add lead byte */
346 result|=(lead+diff)<<shift;
347
348 return result;
349}
350
351/**
352 * BOCU-1 encoder function.
353 *
354 * @param pPrev pointer to the integer that holds
355 * the "previous code point" state;
356 * the initial value should be 0 which
357 * encodeBocu1 will set to the actual BOCU-1 initial state value
358 * @param c the code point to encode
359 * @return the packed 1/2/3/4-byte encoding, see packDiff(),
360 * or 0 if an error occurs
361 *
362 * @see packDiff
363 */
364U_CFUNC int32_t
365encodeBocu1(int32_t *pPrev, int32_t c) {
366 int32_t prev;
367
368 if(pPrev==NULL || c<0 || c>0x10ffff) {
369 /* illegal argument */
370 return 0;
371 }
372
373 prev=*pPrev;
374 if(prev==0) {
375 /* lenient handling of initial value 0 */
376 prev=*pPrev=BOCU1_ASCII_PREV;
377 }
378
379 if(c<=0x20) {
380 /*
381 * ISO C0 control & space:
382 * Encode directly for MIME compatibility,
383 * and reset state except for space, to not disrupt compression.
384 */
385 if(c!=0x20) {
386 *pPrev=BOCU1_ASCII_PREV;
387 }
388 return 0x01000000|c;
389 }
390
391 /*
392 * all other Unicode code points c==U+0021..U+10ffff
393 * are encoded with the difference c-prev
394 *
395 * a new prev is computed from c,
396 * placed in the middle of a 0x80-block (for most small scripts) or
397 * in the middle of the Unihan and Hangul blocks
398 * to statistically minimize the following difference
399 */
400 *pPrev=bocu1Prev(c);
401 return packDiff(c-prev);
402}
403
404/**
405 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
406 *
407 * @param pRx pointer to the decoder state structure
408 * @param b lead byte;
409 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
410 * @return -1 (state change only)
411 *
412 * @see decodeBocu1
413 */
414static int32_t
415decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
416 int32_t c, count;
417
418 if(b>=BOCU1_START_NEG_2) {
419 /* positive difference */
420 if(b<BOCU1_START_POS_3) {
421 /* two bytes */
422 c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
423 count=1;
424 } else if(b<BOCU1_START_POS_4) {
425 /* three bytes */
426 c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
427 count=2;
428 } else {
429 /* four bytes */
430 c=BOCU1_REACH_POS_3+1;
431 count=3;
432 }
433 } else {
434 /* negative difference */
435 if(b>=BOCU1_START_NEG_3) {
436 /* two bytes */
437 c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
438 count=1;
439 } else if(b>BOCU1_MIN) {
440 /* three bytes */
441 c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
442 count=2;
443 } else {
444 /* four bytes */
445 c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
446 count=3;
447 }
448 }
449
450 /* set the state for decoding the trail byte(s) */
451 pRx->diff=c;
452 pRx->count=count;
453 return -1;
454}
455
456/**
457 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
458 *
459 * @param pRx pointer to the decoder state structure
460 * @param b trail byte
461 * @return result value, same as decodeBocu1
462 *
463 * @see decodeBocu1
464 */
465static int32_t
466decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
467 int32_t t, c, count;
468
469 if(b<=0x20) {
470 /* skip some C0 controls and make the trail byte range contiguous */
471 t=bocu1ByteToTrail[b];
472 if(t<0) {
473 /* illegal trail byte value */
474 pRx->prev=BOCU1_ASCII_PREV;
475 pRx->count=0;
476 return -99;
477 }
478#if BOCU1_MAX_TRAIL<0xff
479 } else if(b>BOCU1_MAX_TRAIL) {
480 return -99;
481#endif
482 } else {
483 t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
484 }
485
486 /* add trail byte into difference and decrement count */
487 c=pRx->diff;
488 count=pRx->count;
489
490 if(count==1) {
491 /* final trail byte, deliver a code point */
492 c=pRx->prev+c+t;
493 if(0<=c && c<=0x10ffff) {
494 /* valid code point result */
495 pRx->prev=bocu1Prev(c);
496 pRx->count=0;
497 return c;
498 } else {
499 /* illegal code point result */
500 pRx->prev=BOCU1_ASCII_PREV;
501 pRx->count=0;
502 return -99;
503 }
504 }
505
506 /* intermediate trail byte */
507 if(count==2) {
508 pRx->diff=c+t*BOCU1_TRAIL_COUNT;
509 } else /* count==3 */ {
510 pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
511 }
512 pRx->count=count-1;
513 return -1;
514}
515
516/**
517 * BOCU-1 decoder function.
518 *
519 * @param pRx pointer to the decoder state structure;
520 * the initial values should be 0 which
521 * decodeBocu1 will set to actual initial state values
522 * @param b an input byte
523 * @return
524 * 0..0x10ffff for a result code point
525 * -1 if only the state changed without code point output
526 * <-1 if an error occurs
527 */
528U_CFUNC int32_t
529decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
530 int32_t prev, c, count;
531
532 if(pRx==NULL) {
533 /* illegal argument */
534 return -99;
535 }
536
537 prev=pRx->prev;
538 if(prev==0) {
539 /* lenient handling of initial 0 values */
540 prev=pRx->prev=BOCU1_ASCII_PREV;
541 count=pRx->count=0;
542 } else {
543 count=pRx->count;
544 }
545
546 if(count==0) {
547 /* byte in lead position */
548 if(b<=0x20) {
549 /*
550 * Direct-encoded C0 control code or space.
551 * Reset prev for C0 control codes but not for space.
552 */
553 if(b!=0x20) {
554 pRx->prev=BOCU1_ASCII_PREV;
555 }
556 return b;
557 }
558
559 /*
560 * b is a difference lead byte.
561 *
562 * Return a code point directly from a single-byte difference.
563 *
564 * For multi-byte difference lead bytes, set the decoder state
565 * with the partial difference value from the lead byte and
566 * with the number of trail bytes.
567 *
568 * For four-byte differences, the signedness also affects the
569 * first trail byte, which has special handling farther below.
570 */
571 if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
572 /* single-byte difference */
573 c=prev+((int32_t)b-BOCU1_MIDDLE);
574 pRx->prev=bocu1Prev(c);
575 return c;
576 } else if(b==BOCU1_RESET) {
577 /* only reset the state, no code point */
578 pRx->prev=BOCU1_ASCII_PREV;
579 return -1;
580 } else {
581 return decodeBocu1LeadByte(pRx, b);
582 }
583 } else {
584 /* trail byte in any position */
585 return decodeBocu1TrailByte(pRx, b);
586 }
587}
588
589/* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
590
591/* test code ---------------------------------------------------------------- */
592
593/* test code options */
594
595/* ignore comma when processing name lists in testText() */
596#define TEST_IGNORE_COMMA 1
597
598/**
599 * Write a packed BOCU-1 byte sequence into a byte array,
600 * without overflow check.
601 * Test function.
602 *
603 * @param packed packed BOCU-1 byte sequence, see packDiff()
604 * @param p pointer to byte array
605 * @return number of bytes
606 *
607 * @see packDiff
608 */
609static int32_t
610writePacked(int32_t packed, uint8_t *p) {
611 int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
612 switch(count) {
613 case 4:
614 *p++=(uint8_t)(packed>>24);
615 case 3:
616 *p++=(uint8_t)(packed>>16);
617 case 2:
618 *p++=(uint8_t)(packed>>8);
619 case 1:
620 *p++=(uint8_t)packed;
621 default:
622 break;
623 }
624
625 return count;
626}
627
628/**
629 * Unpack a packed BOCU-1 non-C0/space byte sequence and get
630 * the difference to initialPrev.
631 * Used only for round-trip testing of the difference encoding and decoding.
632 * Test function.
633 *
634 * @param initialPrev bogus "previous code point" value to make sure that
635 * the resulting code point is in the range 0..0x10ffff
636 * @param packed packed BOCU-1 byte sequence
637 * @return the difference to initialPrev
638 *
639 * @see packDiff
640 * @see writeDiff
641 */
642static int32_t
643unpackDiff(int32_t initialPrev, int32_t packed) {
644 Bocu1Rx rx={ 0, 0, 0 };
645 int32_t count;
646
647 rx.prev=initialPrev;
648 count=BOCU1_LENGTH_FROM_PACKED(packed);
649 switch(count) {
650 case 4:
651 decodeBocu1(&rx, (uint8_t)(packed>>24));
652 case 3:
653 decodeBocu1(&rx, (uint8_t)(packed>>16));
654 case 2:
655 decodeBocu1(&rx, (uint8_t)(packed>>8));
656 case 1:
657 /* subtract initial prev */
658 return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
659 default:
660 return -0x7fffffff;
661 }
662}
663
664/**
665 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
666 * preserving lexical order.
667 * Also checks for roundtripping of the difference encoding.
668 * Test function.
669 *
670 * @param diff difference value to test, -0x10ffff..0x10ffff
671 * @param p pointer to output byte array
672 * @return p advanced by number of bytes output
673 *
674 * @see unpackDiff
675 */
676static uint8_t *
677writeDiff(int32_t diff, uint8_t *p) {
678 /* generate the difference as a packed value and serialize it */
679 int32_t packed, initialPrev;
680
681 packed=packDiff(diff);
682
683 /*
684 * bogus initial "prev" to work around
685 * code point range check in decodeBocu1()
686 */
687 if(diff<=0) {
688 initialPrev=0x10ffff;
689 } else {
690 initialPrev=-1;
691 }
692
693 if(diff!=unpackDiff(initialPrev, packed)) {
694 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
695 diff, packed, unpackDiff(initialPrev, packed));
696 }
697 return p+writePacked(packed, p);
698}
699
700/**
701 * Encode a UTF-16 string in BOCU-1.
702 * Does not check for overflows, but otherwise useful function.
703 *
704 * @param s input UTF-16 string
705 * @param length number of UChar code units in s
706 * @param p pointer to output byte array
707 * @return number of bytes output
708 */
709static int32_t
710writeString(const UChar *s, int32_t length, uint8_t *p) {
711 uint8_t *p0;
712 int32_t c, prev, i;
713
714 prev=0;
715 p0=p;
716 i=0;
717 while(i<length) {
718 UTF_NEXT_CHAR(s, i, length, c);
719 p+=writePacked(encodeBocu1(&prev, c), p);
720 }
721 return p-p0;
722}
723
724/**
725 * Decode a BOCU-1 byte sequence to a UTF-16 string.
726 * Does not check for overflows, but otherwise useful function.
727 *
728 * @param p pointer to input BOCU-1 bytes
729 * @param length number of input bytes
730 * @param s point to output UTF-16 string array
731 * @return number of UChar code units output
732 */
733static int32_t
734readString(const uint8_t *p, int32_t length, UChar *s) {
735 Bocu1Rx rx={ 0, 0, 0 };
736 int32_t c, i, sLength;
737
738 i=sLength=0;
739 while(i<length) {
740 c=decodeBocu1(&rx, p[i++]);
741 if(c<-1) {
742 log_err("error: readString detects encoding error at string index %ld\n", i);
743 return -1;
744 }
745 if(c>=0) {
746 UTF_APPEND_CHAR_UNSAFE(s, sLength, c);
747 }
748 }
749 return sLength;
750}
751
752static U_INLINE char
753hexDigit(uint8_t digit) {
754 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
755}
756
757/**
758 * Pretty-print 0-terminated byte values.
759 * Helper function for test output.
760 *
761 * @param bytes 0-terminated byte array to print
762 */
763static void
764printBytes(uint8_t *bytes, char *out) {
765 int i;
766 uint8_t b;
767
768 i=0;
769 while((b=*bytes++)!=0) {
770 *out++=' ';
771 *out++=hexDigit((uint8_t)(b>>4));
772 *out++=hexDigit((uint8_t)(b&0xf));
773 ++i;
774 }
775 i=3*(5-i);
776 while(i>0) {
777 *out++=' ';
778 --i;
779 }
780 *out=0;
781}
782
783/**
784 * Basic BOCU-1 test function, called when there are no command line arguments.
785 * Prints some of the #define values and performs round-trip tests of the
786 * difference encoding and decoding.
787 */
788static void
789TestBOCU1RefDiff(void) {
790 char buf1[80], buf2[80];
791 uint8_t prev[5], level[5];
792 int32_t i, cmp, countErrors;
793
794 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
795 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
796 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
797
798 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
799 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
800 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
801
802 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE);
803 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
804 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
805
806 /* test packDiff() & unpackDiff() with some specific values */
807 writeDiff(0, level);
808 writeDiff(1, level);
809 writeDiff(65, level);
810 writeDiff(130, level);
811 writeDiff(30000, level);
812 writeDiff(1000000, level);
813 writeDiff(-65, level);
814 writeDiff(-130, level);
815 writeDiff(-30000, level);
816 writeDiff(-1000000, level);
817
818 /* test that each value is smaller than any following one */
819 countErrors=0;
820 i=-0x10ffff;
821 *writeDiff(i, prev)=0;
822
823 /* show first number and bytes */
824 printBytes(prev, buf1);
825 log_verbose(" wD(%8ld) %s\n", i, buf1);
826
827 for(++i; i<=0x10ffff; ++i) {
828 *writeDiff(i, level)=0;
829 cmp=strcmp((const char *)prev, (const char *)level);
830 if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
831 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
832 level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
833 }
834 if(cmp<0) {
835 if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
836 /*
837 * if the result is good, then print only if the length changed
838 * to get little but interesting output
839 */
840 printBytes(prev, buf1);
841 printBytes(level, buf2);
842 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
843 }
844 } else {
845 ++countErrors;
846 printBytes(prev, buf1);
847 printBytes(level, buf2);
848 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
849 }
850 /* remember the previous bytes */
851 memcpy(prev, level, 4);
852 }
853
854 /* show last number and bytes */
855 printBytes((uint8_t *)"", buf1);
856 printBytes(prev, buf2);
857 log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2);
858
859 if(countErrors==0) {
860 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
861 } else {
862 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
863 }
864
865 /* output signature byte sequence */
866 i=0;
867 writePacked(encodeBocu1(&i, 0xfeff), level);
868 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
869 level[0], level[1], level[2]);
870}
871
872/* cintltst code ------------------------------------------------------------ */
873
874/* test one string with the ICU and the reference BOCU-1 implementations */
875static void
876roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
877 static UChar roundtripRef[30000], roundtripICU[30000];
878 static char bocu1Ref[30000], bocu1ICU[30000];
879
880 int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
881 UErrorCode errorCode;
882
883 /* Unicode -> BOCU-1 */
884 bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
885
886 errorCode=U_ZERO_ERROR;
887 bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, sizeof(bocu1ICU), text, length, &errorCode);
888 if(U_FAILURE(errorCode)) {
889 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
890 return;
891 }
892
893 if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
894 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
895 return;
896 }
897
898 /* BOCU-1 -> Unicode */
899 roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
900 if(roundtripRefLength<0) {
901 return; /* readString() found an error and reported it */
902 }
903
904 roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, sizeof(roundtripICU)/U_SIZEOF_UCHAR, bocu1ICU, bocu1ICULength, &errorCode);
905 if(U_FAILURE(errorCode)) {
906 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
907 return;
908 }
909
910 if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
911 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
912 return;
913 }
914 if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
915 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
916 return;
917 }
918}
919
920static const UChar feff[]={ 0xfeff };
921static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
922static const UChar crlf[]={ 0xd, 0xa, 0x20 };
923static const UChar nul[]={ 0 };
924static const UChar latin[]={ 0xdf, 0xe6 };
925static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
926static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
927static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
928static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
929static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
930static const UChar plane1[]={ 0xd800, 0xdc00 };
931static const UChar plane2[]={ 0xd845, 0xdddd };
932static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
933static const UChar plane16[]={ 0xdbff, 0xdfff };
934static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
935
936static const struct {
937 const UChar *s;
938 int32_t length;
939} strings[]={
940 { feff, LENGTHOF(feff) },
941 { ascii, LENGTHOF(ascii) },
942 { crlf, LENGTHOF(crlf) },
943 { nul, LENGTHOF(nul) },
944 { latin, LENGTHOF(latin) },
945 { devanagari, LENGTHOF(devanagari) },
946 { hiragana, LENGTHOF(hiragana) },
947 { unihan, LENGTHOF(unihan) },
948 { hangul, LENGTHOF(hangul) },
949 { surrogates, LENGTHOF(surrogates) },
950 { plane1, LENGTHOF(plane1) },
951 { plane2, LENGTHOF(plane2) },
952 { plane15, LENGTHOF(plane15) },
953 { plane16, LENGTHOF(plane16) },
954 { c0, LENGTHOF(c0) }
955};
956
957/*
958 * Verify that the ICU BOCU-1 implementation produces the same results as
959 * the reference implementation from the design folder.
960 * Generate some texts and convert them with both converters, verifying
961 * identical results and roundtripping.
962 */
963static void
964TestBOCU1(void) {
965 UChar text[30000];
966 int32_t i, length;
967
968 UConverter *bocu1;
969 UErrorCode errorCode;
970
971 errorCode=U_ZERO_ERROR;
972 bocu1=ucnv_open("BOCU-1", &errorCode);
973 if(U_FAILURE(errorCode)) {
974 log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
975 return;
976 }
977
978 /* text 1: each of strings[] once */
979 length=0;
980 for(i=0; i<LENGTHOF(strings); ++i) {
981 u_memcpy(text+length, strings[i].s, strings[i].length);
982 length+=strings[i].length;
983 }
984 roundtripBOCU1(bocu1, 1, text, length);
985
986 /* text 2: each of strings[] twice */
987 length=0;
988 for(i=0; i<LENGTHOF(strings); ++i) {
989 u_memcpy(text+length, strings[i].s, strings[i].length);
990 length+=strings[i].length;
991 u_memcpy(text+length, strings[i].s, strings[i].length);
992 length+=strings[i].length;
993 }
994 roundtripBOCU1(bocu1, 2, text, length);
995
996 /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
997 length=0;
998 for(i=1; length<5000; i+=7) {
999 if(i>=LENGTHOF(strings)) {
1000 i-=LENGTHOF(strings);
1001 }
1002 u_memcpy(text+length, strings[i].s, strings[i].length);
1003 length+=strings[i].length;
1004 }
1005 roundtripBOCU1(bocu1, 3, text, length);
1006
1007 ucnv_close(bocu1);
1008}
1009
1010U_CFUNC void addBOCU1Tests(TestNode** root);
1011
1012U_CFUNC void
1013addBOCU1Tests(TestNode** root) {
1014 addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1015 addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1016}