]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/bocu1tst.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / test / cintltst / bocu1tst.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2002, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: bocu1tst.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002may27
14 * created by: Markus W. Scherer
15 *
16 * This is the reference implementation of BOCU-1,
17 * the MIME-friendly form of the Binary Ordered Compression for Unicode,
18 * taken directly from ### http://oss.software.ibm.com/cvs/icu/icuhtml/design/conversion/bocu1/
19 * The files bocu1.h and bocu1.c from the design folder are taken
20 * verbatim (minus copyright and #include) and copied together into this file.
21 * The reference code and some of the reference bocu1tst.c
22 * is modified to run as part of the ICU cintltst
23 * test framework (minus main(), log_ln() etc. instead of printf()).
24 *
25 * This reference implementation is used here to verify
26 * the ICU BOCU-1 implementation, which is
27 * adapted for ICU conversion APIs and optimized.
28 * ### links in design doc to here and to ucnvbocu.c
29 */
30
31 #include "unicode/utypes.h"
32 #include "unicode/ustring.h"
33 #include "unicode/ucnv.h"
34 #include "cmemory.h"
35 #include "cintltst.h"
36
37 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
38
39 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
40
41 /* BOCU-1 constants and macros ---------------------------------------------- */
42
43 /*
44 * BOCU-1 encodes the code points of a Unicode string as
45 * a sequence of byte-encoded differences (slope detection),
46 * preserving lexical order.
47 *
48 * Optimize the difference-taking for runs of Unicode text within
49 * small scripts:
50 *
51 * Most small scripts are allocated within aligned 128-blocks of Unicode
52 * code points. Lexical order is preserved if the "previous code point" state
53 * is always moved into the middle of such a block.
54 *
55 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
56 * areas into the middle of those areas.
57 *
58 * C0 control codes and space are encoded with their US-ASCII bytes.
59 * "prev" is reset for C0 controls but not for space.
60 */
61
62 /* initial value for "prev": middle of the ASCII range */
63 #define BOCU1_ASCII_PREV 0x40
64
65 /* bounding byte values for differences */
66 #define BOCU1_MIN 0x21
67 #define BOCU1_MIDDLE 0x90
68 #define BOCU1_MAX_LEAD 0xfe
69 #define BOCU1_MAX_TRAIL 0xff
70 #define BOCU1_RESET 0xff
71
72 /* number of lead bytes */
73 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
74
75 /* adjust trail byte counts for the use of some C0 control byte values */
76 #define BOCU1_TRAIL_CONTROLS_COUNT 20
77 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
78
79 /* number of trail bytes */
80 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
81
82 /*
83 * number of positive and negative single-byte codes
84 * (counting 0==BOCU1_MIDDLE among the positive ones)
85 */
86 #define BOCU1_SINGLE 64
87
88 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
89 #define BOCU1_LEAD_2 43
90 #define BOCU1_LEAD_3 3
91 #define BOCU1_LEAD_4 1
92
93 /* The difference value range for single-byters. */
94 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
95 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
96
97 /* The difference value range for double-byters. */
98 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
99 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
100
101 /* The difference value range for 3-byters. */
102 #define BOCU1_REACH_POS_3 \
103 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
104
105 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
106
107 /* The lead byte start values. */
108 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
109 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
110 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
111 /* ==BOCU1_MAX_LEAD */
112
113 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
114 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
115 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
116 /* ==BOCU1_MIN+1 */
117
118 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
119 #define BOCU1_LENGTH_FROM_LEAD(lead) \
120 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
121 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
122 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
123
124 /* The length of a byte sequence, according to its packed form. */
125 #define BOCU1_LENGTH_FROM_PACKED(packed) \
126 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
127
128 /*
129 * 12 commonly used C0 control codes (and space) are only used to encode
130 * themselves directly,
131 * which makes BOCU-1 MIME-usable and reasonably safe for
132 * ASCII-oriented software.
133 *
134 * These controls are
135 * 0 NUL
136 *
137 * 7 BEL
138 * 8 BS
139 *
140 * 9 TAB
141 * a LF
142 * b VT
143 * c FF
144 * d CR
145 *
146 * e SO
147 * f SI
148 *
149 * 1a SUB
150 * 1b ESC
151 *
152 * The other 20 C0 controls are also encoded directly (to preserve order)
153 * but are also used as trail bytes in difference encoding
154 * (for better compression).
155 */
156 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
157
158 /*
159 * Byte value map for control codes,
160 * from external byte values 0x00..0x20
161 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
162 * External byte values that are illegal as trail bytes are mapped to -1.
163 */
164 static int8_t
165 bocu1ByteToTrail[BOCU1_MIN]={
166 /* 0 1 2 3 4 5 6 7 */
167 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
168
169 /* 8 9 a b c d e f */
170 -1, -1, -1, -1, -1, -1, -1, -1,
171
172 /* 10 11 12 13 14 15 16 17 */
173 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
174
175 /* 18 19 1a 1b 1c 1d 1e 1f */
176 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
177
178 /* 20 */
179 -1
180 };
181
182 /*
183 * Byte value map for control codes,
184 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
185 * to external byte values 0x00..0x20.
186 */
187 static int8_t
188 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
189 /* 0 1 2 3 4 5 6 7 */
190 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
191
192 /* 8 9 a b c d e f */
193 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
194
195 /* 10 11 12 13 */
196 0x1c, 0x1d, 0x1e, 0x1f
197 };
198
199 /**
200 * Integer division and modulo with negative numerators
201 * yields negative modulo results and quotients that are one more than
202 * what we need here.
203 * This macro adjust the results so that the modulo-value m is always >=0.
204 *
205 * For positive n, the if() condition is always FALSE.
206 *
207 * @param n Number to be split into quotient and rest.
208 * Will be modified to contain the quotient.
209 * @param d Divisor.
210 * @param m Output variable for the rest (modulo result).
211 */
212 #define NEGDIVMOD(n, d, m) { \
213 (m)=(n)%(d); \
214 (n)/=(d); \
215 if((m)<0) { \
216 --(n); \
217 (m)+=(d); \
218 } \
219 }
220
221 /* State for BOCU-1 decoder function. */
222 struct Bocu1Rx {
223 int32_t prev, count, diff;
224 };
225
226 typedef struct Bocu1Rx Bocu1Rx;
227
228 /* Function prototypes ------------------------------------------------------ */
229
230 /* see bocu1.c */
231 U_CFUNC int32_t
232 packDiff(int32_t diff);
233
234 U_CFUNC int32_t
235 encodeBocu1(int32_t *pPrev, int32_t c);
236
237 U_CFUNC int32_t
238 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
239
240 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
241
242 /* BOCU-1 implementation functions ------------------------------------------ */
243
244 /**
245 * Compute the next "previous" value for differencing
246 * from the current code point.
247 *
248 * @param c current code point, 0..0x10ffff
249 * @return "previous code point" state value
250 */
251 static U_INLINE int32_t
252 bocu1Prev(int32_t c) {
253 /* compute new prev */
254 if(0x3040<=c && c<=0x309f) {
255 /* Hiragana is not 128-aligned */
256 return 0x3070;
257 } else if(0x4e00<=c && c<=0x9fa5) {
258 /* CJK Unihan */
259 return 0x4e00-BOCU1_REACH_NEG_2;
260 } else if(0xac00<=c && c<=0xd7a3) {
261 /* Korean Hangul */
262 return (0xd7a3+0xac00)/2;
263 } else {
264 /* mostly small scripts */
265 return (c&~0x7f)+BOCU1_ASCII_PREV;
266 }
267 }
268
269 /**
270 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271 * and return a packed integer with them.
272 *
273 * The encoding favors small absolut differences with short encodings
274 * to compress runs of same-script characters.
275 *
276 * @param diff difference value -0x10ffff..0x10ffff
277 * @return
278 * 0x010000zz for 1-byte sequence zz
279 * 0x0200yyzz for 2-byte sequence yy zz
280 * 0x03xxyyzz for 3-byte sequence xx yy zz
281 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
282 */
283 U_CFUNC int32_t
284 packDiff(int32_t diff) {
285 int32_t result, m, lead, count, shift;
286
287 if(diff>=BOCU1_REACH_NEG_1) {
288 /* mostly positive differences, and single-byte negative ones */
289 if(diff<=BOCU1_REACH_POS_1) {
290 /* single byte */
291 return 0x01000000|(BOCU1_MIDDLE+diff);
292 } else if(diff<=BOCU1_REACH_POS_2) {
293 /* two bytes */
294 diff-=BOCU1_REACH_POS_1+1;
295 lead=BOCU1_START_POS_2;
296 count=1;
297 } else if(diff<=BOCU1_REACH_POS_3) {
298 /* three bytes */
299 diff-=BOCU1_REACH_POS_2+1;
300 lead=BOCU1_START_POS_3;
301 count=2;
302 } else {
303 /* four bytes */
304 diff-=BOCU1_REACH_POS_3+1;
305 lead=BOCU1_START_POS_4;
306 count=3;
307 }
308 } else {
309 /* two- and four-byte negative differences */
310 if(diff>=BOCU1_REACH_NEG_2) {
311 /* two bytes */
312 diff-=BOCU1_REACH_NEG_1;
313 lead=BOCU1_START_NEG_2;
314 count=1;
315 } else if(diff>=BOCU1_REACH_NEG_3) {
316 /* three bytes */
317 diff-=BOCU1_REACH_NEG_2;
318 lead=BOCU1_START_NEG_3;
319 count=2;
320 } else {
321 /* four bytes */
322 diff-=BOCU1_REACH_NEG_3;
323 lead=BOCU1_START_NEG_4;
324 count=3;
325 }
326 }
327
328 /* encode the length of the packed result */
329 if(count<3) {
330 result=(count+1)<<24;
331 } else /* count==3, MSB used for the lead byte */ {
332 result=0;
333 }
334
335 /* calculate trail bytes like digits in itoa() */
336 shift=0;
337 do {
338 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
339 result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
340 shift+=8;
341 } while(--count>0);
342
343 /* add lead byte */
344 result|=(lead+diff)<<shift;
345
346 return result;
347 }
348
349 /**
350 * BOCU-1 encoder function.
351 *
352 * @param pPrev pointer to the integer that holds
353 * the "previous code point" state;
354 * the initial value should be 0 which
355 * encodeBocu1 will set to the actual BOCU-1 initial state value
356 * @param c the code point to encode
357 * @return the packed 1/2/3/4-byte encoding, see packDiff(),
358 * or 0 if an error occurs
359 *
360 * @see packDiff
361 */
362 U_CFUNC int32_t
363 encodeBocu1(int32_t *pPrev, int32_t c) {
364 int32_t prev;
365
366 if(pPrev==NULL || c<0 || c>0x10ffff) {
367 /* illegal argument */
368 return 0;
369 }
370
371 prev=*pPrev;
372 if(prev==0) {
373 /* lenient handling of initial value 0 */
374 prev=*pPrev=BOCU1_ASCII_PREV;
375 }
376
377 if(c<=0x20) {
378 /*
379 * ISO C0 control & space:
380 * Encode directly for MIME compatibility,
381 * and reset state except for space, to not disrupt compression.
382 */
383 if(c!=0x20) {
384 *pPrev=BOCU1_ASCII_PREV;
385 }
386 return 0x01000000|c;
387 }
388
389 /*
390 * all other Unicode code points c==U+0021..U+10ffff
391 * are encoded with the difference c-prev
392 *
393 * a new prev is computed from c,
394 * placed in the middle of a 0x80-block (for most small scripts) or
395 * in the middle of the Unihan and Hangul blocks
396 * to statistically minimize the following difference
397 */
398 *pPrev=bocu1Prev(c);
399 return packDiff(c-prev);
400 }
401
402 /**
403 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
404 *
405 * @param pRx pointer to the decoder state structure
406 * @param b lead byte;
407 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
408 * @return -1 (state change only)
409 *
410 * @see decodeBocu1
411 */
412 static int32_t
413 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
414 int32_t c, count;
415
416 if(b>=BOCU1_START_NEG_2) {
417 /* positive difference */
418 if(b<BOCU1_START_POS_3) {
419 /* two bytes */
420 c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
421 count=1;
422 } else if(b<BOCU1_START_POS_4) {
423 /* three bytes */
424 c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
425 count=2;
426 } else {
427 /* four bytes */
428 c=BOCU1_REACH_POS_3+1;
429 count=3;
430 }
431 } else {
432 /* negative difference */
433 if(b>=BOCU1_START_NEG_3) {
434 /* two bytes */
435 c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
436 count=1;
437 } else if(b>BOCU1_MIN) {
438 /* three bytes */
439 c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
440 count=2;
441 } else {
442 /* four bytes */
443 c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
444 count=3;
445 }
446 }
447
448 /* set the state for decoding the trail byte(s) */
449 pRx->diff=c;
450 pRx->count=count;
451 return -1;
452 }
453
454 /**
455 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
456 *
457 * @param pRx pointer to the decoder state structure
458 * @param b trail byte
459 * @return result value, same as decodeBocu1
460 *
461 * @see decodeBocu1
462 */
463 static int32_t
464 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
465 int32_t t, c, count;
466
467 if(b<=0x20) {
468 /* skip some C0 controls and make the trail byte range contiguous */
469 t=bocu1ByteToTrail[b];
470 if(t<0) {
471 /* illegal trail byte value */
472 pRx->prev=BOCU1_ASCII_PREV;
473 pRx->count=0;
474 return -99;
475 }
476 #if BOCU1_MAX_TRAIL<0xff
477 } else if(b>BOCU1_MAX_TRAIL) {
478 return -99;
479 #endif
480 } else {
481 t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
482 }
483
484 /* add trail byte into difference and decrement count */
485 c=pRx->diff;
486 count=pRx->count;
487
488 if(count==1) {
489 /* final trail byte, deliver a code point */
490 c=pRx->prev+c+t;
491 if(0<=c && c<=0x10ffff) {
492 /* valid code point result */
493 pRx->prev=bocu1Prev(c);
494 pRx->count=0;
495 return c;
496 } else {
497 /* illegal code point result */
498 pRx->prev=BOCU1_ASCII_PREV;
499 pRx->count=0;
500 return -99;
501 }
502 }
503
504 /* intermediate trail byte */
505 if(count==2) {
506 pRx->diff=c+t*BOCU1_TRAIL_COUNT;
507 } else /* count==3 */ {
508 pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
509 }
510 pRx->count=count-1;
511 return -1;
512 }
513
514 /**
515 * BOCU-1 decoder function.
516 *
517 * @param pRx pointer to the decoder state structure;
518 * the initial values should be 0 which
519 * decodeBocu1 will set to actual initial state values
520 * @param b an input byte
521 * @return
522 * 0..0x10ffff for a result code point
523 * -1 if only the state changed without code point output
524 * <-1 if an error occurs
525 */
526 U_CFUNC int32_t
527 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
528 int32_t prev, c, count;
529
530 if(pRx==NULL) {
531 /* illegal argument */
532 return -99;
533 }
534
535 prev=pRx->prev;
536 if(prev==0) {
537 /* lenient handling of initial 0 values */
538 prev=pRx->prev=BOCU1_ASCII_PREV;
539 count=pRx->count=0;
540 } else {
541 count=pRx->count;
542 }
543
544 if(count==0) {
545 /* byte in lead position */
546 if(b<=0x20) {
547 /*
548 * Direct-encoded C0 control code or space.
549 * Reset prev for C0 control codes but not for space.
550 */
551 if(b!=0x20) {
552 pRx->prev=BOCU1_ASCII_PREV;
553 }
554 return b;
555 }
556
557 /*
558 * b is a difference lead byte.
559 *
560 * Return a code point directly from a single-byte difference.
561 *
562 * For multi-byte difference lead bytes, set the decoder state
563 * with the partial difference value from the lead byte and
564 * with the number of trail bytes.
565 *
566 * For four-byte differences, the signedness also affects the
567 * first trail byte, which has special handling farther below.
568 */
569 if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
570 /* single-byte difference */
571 c=prev+((int32_t)b-BOCU1_MIDDLE);
572 pRx->prev=bocu1Prev(c);
573 return c;
574 } else if(b==BOCU1_RESET) {
575 /* only reset the state, no code point */
576 pRx->prev=BOCU1_ASCII_PREV;
577 return -1;
578 } else {
579 return decodeBocu1LeadByte(pRx, b);
580 }
581 } else {
582 /* trail byte in any position */
583 return decodeBocu1TrailByte(pRx, b);
584 }
585 }
586
587 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
588
589 /* test code ---------------------------------------------------------------- */
590
591 /* test code options */
592
593 /* ignore comma when processing name lists in testText() */
594 #define TEST_IGNORE_COMMA 1
595
596 /**
597 * Write a packed BOCU-1 byte sequence into a byte array,
598 * without overflow check.
599 * Test function.
600 *
601 * @param packed packed BOCU-1 byte sequence, see packDiff()
602 * @param p pointer to byte array
603 * @return number of bytes
604 *
605 * @see packDiff
606 */
607 static int32_t
608 writePacked(int32_t packed, uint8_t *p) {
609 int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
610 switch(count) {
611 case 4:
612 *p++=(uint8_t)(packed>>24);
613 case 3:
614 *p++=(uint8_t)(packed>>16);
615 case 2:
616 *p++=(uint8_t)(packed>>8);
617 case 1:
618 *p++=(uint8_t)packed;
619 default:
620 break;
621 }
622
623 return count;
624 }
625
626 /**
627 * Unpack a packed BOCU-1 non-C0/space byte sequence and get
628 * the difference to initialPrev.
629 * Used only for round-trip testing of the difference encoding and decoding.
630 * Test function.
631 *
632 * @param initialPrev bogus "previous code point" value to make sure that
633 * the resulting code point is in the range 0..0x10ffff
634 * @param packed packed BOCU-1 byte sequence
635 * @return the difference to initialPrev
636 *
637 * @see packDiff
638 * @see writeDiff
639 */
640 static int32_t
641 unpackDiff(int32_t initialPrev, int32_t packed) {
642 Bocu1Rx rx={ 0, 0, 0 };
643 int32_t count;
644
645 rx.prev=initialPrev;
646 count=BOCU1_LENGTH_FROM_PACKED(packed);
647 switch(count) {
648 case 4:
649 decodeBocu1(&rx, (uint8_t)(packed>>24));
650 case 3:
651 decodeBocu1(&rx, (uint8_t)(packed>>16));
652 case 2:
653 decodeBocu1(&rx, (uint8_t)(packed>>8));
654 case 1:
655 /* subtract initial prev */
656 return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
657 default:
658 return -0x7fffffff;
659 }
660 }
661
662 /**
663 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
664 * preserving lexical order.
665 * Also checks for roundtripping of the difference encoding.
666 * Test function.
667 *
668 * @param diff difference value to test, -0x10ffff..0x10ffff
669 * @param p pointer to output byte array
670 * @return p advanced by number of bytes output
671 *
672 * @see unpackDiff
673 */
674 static uint8_t *
675 writeDiff(int32_t diff, uint8_t *p) {
676 /* generate the difference as a packed value and serialize it */
677 int32_t packed, initialPrev;
678
679 packed=packDiff(diff);
680
681 /*
682 * bogus initial "prev" to work around
683 * code point range check in decodeBocu1()
684 */
685 if(diff<=0) {
686 initialPrev=0x10ffff;
687 } else {
688 initialPrev=-1;
689 }
690
691 if(diff!=unpackDiff(initialPrev, packed)) {
692 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
693 diff, packed, unpackDiff(initialPrev, packed));
694 }
695 return p+writePacked(packed, p);
696 }
697
698 /**
699 * Encode a UTF-16 string in BOCU-1.
700 * Does not check for overflows, but otherwise useful function.
701 *
702 * @param s input UTF-16 string
703 * @param length number of UChar code units in s
704 * @param p pointer to output byte array
705 * @return number of bytes output
706 */
707 static int32_t
708 writeString(const UChar *s, int32_t length, uint8_t *p) {
709 uint8_t *p0;
710 int32_t c, prev, i;
711
712 prev=0;
713 p0=p;
714 i=0;
715 while(i<length) {
716 UTF_NEXT_CHAR(s, i, length, c);
717 p+=writePacked(encodeBocu1(&prev, c), p);
718 }
719 return p-p0;
720 }
721
722 /**
723 * Decode a BOCU-1 byte sequence to a UTF-16 string.
724 * Does not check for overflows, but otherwise useful function.
725 *
726 * @param p pointer to input BOCU-1 bytes
727 * @param length number of input bytes
728 * @param s point to output UTF-16 string array
729 * @return number of UChar code units output
730 */
731 static int32_t
732 readString(const uint8_t *p, int32_t length, UChar *s) {
733 Bocu1Rx rx={ 0, 0, 0 };
734 int32_t c, i, sLength;
735
736 i=sLength=0;
737 while(i<length) {
738 c=decodeBocu1(&rx, p[i++]);
739 if(c<-1) {
740 log_err("error: readString detects encoding error at string index %ld\n", i);
741 return -1;
742 }
743 if(c>=0) {
744 UTF_APPEND_CHAR_UNSAFE(s, sLength, c);
745 }
746 }
747 return sLength;
748 }
749
750 static U_INLINE char
751 hexDigit(uint8_t digit) {
752 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
753 }
754
755 /**
756 * Pretty-print 0-terminated byte values.
757 * Helper function for test output.
758 *
759 * @param bytes 0-terminated byte array to print
760 */
761 static void
762 printBytes(uint8_t *bytes, char *out) {
763 int i;
764 uint8_t b;
765
766 i=0;
767 while((b=*bytes++)!=0) {
768 *out++=' ';
769 *out++=hexDigit((uint8_t)(b>>4));
770 *out++=hexDigit((uint8_t)(b&0xf));
771 ++i;
772 }
773 i=3*(5-i);
774 while(i>0) {
775 *out++=' ';
776 --i;
777 }
778 *out=0;
779 }
780
781 /**
782 * Basic BOCU-1 test function, called when there are no command line arguments.
783 * Prints some of the #define values and performs round-trip tests of the
784 * difference encoding and decoding.
785 */
786 static void
787 TestBOCU1RefDiff(void) {
788 char buf1[80], buf2[80];
789 uint8_t prev[5], level[5];
790 int32_t i, cmp, countErrors;
791
792 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
793 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
794 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
795
796 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
797 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
798 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
799
800 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE);
801 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
802 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
803
804 /* test packDiff() & unpackDiff() with some specific values */
805 writeDiff(0, level);
806 writeDiff(1, level);
807 writeDiff(65, level);
808 writeDiff(130, level);
809 writeDiff(30000, level);
810 writeDiff(1000000, level);
811 writeDiff(-65, level);
812 writeDiff(-130, level);
813 writeDiff(-30000, level);
814 writeDiff(-1000000, level);
815
816 /* test that each value is smaller than any following one */
817 countErrors=0;
818 i=-0x10ffff;
819 *writeDiff(i, prev)=0;
820
821 /* show first number and bytes */
822 printBytes(prev, buf1);
823 log_verbose(" wD(%8ld) %s\n", i, buf1);
824
825 for(++i; i<=0x10ffff; ++i) {
826 *writeDiff(i, level)=0;
827 cmp=strcmp((const char *)prev, (const char *)level);
828 if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
829 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
830 level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
831 }
832 if(cmp<0) {
833 if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
834 /*
835 * if the result is good, then print only if the length changed
836 * to get little but interesting output
837 */
838 printBytes(prev, buf1);
839 printBytes(level, buf2);
840 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
841 }
842 } else {
843 ++countErrors;
844 printBytes(prev, buf1);
845 printBytes(level, buf2);
846 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
847 }
848 /* remember the previous bytes */
849 memcpy(prev, level, 4);
850 }
851
852 /* show last number and bytes */
853 printBytes((uint8_t *)"", buf1);
854 printBytes(prev, buf2);
855 log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2);
856
857 if(countErrors==0) {
858 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
859 } else {
860 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
861 }
862
863 /* output signature byte sequence */
864 i=0;
865 writePacked(encodeBocu1(&i, 0xfeff), level);
866 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
867 level[0], level[1], level[2]);
868 }
869
870 /* cintltst code ------------------------------------------------------------ */
871
872 /* test one string with the ICU and the reference BOCU-1 implementations */
873 static void
874 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
875 static UChar roundtripRef[30000], roundtripICU[30000];
876 static char bocu1Ref[30000], bocu1ICU[30000];
877
878 int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
879 UErrorCode errorCode;
880
881 /* Unicode -> BOCU-1 */
882 bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
883
884 errorCode=U_ZERO_ERROR;
885 bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, sizeof(bocu1ICU), text, length, &errorCode);
886 if(U_FAILURE(errorCode)) {
887 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
888 return;
889 }
890
891 if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
892 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
893 return;
894 }
895
896 /* BOCU-1 -> Unicode */
897 roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
898 if(roundtripRefLength<0) {
899 return; /* readString() found an error and reported it */
900 }
901
902 roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, sizeof(roundtripICU)/U_SIZEOF_UCHAR, bocu1ICU, bocu1ICULength, &errorCode);
903 if(U_FAILURE(errorCode)) {
904 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
905 return;
906 }
907
908 if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
909 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
910 return;
911 }
912 if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
913 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
914 return;
915 }
916 }
917
918 static const UChar feff[]={ 0xfeff };
919 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
920 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
921 static const UChar nul[]={ 0 };
922 static const UChar latin[]={ 0xdf, 0xe6 };
923 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
924 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
925 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
926 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
927 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
928 static const UChar plane1[]={ 0xd800, 0xdc00 };
929 static const UChar plane2[]={ 0xd845, 0xdddd };
930 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
931 static const UChar plane16[]={ 0xdbff, 0xdfff };
932 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
933
934 static const struct {
935 const UChar *s;
936 int32_t length;
937 } strings[]={
938 { feff, LENGTHOF(feff) },
939 { ascii, LENGTHOF(ascii) },
940 { crlf, LENGTHOF(crlf) },
941 { nul, LENGTHOF(nul) },
942 { latin, LENGTHOF(latin) },
943 { devanagari, LENGTHOF(devanagari) },
944 { hiragana, LENGTHOF(hiragana) },
945 { unihan, LENGTHOF(unihan) },
946 { hangul, LENGTHOF(hangul) },
947 { surrogates, LENGTHOF(surrogates) },
948 { plane1, LENGTHOF(plane1) },
949 { plane2, LENGTHOF(plane2) },
950 { plane15, LENGTHOF(plane15) },
951 { plane16, LENGTHOF(plane16) },
952 { c0, LENGTHOF(c0) }
953 };
954
955 /*
956 * Verify that the ICU BOCU-1 implementation produces the same results as
957 * the reference implementation from the design folder.
958 * Generate some texts and convert them with both converters, verifying
959 * identical results and roundtripping.
960 */
961 static void
962 TestBOCU1(void) {
963 UChar text[30000];
964 int32_t i, length;
965
966 UConverter *bocu1;
967 UErrorCode errorCode;
968
969 errorCode=U_ZERO_ERROR;
970 bocu1=ucnv_open("BOCU-1", &errorCode);
971 if(U_FAILURE(errorCode)) {
972 log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
973 return;
974 }
975
976 /* text 1: each of strings[] once */
977 length=0;
978 for(i=0; i<LENGTHOF(strings); ++i) {
979 u_memcpy(text+length, strings[i].s, strings[i].length);
980 length+=strings[i].length;
981 }
982 roundtripBOCU1(bocu1, 1, text, length);
983
984 /* text 2: each of strings[] twice */
985 length=0;
986 for(i=0; i<LENGTHOF(strings); ++i) {
987 u_memcpy(text+length, strings[i].s, strings[i].length);
988 length+=strings[i].length;
989 u_memcpy(text+length, strings[i].s, strings[i].length);
990 length+=strings[i].length;
991 }
992 roundtripBOCU1(bocu1, 2, text, length);
993
994 /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
995 length=0;
996 for(i=1; length<5000; i+=7) {
997 if(i>=LENGTHOF(strings)) {
998 i-=LENGTHOF(strings);
999 }
1000 u_memcpy(text+length, strings[i].s, strings[i].length);
1001 length+=strings[i].length;
1002 }
1003 roundtripBOCU1(bocu1, 3, text, length);
1004
1005 ucnv_close(bocu1);
1006 }
1007
1008 U_CFUNC void addBOCU1Tests(TestNode** root);
1009
1010 U_CFUNC void
1011 addBOCU1Tests(TestNode** root) {
1012 addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1013 addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1014 }