2 ******************************************************************************
4 * Copyright (C) 2002-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvbocu.c
10 * tab size: 8 (not used)
13 * created on: 2002mar27
14 * created by: Markus W. Scherer
16 * This is an implementation of the Binary Ordered Compression for Unicode,
17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_CONVERSION
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
29 /* BOCU-1 constants and macros ---------------------------------------------- */
32 * BOCU-1 encodes the code points of a Unicode string as
33 * a sequence of byte-encoded differences (slope detection),
34 * preserving lexical order.
36 * Optimize the difference-taking for runs of Unicode text within
39 * Most small scripts are allocated within aligned 128-blocks of Unicode
40 * code points. Lexical order is preserved if the "previous code point" state
41 * is always moved into the middle of such a block.
43 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
44 * areas into the middle of those areas.
46 * C0 control codes and space are encoded with their US-ASCII bytes.
47 * "prev" is reset for C0 controls but not for space.
50 /* initial value for "prev": middle of the ASCII range */
51 #define BOCU1_ASCII_PREV 0x40
53 /* bounding byte values for differences */
54 #define BOCU1_MIN 0x21
55 #define BOCU1_MIDDLE 0x90
56 #define BOCU1_MAX_LEAD 0xfe
57 #define BOCU1_MAX_TRAIL 0xff
58 #define BOCU1_RESET 0xff
60 /* number of lead bytes */
61 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
63 /* adjust trail byte counts for the use of some C0 control byte values */
64 #define BOCU1_TRAIL_CONTROLS_COUNT 20
65 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
67 /* number of trail bytes */
68 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
71 * number of positive and negative single-byte codes
72 * (counting 0==BOCU1_MIDDLE among the positive ones)
74 #define BOCU1_SINGLE 64
76 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
77 #define BOCU1_LEAD_2 43
78 #define BOCU1_LEAD_3 3
79 #define BOCU1_LEAD_4 1
81 /* The difference value range for single-byters. */
82 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
83 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
85 /* The difference value range for double-byters. */
86 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
87 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
89 /* The difference value range for 3-byters. */
90 #define BOCU1_REACH_POS_3 \
91 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
93 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
95 /* The lead byte start values. */
96 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
97 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
98 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
99 /* ==BOCU1_MAX_LEAD */
101 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
102 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
103 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
106 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
107 #define BOCU1_LENGTH_FROM_LEAD(lead) \
108 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
109 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
110 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
112 /* The length of a byte sequence, according to its packed form. */
113 #define BOCU1_LENGTH_FROM_PACKED(packed) \
114 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
117 * 12 commonly used C0 control codes (and space) are only used to encode
118 * themselves directly,
119 * which makes BOCU-1 MIME-usable and reasonably safe for
120 * ASCII-oriented software.
140 * The other 20 C0 controls are also encoded directly (to preserve order)
141 * but are also used as trail bytes in difference encoding
142 * (for better compression).
144 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
147 * Byte value map for control codes,
148 * from external byte values 0x00..0x20
149 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
150 * External byte values that are illegal as trail bytes are mapped to -1.
153 bocu1ByteToTrail
[BOCU1_MIN
]={
154 /* 0 1 2 3 4 5 6 7 */
155 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
157 /* 8 9 a b c d e f */
158 -1, -1, -1, -1, -1, -1, -1, -1,
160 /* 10 11 12 13 14 15 16 17 */
161 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
163 /* 18 19 1a 1b 1c 1d 1e 1f */
164 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
171 * Byte value map for control codes,
172 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
173 * to external byte values 0x00..0x20.
176 bocu1TrailToByte
[BOCU1_TRAIL_CONTROLS_COUNT
]={
177 /* 0 1 2 3 4 5 6 7 */
178 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
180 /* 8 9 a b c d e f */
181 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
184 0x1c, 0x1d, 0x1e, 0x1f
188 * Integer division and modulo with negative numerators
189 * yields negative modulo results and quotients that are one more than
191 * This macro adjust the results so that the modulo-value m is always >=0.
193 * For positive n, the if() condition is always FALSE.
195 * @param n Number to be split into quotient and rest.
196 * Will be modified to contain the quotient.
198 * @param m Output variable for the rest (modulo result).
200 #define NEGDIVMOD(n, d, m) { \
209 /* BOCU-1 implementation functions ------------------------------------------ */
211 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
214 * Compute the next "previous" value for differencing
215 * from the current code point.
217 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
218 * @return "previous code point" state value
220 static U_INLINE
int32_t
221 bocu1Prev(int32_t c
) {
222 /* compute new prev */
223 if(/* 0x3040<=c && */ c
<=0x309f) {
224 /* Hiragana is not 128-aligned */
226 } else if(0x4e00<=c
&& c
<=0x9fa5) {
228 return 0x4e00-BOCU1_REACH_NEG_2
;
229 } else if(0xac00<=c
/* && c<=0xd7a3 */) {
231 return (0xd7a3+0xac00)/2;
233 /* mostly small scripts */
234 return BOCU1_SIMPLE_PREV(c
);
238 /** Fast version of bocu1Prev() for most scripts. */
239 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
242 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
243 * The UConverter fields are used as follows:
245 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
247 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
248 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
251 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
254 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
255 * and return a packed integer with them.
257 * The encoding favors small absolut differences with short encodings
258 * to compress runs of same-script characters.
260 * Optimized version with unrolled loops and fewer floating-point operations
261 * than the standard packDiff().
263 * @param diff difference value -0x10ffff..0x10ffff
265 * 0x010000zz for 1-byte sequence zz
266 * 0x0200yyzz for 2-byte sequence yy zz
267 * 0x03xxyyzz for 3-byte sequence xx yy zz
268 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
271 packDiff(int32_t diff
) {
274 if(diff
>=BOCU1_REACH_NEG_1
) {
275 /* mostly positive differences, and single-byte negative ones */
276 #if 0 /* single-byte case handled in macros, see below */
277 if(diff
<=BOCU1_REACH_POS_1
) {
279 return 0x01000000|(BOCU1_MIDDLE
+diff
);
282 if(diff
<=BOCU1_REACH_POS_2
) {
284 diff
-=BOCU1_REACH_POS_1
+1;
287 m
=diff%BOCU
1_TRAIL_COUNT
;
288 diff
/=BOCU1_TRAIL_COUNT
;
289 result
|=BOCU1_TRAIL_TO_BYTE(m
);
291 result
|=(BOCU1_START_POS_2
+diff
)<<8;
292 } else if(diff
<=BOCU1_REACH_POS_3
) {
294 diff
-=BOCU1_REACH_POS_2
+1;
297 m
=diff%BOCU
1_TRAIL_COUNT
;
298 diff
/=BOCU1_TRAIL_COUNT
;
299 result
|=BOCU1_TRAIL_TO_BYTE(m
);
301 m
=diff%BOCU
1_TRAIL_COUNT
;
302 diff
/=BOCU1_TRAIL_COUNT
;
303 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
305 result
|=(BOCU1_START_POS_3
+diff
)<<16;
308 diff
-=BOCU1_REACH_POS_3
+1;
310 m
=diff%BOCU
1_TRAIL_COUNT
;
311 diff
/=BOCU1_TRAIL_COUNT
;
312 result
=BOCU1_TRAIL_TO_BYTE(m
);
314 m
=diff%BOCU
1_TRAIL_COUNT
;
315 diff
/=BOCU1_TRAIL_COUNT
;
316 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
319 * We know that / and % would deliver quotient 0 and rest=diff.
320 * Avoid division and modulo for performance.
322 result
|=BOCU1_TRAIL_TO_BYTE(diff
)<<16;
324 result
|=((uint32_t)BOCU1_START_POS_4
)<<24;
327 /* two- to four-byte negative differences */
328 if(diff
>=BOCU1_REACH_NEG_2
) {
330 diff
-=BOCU1_REACH_NEG_1
;
333 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
334 result
|=BOCU1_TRAIL_TO_BYTE(m
);
336 result
|=(BOCU1_START_NEG_2
+diff
)<<8;
337 } else if(diff
>=BOCU1_REACH_NEG_3
) {
339 diff
-=BOCU1_REACH_NEG_2
;
342 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
343 result
|=BOCU1_TRAIL_TO_BYTE(m
);
345 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
346 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
348 result
|=(BOCU1_START_NEG_3
+diff
)<<16;
351 diff
-=BOCU1_REACH_NEG_3
;
353 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
354 result
=BOCU1_TRAIL_TO_BYTE(m
);
356 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
357 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
360 * We know that NEGDIVMOD would deliver
361 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
362 * Avoid division and modulo for performance.
364 m
=diff
+BOCU1_TRAIL_COUNT
;
365 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<16;
367 result
|=BOCU1_MIN
<<24;
373 /* Faster versions of packDiff() for single-byte-encoded diff values. */
375 /** Is a diff value encodable in a single byte? */
376 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
378 /** Encode a diff value in a single byte. */
379 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
381 /** Is a diff value encodable in two bytes? */
382 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
385 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
386 UErrorCode
*pErrorCode
) {
388 const UChar
*source
, *sourceLimit
;
390 int32_t targetCapacity
;
393 int32_t prev
, c
, diff
;
395 int32_t sourceIndex
, nextSourceIndex
;
399 /* set up the local pointers */
400 cnv
=pArgs
->converter
;
401 source
=pArgs
->source
;
402 sourceLimit
=pArgs
->sourceLimit
;
403 target
=(uint8_t *)pArgs
->target
;
404 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
405 offsets
=pArgs
->offsets
;
407 /* get the converter state from UConverter */
409 prev
=(int32_t)cnv
->fromUnicodeStatus
;
411 prev
=BOCU1_ASCII_PREV
;
414 /* sourceIndex=-1 if the current character began in the previous buffer */
415 sourceIndex
= c
==0 ? 0 : -1;
418 /* conversion loop */
419 if(c
!=0 && targetCapacity
>0) {
424 /* fast loop for single-byte differences */
425 /* use only one loop counter variable, targetCapacity, not also source */
426 diff
=sourceLimit
-source
;
427 if(targetCapacity
>diff
) {
430 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
433 prev
=BOCU1_ASCII_PREV
;
435 *target
++=(uint8_t)c
;
436 *offsets
++=nextSourceIndex
++;
441 if(DIFF_IS_SINGLE(diff
)) {
442 prev
=BOCU1_SIMPLE_PREV(c
);
443 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
444 *offsets
++=nextSourceIndex
++;
452 /* restore real values */
453 targetCapacity
=(const uint8_t *)pArgs
->targetLimit
-target
;
454 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
456 /* regular loop for all cases */
457 while(source
<sourceLimit
) {
458 if(targetCapacity
>0) {
464 * ISO C0 control & space:
465 * Encode directly for MIME compatibility,
466 * and reset state except for space, to not disrupt compression.
469 prev
=BOCU1_ASCII_PREV
;
471 *target
++=(uint8_t)c
;
472 *offsets
++=sourceIndex
;
475 sourceIndex
=nextSourceIndex
;
481 if(source
<sourceLimit
) {
482 /* test the following code unit */
484 if(UTF_IS_SECOND_SURROGATE(trail
)) {
487 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
491 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497 * all other Unicode code points c==U+0021..U+10ffff
498 * are encoded with the difference c-prev
500 * a new prev is computed from c,
501 * placed in the middle of a 0x80-block (for most small scripts) or
502 * in the middle of the Unihan and Hangul blocks
503 * to statistically minimize the following difference
507 if(DIFF_IS_SINGLE(diff
)) {
508 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
509 *offsets
++=sourceIndex
;
511 sourceIndex
=nextSourceIndex
;
515 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
516 /* optimize 2-byte case */
520 diff
-=BOCU1_REACH_POS_1
+1;
521 m
=diff%BOCU
1_TRAIL_COUNT
;
522 diff
/=BOCU1_TRAIL_COUNT
;
523 diff
+=BOCU1_START_POS_2
;
525 diff
-=BOCU1_REACH_NEG_1
;
526 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
527 diff
+=BOCU1_START_NEG_2
;
529 *target
++=(uint8_t)diff
;
530 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
531 *offsets
++=sourceIndex
;
532 *offsets
++=sourceIndex
;
534 sourceIndex
=nextSourceIndex
;
536 int32_t length
; /* will be 2..4 */
539 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
541 /* write the output character bytes from diff and length */
542 /* from the first if in the loop we know that targetCapacity>0 */
543 if(length
<=targetCapacity
) {
545 /* each branch falls through to the next one */
547 *target
++=(uint8_t)(diff
>>24);
548 *offsets
++=sourceIndex
;
550 *target
++=(uint8_t)(diff
>>16);
551 *offsets
++=sourceIndex
;
553 *target
++=(uint8_t)(diff
>>8);
554 *offsets
++=sourceIndex
;
555 /* case 1: handled above */
556 *target
++=(uint8_t)diff
;
557 *offsets
++=sourceIndex
;
559 /* will never occur */
562 targetCapacity
-=length
;
563 sourceIndex
=nextSourceIndex
;
565 uint8_t *charErrorBuffer
;
568 * We actually do this backwards here:
569 * In order to save an intermediate variable, we output
570 * first to the overflow buffer what does not fit into the
573 /* we know that 1<=targetCapacity<length<=4 */
574 length
-=targetCapacity
;
575 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
577 /* each branch falls through to the next one */
579 *charErrorBuffer
++=(uint8_t)(diff
>>16);
581 *charErrorBuffer
++=(uint8_t)(diff
>>8);
583 *charErrorBuffer
=(uint8_t)diff
;
585 /* will never occur */
588 cnv
->charErrorBufferLength
=(int8_t)length
;
590 /* now output what fits into the regular target */
591 diff
>>=8*length
; /* length was reduced by targetCapacity */
592 switch(targetCapacity
) {
593 /* each branch falls through to the next one */
595 *target
++=(uint8_t)(diff
>>16);
596 *offsets
++=sourceIndex
;
598 *target
++=(uint8_t)(diff
>>8);
599 *offsets
++=sourceIndex
;
601 *target
++=(uint8_t)diff
;
602 *offsets
++=sourceIndex
;
604 /* will never occur */
608 /* target overflow */
610 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
616 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
621 /* set the converter state back into UConverter */
622 cnv
->fromUChar32
= c
<0 ? -c
: 0;
623 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
625 /* write back the updated pointers */
626 pArgs
->source
=source
;
627 pArgs
->target
=(char *)target
;
628 pArgs
->offsets
=offsets
;
632 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
633 * If a change is made in the original function, then either
634 * change this function the same way or
635 * re-copy the original function and remove the variables
636 * offsets, sourceIndex, and nextSourceIndex.
639 _Bocu1FromUnicode(UConverterFromUnicodeArgs
*pArgs
,
640 UErrorCode
*pErrorCode
) {
642 const UChar
*source
, *sourceLimit
;
644 int32_t targetCapacity
;
646 int32_t prev
, c
, diff
;
648 /* set up the local pointers */
649 cnv
=pArgs
->converter
;
650 source
=pArgs
->source
;
651 sourceLimit
=pArgs
->sourceLimit
;
652 target
=(uint8_t *)pArgs
->target
;
653 targetCapacity
=pArgs
->targetLimit
-pArgs
->target
;
655 /* get the converter state from UConverter */
657 prev
=(int32_t)cnv
->fromUnicodeStatus
;
659 prev
=BOCU1_ASCII_PREV
;
662 /* conversion loop */
663 if(c
!=0 && targetCapacity
>0) {
668 /* fast loop for single-byte differences */
669 /* use only one loop counter variable, targetCapacity, not also source */
670 diff
=sourceLimit
-source
;
671 if(targetCapacity
>diff
) {
674 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
677 prev
=BOCU1_ASCII_PREV
;
679 *target
++=(uint8_t)c
;
682 if(DIFF_IS_SINGLE(diff
)) {
683 prev
=BOCU1_SIMPLE_PREV(c
);
684 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
692 /* restore real values */
693 targetCapacity
=(const uint8_t *)pArgs
->targetLimit
-target
;
695 /* regular loop for all cases */
696 while(source
<sourceLimit
) {
697 if(targetCapacity
>0) {
702 * ISO C0 control & space:
703 * Encode directly for MIME compatibility,
704 * and reset state except for space, to not disrupt compression.
707 prev
=BOCU1_ASCII_PREV
;
709 *target
++=(uint8_t)c
;
716 if(source
<sourceLimit
) {
717 /* test the following code unit */
719 if(UTF_IS_SECOND_SURROGATE(trail
)) {
721 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
725 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
731 * all other Unicode code points c==U+0021..U+10ffff
732 * are encoded with the difference c-prev
734 * a new prev is computed from c,
735 * placed in the middle of a 0x80-block (for most small scripts) or
736 * in the middle of the Unihan and Hangul blocks
737 * to statistically minimize the following difference
741 if(DIFF_IS_SINGLE(diff
)) {
742 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
747 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
748 /* optimize 2-byte case */
752 diff
-=BOCU1_REACH_POS_1
+1;
753 m
=diff%BOCU
1_TRAIL_COUNT
;
754 diff
/=BOCU1_TRAIL_COUNT
;
755 diff
+=BOCU1_START_POS_2
;
757 diff
-=BOCU1_REACH_NEG_1
;
758 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
759 diff
+=BOCU1_START_NEG_2
;
761 *target
++=(uint8_t)diff
;
762 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
765 int32_t length
; /* will be 2..4 */
768 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
770 /* write the output character bytes from diff and length */
771 /* from the first if in the loop we know that targetCapacity>0 */
772 if(length
<=targetCapacity
) {
774 /* each branch falls through to the next one */
776 *target
++=(uint8_t)(diff
>>24);
778 *target
++=(uint8_t)(diff
>>16);
779 /* case 2: handled above */
780 *target
++=(uint8_t)(diff
>>8);
781 /* case 1: handled above */
782 *target
++=(uint8_t)diff
;
784 /* will never occur */
787 targetCapacity
-=length
;
789 uint8_t *charErrorBuffer
;
792 * We actually do this backwards here:
793 * In order to save an intermediate variable, we output
794 * first to the overflow buffer what does not fit into the
797 /* we know that 1<=targetCapacity<length<=4 */
798 length
-=targetCapacity
;
799 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
801 /* each branch falls through to the next one */
803 *charErrorBuffer
++=(uint8_t)(diff
>>16);
805 *charErrorBuffer
++=(uint8_t)(diff
>>8);
807 *charErrorBuffer
=(uint8_t)diff
;
809 /* will never occur */
812 cnv
->charErrorBufferLength
=(int8_t)length
;
814 /* now output what fits into the regular target */
815 diff
>>=8*length
; /* length was reduced by targetCapacity */
816 switch(targetCapacity
) {
817 /* each branch falls through to the next one */
819 *target
++=(uint8_t)(diff
>>16);
821 *target
++=(uint8_t)(diff
>>8);
823 *target
++=(uint8_t)diff
;
825 /* will never occur */
829 /* target overflow */
831 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
837 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
842 /* set the converter state back into UConverter */
843 cnv
->fromUChar32
= c
<0 ? -c
: 0;
844 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
846 /* write back the updated pointers */
847 pArgs
->source
=source
;
848 pArgs
->target
=(char *)target
;
851 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
854 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
856 * @param b lead byte;
857 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
858 * @return (diff<<2)|count
860 static U_INLINE
int32_t
861 decodeBocu1LeadByte(int32_t b
) {
864 if(b
>=BOCU1_START_NEG_2
) {
865 /* positive difference */
866 if(b
<BOCU1_START_POS_3
) {
868 diff
=((int32_t)b
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
870 } else if(b
<BOCU1_START_POS_4
) {
872 diff
=((int32_t)b
-BOCU1_START_POS_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_2
+1;
876 diff
=BOCU1_REACH_POS_3
+1;
880 /* negative difference */
881 if(b
>=BOCU1_START_NEG_3
) {
883 diff
=((int32_t)b
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
885 } else if(b
>BOCU1_MIN
) {
887 diff
=((int32_t)b
-BOCU1_START_NEG_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_2
;
891 diff
=-BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_3
;
896 /* return the state for decoding the trail byte(s) */
897 return (diff
<<2)|count
;
901 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
903 * @param count number of remaining trail bytes including this one
904 * @param b trail byte
905 * @return new delta for diff including b - <0 indicates an error
909 static U_INLINE
int32_t
910 decodeBocu1TrailByte(int32_t count
, int32_t b
) {
912 /* skip some C0 controls and make the trail byte range contiguous */
913 b
=bocu1ByteToTrail
[b
];
914 /* b<0 for an illegal trail byte value will result in return<0 below */
915 #if BOCU1_MAX_TRAIL<0xff
916 } else if(b
>BOCU1_MAX_TRAIL
) {
920 b
-=BOCU1_TRAIL_BYTE_OFFSET
;
923 /* add trail byte into difference and decrement count */
926 } else if(count
==2) {
927 return b
*BOCU1_TRAIL_COUNT
;
928 } else /* count==3 */ {
929 return b
*(BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
);
934 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
935 UErrorCode
*pErrorCode
) {
937 const uint8_t *source
, *sourceLimit
;
939 const UChar
*targetLimit
;
942 int32_t prev
, count
, diff
, c
;
947 int32_t sourceIndex
, nextSourceIndex
;
949 /* set up the local pointers */
950 cnv
=pArgs
->converter
;
951 source
=(const uint8_t *)pArgs
->source
;
952 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
953 target
=pArgs
->target
;
954 targetLimit
=pArgs
->targetLimit
;
955 offsets
=pArgs
->offsets
;
957 /* get the converter state from UConverter */
958 prev
=(int32_t)cnv
->toUnicodeStatus
;
960 prev
=BOCU1_ASCII_PREV
;
962 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
966 byteIndex
=cnv
->toULength
;
969 /* sourceIndex=-1 if the current character began in the previous buffer */
970 sourceIndex
=byteIndex
==0 ? 0 : -1;
973 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
974 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
979 /* fast loop for single-byte differences */
980 /* use count as the only loop counter variable */
981 diff
=sourceLimit
-source
;
982 count
=pArgs
->targetLimit
-target
;
987 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
988 c
=prev
+(c
-BOCU1_MIDDLE
);
991 *offsets
++=nextSourceIndex
++;
992 prev
=BOCU1_SIMPLE_PREV(c
);
998 prev
=BOCU1_ASCII_PREV
;
1001 *offsets
++=nextSourceIndex
++;
1008 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
1010 /* decode a sequence of single and lead bytes */
1011 while(source
<sourceLimit
) {
1012 if(target
>=targetLimit
) {
1013 /* target is full */
1014 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1020 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1021 /* Write a code point directly from a single-byte difference. */
1022 c
=prev
+(c
-BOCU1_MIDDLE
);
1025 *offsets
++=sourceIndex
;
1026 prev
=BOCU1_SIMPLE_PREV(c
);
1027 sourceIndex
=nextSourceIndex
;
1030 } else if(c
<=0x20) {
1032 * Direct-encoded C0 control code or space.
1033 * Reset prev for C0 control codes but not for space.
1036 prev
=BOCU1_ASCII_PREV
;
1039 *offsets
++=sourceIndex
;
1040 sourceIndex
=nextSourceIndex
;
1042 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1043 /* Optimize two-byte case. */
1044 if(c
>=BOCU1_MIDDLE
) {
1045 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1047 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1052 c
=decodeBocu1TrailByte(1, *source
++);
1053 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1054 bytes
[0]=source
[-2];
1055 bytes
[1]=source
[-1];
1057 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1060 } else if(c
==BOCU1_RESET
) {
1061 /* only reset the state, no code point */
1062 prev
=BOCU1_ASCII_PREV
;
1063 sourceIndex
=nextSourceIndex
;
1067 * For multi-byte difference lead bytes, set the decoder state
1068 * with the partial difference value from the lead byte and
1069 * with the number of trail bytes.
1071 bytes
[0]=(uint8_t)c
;
1074 diff
=decodeBocu1LeadByte(c
);
1079 if(source
>=sourceLimit
) {
1083 c
=bytes
[byteIndex
++]=*source
++;
1085 /* trail byte in any position */
1086 c
=decodeBocu1TrailByte(count
, c
);
1088 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1094 /* final trail byte, deliver a code point */
1097 if((uint32_t)c
>0x10ffff) {
1098 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1106 /* calculate the next prev and output c */
1110 *offsets
++=sourceIndex
;
1112 /* output surrogate pair */
1113 *target
++=UTF16_LEAD(c
);
1114 if(target
<targetLimit
) {
1115 *target
++=UTF16_TRAIL(c
);
1116 *offsets
++=sourceIndex
;
1117 *offsets
++=sourceIndex
;
1119 /* target overflow */
1120 *offsets
++=sourceIndex
;
1121 cnv
->UCharErrorBuffer
[0]=UTF16_TRAIL(c
);
1122 cnv
->UCharErrorBufferLength
=1;
1123 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1127 sourceIndex
=nextSourceIndex
;
1131 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1132 /* set the converter state in UConverter to deal with the next character */
1133 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1136 /* set the converter state back into UConverter */
1137 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1138 cnv
->mode
=(diff
<<2)|count
;
1140 cnv
->toULength
=byteIndex
;
1142 /* write back the updated pointers */
1143 pArgs
->source
=(const char *)source
;
1144 pArgs
->target
=target
;
1145 pArgs
->offsets
=offsets
;
1150 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1151 * If a change is made in the original function, then either
1152 * change this function the same way or
1153 * re-copy the original function and remove the variables
1154 * offsets, sourceIndex, and nextSourceIndex.
1157 _Bocu1ToUnicode(UConverterToUnicodeArgs
*pArgs
,
1158 UErrorCode
*pErrorCode
) {
1160 const uint8_t *source
, *sourceLimit
;
1162 const UChar
*targetLimit
;
1164 int32_t prev
, count
, diff
, c
;
1171 /* set up the local pointers */
1172 cnv
=pArgs
->converter
;
1173 source
=(const uint8_t *)pArgs
->source
;
1174 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1175 target
=pArgs
->target
;
1176 targetLimit
=pArgs
->targetLimit
;
1178 /* get the converter state from UConverter */
1179 prev
=(int32_t)cnv
->toUnicodeStatus
;
1181 prev
=BOCU1_ASCII_PREV
;
1183 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1187 byteIndex
=cnv
->toULength
;
1188 bytes
=cnv
->toUBytes
;
1190 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1191 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
1196 /* fast loop for single-byte differences */
1197 /* use count as the only loop counter variable */
1198 diff
=sourceLimit
-source
;
1199 count
=pArgs
->targetLimit
-target
;
1204 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
1205 c
=prev
+(c
-BOCU1_MIDDLE
);
1208 prev
=BOCU1_SIMPLE_PREV(c
);
1212 } else if(c
<=0x20) {
1214 prev
=BOCU1_ASCII_PREV
;
1224 /* decode a sequence of single and lead bytes */
1225 while(source
<sourceLimit
) {
1226 if(target
>=targetLimit
) {
1227 /* target is full */
1228 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1233 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1234 /* Write a code point directly from a single-byte difference. */
1235 c
=prev
+(c
-BOCU1_MIDDLE
);
1238 prev
=BOCU1_SIMPLE_PREV(c
);
1241 } else if(c
<=0x20) {
1243 * Direct-encoded C0 control code or space.
1244 * Reset prev for C0 control codes but not for space.
1247 prev
=BOCU1_ASCII_PREV
;
1251 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1252 /* Optimize two-byte case. */
1253 if(c
>=BOCU1_MIDDLE
) {
1254 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1256 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1260 c
=decodeBocu1TrailByte(1, *source
++);
1261 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1262 bytes
[0]=source
[-2];
1263 bytes
[1]=source
[-1];
1265 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1268 } else if(c
==BOCU1_RESET
) {
1269 /* only reset the state, no code point */
1270 prev
=BOCU1_ASCII_PREV
;
1274 * For multi-byte difference lead bytes, set the decoder state
1275 * with the partial difference value from the lead byte and
1276 * with the number of trail bytes.
1278 bytes
[0]=(uint8_t)c
;
1281 diff
=decodeBocu1LeadByte(c
);
1286 if(source
>=sourceLimit
) {
1289 c
=bytes
[byteIndex
++]=*source
++;
1291 /* trail byte in any position */
1292 c
=decodeBocu1TrailByte(count
, c
);
1294 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1300 /* final trail byte, deliver a code point */
1303 if((uint32_t)c
>0x10ffff) {
1304 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1312 /* calculate the next prev and output c */
1317 /* output surrogate pair */
1318 *target
++=UTF16_LEAD(c
);
1319 if(target
<targetLimit
) {
1320 *target
++=UTF16_TRAIL(c
);
1322 /* target overflow */
1323 cnv
->UCharErrorBuffer
[0]=UTF16_TRAIL(c
);
1324 cnv
->UCharErrorBufferLength
=1;
1325 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1332 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1333 /* set the converter state in UConverter to deal with the next character */
1334 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1337 /* set the converter state back into UConverter */
1338 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1339 cnv
->mode
=(diff
<<2)|count
;
1341 cnv
->toULength
=byteIndex
;
1343 /* write back the updated pointers */
1344 pArgs
->source
=(const char *)source
;
1345 pArgs
->target
=target
;
1349 /* miscellaneous ------------------------------------------------------------ */
1351 static const UConverterImpl _Bocu1Impl
={
1362 _Bocu1ToUnicodeWithOffsets
,
1364 _Bocu1FromUnicodeWithOffsets
,
1371 ucnv_getCompleteUnicodeSet
1374 static const UConverterStaticData _Bocu1StaticData
={
1375 sizeof(UConverterStaticData
),
1377 0, /* CCSID for BOCU-1 */
1378 UCNV_IBM
, UCNV_BOCU1
,
1379 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1380 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1384 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1387 const UConverterSharedData _Bocu1Data
={
1388 sizeof(UConverterSharedData
), ~((uint32_t)0),
1389 NULL
, NULL
, &_Bocu1StaticData
, FALSE
, &_Bocu1Impl
,