2 ******************************************************************************
4 * Copyright (C) 2002-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvbocu.cpp
10 * tab size: 8 (not used)
13 * created on: 2002mar27
14 * created by: Markus W. Scherer
16 * This is an implementation of the Binary Ordered Compression for Unicode,
17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_CONVERSION
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
32 /* BOCU-1 constants and macros ---------------------------------------------- */
35 * BOCU-1 encodes the code points of a Unicode string as
36 * a sequence of byte-encoded differences (slope detection),
37 * preserving lexical order.
39 * Optimize the difference-taking for runs of Unicode text within
42 * Most small scripts are allocated within aligned 128-blocks of Unicode
43 * code points. Lexical order is preserved if the "previous code point" state
44 * is always moved into the middle of such a block.
46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
47 * areas into the middle of those areas.
49 * C0 control codes and space are encoded with their US-ASCII bytes.
50 * "prev" is reset for C0 controls but not for space.
53 /* initial value for "prev": middle of the ASCII range */
54 #define BOCU1_ASCII_PREV 0x40
56 /* bounding byte values for differences */
57 #define BOCU1_MIN 0x21
58 #define BOCU1_MIDDLE 0x90
59 #define BOCU1_MAX_LEAD 0xfe
60 #define BOCU1_MAX_TRAIL 0xff
61 #define BOCU1_RESET 0xff
63 /* number of lead bytes */
64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
66 /* adjust trail byte counts for the use of some C0 control byte values */
67 #define BOCU1_TRAIL_CONTROLS_COUNT 20
68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
70 /* number of trail bytes */
71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74 * number of positive and negative single-byte codes
75 * (counting 0==BOCU1_MIDDLE among the positive ones)
77 #define BOCU1_SINGLE 64
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
80 #define BOCU1_LEAD_2 43
81 #define BOCU1_LEAD_3 3
82 #define BOCU1_LEAD_4 1
84 /* The difference value range for single-byters. */
85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
88 /* The difference value range for double-byters. */
89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92 /* The difference value range for 3-byters. */
93 #define BOCU1_REACH_POS_3 \
94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
98 /* The lead byte start values. */
99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
102 /* ==BOCU1_MAX_LEAD */
104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
115 /* The length of a byte sequence, according to its packed form. */
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120 * 12 commonly used C0 control codes (and space) are only used to encode
121 * themselves directly,
122 * which makes BOCU-1 MIME-usable and reasonably safe for
123 * ASCII-oriented software.
143 * The other 20 C0 controls are also encoded directly (to preserve order)
144 * but are also used as trail bytes in difference encoding
145 * (for better compression).
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150 * Byte value map for control codes,
151 * from external byte values 0x00..0x20
152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
153 * External byte values that are illegal as trail bytes are mapped to -1.
156 bocu1ByteToTrail
[BOCU1_MIN
]={
157 /* 0 1 2 3 4 5 6 7 */
158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
160 /* 8 9 a b c d e f */
161 -1, -1, -1, -1, -1, -1, -1, -1,
163 /* 10 11 12 13 14 15 16 17 */
164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
166 /* 18 19 1a 1b 1c 1d 1e 1f */
167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
174 * Byte value map for control codes,
175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
176 * to external byte values 0x00..0x20.
179 bocu1TrailToByte
[BOCU1_TRAIL_CONTROLS_COUNT
]={
180 /* 0 1 2 3 4 5 6 7 */
181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
183 /* 8 9 a b c d e f */
184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
187 0x1c, 0x1d, 0x1e, 0x1f
191 * Integer division and modulo with negative numerators
192 * yields negative modulo results and quotients that are one more than
194 * This macro adjust the results so that the modulo-value m is always >=0.
196 * For positive n, the if() condition is always FALSE.
198 * @param n Number to be split into quotient and rest.
199 * Will be modified to contain the quotient.
201 * @param m Output variable for the rest (modulo result).
203 #define NEGDIVMOD(n, d, m) { \
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
214 /** Is a diff value encodable in a single byte? */
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
217 /** Encode a diff value in a single byte. */
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
220 /** Is a diff value encodable in two bytes? */
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
223 /* BOCU-1 implementation functions ------------------------------------------ */
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228 * Compute the next "previous" value for differencing
229 * from the current code point.
231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
232 * @return "previous code point" state value
234 static inline int32_t
235 bocu1Prev(int32_t c
) {
236 /* compute new prev */
237 if(/* 0x3040<=c && */ c
<=0x309f) {
238 /* Hiragana is not 128-aligned */
240 } else if(0x4e00<=c
&& c
<=0x9fa5) {
242 return 0x4e00-BOCU1_REACH_NEG_2
;
243 } else if(0xac00<=c
/* && c<=0xd7a3 */) {
245 return (0xd7a3+0xac00)/2;
247 /* mostly small scripts */
248 return BOCU1_SIMPLE_PREV(c
);
252 /** Fast version of bocu1Prev() for most scripts. */
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
257 * The UConverter fields are used as follows:
259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
269 * and return a packed integer with them.
271 * The encoding favors small absolute differences with short encodings
272 * to compress runs of same-script characters.
274 * Optimized version with unrolled loops and fewer floating-point operations
275 * than the standard packDiff().
277 * @param diff difference value -0x10ffff..0x10ffff
279 * 0x010000zz for 1-byte sequence zz
280 * 0x0200yyzz for 2-byte sequence yy zz
281 * 0x03xxyyzz for 3-byte sequence xx yy zz
282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285 packDiff(int32_t diff
) {
288 U_ASSERT(!DIFF_IS_SINGLE(diff
)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
289 if(diff
>=BOCU1_REACH_NEG_1
) {
290 /* mostly positive differences, and single-byte negative ones */
291 #if 0 /* single-byte case handled in macros, see below */
292 if(diff
<=BOCU1_REACH_POS_1
) {
294 return 0x01000000|(BOCU1_MIDDLE
+diff
);
297 if(diff
<=BOCU1_REACH_POS_2
) {
299 diff
-=BOCU1_REACH_POS_1
+1;
302 m
=diff%BOCU
1_TRAIL_COUNT
;
303 diff
/=BOCU1_TRAIL_COUNT
;
304 result
|=BOCU1_TRAIL_TO_BYTE(m
);
306 result
|=(BOCU1_START_POS_2
+diff
)<<8;
307 } else if(diff
<=BOCU1_REACH_POS_3
) {
309 diff
-=BOCU1_REACH_POS_2
+1;
312 m
=diff%BOCU
1_TRAIL_COUNT
;
313 diff
/=BOCU1_TRAIL_COUNT
;
314 result
|=BOCU1_TRAIL_TO_BYTE(m
);
316 m
=diff%BOCU
1_TRAIL_COUNT
;
317 diff
/=BOCU1_TRAIL_COUNT
;
318 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
320 result
|=(BOCU1_START_POS_3
+diff
)<<16;
323 diff
-=BOCU1_REACH_POS_3
+1;
325 m
=diff%BOCU
1_TRAIL_COUNT
;
326 diff
/=BOCU1_TRAIL_COUNT
;
327 result
=BOCU1_TRAIL_TO_BYTE(m
);
329 m
=diff%BOCU
1_TRAIL_COUNT
;
330 diff
/=BOCU1_TRAIL_COUNT
;
331 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
334 * We know that / and % would deliver quotient 0 and rest=diff.
335 * Avoid division and modulo for performance.
337 result
|=BOCU1_TRAIL_TO_BYTE(diff
)<<16;
339 result
|=((uint32_t)BOCU1_START_POS_4
)<<24;
342 /* two- to four-byte negative differences */
343 if(diff
>=BOCU1_REACH_NEG_2
) {
345 diff
-=BOCU1_REACH_NEG_1
;
348 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
349 result
|=BOCU1_TRAIL_TO_BYTE(m
);
351 result
|=(BOCU1_START_NEG_2
+diff
)<<8;
352 } else if(diff
>=BOCU1_REACH_NEG_3
) {
354 diff
-=BOCU1_REACH_NEG_2
;
357 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
358 result
|=BOCU1_TRAIL_TO_BYTE(m
);
360 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
361 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
363 result
|=(BOCU1_START_NEG_3
+diff
)<<16;
366 diff
-=BOCU1_REACH_NEG_3
;
368 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
369 result
=BOCU1_TRAIL_TO_BYTE(m
);
371 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
372 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
375 * We know that NEGDIVMOD would deliver
376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
377 * Avoid division and modulo for performance.
379 m
=diff
+BOCU1_TRAIL_COUNT
;
380 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<16;
382 result
|=BOCU1_MIN
<<24;
390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
391 UErrorCode
*pErrorCode
) {
393 const UChar
*source
, *sourceLimit
;
395 int32_t targetCapacity
;
398 int32_t prev
, c
, diff
;
400 int32_t sourceIndex
, nextSourceIndex
;
404 /* set up the local pointers */
405 cnv
=pArgs
->converter
;
406 source
=pArgs
->source
;
407 sourceLimit
=pArgs
->sourceLimit
;
408 target
=(uint8_t *)pArgs
->target
;
409 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
410 offsets
=pArgs
->offsets
;
412 /* get the converter state from UConverter */
414 prev
=(int32_t)cnv
->fromUnicodeStatus
;
416 prev
=BOCU1_ASCII_PREV
;
419 /* sourceIndex=-1 if the current character began in the previous buffer */
420 sourceIndex
= c
==0 ? 0 : -1;
423 /* conversion loop */
424 if(c
!=0 && targetCapacity
>0) {
429 /* fast loop for single-byte differences */
430 /* use only one loop counter variable, targetCapacity, not also source */
431 diff
=(int32_t)(sourceLimit
-source
);
432 if(targetCapacity
>diff
) {
435 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
438 prev
=BOCU1_ASCII_PREV
;
440 *target
++=(uint8_t)c
;
441 *offsets
++=nextSourceIndex
++;
446 if(DIFF_IS_SINGLE(diff
)) {
447 prev
=BOCU1_SIMPLE_PREV(c
);
448 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
449 *offsets
++=nextSourceIndex
++;
457 /* restore real values */
458 targetCapacity
=(int32_t)((const uint8_t *)pArgs
->targetLimit
-target
);
459 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
461 /* regular loop for all cases */
462 while(source
<sourceLimit
) {
463 if(targetCapacity
>0) {
469 * ISO C0 control & space:
470 * Encode directly for MIME compatibility,
471 * and reset state except for space, to not disrupt compression.
474 prev
=BOCU1_ASCII_PREV
;
476 *target
++=(uint8_t)c
;
477 *offsets
++=sourceIndex
;
480 sourceIndex
=nextSourceIndex
;
486 if(source
<sourceLimit
) {
487 /* test the following code unit */
489 if(U16_IS_TRAIL(trail
)) {
492 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
496 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
502 * all other Unicode code points c==U+0021..U+10ffff
503 * are encoded with the difference c-prev
505 * a new prev is computed from c,
506 * placed in the middle of a 0x80-block (for most small scripts) or
507 * in the middle of the Unihan and Hangul blocks
508 * to statistically minimize the following difference
512 if(DIFF_IS_SINGLE(diff
)) {
513 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
514 *offsets
++=sourceIndex
;
516 sourceIndex
=nextSourceIndex
;
520 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
521 /* optimize 2-byte case */
525 diff
-=BOCU1_REACH_POS_1
+1;
526 m
=diff%BOCU
1_TRAIL_COUNT
;
527 diff
/=BOCU1_TRAIL_COUNT
;
528 diff
+=BOCU1_START_POS_2
;
530 diff
-=BOCU1_REACH_NEG_1
;
531 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
532 diff
+=BOCU1_START_NEG_2
;
534 *target
++=(uint8_t)diff
;
535 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
536 *offsets
++=sourceIndex
;
537 *offsets
++=sourceIndex
;
539 sourceIndex
=nextSourceIndex
;
541 int32_t length
; /* will be 2..4 */
544 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
546 /* write the output character bytes from diff and length */
547 /* from the first if in the loop we know that targetCapacity>0 */
548 if(length
<=targetCapacity
) {
550 /* each branch falls through to the next one */
552 *target
++=(uint8_t)(diff
>>24);
553 *offsets
++=sourceIndex
;
554 case 3: /*fall through*/
555 *target
++=(uint8_t)(diff
>>16);
556 *offsets
++=sourceIndex
;
557 case 2: /*fall through*/
558 *target
++=(uint8_t)(diff
>>8);
559 *offsets
++=sourceIndex
;
560 /* case 1: handled above */
561 *target
++=(uint8_t)diff
;
562 *offsets
++=sourceIndex
;
564 /* will never occur */
567 targetCapacity
-=length
;
568 sourceIndex
=nextSourceIndex
;
570 uint8_t *charErrorBuffer
;
573 * We actually do this backwards here:
574 * In order to save an intermediate variable, we output
575 * first to the overflow buffer what does not fit into the
578 /* we know that 1<=targetCapacity<length<=4 */
579 length
-=targetCapacity
;
580 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
582 /* each branch falls through to the next one */
584 *charErrorBuffer
++=(uint8_t)(diff
>>16);
585 case 2: /*fall through*/
586 *charErrorBuffer
++=(uint8_t)(diff
>>8);
587 case 1: /*fall through*/
588 *charErrorBuffer
=(uint8_t)diff
;
590 /* will never occur */
593 cnv
->charErrorBufferLength
=(int8_t)length
;
595 /* now output what fits into the regular target */
596 diff
>>=8*length
; /* length was reduced by targetCapacity */
597 switch(targetCapacity
) {
598 /* each branch falls through to the next one */
600 *target
++=(uint8_t)(diff
>>16);
601 *offsets
++=sourceIndex
;
602 case 2: /*fall through*/
603 *target
++=(uint8_t)(diff
>>8);
604 *offsets
++=sourceIndex
;
605 case 1: /*fall through*/
606 *target
++=(uint8_t)diff
;
607 *offsets
++=sourceIndex
;
609 /* will never occur */
613 /* target overflow */
615 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
621 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
626 /* set the converter state back into UConverter */
627 cnv
->fromUChar32
= c
<0 ? -c
: 0;
628 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
630 /* write back the updated pointers */
631 pArgs
->source
=source
;
632 pArgs
->target
=(char *)target
;
633 pArgs
->offsets
=offsets
;
637 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
638 * If a change is made in the original function, then either
639 * change this function the same way or
640 * re-copy the original function and remove the variables
641 * offsets, sourceIndex, and nextSourceIndex.
644 _Bocu1FromUnicode(UConverterFromUnicodeArgs
*pArgs
,
645 UErrorCode
*pErrorCode
) {
647 const UChar
*source
, *sourceLimit
;
649 int32_t targetCapacity
;
651 int32_t prev
, c
, diff
;
653 /* set up the local pointers */
654 cnv
=pArgs
->converter
;
655 source
=pArgs
->source
;
656 sourceLimit
=pArgs
->sourceLimit
;
657 target
=(uint8_t *)pArgs
->target
;
658 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
660 /* get the converter state from UConverter */
662 prev
=(int32_t)cnv
->fromUnicodeStatus
;
664 prev
=BOCU1_ASCII_PREV
;
667 /* conversion loop */
668 if(c
!=0 && targetCapacity
>0) {
673 /* fast loop for single-byte differences */
674 /* use only one loop counter variable, targetCapacity, not also source */
675 diff
=(int32_t)(sourceLimit
-source
);
676 if(targetCapacity
>diff
) {
679 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
682 prev
=BOCU1_ASCII_PREV
;
684 *target
++=(uint8_t)c
;
687 if(DIFF_IS_SINGLE(diff
)) {
688 prev
=BOCU1_SIMPLE_PREV(c
);
689 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
697 /* restore real values */
698 targetCapacity
=(int32_t)((const uint8_t *)pArgs
->targetLimit
-target
);
700 /* regular loop for all cases */
701 while(source
<sourceLimit
) {
702 if(targetCapacity
>0) {
707 * ISO C0 control & space:
708 * Encode directly for MIME compatibility,
709 * and reset state except for space, to not disrupt compression.
712 prev
=BOCU1_ASCII_PREV
;
714 *target
++=(uint8_t)c
;
721 if(source
<sourceLimit
) {
722 /* test the following code unit */
724 if(U16_IS_TRAIL(trail
)) {
726 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
730 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
736 * all other Unicode code points c==U+0021..U+10ffff
737 * are encoded with the difference c-prev
739 * a new prev is computed from c,
740 * placed in the middle of a 0x80-block (for most small scripts) or
741 * in the middle of the Unihan and Hangul blocks
742 * to statistically minimize the following difference
746 if(DIFF_IS_SINGLE(diff
)) {
747 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
752 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
753 /* optimize 2-byte case */
757 diff
-=BOCU1_REACH_POS_1
+1;
758 m
=diff%BOCU
1_TRAIL_COUNT
;
759 diff
/=BOCU1_TRAIL_COUNT
;
760 diff
+=BOCU1_START_POS_2
;
762 diff
-=BOCU1_REACH_NEG_1
;
763 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
764 diff
+=BOCU1_START_NEG_2
;
766 *target
++=(uint8_t)diff
;
767 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
770 int32_t length
; /* will be 2..4 */
773 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
775 /* write the output character bytes from diff and length */
776 /* from the first if in the loop we know that targetCapacity>0 */
777 if(length
<=targetCapacity
) {
779 /* each branch falls through to the next one */
781 *target
++=(uint8_t)(diff
>>24);
782 case 3: /*fall through*/
783 *target
++=(uint8_t)(diff
>>16);
784 /* case 2: handled above */
785 *target
++=(uint8_t)(diff
>>8);
786 /* case 1: handled above */
787 *target
++=(uint8_t)diff
;
789 /* will never occur */
792 targetCapacity
-=length
;
794 uint8_t *charErrorBuffer
;
797 * We actually do this backwards here:
798 * In order to save an intermediate variable, we output
799 * first to the overflow buffer what does not fit into the
802 /* we know that 1<=targetCapacity<length<=4 */
803 length
-=targetCapacity
;
804 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
806 /* each branch falls through to the next one */
808 *charErrorBuffer
++=(uint8_t)(diff
>>16);
809 case 2: /*fall through*/
810 *charErrorBuffer
++=(uint8_t)(diff
>>8);
811 case 1: /*fall through*/
812 *charErrorBuffer
=(uint8_t)diff
;
814 /* will never occur */
817 cnv
->charErrorBufferLength
=(int8_t)length
;
819 /* now output what fits into the regular target */
820 diff
>>=8*length
; /* length was reduced by targetCapacity */
821 switch(targetCapacity
) {
822 /* each branch falls through to the next one */
824 *target
++=(uint8_t)(diff
>>16);
825 case 2: /*fall through*/
826 *target
++=(uint8_t)(diff
>>8);
827 case 1: /*fall through*/
828 *target
++=(uint8_t)diff
;
830 /* will never occur */
834 /* target overflow */
836 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
842 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
847 /* set the converter state back into UConverter */
848 cnv
->fromUChar32
= c
<0 ? -c
: 0;
849 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
851 /* write back the updated pointers */
852 pArgs
->source
=source
;
853 pArgs
->target
=(char *)target
;
856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
859 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
861 * @param b lead byte;
862 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
863 * @return (diff<<2)|count
865 static inline int32_t
866 decodeBocu1LeadByte(int32_t b
) {
869 if(b
>=BOCU1_START_NEG_2
) {
870 /* positive difference */
871 if(b
<BOCU1_START_POS_3
) {
873 diff
=((int32_t)b
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
875 } else if(b
<BOCU1_START_POS_4
) {
877 diff
=((int32_t)b
-BOCU1_START_POS_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_2
+1;
881 diff
=BOCU1_REACH_POS_3
+1;
885 /* negative difference */
886 if(b
>=BOCU1_START_NEG_3
) {
888 diff
=((int32_t)b
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
890 } else if(b
>BOCU1_MIN
) {
892 diff
=((int32_t)b
-BOCU1_START_NEG_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_2
;
896 diff
=-BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_3
;
901 /* return the state for decoding the trail byte(s) */
902 return (diff
<<2)|count
;
906 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
908 * @param count number of remaining trail bytes including this one
909 * @param b trail byte
910 * @return new delta for diff including b - <0 indicates an error
914 static inline int32_t
915 decodeBocu1TrailByte(int32_t count
, int32_t b
) {
917 /* skip some C0 controls and make the trail byte range contiguous */
918 b
=bocu1ByteToTrail
[b
];
919 /* b<0 for an illegal trail byte value will result in return<0 below */
920 #if BOCU1_MAX_TRAIL<0xff
921 } else if(b
>BOCU1_MAX_TRAIL
) {
925 b
-=BOCU1_TRAIL_BYTE_OFFSET
;
928 /* add trail byte into difference and decrement count */
931 } else if(count
==2) {
932 return b
*BOCU1_TRAIL_COUNT
;
933 } else /* count==3 */ {
934 return b
*(BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
);
939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
940 UErrorCode
*pErrorCode
) {
942 const uint8_t *source
, *sourceLimit
;
944 const UChar
*targetLimit
;
947 int32_t prev
, count
, diff
, c
;
952 int32_t sourceIndex
, nextSourceIndex
;
954 /* set up the local pointers */
955 cnv
=pArgs
->converter
;
956 source
=(const uint8_t *)pArgs
->source
;
957 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
958 target
=pArgs
->target
;
959 targetLimit
=pArgs
->targetLimit
;
960 offsets
=pArgs
->offsets
;
962 /* get the converter state from UConverter */
963 prev
=(int32_t)cnv
->toUnicodeStatus
;
965 prev
=BOCU1_ASCII_PREV
;
967 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
971 byteIndex
=cnv
->toULength
;
974 /* sourceIndex=-1 if the current character began in the previous buffer */
975 sourceIndex
=byteIndex
==0 ? 0 : -1;
978 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
979 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
984 /* fast loop for single-byte differences */
985 /* use count as the only loop counter variable */
986 diff
=(int32_t)(sourceLimit
-source
);
987 count
=(int32_t)(pArgs
->targetLimit
-target
);
992 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
993 c
=prev
+(c
-BOCU1_MIDDLE
);
996 *offsets
++=nextSourceIndex
++;
997 prev
=BOCU1_SIMPLE_PREV(c
);
1001 } else if(c
<=0x20) {
1003 prev
=BOCU1_ASCII_PREV
;
1006 *offsets
++=nextSourceIndex
++;
1013 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
1015 /* decode a sequence of single and lead bytes */
1016 while(source
<sourceLimit
) {
1017 if(target
>=targetLimit
) {
1018 /* target is full */
1019 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1025 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1026 /* Write a code point directly from a single-byte difference. */
1027 c
=prev
+(c
-BOCU1_MIDDLE
);
1030 *offsets
++=sourceIndex
;
1031 prev
=BOCU1_SIMPLE_PREV(c
);
1032 sourceIndex
=nextSourceIndex
;
1035 } else if(c
<=0x20) {
1037 * Direct-encoded C0 control code or space.
1038 * Reset prev for C0 control codes but not for space.
1041 prev
=BOCU1_ASCII_PREV
;
1044 *offsets
++=sourceIndex
;
1045 sourceIndex
=nextSourceIndex
;
1047 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1048 /* Optimize two-byte case. */
1049 if(c
>=BOCU1_MIDDLE
) {
1050 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1052 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1057 c
=decodeBocu1TrailByte(1, *source
++);
1058 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1059 bytes
[0]=source
[-2];
1060 bytes
[1]=source
[-1];
1062 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1065 } else if(c
==BOCU1_RESET
) {
1066 /* only reset the state, no code point */
1067 prev
=BOCU1_ASCII_PREV
;
1068 sourceIndex
=nextSourceIndex
;
1072 * For multi-byte difference lead bytes, set the decoder state
1073 * with the partial difference value from the lead byte and
1074 * with the number of trail bytes.
1076 bytes
[0]=(uint8_t)c
;
1079 diff
=decodeBocu1LeadByte(c
);
1084 if(source
>=sourceLimit
) {
1088 c
=bytes
[byteIndex
++]=*source
++;
1090 /* trail byte in any position */
1091 c
=decodeBocu1TrailByte(count
, c
);
1093 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1099 /* final trail byte, deliver a code point */
1102 if((uint32_t)c
>0x10ffff) {
1103 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1111 /* calculate the next prev and output c */
1115 *offsets
++=sourceIndex
;
1117 /* output surrogate pair */
1118 *target
++=U16_LEAD(c
);
1119 if(target
<targetLimit
) {
1120 *target
++=U16_TRAIL(c
);
1121 *offsets
++=sourceIndex
;
1122 *offsets
++=sourceIndex
;
1124 /* target overflow */
1125 *offsets
++=sourceIndex
;
1126 cnv
->UCharErrorBuffer
[0]=U16_TRAIL(c
);
1127 cnv
->UCharErrorBufferLength
=1;
1128 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1132 sourceIndex
=nextSourceIndex
;
1136 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1137 /* set the converter state in UConverter to deal with the next character */
1138 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1141 /* set the converter state back into UConverter */
1142 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1143 cnv
->mode
=(diff
<<2)|count
;
1145 cnv
->toULength
=byteIndex
;
1147 /* write back the updated pointers */
1148 pArgs
->source
=(const char *)source
;
1149 pArgs
->target
=target
;
1150 pArgs
->offsets
=offsets
;
1155 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1156 * If a change is made in the original function, then either
1157 * change this function the same way or
1158 * re-copy the original function and remove the variables
1159 * offsets, sourceIndex, and nextSourceIndex.
1162 _Bocu1ToUnicode(UConverterToUnicodeArgs
*pArgs
,
1163 UErrorCode
*pErrorCode
) {
1165 const uint8_t *source
, *sourceLimit
;
1167 const UChar
*targetLimit
;
1169 int32_t prev
, count
, diff
, c
;
1176 /* set up the local pointers */
1177 cnv
=pArgs
->converter
;
1178 source
=(const uint8_t *)pArgs
->source
;
1179 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1180 target
=pArgs
->target
;
1181 targetLimit
=pArgs
->targetLimit
;
1183 /* get the converter state from UConverter */
1184 prev
=(int32_t)cnv
->toUnicodeStatus
;
1186 prev
=BOCU1_ASCII_PREV
;
1188 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1192 byteIndex
=cnv
->toULength
;
1193 bytes
=cnv
->toUBytes
;
1195 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1196 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
1201 /* fast loop for single-byte differences */
1202 /* use count as the only loop counter variable */
1203 diff
=(int32_t)(sourceLimit
-source
);
1204 count
=(int32_t)(pArgs
->targetLimit
-target
);
1209 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
1210 c
=prev
+(c
-BOCU1_MIDDLE
);
1213 prev
=BOCU1_SIMPLE_PREV(c
);
1217 } else if(c
<=0x20) {
1219 prev
=BOCU1_ASCII_PREV
;
1229 /* decode a sequence of single and lead bytes */
1230 while(source
<sourceLimit
) {
1231 if(target
>=targetLimit
) {
1232 /* target is full */
1233 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1238 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1239 /* Write a code point directly from a single-byte difference. */
1240 c
=prev
+(c
-BOCU1_MIDDLE
);
1243 prev
=BOCU1_SIMPLE_PREV(c
);
1246 } else if(c
<=0x20) {
1248 * Direct-encoded C0 control code or space.
1249 * Reset prev for C0 control codes but not for space.
1252 prev
=BOCU1_ASCII_PREV
;
1256 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1257 /* Optimize two-byte case. */
1258 if(c
>=BOCU1_MIDDLE
) {
1259 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1261 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1265 c
=decodeBocu1TrailByte(1, *source
++);
1266 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1267 bytes
[0]=source
[-2];
1268 bytes
[1]=source
[-1];
1270 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1273 } else if(c
==BOCU1_RESET
) {
1274 /* only reset the state, no code point */
1275 prev
=BOCU1_ASCII_PREV
;
1279 * For multi-byte difference lead bytes, set the decoder state
1280 * with the partial difference value from the lead byte and
1281 * with the number of trail bytes.
1283 bytes
[0]=(uint8_t)c
;
1286 diff
=decodeBocu1LeadByte(c
);
1291 if(source
>=sourceLimit
) {
1294 c
=bytes
[byteIndex
++]=*source
++;
1296 /* trail byte in any position */
1297 c
=decodeBocu1TrailByte(count
, c
);
1299 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1305 /* final trail byte, deliver a code point */
1308 if((uint32_t)c
>0x10ffff) {
1309 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1317 /* calculate the next prev and output c */
1322 /* output surrogate pair */
1323 *target
++=U16_LEAD(c
);
1324 if(target
<targetLimit
) {
1325 *target
++=U16_TRAIL(c
);
1327 /* target overflow */
1328 cnv
->UCharErrorBuffer
[0]=U16_TRAIL(c
);
1329 cnv
->UCharErrorBufferLength
=1;
1330 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1337 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1338 /* set the converter state in UConverter to deal with the next character */
1339 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1342 /* set the converter state back into UConverter */
1343 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1344 cnv
->mode
=(diff
<<2)|count
;
1346 cnv
->toULength
=byteIndex
;
1348 /* write back the updated pointers */
1349 pArgs
->source
=(const char *)source
;
1350 pArgs
->target
=target
;
1354 /* miscellaneous ------------------------------------------------------------ */
1356 static const UConverterImpl _Bocu1Impl
={
1367 _Bocu1ToUnicodeWithOffsets
,
1369 _Bocu1FromUnicodeWithOffsets
,
1376 ucnv_getCompleteUnicodeSet
,
1382 static const UConverterStaticData _Bocu1StaticData
={
1383 sizeof(UConverterStaticData
),
1385 1214, /* CCSID for BOCU-1 */
1386 UCNV_IBM
, UCNV_BOCU1
,
1387 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1388 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1392 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1395 const UConverterSharedData _Bocu1Data
={
1396 sizeof(UConverterSharedData
), ~((uint32_t)0),
1397 NULL
, NULL
, &_Bocu1StaticData
, FALSE
, &_Bocu1Impl
,
1399 UCNV_MBCS_TABLE_INITIALIZER