2 ******************************************************************************
4 * Copyright (C) 2002-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvbocu.cpp
10 * tab size: 8 (not used)
13 * created on: 2002mar27
14 * created by: Markus W. Scherer
16 * This is an implementation of the Binary Ordered Compression for Unicode,
17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
32 /* BOCU-1 constants and macros ---------------------------------------------- */
35 * BOCU-1 encodes the code points of a Unicode string as
36 * a sequence of byte-encoded differences (slope detection),
37 * preserving lexical order.
39 * Optimize the difference-taking for runs of Unicode text within
42 * Most small scripts are allocated within aligned 128-blocks of Unicode
43 * code points. Lexical order is preserved if the "previous code point" state
44 * is always moved into the middle of such a block.
46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
47 * areas into the middle of those areas.
49 * C0 control codes and space are encoded with their US-ASCII bytes.
50 * "prev" is reset for C0 controls but not for space.
53 /* initial value for "prev": middle of the ASCII range */
54 #define BOCU1_ASCII_PREV 0x40
56 /* bounding byte values for differences */
57 #define BOCU1_MIN 0x21
58 #define BOCU1_MIDDLE 0x90
59 #define BOCU1_MAX_LEAD 0xfe
60 #define BOCU1_MAX_TRAIL 0xff
61 #define BOCU1_RESET 0xff
63 /* number of lead bytes */
64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
66 /* adjust trail byte counts for the use of some C0 control byte values */
67 #define BOCU1_TRAIL_CONTROLS_COUNT 20
68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
70 /* number of trail bytes */
71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74 * number of positive and negative single-byte codes
75 * (counting 0==BOCU1_MIDDLE among the positive ones)
77 #define BOCU1_SINGLE 64
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
80 #define BOCU1_LEAD_2 43
81 #define BOCU1_LEAD_3 3
82 #define BOCU1_LEAD_4 1
84 /* The difference value range for single-byters. */
85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
88 /* The difference value range for double-byters. */
89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92 /* The difference value range for 3-byters. */
93 #define BOCU1_REACH_POS_3 \
94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
98 /* The lead byte start values. */
99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
102 /* ==BOCU1_MAX_LEAD */
104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
115 /* The length of a byte sequence, according to its packed form. */
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120 * 12 commonly used C0 control codes (and space) are only used to encode
121 * themselves directly,
122 * which makes BOCU-1 MIME-usable and reasonably safe for
123 * ASCII-oriented software.
143 * The other 20 C0 controls are also encoded directly (to preserve order)
144 * but are also used as trail bytes in difference encoding
145 * (for better compression).
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150 * Byte value map for control codes,
151 * from external byte values 0x00..0x20
152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
153 * External byte values that are illegal as trail bytes are mapped to -1.
156 bocu1ByteToTrail
[BOCU1_MIN
]={
157 /* 0 1 2 3 4 5 6 7 */
158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
160 /* 8 9 a b c d e f */
161 -1, -1, -1, -1, -1, -1, -1, -1,
163 /* 10 11 12 13 14 15 16 17 */
164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
166 /* 18 19 1a 1b 1c 1d 1e 1f */
167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
174 * Byte value map for control codes,
175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
176 * to external byte values 0x00..0x20.
179 bocu1TrailToByte
[BOCU1_TRAIL_CONTROLS_COUNT
]={
180 /* 0 1 2 3 4 5 6 7 */
181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
183 /* 8 9 a b c d e f */
184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
187 0x1c, 0x1d, 0x1e, 0x1f
191 * Integer division and modulo with negative numerators
192 * yields negative modulo results and quotients that are one more than
194 * This macro adjust the results so that the modulo-value m is always >=0.
196 * For positive n, the if() condition is always FALSE.
198 * @param n Number to be split into quotient and rest.
199 * Will be modified to contain the quotient.
201 * @param m Output variable for the rest (modulo result).
203 #define NEGDIVMOD(n, d, m) { \
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
214 /** Is a diff value encodable in a single byte? */
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
217 /** Encode a diff value in a single byte. */
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
220 /** Is a diff value encodable in two bytes? */
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
223 /* BOCU-1 implementation functions ------------------------------------------ */
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228 * Compute the next "previous" value for differencing
229 * from the current code point.
231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
232 * @return "previous code point" state value
234 static inline int32_t
235 bocu1Prev(int32_t c
) {
236 /* compute new prev */
237 if(/* 0x3040<=c && */ c
<=0x309f) {
238 /* Hiragana is not 128-aligned */
240 } else if(0x4e00<=c
&& c
<=0x9fa5) {
242 return 0x4e00-BOCU1_REACH_NEG_2
;
243 } else if(0xac00<=c
/* && c<=0xd7a3 */) {
245 return (0xd7a3+0xac00)/2;
247 /* mostly small scripts */
248 return BOCU1_SIMPLE_PREV(c
);
252 /** Fast version of bocu1Prev() for most scripts. */
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
257 * The UConverter fields are used as follows:
259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
269 * and return a packed integer with them.
271 * The encoding favors small absolute differences with short encodings
272 * to compress runs of same-script characters.
274 * Optimized version with unrolled loops and fewer floating-point operations
275 * than the standard packDiff().
277 * @param diff difference value -0x10ffff..0x10ffff
279 * 0x010000zz for 1-byte sequence zz
280 * 0x0200yyzz for 2-byte sequence yy zz
281 * 0x03xxyyzz for 3-byte sequence xx yy zz
282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285 packDiff(int32_t diff
) {
288 U_ASSERT(!DIFF_IS_SINGLE(diff
)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
289 if(diff
>=BOCU1_REACH_NEG_1
) {
290 /* mostly positive differences, and single-byte negative ones */
291 #if 0 /* single-byte case handled in macros, see below */
292 if(diff
<=BOCU1_REACH_POS_1
) {
294 return 0x01000000|(BOCU1_MIDDLE
+diff
);
297 if(diff
<=BOCU1_REACH_POS_2
) {
299 diff
-=BOCU1_REACH_POS_1
+1;
302 m
=diff%BOCU
1_TRAIL_COUNT
;
303 diff
/=BOCU1_TRAIL_COUNT
;
304 result
|=BOCU1_TRAIL_TO_BYTE(m
);
306 result
|=(BOCU1_START_POS_2
+diff
)<<8;
307 } else if(diff
<=BOCU1_REACH_POS_3
) {
309 diff
-=BOCU1_REACH_POS_2
+1;
312 m
=diff%BOCU
1_TRAIL_COUNT
;
313 diff
/=BOCU1_TRAIL_COUNT
;
314 result
|=BOCU1_TRAIL_TO_BYTE(m
);
316 m
=diff%BOCU
1_TRAIL_COUNT
;
317 diff
/=BOCU1_TRAIL_COUNT
;
318 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
320 result
|=(BOCU1_START_POS_3
+diff
)<<16;
323 diff
-=BOCU1_REACH_POS_3
+1;
325 m
=diff%BOCU
1_TRAIL_COUNT
;
326 diff
/=BOCU1_TRAIL_COUNT
;
327 result
=BOCU1_TRAIL_TO_BYTE(m
);
329 m
=diff%BOCU
1_TRAIL_COUNT
;
330 diff
/=BOCU1_TRAIL_COUNT
;
331 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
334 * We know that / and % would deliver quotient 0 and rest=diff.
335 * Avoid division and modulo for performance.
337 result
|=BOCU1_TRAIL_TO_BYTE(diff
)<<16;
339 result
|=((uint32_t)BOCU1_START_POS_4
)<<24;
342 /* two- to four-byte negative differences */
343 if(diff
>=BOCU1_REACH_NEG_2
) {
345 diff
-=BOCU1_REACH_NEG_1
;
348 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
349 result
|=BOCU1_TRAIL_TO_BYTE(m
);
351 result
|=(BOCU1_START_NEG_2
+diff
)<<8;
352 } else if(diff
>=BOCU1_REACH_NEG_3
) {
354 diff
-=BOCU1_REACH_NEG_2
;
357 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
358 result
|=BOCU1_TRAIL_TO_BYTE(m
);
360 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
361 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
363 result
|=(BOCU1_START_NEG_3
+diff
)<<16;
366 diff
-=BOCU1_REACH_NEG_3
;
368 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
369 result
=BOCU1_TRAIL_TO_BYTE(m
);
371 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
372 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
375 * We know that NEGDIVMOD would deliver
376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
377 * Avoid division and modulo for performance.
379 m
=diff
+BOCU1_TRAIL_COUNT
;
380 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<16;
382 result
|=BOCU1_MIN
<<24;
390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
391 UErrorCode
*pErrorCode
) {
393 const UChar
*source
, *sourceLimit
;
395 int32_t targetCapacity
;
398 int32_t prev
, c
, diff
;
400 int32_t sourceIndex
, nextSourceIndex
;
402 /* set up the local pointers */
403 cnv
=pArgs
->converter
;
404 source
=pArgs
->source
;
405 sourceLimit
=pArgs
->sourceLimit
;
406 target
=(uint8_t *)pArgs
->target
;
407 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
408 offsets
=pArgs
->offsets
;
410 /* get the converter state from UConverter */
412 prev
=(int32_t)cnv
->fromUnicodeStatus
;
414 prev
=BOCU1_ASCII_PREV
;
417 /* sourceIndex=-1 if the current character began in the previous buffer */
418 sourceIndex
= c
==0 ? 0 : -1;
421 /* conversion loop */
422 if(c
!=0 && targetCapacity
>0) {
427 /* fast loop for single-byte differences */
428 /* use only one loop counter variable, targetCapacity, not also source */
429 diff
=(int32_t)(sourceLimit
-source
);
430 if(targetCapacity
>diff
) {
433 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
436 prev
=BOCU1_ASCII_PREV
;
438 *target
++=(uint8_t)c
;
439 *offsets
++=nextSourceIndex
++;
444 if(DIFF_IS_SINGLE(diff
)) {
445 prev
=BOCU1_SIMPLE_PREV(c
);
446 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
447 *offsets
++=nextSourceIndex
++;
455 /* restore real values */
456 targetCapacity
=(int32_t)((const uint8_t *)pArgs
->targetLimit
-target
);
457 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
459 /* regular loop for all cases */
460 while(source
<sourceLimit
) {
461 if(targetCapacity
>0) {
467 * ISO C0 control & space:
468 * Encode directly for MIME compatibility,
469 * and reset state except for space, to not disrupt compression.
472 prev
=BOCU1_ASCII_PREV
;
474 *target
++=(uint8_t)c
;
475 *offsets
++=sourceIndex
;
478 sourceIndex
=nextSourceIndex
;
484 if(source
<sourceLimit
) {
485 /* test the following code unit */
487 if(U16_IS_TRAIL(trail
)) {
490 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
494 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
500 * all other Unicode code points c==U+0021..U+10ffff
501 * are encoded with the difference c-prev
503 * a new prev is computed from c,
504 * placed in the middle of a 0x80-block (for most small scripts) or
505 * in the middle of the Unihan and Hangul blocks
506 * to statistically minimize the following difference
510 if(DIFF_IS_SINGLE(diff
)) {
511 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
512 *offsets
++=sourceIndex
;
514 sourceIndex
=nextSourceIndex
;
518 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
519 /* optimize 2-byte case */
523 diff
-=BOCU1_REACH_POS_1
+1;
524 m
=diff%BOCU
1_TRAIL_COUNT
;
525 diff
/=BOCU1_TRAIL_COUNT
;
526 diff
+=BOCU1_START_POS_2
;
528 diff
-=BOCU1_REACH_NEG_1
;
529 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
530 diff
+=BOCU1_START_NEG_2
;
532 *target
++=(uint8_t)diff
;
533 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
534 *offsets
++=sourceIndex
;
535 *offsets
++=sourceIndex
;
537 sourceIndex
=nextSourceIndex
;
539 int32_t length
; /* will be 2..4 */
542 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
544 /* write the output character bytes from diff and length */
545 /* from the first if in the loop we know that targetCapacity>0 */
546 if(length
<=targetCapacity
) {
548 /* each branch falls through to the next one */
550 *target
++=(uint8_t)(diff
>>24);
551 *offsets
++=sourceIndex
;
554 *target
++=(uint8_t)(diff
>>16);
555 *offsets
++=sourceIndex
;
558 *target
++=(uint8_t)(diff
>>8);
559 *offsets
++=sourceIndex
;
560 /* case 1: handled above */
561 *target
++=(uint8_t)diff
;
562 *offsets
++=sourceIndex
;
565 /* will never occur */
568 targetCapacity
-=length
;
569 sourceIndex
=nextSourceIndex
;
571 uint8_t *charErrorBuffer
;
574 * We actually do this backwards here:
575 * In order to save an intermediate variable, we output
576 * first to the overflow buffer what does not fit into the
579 /* we know that 1<=targetCapacity<length<=4 */
580 length
-=targetCapacity
;
581 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
583 /* each branch falls through to the next one */
585 *charErrorBuffer
++=(uint8_t)(diff
>>16);
588 *charErrorBuffer
++=(uint8_t)(diff
>>8);
591 *charErrorBuffer
=(uint8_t)diff
;
594 /* will never occur */
597 cnv
->charErrorBufferLength
=(int8_t)length
;
599 /* now output what fits into the regular target */
600 diff
>>=8*length
; /* length was reduced by targetCapacity */
601 switch(targetCapacity
) {
602 /* each branch falls through to the next one */
604 *target
++=(uint8_t)(diff
>>16);
605 *offsets
++=sourceIndex
;
608 *target
++=(uint8_t)(diff
>>8);
609 *offsets
++=sourceIndex
;
612 *target
++=(uint8_t)diff
;
613 *offsets
++=sourceIndex
;
616 /* will never occur */
620 /* target overflow */
622 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
628 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
633 /* set the converter state back into UConverter */
634 cnv
->fromUChar32
= c
<0 ? -c
: 0;
635 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
637 /* write back the updated pointers */
638 pArgs
->source
=source
;
639 pArgs
->target
=(char *)target
;
640 pArgs
->offsets
=offsets
;
644 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
645 * If a change is made in the original function, then either
646 * change this function the same way or
647 * re-copy the original function and remove the variables
648 * offsets, sourceIndex, and nextSourceIndex.
651 _Bocu1FromUnicode(UConverterFromUnicodeArgs
*pArgs
,
652 UErrorCode
*pErrorCode
) {
654 const UChar
*source
, *sourceLimit
;
656 int32_t targetCapacity
;
658 int32_t prev
, c
, diff
;
660 /* set up the local pointers */
661 cnv
=pArgs
->converter
;
662 source
=pArgs
->source
;
663 sourceLimit
=pArgs
->sourceLimit
;
664 target
=(uint8_t *)pArgs
->target
;
665 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
667 /* get the converter state from UConverter */
669 prev
=(int32_t)cnv
->fromUnicodeStatus
;
671 prev
=BOCU1_ASCII_PREV
;
674 /* conversion loop */
675 if(c
!=0 && targetCapacity
>0) {
680 /* fast loop for single-byte differences */
681 /* use only one loop counter variable, targetCapacity, not also source */
682 diff
=(int32_t)(sourceLimit
-source
);
683 if(targetCapacity
>diff
) {
686 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
689 prev
=BOCU1_ASCII_PREV
;
691 *target
++=(uint8_t)c
;
694 if(DIFF_IS_SINGLE(diff
)) {
695 prev
=BOCU1_SIMPLE_PREV(c
);
696 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
704 /* restore real values */
705 targetCapacity
=(int32_t)((const uint8_t *)pArgs
->targetLimit
-target
);
707 /* regular loop for all cases */
708 while(source
<sourceLimit
) {
709 if(targetCapacity
>0) {
714 * ISO C0 control & space:
715 * Encode directly for MIME compatibility,
716 * and reset state except for space, to not disrupt compression.
719 prev
=BOCU1_ASCII_PREV
;
721 *target
++=(uint8_t)c
;
728 if(source
<sourceLimit
) {
729 /* test the following code unit */
731 if(U16_IS_TRAIL(trail
)) {
733 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
737 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
743 * all other Unicode code points c==U+0021..U+10ffff
744 * are encoded with the difference c-prev
746 * a new prev is computed from c,
747 * placed in the middle of a 0x80-block (for most small scripts) or
748 * in the middle of the Unihan and Hangul blocks
749 * to statistically minimize the following difference
753 if(DIFF_IS_SINGLE(diff
)) {
754 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
759 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
760 /* optimize 2-byte case */
764 diff
-=BOCU1_REACH_POS_1
+1;
765 m
=diff%BOCU
1_TRAIL_COUNT
;
766 diff
/=BOCU1_TRAIL_COUNT
;
767 diff
+=BOCU1_START_POS_2
;
769 diff
-=BOCU1_REACH_NEG_1
;
770 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
771 diff
+=BOCU1_START_NEG_2
;
773 *target
++=(uint8_t)diff
;
774 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
777 int32_t length
; /* will be 2..4 */
780 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
782 /* write the output character bytes from diff and length */
783 /* from the first if in the loop we know that targetCapacity>0 */
784 if(length
<=targetCapacity
) {
786 /* each branch falls through to the next one */
788 *target
++=(uint8_t)(diff
>>24);
791 *target
++=(uint8_t)(diff
>>16);
792 /* case 2: handled above */
793 *target
++=(uint8_t)(diff
>>8);
794 /* case 1: handled above */
795 *target
++=(uint8_t)diff
;
798 /* will never occur */
801 targetCapacity
-=length
;
803 uint8_t *charErrorBuffer
;
806 * We actually do this backwards here:
807 * In order to save an intermediate variable, we output
808 * first to the overflow buffer what does not fit into the
811 /* we know that 1<=targetCapacity<length<=4 */
812 length
-=targetCapacity
;
813 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
815 /* each branch falls through to the next one */
817 *charErrorBuffer
++=(uint8_t)(diff
>>16);
820 *charErrorBuffer
++=(uint8_t)(diff
>>8);
823 *charErrorBuffer
=(uint8_t)diff
;
826 /* will never occur */
829 cnv
->charErrorBufferLength
=(int8_t)length
;
831 /* now output what fits into the regular target */
832 diff
>>=8*length
; /* length was reduced by targetCapacity */
833 switch(targetCapacity
) {
834 /* each branch falls through to the next one */
836 *target
++=(uint8_t)(diff
>>16);
839 *target
++=(uint8_t)(diff
>>8);
842 *target
++=(uint8_t)diff
;
845 /* will never occur */
849 /* target overflow */
851 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
857 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
862 /* set the converter state back into UConverter */
863 cnv
->fromUChar32
= c
<0 ? -c
: 0;
864 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
866 /* write back the updated pointers */
867 pArgs
->source
=source
;
868 pArgs
->target
=(char *)target
;
871 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
874 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
876 * @param b lead byte;
877 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
878 * @return (diff<<2)|count
880 static inline int32_t
881 decodeBocu1LeadByte(int32_t b
) {
884 if(b
>=BOCU1_START_NEG_2
) {
885 /* positive difference */
886 if(b
<BOCU1_START_POS_3
) {
888 diff
=((int32_t)b
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
890 } else if(b
<BOCU1_START_POS_4
) {
892 diff
=((int32_t)b
-BOCU1_START_POS_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_2
+1;
896 diff
=BOCU1_REACH_POS_3
+1;
900 /* negative difference */
901 if(b
>=BOCU1_START_NEG_3
) {
903 diff
=((int32_t)b
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
905 } else if(b
>BOCU1_MIN
) {
907 diff
=((int32_t)b
-BOCU1_START_NEG_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_2
;
911 diff
=-BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_3
;
916 /* return the state for decoding the trail byte(s) */
917 return (diff
<<2)|count
;
921 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
923 * @param count number of remaining trail bytes including this one
924 * @param b trail byte
925 * @return new delta for diff including b - <0 indicates an error
929 static inline int32_t
930 decodeBocu1TrailByte(int32_t count
, int32_t b
) {
932 /* skip some C0 controls and make the trail byte range contiguous */
933 b
=bocu1ByteToTrail
[b
];
934 /* b<0 for an illegal trail byte value will result in return<0 below */
935 #if BOCU1_MAX_TRAIL<0xff
936 } else if(b
>BOCU1_MAX_TRAIL
) {
940 b
-=BOCU1_TRAIL_BYTE_OFFSET
;
943 /* add trail byte into difference and decrement count */
946 } else if(count
==2) {
947 return b
*BOCU1_TRAIL_COUNT
;
948 } else /* count==3 */ {
949 return b
*(BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
);
954 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
955 UErrorCode
*pErrorCode
) {
957 const uint8_t *source
, *sourceLimit
;
959 const UChar
*targetLimit
;
962 int32_t prev
, count
, diff
, c
;
967 int32_t sourceIndex
, nextSourceIndex
;
969 /* set up the local pointers */
970 cnv
=pArgs
->converter
;
971 source
=(const uint8_t *)pArgs
->source
;
972 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
973 target
=pArgs
->target
;
974 targetLimit
=pArgs
->targetLimit
;
975 offsets
=pArgs
->offsets
;
977 /* get the converter state from UConverter */
978 prev
=(int32_t)cnv
->toUnicodeStatus
;
980 prev
=BOCU1_ASCII_PREV
;
982 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
986 byteIndex
=cnv
->toULength
;
989 /* sourceIndex=-1 if the current character began in the previous buffer */
990 sourceIndex
=byteIndex
==0 ? 0 : -1;
993 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
994 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
999 /* fast loop for single-byte differences */
1000 /* use count as the only loop counter variable */
1001 diff
=(int32_t)(sourceLimit
-source
);
1002 count
=(int32_t)(pArgs
->targetLimit
-target
);
1007 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
1008 c
=prev
+(c
-BOCU1_MIDDLE
);
1011 *offsets
++=nextSourceIndex
++;
1012 prev
=BOCU1_SIMPLE_PREV(c
);
1016 } else if(c
<=0x20) {
1018 prev
=BOCU1_ASCII_PREV
;
1021 *offsets
++=nextSourceIndex
++;
1028 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
1030 /* decode a sequence of single and lead bytes */
1031 while(source
<sourceLimit
) {
1032 if(target
>=targetLimit
) {
1033 /* target is full */
1034 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1040 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1041 /* Write a code point directly from a single-byte difference. */
1042 c
=prev
+(c
-BOCU1_MIDDLE
);
1045 *offsets
++=sourceIndex
;
1046 prev
=BOCU1_SIMPLE_PREV(c
);
1047 sourceIndex
=nextSourceIndex
;
1050 } else if(c
<=0x20) {
1052 * Direct-encoded C0 control code or space.
1053 * Reset prev for C0 control codes but not for space.
1056 prev
=BOCU1_ASCII_PREV
;
1059 *offsets
++=sourceIndex
;
1060 sourceIndex
=nextSourceIndex
;
1062 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1063 /* Optimize two-byte case. */
1064 if(c
>=BOCU1_MIDDLE
) {
1065 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1067 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1072 c
=decodeBocu1TrailByte(1, *source
++);
1073 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1074 bytes
[0]=source
[-2];
1075 bytes
[1]=source
[-1];
1077 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1080 } else if(c
==BOCU1_RESET
) {
1081 /* only reset the state, no code point */
1082 prev
=BOCU1_ASCII_PREV
;
1083 sourceIndex
=nextSourceIndex
;
1087 * For multi-byte difference lead bytes, set the decoder state
1088 * with the partial difference value from the lead byte and
1089 * with the number of trail bytes.
1091 bytes
[0]=(uint8_t)c
;
1094 diff
=decodeBocu1LeadByte(c
);
1099 if(source
>=sourceLimit
) {
1103 c
=bytes
[byteIndex
++]=*source
++;
1105 /* trail byte in any position */
1106 c
=decodeBocu1TrailByte(count
, c
);
1108 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1114 /* final trail byte, deliver a code point */
1117 if((uint32_t)c
>0x10ffff) {
1118 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1126 /* calculate the next prev and output c */
1130 *offsets
++=sourceIndex
;
1132 /* output surrogate pair */
1133 *target
++=U16_LEAD(c
);
1134 if(target
<targetLimit
) {
1135 *target
++=U16_TRAIL(c
);
1136 *offsets
++=sourceIndex
;
1137 *offsets
++=sourceIndex
;
1139 /* target overflow */
1140 *offsets
++=sourceIndex
;
1141 cnv
->UCharErrorBuffer
[0]=U16_TRAIL(c
);
1142 cnv
->UCharErrorBufferLength
=1;
1143 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1147 sourceIndex
=nextSourceIndex
;
1151 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1152 /* set the converter state in UConverter to deal with the next character */
1153 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1156 /* set the converter state back into UConverter */
1157 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1158 cnv
->mode
=(diff
<<2)|count
;
1160 cnv
->toULength
=byteIndex
;
1162 /* write back the updated pointers */
1163 pArgs
->source
=(const char *)source
;
1164 pArgs
->target
=target
;
1165 pArgs
->offsets
=offsets
;
1170 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1171 * If a change is made in the original function, then either
1172 * change this function the same way or
1173 * re-copy the original function and remove the variables
1174 * offsets, sourceIndex, and nextSourceIndex.
1177 _Bocu1ToUnicode(UConverterToUnicodeArgs
*pArgs
,
1178 UErrorCode
*pErrorCode
) {
1180 const uint8_t *source
, *sourceLimit
;
1182 const UChar
*targetLimit
;
1184 int32_t prev
, count
, diff
, c
;
1189 /* set up the local pointers */
1190 cnv
=pArgs
->converter
;
1191 source
=(const uint8_t *)pArgs
->source
;
1192 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1193 target
=pArgs
->target
;
1194 targetLimit
=pArgs
->targetLimit
;
1196 /* get the converter state from UConverter */
1197 prev
=(int32_t)cnv
->toUnicodeStatus
;
1199 prev
=BOCU1_ASCII_PREV
;
1201 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1205 byteIndex
=cnv
->toULength
;
1206 bytes
=cnv
->toUBytes
;
1208 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1209 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
1214 /* fast loop for single-byte differences */
1215 /* use count as the only loop counter variable */
1216 diff
=(int32_t)(sourceLimit
-source
);
1217 count
=(int32_t)(pArgs
->targetLimit
-target
);
1222 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
1223 c
=prev
+(c
-BOCU1_MIDDLE
);
1226 prev
=BOCU1_SIMPLE_PREV(c
);
1230 } else if(c
<=0x20) {
1232 prev
=BOCU1_ASCII_PREV
;
1242 /* decode a sequence of single and lead bytes */
1243 while(source
<sourceLimit
) {
1244 if(target
>=targetLimit
) {
1245 /* target is full */
1246 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1251 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1252 /* Write a code point directly from a single-byte difference. */
1253 c
=prev
+(c
-BOCU1_MIDDLE
);
1256 prev
=BOCU1_SIMPLE_PREV(c
);
1259 } else if(c
<=0x20) {
1261 * Direct-encoded C0 control code or space.
1262 * Reset prev for C0 control codes but not for space.
1265 prev
=BOCU1_ASCII_PREV
;
1269 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1270 /* Optimize two-byte case. */
1271 if(c
>=BOCU1_MIDDLE
) {
1272 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1274 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1278 c
=decodeBocu1TrailByte(1, *source
++);
1279 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1280 bytes
[0]=source
[-2];
1281 bytes
[1]=source
[-1];
1283 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1286 } else if(c
==BOCU1_RESET
) {
1287 /* only reset the state, no code point */
1288 prev
=BOCU1_ASCII_PREV
;
1292 * For multi-byte difference lead bytes, set the decoder state
1293 * with the partial difference value from the lead byte and
1294 * with the number of trail bytes.
1296 bytes
[0]=(uint8_t)c
;
1299 diff
=decodeBocu1LeadByte(c
);
1304 if(source
>=sourceLimit
) {
1307 c
=bytes
[byteIndex
++]=*source
++;
1309 /* trail byte in any position */
1310 c
=decodeBocu1TrailByte(count
, c
);
1312 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1318 /* final trail byte, deliver a code point */
1321 if((uint32_t)c
>0x10ffff) {
1322 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1330 /* calculate the next prev and output c */
1335 /* output surrogate pair */
1336 *target
++=U16_LEAD(c
);
1337 if(target
<targetLimit
) {
1338 *target
++=U16_TRAIL(c
);
1340 /* target overflow */
1341 cnv
->UCharErrorBuffer
[0]=U16_TRAIL(c
);
1342 cnv
->UCharErrorBufferLength
=1;
1343 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1350 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1351 /* set the converter state in UConverter to deal with the next character */
1352 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1355 /* set the converter state back into UConverter */
1356 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1357 cnv
->mode
=(diff
<<2)|count
;
1359 cnv
->toULength
=byteIndex
;
1361 /* write back the updated pointers */
1362 pArgs
->source
=(const char *)source
;
1363 pArgs
->target
=target
;
1367 /* miscellaneous ------------------------------------------------------------ */
1369 static const UConverterImpl _Bocu1Impl
={
1380 _Bocu1ToUnicodeWithOffsets
,
1382 _Bocu1FromUnicodeWithOffsets
,
1389 ucnv_getCompleteUnicodeSet
,
1395 static const UConverterStaticData _Bocu1StaticData
={
1396 sizeof(UConverterStaticData
),
1398 1214, /* CCSID for BOCU-1 */
1399 UCNV_IBM
, UCNV_BOCU1
,
1400 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1401 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1405 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1408 const UConverterSharedData _Bocu1Data
=
1409 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData
, &_Bocu1Impl
);