1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2002-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: ucnvbocu.cpp
12 * tab size: 8 (not used)
15 * created on: 2002mar27
16 * created by: Markus W. Scherer
18 * This is an implementation of the Binary Ordered Compression for Unicode,
19 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
22 #include "unicode/utypes.h"
24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "unicode/utf16.h"
34 /* BOCU-1 constants and macros ---------------------------------------------- */
37 * BOCU-1 encodes the code points of a Unicode string as
38 * a sequence of byte-encoded differences (slope detection),
39 * preserving lexical order.
41 * Optimize the difference-taking for runs of Unicode text within
44 * Most small scripts are allocated within aligned 128-blocks of Unicode
45 * code points. Lexical order is preserved if the "previous code point" state
46 * is always moved into the middle of such a block.
48 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
49 * areas into the middle of those areas.
51 * C0 control codes and space are encoded with their US-ASCII bytes.
52 * "prev" is reset for C0 controls but not for space.
55 /* initial value for "prev": middle of the ASCII range */
56 #define BOCU1_ASCII_PREV 0x40
58 /* bounding byte values for differences */
59 #define BOCU1_MIN 0x21
60 #define BOCU1_MIDDLE 0x90
61 #define BOCU1_MAX_LEAD 0xfe
62 #define BOCU1_MAX_TRAIL 0xff
63 #define BOCU1_RESET 0xff
65 /* number of lead bytes */
66 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
68 /* adjust trail byte counts for the use of some C0 control byte values */
69 #define BOCU1_TRAIL_CONTROLS_COUNT 20
70 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
72 /* number of trail bytes */
73 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
76 * number of positive and negative single-byte codes
77 * (counting 0==BOCU1_MIDDLE among the positive ones)
79 #define BOCU1_SINGLE 64
81 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
82 #define BOCU1_LEAD_2 43
83 #define BOCU1_LEAD_3 3
84 #define BOCU1_LEAD_4 1
86 /* The difference value range for single-byters. */
87 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
88 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
90 /* The difference value range for double-byters. */
91 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
94 /* The difference value range for 3-byters. */
95 #define BOCU1_REACH_POS_3 \
96 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
98 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
100 /* The lead byte start values. */
101 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
102 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
103 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
104 /* ==BOCU1_MAX_LEAD */
106 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
107 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
108 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
111 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
112 #define BOCU1_LENGTH_FROM_LEAD(lead) \
113 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
114 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
115 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
117 /* The length of a byte sequence, according to its packed form. */
118 #define BOCU1_LENGTH_FROM_PACKED(packed) \
119 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
122 * 12 commonly used C0 control codes (and space) are only used to encode
123 * themselves directly,
124 * which makes BOCU-1 MIME-usable and reasonably safe for
125 * ASCII-oriented software.
145 * The other 20 C0 controls are also encoded directly (to preserve order)
146 * but are also used as trail bytes in difference encoding
147 * (for better compression).
149 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
152 * Byte value map for control codes,
153 * from external byte values 0x00..0x20
154 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
155 * External byte values that are illegal as trail bytes are mapped to -1.
158 bocu1ByteToTrail
[BOCU1_MIN
]={
159 /* 0 1 2 3 4 5 6 7 */
160 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
162 /* 8 9 a b c d e f */
163 -1, -1, -1, -1, -1, -1, -1, -1,
165 /* 10 11 12 13 14 15 16 17 */
166 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
168 /* 18 19 1a 1b 1c 1d 1e 1f */
169 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
176 * Byte value map for control codes,
177 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
178 * to external byte values 0x00..0x20.
181 bocu1TrailToByte
[BOCU1_TRAIL_CONTROLS_COUNT
]={
182 /* 0 1 2 3 4 5 6 7 */
183 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
185 /* 8 9 a b c d e f */
186 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
189 0x1c, 0x1d, 0x1e, 0x1f
193 * Integer division and modulo with negative numerators
194 * yields negative modulo results and quotients that are one more than
196 * This macro adjust the results so that the modulo-value m is always >=0.
198 * For positive n, the if() condition is always FALSE.
200 * @param n Number to be split into quotient and rest.
201 * Will be modified to contain the quotient.
203 * @param m Output variable for the rest (modulo result).
205 #define NEGDIVMOD(n, d, m) { \
214 /* Faster versions of packDiff() for single-byte-encoded diff values. */
216 /** Is a diff value encodable in a single byte? */
217 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
219 /** Encode a diff value in a single byte. */
220 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
222 /** Is a diff value encodable in two bytes? */
223 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
225 /* BOCU-1 implementation functions ------------------------------------------ */
227 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
230 * Compute the next "previous" value for differencing
231 * from the current code point.
233 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
234 * @return "previous code point" state value
236 static inline int32_t
237 bocu1Prev(int32_t c
) {
238 /* compute new prev */
239 if(/* 0x3040<=c && */ c
<=0x309f) {
240 /* Hiragana is not 128-aligned */
242 } else if(0x4e00<=c
&& c
<=0x9fa5) {
244 return 0x4e00-BOCU1_REACH_NEG_2
;
245 } else if(0xac00<=c
/* && c<=0xd7a3 */) {
247 return (0xd7a3+0xac00)/2;
249 /* mostly small scripts */
250 return BOCU1_SIMPLE_PREV(c
);
254 /** Fast version of bocu1Prev() for most scripts. */
255 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
258 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
259 * The UConverter fields are used as follows:
261 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
263 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
264 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
267 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
270 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271 * and return a packed integer with them.
273 * The encoding favors small absolute differences with short encodings
274 * to compress runs of same-script characters.
276 * Optimized version with unrolled loops and fewer floating-point operations
277 * than the standard packDiff().
279 * @param diff difference value -0x10ffff..0x10ffff
281 * 0x010000zz for 1-byte sequence zz
282 * 0x0200yyzz for 2-byte sequence yy zz
283 * 0x03xxyyzz for 3-byte sequence xx yy zz
284 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
287 packDiff(int32_t diff
) {
290 U_ASSERT(!DIFF_IS_SINGLE(diff
)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
291 if(diff
>=BOCU1_REACH_NEG_1
) {
292 /* mostly positive differences, and single-byte negative ones */
293 #if 0 /* single-byte case handled in macros, see below */
294 if(diff
<=BOCU1_REACH_POS_1
) {
296 return 0x01000000|(BOCU1_MIDDLE
+diff
);
299 if(diff
<=BOCU1_REACH_POS_2
) {
301 diff
-=BOCU1_REACH_POS_1
+1;
304 m
=diff%BOCU
1_TRAIL_COUNT
;
305 diff
/=BOCU1_TRAIL_COUNT
;
306 result
|=BOCU1_TRAIL_TO_BYTE(m
);
308 result
|=(BOCU1_START_POS_2
+diff
)<<8;
309 } else if(diff
<=BOCU1_REACH_POS_3
) {
311 diff
-=BOCU1_REACH_POS_2
+1;
314 m
=diff%BOCU
1_TRAIL_COUNT
;
315 diff
/=BOCU1_TRAIL_COUNT
;
316 result
|=BOCU1_TRAIL_TO_BYTE(m
);
318 m
=diff%BOCU
1_TRAIL_COUNT
;
319 diff
/=BOCU1_TRAIL_COUNT
;
320 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
322 result
|=(BOCU1_START_POS_3
+diff
)<<16;
325 diff
-=BOCU1_REACH_POS_3
+1;
327 m
=diff%BOCU
1_TRAIL_COUNT
;
328 diff
/=BOCU1_TRAIL_COUNT
;
329 result
=BOCU1_TRAIL_TO_BYTE(m
);
331 m
=diff%BOCU
1_TRAIL_COUNT
;
332 diff
/=BOCU1_TRAIL_COUNT
;
333 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
336 * We know that / and % would deliver quotient 0 and rest=diff.
337 * Avoid division and modulo for performance.
339 result
|=BOCU1_TRAIL_TO_BYTE(diff
)<<16;
341 result
|=((uint32_t)BOCU1_START_POS_4
)<<24;
344 /* two- to four-byte negative differences */
345 if(diff
>=BOCU1_REACH_NEG_2
) {
347 diff
-=BOCU1_REACH_NEG_1
;
350 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
351 result
|=BOCU1_TRAIL_TO_BYTE(m
);
353 result
|=(BOCU1_START_NEG_2
+diff
)<<8;
354 } else if(diff
>=BOCU1_REACH_NEG_3
) {
356 diff
-=BOCU1_REACH_NEG_2
;
359 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
360 result
|=BOCU1_TRAIL_TO_BYTE(m
);
362 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
363 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
365 result
|=(BOCU1_START_NEG_3
+diff
)<<16;
368 diff
-=BOCU1_REACH_NEG_3
;
370 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
371 result
=BOCU1_TRAIL_TO_BYTE(m
);
373 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
374 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<8;
377 * We know that NEGDIVMOD would deliver
378 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
379 * Avoid division and modulo for performance.
381 m
=diff
+BOCU1_TRAIL_COUNT
;
382 result
|=BOCU1_TRAIL_TO_BYTE(m
)<<16;
384 result
|=BOCU1_MIN
<<24;
391 static void U_CALLCONV
392 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
393 UErrorCode
*pErrorCode
) {
395 const UChar
*source
, *sourceLimit
;
397 int32_t targetCapacity
;
400 int32_t prev
, c
, diff
;
402 int32_t sourceIndex
, nextSourceIndex
;
404 /* set up the local pointers */
405 cnv
=pArgs
->converter
;
406 source
=pArgs
->source
;
407 sourceLimit
=pArgs
->sourceLimit
;
408 target
=(uint8_t *)pArgs
->target
;
409 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
410 offsets
=pArgs
->offsets
;
412 /* get the converter state from UConverter */
414 prev
=(int32_t)cnv
->fromUnicodeStatus
;
416 prev
=BOCU1_ASCII_PREV
;
419 /* sourceIndex=-1 if the current character began in the previous buffer */
420 sourceIndex
= c
==0 ? 0 : -1;
423 /* conversion loop */
424 if(c
!=0 && targetCapacity
>0) {
429 /* fast loop for single-byte differences */
430 /* use only one loop counter variable, targetCapacity, not also source */
431 diff
=(int32_t)(sourceLimit
-source
);
432 if(targetCapacity
>diff
) {
435 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
438 prev
=BOCU1_ASCII_PREV
;
440 *target
++=(uint8_t)c
;
441 *offsets
++=nextSourceIndex
++;
446 if(DIFF_IS_SINGLE(diff
)) {
447 prev
=BOCU1_SIMPLE_PREV(c
);
448 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
449 *offsets
++=nextSourceIndex
++;
457 /* restore real values */
458 targetCapacity
=(int32_t)((const uint8_t *)pArgs
->targetLimit
-target
);
459 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
461 /* regular loop for all cases */
462 while(source
<sourceLimit
) {
463 if(targetCapacity
>0) {
469 * ISO C0 control & space:
470 * Encode directly for MIME compatibility,
471 * and reset state except for space, to not disrupt compression.
474 prev
=BOCU1_ASCII_PREV
;
476 *target
++=(uint8_t)c
;
477 *offsets
++=sourceIndex
;
480 sourceIndex
=nextSourceIndex
;
486 if(source
<sourceLimit
) {
487 /* test the following code unit */
489 if(U16_IS_TRAIL(trail
)) {
492 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
496 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
502 * all other Unicode code points c==U+0021..U+10ffff
503 * are encoded with the difference c-prev
505 * a new prev is computed from c,
506 * placed in the middle of a 0x80-block (for most small scripts) or
507 * in the middle of the Unihan and Hangul blocks
508 * to statistically minimize the following difference
512 if(DIFF_IS_SINGLE(diff
)) {
513 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
514 *offsets
++=sourceIndex
;
516 sourceIndex
=nextSourceIndex
;
520 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
521 /* optimize 2-byte case */
525 diff
-=BOCU1_REACH_POS_1
+1;
526 m
=diff%BOCU
1_TRAIL_COUNT
;
527 diff
/=BOCU1_TRAIL_COUNT
;
528 diff
+=BOCU1_START_POS_2
;
530 diff
-=BOCU1_REACH_NEG_1
;
531 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
532 diff
+=BOCU1_START_NEG_2
;
534 *target
++=(uint8_t)diff
;
535 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
536 *offsets
++=sourceIndex
;
537 *offsets
++=sourceIndex
;
539 sourceIndex
=nextSourceIndex
;
541 int32_t length
; /* will be 2..4 */
544 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
546 /* write the output character bytes from diff and length */
547 /* from the first if in the loop we know that targetCapacity>0 */
548 if(length
<=targetCapacity
) {
550 /* each branch falls through to the next one */
552 *target
++=(uint8_t)(diff
>>24);
553 *offsets
++=sourceIndex
;
556 *target
++=(uint8_t)(diff
>>16);
557 *offsets
++=sourceIndex
;
560 *target
++=(uint8_t)(diff
>>8);
561 *offsets
++=sourceIndex
;
562 /* case 1: handled above */
563 *target
++=(uint8_t)diff
;
564 *offsets
++=sourceIndex
;
567 /* will never occur */
570 targetCapacity
-=length
;
571 sourceIndex
=nextSourceIndex
;
573 uint8_t *charErrorBuffer
;
576 * We actually do this backwards here:
577 * In order to save an intermediate variable, we output
578 * first to the overflow buffer what does not fit into the
581 /* we know that 1<=targetCapacity<length<=4 */
582 length
-=targetCapacity
;
583 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
585 /* each branch falls through to the next one */
587 *charErrorBuffer
++=(uint8_t)(diff
>>16);
590 *charErrorBuffer
++=(uint8_t)(diff
>>8);
593 *charErrorBuffer
=(uint8_t)diff
;
596 /* will never occur */
599 cnv
->charErrorBufferLength
=(int8_t)length
;
601 /* now output what fits into the regular target */
602 diff
>>=8*length
; /* length was reduced by targetCapacity */
603 switch(targetCapacity
) {
604 /* each branch falls through to the next one */
606 *target
++=(uint8_t)(diff
>>16);
607 *offsets
++=sourceIndex
;
610 *target
++=(uint8_t)(diff
>>8);
611 *offsets
++=sourceIndex
;
614 *target
++=(uint8_t)diff
;
615 *offsets
++=sourceIndex
;
618 /* will never occur */
622 /* target overflow */
624 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
630 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
635 /* set the converter state back into UConverter */
636 cnv
->fromUChar32
= c
<0 ? -c
: 0;
637 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
639 /* write back the updated pointers */
640 pArgs
->source
=source
;
641 pArgs
->target
=(char *)target
;
642 pArgs
->offsets
=offsets
;
646 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
647 * If a change is made in the original function, then either
648 * change this function the same way or
649 * re-copy the original function and remove the variables
650 * offsets, sourceIndex, and nextSourceIndex.
652 static void U_CALLCONV
653 _Bocu1FromUnicode(UConverterFromUnicodeArgs
*pArgs
,
654 UErrorCode
*pErrorCode
) {
656 const UChar
*source
, *sourceLimit
;
658 int32_t targetCapacity
;
660 int32_t prev
, c
, diff
;
662 /* set up the local pointers */
663 cnv
=pArgs
->converter
;
664 source
=pArgs
->source
;
665 sourceLimit
=pArgs
->sourceLimit
;
666 target
=(uint8_t *)pArgs
->target
;
667 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
669 /* get the converter state from UConverter */
671 prev
=(int32_t)cnv
->fromUnicodeStatus
;
673 prev
=BOCU1_ASCII_PREV
;
676 /* conversion loop */
677 if(c
!=0 && targetCapacity
>0) {
682 /* fast loop for single-byte differences */
683 /* use only one loop counter variable, targetCapacity, not also source */
684 diff
=(int32_t)(sourceLimit
-source
);
685 if(targetCapacity
>diff
) {
688 while(targetCapacity
>0 && (c
=*source
)<0x3000) {
691 prev
=BOCU1_ASCII_PREV
;
693 *target
++=(uint8_t)c
;
696 if(DIFF_IS_SINGLE(diff
)) {
697 prev
=BOCU1_SIMPLE_PREV(c
);
698 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
706 /* restore real values */
707 targetCapacity
=(int32_t)((const uint8_t *)pArgs
->targetLimit
-target
);
709 /* regular loop for all cases */
710 while(source
<sourceLimit
) {
711 if(targetCapacity
>0) {
716 * ISO C0 control & space:
717 * Encode directly for MIME compatibility,
718 * and reset state except for space, to not disrupt compression.
721 prev
=BOCU1_ASCII_PREV
;
723 *target
++=(uint8_t)c
;
730 if(source
<sourceLimit
) {
731 /* test the following code unit */
733 if(U16_IS_TRAIL(trail
)) {
735 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
739 c
=-c
; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
745 * all other Unicode code points c==U+0021..U+10ffff
746 * are encoded with the difference c-prev
748 * a new prev is computed from c,
749 * placed in the middle of a 0x80-block (for most small scripts) or
750 * in the middle of the Unihan and Hangul blocks
751 * to statistically minimize the following difference
755 if(DIFF_IS_SINGLE(diff
)) {
756 *target
++=(uint8_t)PACK_SINGLE_DIFF(diff
);
761 } else if(DIFF_IS_DOUBLE(diff
) && 2<=targetCapacity
) {
762 /* optimize 2-byte case */
766 diff
-=BOCU1_REACH_POS_1
+1;
767 m
=diff%BOCU
1_TRAIL_COUNT
;
768 diff
/=BOCU1_TRAIL_COUNT
;
769 diff
+=BOCU1_START_POS_2
;
771 diff
-=BOCU1_REACH_NEG_1
;
772 NEGDIVMOD(diff
, BOCU1_TRAIL_COUNT
, m
);
773 diff
+=BOCU1_START_NEG_2
;
775 *target
++=(uint8_t)diff
;
776 *target
++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m
);
779 int32_t length
; /* will be 2..4 */
782 length
=BOCU1_LENGTH_FROM_PACKED(diff
);
784 /* write the output character bytes from diff and length */
785 /* from the first if in the loop we know that targetCapacity>0 */
786 if(length
<=targetCapacity
) {
788 /* each branch falls through to the next one */
790 *target
++=(uint8_t)(diff
>>24);
793 *target
++=(uint8_t)(diff
>>16);
794 /* case 2: handled above */
795 *target
++=(uint8_t)(diff
>>8);
796 /* case 1: handled above */
797 *target
++=(uint8_t)diff
;
800 /* will never occur */
803 targetCapacity
-=length
;
805 uint8_t *charErrorBuffer
;
808 * We actually do this backwards here:
809 * In order to save an intermediate variable, we output
810 * first to the overflow buffer what does not fit into the
813 /* we know that 1<=targetCapacity<length<=4 */
814 length
-=targetCapacity
;
815 charErrorBuffer
=(uint8_t *)cnv
->charErrorBuffer
;
817 /* each branch falls through to the next one */
819 *charErrorBuffer
++=(uint8_t)(diff
>>16);
822 *charErrorBuffer
++=(uint8_t)(diff
>>8);
825 *charErrorBuffer
=(uint8_t)diff
;
828 /* will never occur */
831 cnv
->charErrorBufferLength
=(int8_t)length
;
833 /* now output what fits into the regular target */
834 diff
>>=8*length
; /* length was reduced by targetCapacity */
835 switch(targetCapacity
) {
836 /* each branch falls through to the next one */
838 *target
++=(uint8_t)(diff
>>16);
841 *target
++=(uint8_t)(diff
>>8);
844 *target
++=(uint8_t)diff
;
847 /* will never occur */
851 /* target overflow */
853 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
859 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
864 /* set the converter state back into UConverter */
865 cnv
->fromUChar32
= c
<0 ? -c
: 0;
866 cnv
->fromUnicodeStatus
=(uint32_t)prev
;
868 /* write back the updated pointers */
869 pArgs
->source
=source
;
870 pArgs
->target
=(char *)target
;
873 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
876 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
878 * @param b lead byte;
879 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
880 * @return (diff<<2)|count
882 static inline int32_t
883 decodeBocu1LeadByte(int32_t b
) {
886 if(b
>=BOCU1_START_NEG_2
) {
887 /* positive difference */
888 if(b
<BOCU1_START_POS_3
) {
890 diff
=((int32_t)b
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
892 } else if(b
<BOCU1_START_POS_4
) {
894 diff
=((int32_t)b
-BOCU1_START_POS_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_2
+1;
898 diff
=BOCU1_REACH_POS_3
+1;
902 /* negative difference */
903 if(b
>=BOCU1_START_NEG_3
) {
905 diff
=((int32_t)b
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
907 } else if(b
>BOCU1_MIN
) {
909 diff
=((int32_t)b
-BOCU1_START_NEG_3
)*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_2
;
913 diff
=-BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_3
;
918 /* return the state for decoding the trail byte(s) */
919 return (diff
<<2)|count
;
923 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
925 * @param count number of remaining trail bytes including this one
926 * @param b trail byte
927 * @return new delta for diff including b - <0 indicates an error
931 static inline int32_t
932 decodeBocu1TrailByte(int32_t count
, int32_t b
) {
934 /* skip some C0 controls and make the trail byte range contiguous */
935 b
=bocu1ByteToTrail
[b
];
936 /* b<0 for an illegal trail byte value will result in return<0 below */
937 #if BOCU1_MAX_TRAIL<0xff
938 } else if(b
>BOCU1_MAX_TRAIL
) {
942 b
-=BOCU1_TRAIL_BYTE_OFFSET
;
945 /* add trail byte into difference and decrement count */
948 } else if(count
==2) {
949 return b
*BOCU1_TRAIL_COUNT
;
950 } else /* count==3 */ {
951 return b
*(BOCU1_TRAIL_COUNT
*BOCU1_TRAIL_COUNT
);
955 static void U_CALLCONV
956 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
957 UErrorCode
*pErrorCode
) {
959 const uint8_t *source
, *sourceLimit
;
961 const UChar
*targetLimit
;
964 int32_t prev
, count
, diff
, c
;
969 int32_t sourceIndex
, nextSourceIndex
;
971 /* set up the local pointers */
972 cnv
=pArgs
->converter
;
973 source
=(const uint8_t *)pArgs
->source
;
974 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
975 target
=pArgs
->target
;
976 targetLimit
=pArgs
->targetLimit
;
977 offsets
=pArgs
->offsets
;
979 /* get the converter state from UConverter */
980 prev
=(int32_t)cnv
->toUnicodeStatus
;
982 prev
=BOCU1_ASCII_PREV
;
984 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
988 byteIndex
=cnv
->toULength
;
991 /* sourceIndex=-1 if the current character began in the previous buffer */
992 sourceIndex
=byteIndex
==0 ? 0 : -1;
995 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
996 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
1001 /* fast loop for single-byte differences */
1002 /* use count as the only loop counter variable */
1003 diff
=(int32_t)(sourceLimit
-source
);
1004 count
=(int32_t)(pArgs
->targetLimit
-target
);
1009 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
1010 c
=prev
+(c
-BOCU1_MIDDLE
);
1013 *offsets
++=nextSourceIndex
++;
1014 prev
=BOCU1_SIMPLE_PREV(c
);
1018 } else if(c
<=0x20) {
1020 prev
=BOCU1_ASCII_PREV
;
1023 *offsets
++=nextSourceIndex
++;
1030 sourceIndex
=nextSourceIndex
; /* wrong if offsets==NULL but does not matter */
1032 /* decode a sequence of single and lead bytes */
1033 while(source
<sourceLimit
) {
1034 if(target
>=targetLimit
) {
1035 /* target is full */
1036 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1042 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1043 /* Write a code point directly from a single-byte difference. */
1044 c
=prev
+(c
-BOCU1_MIDDLE
);
1047 *offsets
++=sourceIndex
;
1048 prev
=BOCU1_SIMPLE_PREV(c
);
1049 sourceIndex
=nextSourceIndex
;
1052 } else if(c
<=0x20) {
1054 * Direct-encoded C0 control code or space.
1055 * Reset prev for C0 control codes but not for space.
1058 prev
=BOCU1_ASCII_PREV
;
1061 *offsets
++=sourceIndex
;
1062 sourceIndex
=nextSourceIndex
;
1064 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1065 /* Optimize two-byte case. */
1066 if(c
>=BOCU1_MIDDLE
) {
1067 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1069 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1074 c
=decodeBocu1TrailByte(1, *source
++);
1075 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1076 bytes
[0]=source
[-2];
1077 bytes
[1]=source
[-1];
1079 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1082 } else if(c
==BOCU1_RESET
) {
1083 /* only reset the state, no code point */
1084 prev
=BOCU1_ASCII_PREV
;
1085 sourceIndex
=nextSourceIndex
;
1089 * For multi-byte difference lead bytes, set the decoder state
1090 * with the partial difference value from the lead byte and
1091 * with the number of trail bytes.
1093 bytes
[0]=(uint8_t)c
;
1096 diff
=decodeBocu1LeadByte(c
);
1101 if(source
>=sourceLimit
) {
1105 c
=bytes
[byteIndex
++]=*source
++;
1107 /* trail byte in any position */
1108 c
=decodeBocu1TrailByte(count
, c
);
1110 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1116 /* final trail byte, deliver a code point */
1119 if((uint32_t)c
>0x10ffff) {
1120 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1128 /* calculate the next prev and output c */
1132 *offsets
++=sourceIndex
;
1134 /* output surrogate pair */
1135 *target
++=U16_LEAD(c
);
1136 if(target
<targetLimit
) {
1137 *target
++=U16_TRAIL(c
);
1138 *offsets
++=sourceIndex
;
1139 *offsets
++=sourceIndex
;
1141 /* target overflow */
1142 *offsets
++=sourceIndex
;
1143 cnv
->UCharErrorBuffer
[0]=U16_TRAIL(c
);
1144 cnv
->UCharErrorBufferLength
=1;
1145 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1149 sourceIndex
=nextSourceIndex
;
1153 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1154 /* set the converter state in UConverter to deal with the next character */
1155 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1158 /* set the converter state back into UConverter */
1159 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1160 cnv
->mode
=(diff
<<2)|count
;
1162 cnv
->toULength
=byteIndex
;
1164 /* write back the updated pointers */
1165 pArgs
->source
=(const char *)source
;
1166 pArgs
->target
=target
;
1167 pArgs
->offsets
=offsets
;
1172 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173 * If a change is made in the original function, then either
1174 * change this function the same way or
1175 * re-copy the original function and remove the variables
1176 * offsets, sourceIndex, and nextSourceIndex.
1178 static void U_CALLCONV
1179 _Bocu1ToUnicode(UConverterToUnicodeArgs
*pArgs
,
1180 UErrorCode
*pErrorCode
) {
1182 const uint8_t *source
, *sourceLimit
;
1184 const UChar
*targetLimit
;
1186 int32_t prev
, count
, diff
, c
;
1191 /* set up the local pointers */
1192 cnv
=pArgs
->converter
;
1193 source
=(const uint8_t *)pArgs
->source
;
1194 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
1195 target
=pArgs
->target
;
1196 targetLimit
=pArgs
->targetLimit
;
1198 /* get the converter state from UConverter */
1199 prev
=(int32_t)cnv
->toUnicodeStatus
;
1201 prev
=BOCU1_ASCII_PREV
;
1203 diff
=cnv
->mode
; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1207 byteIndex
=cnv
->toULength
;
1208 bytes
=cnv
->toUBytes
;
1210 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1211 if(count
>0 && byteIndex
>0 && target
<targetLimit
) {
1216 /* fast loop for single-byte differences */
1217 /* use count as the only loop counter variable */
1218 diff
=(int32_t)(sourceLimit
-source
);
1219 count
=(int32_t)(pArgs
->targetLimit
-target
);
1224 if(BOCU1_START_NEG_2
<=(c
=*source
) && c
<BOCU1_START_POS_2
) {
1225 c
=prev
+(c
-BOCU1_MIDDLE
);
1228 prev
=BOCU1_SIMPLE_PREV(c
);
1232 } else if(c
<=0x20) {
1234 prev
=BOCU1_ASCII_PREV
;
1244 /* decode a sequence of single and lead bytes */
1245 while(source
<sourceLimit
) {
1246 if(target
>=targetLimit
) {
1247 /* target is full */
1248 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1253 if(BOCU1_START_NEG_2
<=c
&& c
<BOCU1_START_POS_2
) {
1254 /* Write a code point directly from a single-byte difference. */
1255 c
=prev
+(c
-BOCU1_MIDDLE
);
1258 prev
=BOCU1_SIMPLE_PREV(c
);
1261 } else if(c
<=0x20) {
1263 * Direct-encoded C0 control code or space.
1264 * Reset prev for C0 control codes but not for space.
1267 prev
=BOCU1_ASCII_PREV
;
1271 } else if(BOCU1_START_NEG_3
<=c
&& c
<BOCU1_START_POS_3
&& source
<sourceLimit
) {
1272 /* Optimize two-byte case. */
1273 if(c
>=BOCU1_MIDDLE
) {
1274 diff
=((int32_t)c
-BOCU1_START_POS_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_POS_1
+1;
1276 diff
=((int32_t)c
-BOCU1_START_NEG_2
)*BOCU1_TRAIL_COUNT
+BOCU1_REACH_NEG_1
;
1280 c
=decodeBocu1TrailByte(1, *source
++);
1281 if(c
<0 || (uint32_t)(c
=prev
+diff
+c
)>0x10ffff) {
1282 bytes
[0]=source
[-2];
1283 bytes
[1]=source
[-1];
1285 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1288 } else if(c
==BOCU1_RESET
) {
1289 /* only reset the state, no code point */
1290 prev
=BOCU1_ASCII_PREV
;
1294 * For multi-byte difference lead bytes, set the decoder state
1295 * with the partial difference value from the lead byte and
1296 * with the number of trail bytes.
1298 bytes
[0]=(uint8_t)c
;
1301 diff
=decodeBocu1LeadByte(c
);
1306 if(source
>=sourceLimit
) {
1309 c
=bytes
[byteIndex
++]=*source
++;
1311 /* trail byte in any position */
1312 c
=decodeBocu1TrailByte(count
, c
);
1314 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1320 /* final trail byte, deliver a code point */
1323 if((uint32_t)c
>0x10ffff) {
1324 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1332 /* calculate the next prev and output c */
1337 /* output surrogate pair */
1338 *target
++=U16_LEAD(c
);
1339 if(target
<targetLimit
) {
1340 *target
++=U16_TRAIL(c
);
1342 /* target overflow */
1343 cnv
->UCharErrorBuffer
[0]=U16_TRAIL(c
);
1344 cnv
->UCharErrorBufferLength
=1;
1345 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1352 if(*pErrorCode
==U_ILLEGAL_CHAR_FOUND
) {
1353 /* set the converter state in UConverter to deal with the next character */
1354 cnv
->toUnicodeStatus
=BOCU1_ASCII_PREV
;
1357 /* set the converter state back into UConverter */
1358 cnv
->toUnicodeStatus
=(uint32_t)prev
;
1359 cnv
->mode
=(diff
<<2)|count
;
1361 cnv
->toULength
=byteIndex
;
1363 /* write back the updated pointers */
1364 pArgs
->source
=(const char *)source
;
1365 pArgs
->target
=target
;
1369 /* miscellaneous ------------------------------------------------------------ */
1371 static const UConverterImpl _Bocu1Impl
={
1382 _Bocu1ToUnicodeWithOffsets
,
1384 _Bocu1FromUnicodeWithOffsets
,
1391 ucnv_getCompleteUnicodeSet
,
1397 static const UConverterStaticData _Bocu1StaticData
={
1398 sizeof(UConverterStaticData
),
1400 1214, /* CCSID for BOCU-1 */
1401 UCNV_IBM
, UCNV_BOCU1
,
1402 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1403 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1407 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1410 const UConverterSharedData _Bocu1Data
=
1411 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData
, &_Bocu1Impl
);