2 ******************************************************************************
4 * Copyright (C) 2000-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvscsu.c
10 * tab size: 8 (not used)
13 * created on: 2000nov18
14 * created by: Markus W. Scherer
16 * This is an implementation of the Standard Compression Scheme for Unicode
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 * Reserved commands and window settings are treated as illegal sequences and
19 * will result in callback calls.
22 #include "unicode/utypes.h"
24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "unicode/utf16.h"
33 /* SCSU definitions --------------------------------------------------------- */
35 /* SCSU command byte values */
37 SQ0
=0x01, /* Quote from window pair 0 */
38 SQ7
=0x08, /* Quote from window pair 7 */
39 SDX
=0x0B, /* Define a window as extended */
40 Srs
=0x0C, /* reserved */
41 SQU
=0x0E, /* Quote a single Unicode character */
42 SCU
=0x0F, /* Change to Unicode mode */
43 SC0
=0x10, /* Select window 0 */
44 SC7
=0x17, /* Select window 7 */
45 SD0
=0x18, /* Define and select window 0 */
46 SD7
=0x1F, /* Define and select window 7 */
48 UC0
=0xE0, /* Select window 0 */
49 UC7
=0xE7, /* Select window 7 */
50 UD0
=0xE8, /* Define and select window 0 */
51 UD7
=0xEF, /* Define and select window 7 */
52 UQU
=0xF0, /* Quote a single Unicode character */
53 UDX
=0xF1, /* Define a Window as extended */
54 Urs
=0xF2 /* reserved */
59 * Unicode code points from 3400 to E000 are not adressible by
60 * dynamic window, since in these areas no short run alphabets are
61 * found. Therefore add gapOffset to all values from gapThreshold.
66 /* values between reservedStart and fixedThreshold are reserved */
69 /* use table of predefined fixed offsets for values from fixedThreshold */
73 /* constant offsets for the 8 static windows */
74 static const uint32_t staticOffsets
[8]={
75 0x0000, /* ASCII for quoted tags */
76 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
77 0x0100, /* Latin Extended-A */
78 0x0300, /* Combining Diacritical Marks */
79 0x2000, /* General Punctuation */
80 0x2080, /* Currency Symbols */
81 0x2100, /* Letterlike Symbols and Number Forms */
82 0x3000 /* CJK Symbols and punctuation */
85 /* initial offsets for the 8 dynamic (sliding) windows */
86 static const uint32_t initialDynamicOffsets
[8]={
88 0x00C0, /* Latin Extended A */
89 0x0400, /* Cyrillic */
91 0x0900, /* Devanagari */
92 0x3040, /* Hiragana */
93 0x30A0, /* Katakana */
94 0xFF00 /* Fullwidth ASCII */
97 /* Table of fixed predefined Offsets */
98 static const uint32_t fixedOffsets
[]={
99 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
100 /* 0xFA */ 0x0250, /* IPA extensions */
101 /* 0xFB */ 0x0370, /* Greek */
102 /* 0xFC */ 0x0530, /* Armenian */
103 /* 0xFD */ 0x3040, /* Hiragana */
104 /* 0xFE */ 0x30A0, /* Katakana */
105 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
119 typedef struct SCSUData
{
120 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
121 uint32_t toUDynamicOffsets
[8];
122 uint32_t fromUDynamicOffsets
[8];
124 /* state machine state - toUnicode */
125 UBool toUIsSingleByteMode
;
127 int8_t toUQuoteWindow
, toUDynamicWindow
;
129 uint8_t toUPadding
[3];
131 /* state machine state - fromUnicode */
132 UBool fromUIsSingleByteMode
;
133 int8_t fromUDynamicWindow
;
136 * windowUse[] keeps track of the use of the dynamic windows:
137 * At nextWindowUseIndex there is the least recently used window,
138 * and the following windows (in a wrapping manner) are more and more
140 * At nextWindowUseIndex-1 there is the most recently used window.
143 int8_t nextWindowUseIndex
;
147 static const int8_t initialWindowUse
[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
148 static const int8_t initialWindowUse_ja
[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
154 /* SCSU setup functions ----------------------------------------------------- */
157 _SCSUReset(UConverter
*cnv
, UConverterResetChoice choice
) {
158 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
160 if(choice
<=UCNV_RESET_TO_UNICODE
) {
161 /* reset toUnicode */
162 uprv_memcpy(scsu
->toUDynamicOffsets
, initialDynamicOffsets
, 32);
164 scsu
->toUIsSingleByteMode
=TRUE
;
165 scsu
->toUState
=readCommand
;
166 scsu
->toUQuoteWindow
=scsu
->toUDynamicWindow
=0;
171 if(choice
!=UCNV_RESET_TO_UNICODE
) {
172 /* reset fromUnicode */
173 uprv_memcpy(scsu
->fromUDynamicOffsets
, initialDynamicOffsets
, 32);
175 scsu
->fromUIsSingleByteMode
=TRUE
;
176 scsu
->fromUDynamicWindow
=0;
178 scsu
->nextWindowUseIndex
=0;
179 switch(scsu
->locale
) {
181 uprv_memcpy(scsu
->windowUse
, initialWindowUse_ja
, 8);
184 uprv_memcpy(scsu
->windowUse
, initialWindowUse
, 8);
193 _SCSUOpen(UConverter
*cnv
,
194 UConverterLoadArgs
*pArgs
,
195 UErrorCode
*pErrorCode
) {
196 const char *locale
=pArgs
->locale
;
197 if(pArgs
->onlyTestIsLoadable
) {
200 cnv
->extraInfo
=uprv_malloc(sizeof(SCSUData
));
201 if(cnv
->extraInfo
!=NULL
) {
202 if(locale
!=NULL
&& locale
[0]=='j' && locale
[1]=='a' && (locale
[2]==0 || locale
[2]=='_')) {
203 ((SCSUData
*)cnv
->extraInfo
)->locale
=l_ja
;
205 ((SCSUData
*)cnv
->extraInfo
)->locale
=lGeneric
;
207 _SCSUReset(cnv
, UCNV_RESET_BOTH
);
209 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
212 /* Set the substitution character U+fffd as a Unicode string. */
213 cnv
->subUChars
[0]=0xfffd;
218 _SCSUClose(UConverter
*cnv
) {
219 if(cnv
->extraInfo
!=NULL
) {
220 if(!cnv
->isExtraLocal
) {
221 uprv_free(cnv
->extraInfo
);
227 /* SCSU-to-Unicode conversion functions ------------------------------------- */
230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
231 UErrorCode
*pErrorCode
) {
234 const uint8_t *source
, *sourceLimit
;
236 const UChar
*targetLimit
;
238 UBool isSingleByteMode
;
239 uint8_t state
, byteOne
;
240 int8_t quoteWindow
, dynamicWindow
;
242 int32_t sourceIndex
, nextSourceIndex
;
246 /* set up the local pointers */
247 cnv
=pArgs
->converter
;
248 scsu
=(SCSUData
*)cnv
->extraInfo
;
250 source
=(const uint8_t *)pArgs
->source
;
251 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
252 target
=pArgs
->target
;
253 targetLimit
=pArgs
->targetLimit
;
254 offsets
=pArgs
->offsets
;
256 /* get the state machine state */
257 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
258 state
=scsu
->toUState
;
259 quoteWindow
=scsu
->toUQuoteWindow
;
260 dynamicWindow
=scsu
->toUDynamicWindow
;
261 byteOne
=scsu
->toUByteOne
;
263 /* sourceIndex=-1 if the current character began in the previous buffer */
264 sourceIndex
=state
==readCommand
? 0 : -1;
270 * For performance, this is not a normal C loop.
271 * Instead, there are two code blocks for the two SCSU modes.
272 * The function branches to either one, and a change of the mode is done with a goto to
275 * Each branch has two conventional loops:
276 * - a fast-path loop for the most common codes in the mode
277 * - a loop for all other codes in the mode
278 * When the fast-path runs into a code that it cannot handle, its loop ends and it
279 * runs into the following loop to handle the other codes.
280 * The end of the input or output buffer is also handled by the slower loop.
281 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
283 * The callback handling is done by returning with an error code.
284 * The conversion framework actually calls the callback function.
286 if(isSingleByteMode
) {
287 /* fast path for single-byte mode */
288 if(state
==readCommand
) {
290 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
294 /* write US-ASCII graphic character or DEL */
297 *offsets
++=sourceIndex
;
300 /* write from dynamic window */
301 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
305 *offsets
++=sourceIndex
;
308 /* output surrogate pair */
309 *target
++=(UChar
)(0xd7c0+(c
>>10));
310 if(target
<targetLimit
) {
311 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
313 *offsets
++=sourceIndex
;
314 *offsets
++=sourceIndex
;
317 /* target overflow */
319 *offsets
++=sourceIndex
;
321 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
322 cnv
->UCharErrorBufferLength
=1;
323 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
328 sourceIndex
=nextSourceIndex
;
332 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
334 while(source
<sourceLimit
) {
335 if(target
>=targetLimit
) {
337 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
344 /* redundant conditions are commented out */
345 /* here: b<0x20 because otherwise we would be in fastSingle */
346 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
350 *offsets
++=sourceIndex
;
352 sourceIndex
=nextSourceIndex
;
356 dynamicWindow
=(int8_t)(b
-SC0
);
357 sourceIndex
=nextSourceIndex
;
359 } else /* if(SD0<=b && b<=SD7) */ {
360 dynamicWindow
=(int8_t)(b
-SD0
);
363 } else if(/* SQ0<=b && */ b
<=SQ7
) {
364 quoteWindow
=(int8_t)(b
-SQ0
);
371 sourceIndex
=nextSourceIndex
;
372 isSingleByteMode
=FALSE
;
375 /* callback(illegal) */
376 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
382 /* store the first byte of a multibyte sequence in toUBytes[] */
393 *target
++=(UChar
)((byteOne
<<8)|b
);
395 *offsets
++=sourceIndex
;
397 sourceIndex
=nextSourceIndex
;
402 /* all static offsets are in the BMP */
403 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
405 *offsets
++=sourceIndex
;
408 /* write from dynamic window */
409 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
413 *offsets
++=sourceIndex
;
416 /* output surrogate pair */
417 *target
++=(UChar
)(0xd7c0+(c
>>10));
418 if(target
<targetLimit
) {
419 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
421 *offsets
++=sourceIndex
;
422 *offsets
++=sourceIndex
;
425 /* target overflow */
427 *offsets
++=sourceIndex
;
429 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
430 cnv
->UCharErrorBufferLength
=1;
431 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
436 sourceIndex
=nextSourceIndex
;
440 dynamicWindow
=(int8_t)((b
>>5)&7);
441 byteOne
=(uint8_t)(b
&0x1f);
447 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
448 sourceIndex
=nextSourceIndex
;
453 /* callback(illegal): Reserved window offset value 0 */
457 } else if(b
<gapThreshold
) {
458 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
459 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
460 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
461 } else if(b
>=fixedThreshold
) {
462 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
464 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
469 sourceIndex
=nextSourceIndex
;
475 /* fast path for Unicode mode */
476 if(state
==readCommand
) {
478 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
479 *target
++=(UChar
)((b
<<8)|source
[1]);
481 *offsets
++=sourceIndex
;
483 sourceIndex
=nextSourceIndex
;
489 /* normal state machine for Unicode mode */
490 /* unicodeByteMode: */
491 while(source
<sourceLimit
) {
492 if(target
>=targetLimit
) {
494 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
501 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
506 } else if(/* UC0<=b && */ b
<=UC7
) {
507 dynamicWindow
=(int8_t)(b
-UC0
);
508 sourceIndex
=nextSourceIndex
;
509 isSingleByteMode
=TRUE
;
511 } else if(/* UD0<=b && */ b
<=UD7
) {
512 dynamicWindow
=(int8_t)(b
-UD0
);
513 isSingleByteMode
=TRUE
;
519 isSingleByteMode
=TRUE
;
529 /* callback(illegal) */
530 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
543 *target
++=(UChar
)((byteOne
<<8)|b
);
545 *offsets
++=sourceIndex
;
547 sourceIndex
=nextSourceIndex
;
555 /* set the converter state back into UConverter */
556 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
557 /* reset to deal with the next character */
559 } else if(state
==readCommand
) {
560 /* not in a multi-byte sequence, reset toULength */
563 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
564 scsu
->toUState
=state
;
565 scsu
->toUQuoteWindow
=quoteWindow
;
566 scsu
->toUDynamicWindow
=dynamicWindow
;
567 scsu
->toUByteOne
=byteOne
;
569 /* write back the updated pointers */
570 pArgs
->source
=(const char *)source
;
571 pArgs
->target
=target
;
572 pArgs
->offsets
=offsets
;
577 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
578 * If a change is made in the original function, then either
579 * change this function the same way or
580 * re-copy the original function and remove the variables
581 * offsets, sourceIndex, and nextSourceIndex.
584 _SCSUToUnicode(UConverterToUnicodeArgs
*pArgs
,
585 UErrorCode
*pErrorCode
) {
588 const uint8_t *source
, *sourceLimit
;
590 const UChar
*targetLimit
;
591 UBool isSingleByteMode
;
592 uint8_t state
, byteOne
;
593 int8_t quoteWindow
, dynamicWindow
;
597 /* set up the local pointers */
598 cnv
=pArgs
->converter
;
599 scsu
=(SCSUData
*)cnv
->extraInfo
;
601 source
=(const uint8_t *)pArgs
->source
;
602 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
603 target
=pArgs
->target
;
604 targetLimit
=pArgs
->targetLimit
;
606 /* get the state machine state */
607 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
608 state
=scsu
->toUState
;
609 quoteWindow
=scsu
->toUQuoteWindow
;
610 dynamicWindow
=scsu
->toUDynamicWindow
;
611 byteOne
=scsu
->toUByteOne
;
616 * For performance, this is not a normal C loop.
617 * Instead, there are two code blocks for the two SCSU modes.
618 * The function branches to either one, and a change of the mode is done with a goto to
621 * Each branch has two conventional loops:
622 * - a fast-path loop for the most common codes in the mode
623 * - a loop for all other codes in the mode
624 * When the fast-path runs into a code that it cannot handle, its loop ends and it
625 * runs into the following loop to handle the other codes.
626 * The end of the input or output buffer is also handled by the slower loop.
627 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
629 * The callback handling is done by returning with an error code.
630 * The conversion framework actually calls the callback function.
632 if(isSingleByteMode
) {
633 /* fast path for single-byte mode */
634 if(state
==readCommand
) {
636 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
639 /* write US-ASCII graphic character or DEL */
642 /* write from dynamic window */
643 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
647 /* output surrogate pair */
648 *target
++=(UChar
)(0xd7c0+(c
>>10));
649 if(target
<targetLimit
) {
650 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
652 /* target overflow */
653 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
654 cnv
->UCharErrorBufferLength
=1;
655 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
663 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
665 while(source
<sourceLimit
) {
666 if(target
>=targetLimit
) {
668 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
674 /* redundant conditions are commented out */
675 /* here: b<0x20 because otherwise we would be in fastSingle */
676 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
682 dynamicWindow
=(int8_t)(b
-SC0
);
684 } else /* if(SD0<=b && b<=SD7) */ {
685 dynamicWindow
=(int8_t)(b
-SD0
);
688 } else if(/* SQ0<=b && */ b
<=SQ7
) {
689 quoteWindow
=(int8_t)(b
-SQ0
);
696 isSingleByteMode
=FALSE
;
699 /* callback(illegal) */
700 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
706 /* store the first byte of a multibyte sequence in toUBytes[] */
717 *target
++=(UChar
)((byteOne
<<8)|b
);
722 /* all static offsets are in the BMP */
723 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
725 /* write from dynamic window */
726 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
730 /* output surrogate pair */
731 *target
++=(UChar
)(0xd7c0+(c
>>10));
732 if(target
<targetLimit
) {
733 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
735 /* target overflow */
736 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
737 cnv
->UCharErrorBufferLength
=1;
738 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
746 dynamicWindow
=(int8_t)((b
>>5)&7);
747 byteOne
=(uint8_t)(b
&0x1f);
753 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
758 /* callback(illegal): Reserved window offset value 0 */
762 } else if(b
<gapThreshold
) {
763 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
764 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
765 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
766 } else if(b
>=fixedThreshold
) {
767 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
769 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
779 /* fast path for Unicode mode */
780 if(state
==readCommand
) {
782 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
783 *target
++=(UChar
)((b
<<8)|source
[1]);
788 /* normal state machine for Unicode mode */
789 /* unicodeByteMode: */
790 while(source
<sourceLimit
) {
791 if(target
>=targetLimit
) {
793 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
799 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
804 } else if(/* UC0<=b && */ b
<=UC7
) {
805 dynamicWindow
=(int8_t)(b
-UC0
);
806 isSingleByteMode
=TRUE
;
808 } else if(/* UD0<=b && */ b
<=UD7
) {
809 dynamicWindow
=(int8_t)(b
-UD0
);
810 isSingleByteMode
=TRUE
;
816 isSingleByteMode
=TRUE
;
826 /* callback(illegal) */
827 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
840 *target
++=(UChar
)((byteOne
<<8)|b
);
848 /* set the converter state back into UConverter */
849 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
850 /* reset to deal with the next character */
852 } else if(state
==readCommand
) {
853 /* not in a multi-byte sequence, reset toULength */
856 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
857 scsu
->toUState
=state
;
858 scsu
->toUQuoteWindow
=quoteWindow
;
859 scsu
->toUDynamicWindow
=dynamicWindow
;
860 scsu
->toUByteOne
=byteOne
;
862 /* write back the updated pointers */
863 pArgs
->source
=(const char *)source
;
864 pArgs
->target
=target
;
868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
872 * reasonable results. The lookahead is minimal.
873 * Many cases are simple:
874 * A character fits directly into the current mode, a dynamic or static window,
875 * or is not compressible. These cases are tested first.
876 * Real compression heuristics are applied to the rest, in code branches for
877 * single/Unicode mode and BMP/supplementary code points.
878 * The heuristics used here are extremely simple.
881 /* get the number of the window that this character is in, or -1 */
883 getWindow(const uint32_t offsets
[8], uint32_t c
) {
886 if((uint32_t)(c
-offsets
[i
])<=0x7f) {
893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
895 isInOffsetWindowOrDirect(uint32_t offset
, uint32_t c
) {
896 return (UBool
)(c
<=offset
+0x7f &&
897 (c
>=offset
|| (c
<=0x7f &&
898 (c
>=0x20 || (1UL<<c
)&0x2601))));
899 /* binary 0010 0110 0000 0001,
900 check for b==0xd || b==0xa || b==9 || b==0 */
904 * getNextDynamicWindow returns the next dynamic window to be redefined
907 getNextDynamicWindow(SCSUData
*scsu
) {
908 int8_t window
=scsu
->windowUse
[scsu
->nextWindowUseIndex
];
909 if(++scsu
->nextWindowUseIndex
==8) {
910 scsu
->nextWindowUseIndex
=0;
916 * useDynamicWindow() adjusts
917 * windowUse[] and nextWindowUseIndex for the algorithm to choose
918 * the next dynamic window to be defined;
919 * a subclass may override it and provide its own algorithm.
922 useDynamicWindow(SCSUData
*scsu
, int8_t window
) {
924 * move the existing window, which just became the most recently used one,
925 * up in windowUse[] to nextWindowUseIndex-1
928 /* first, find the index of the window - backwards to favor the more recently used windows */
931 i
=scsu
->nextWindowUseIndex
;
936 } while(scsu
->windowUse
[i
]!=window
);
938 /* now copy each windowUse[i+1] to [i] */
943 while(j
!=scsu
->nextWindowUseIndex
) {
944 scsu
->windowUse
[i
]=scsu
->windowUse
[j
];
949 /* finally, set the window into the most recently used index */
950 scsu
->windowUse
[i
]=window
;
954 * calculate the offset and the code for a dynamic window that contains the character
955 * takes fixed offsets into account
956 * the offset of the window is stored in the offset variable,
957 * the code is returned
959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
962 getDynamicOffset(uint32_t c
, uint32_t *pOffset
) {
966 if((uint32_t)(c
-fixedOffsets
[i
])<=0x7f) {
967 *pOffset
=fixedOffsets
[i
];
973 /* No dynamic window for US-ASCII. */
975 } else if(c
<0x3400 ||
976 (uint32_t)(c
-0x10000)<(0x14000-0x10000) ||
977 (uint32_t)(c
-0x1d000)<=(0x1ffff-0x1d000)
979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
980 *pOffset
=c
&0x7fffff80;
982 } else if(0xe000<=c
&& c
!=0xfeff && c
<0xfff0) {
983 /* For these characters we need to take the gapOffset into account. */
984 *pOffset
=c
&0x7fffff80;
985 return (int)((c
-gapOffset
)>>7);
992 * Idea for compression:
993 * - save SCSUData and other state before really starting work
994 * - at endloop, see if compression could be better with just unicode mode
995 * - don't do this if a callback has been called
996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
997 * - different buffer handling!
999 * Drawback or need for corrective handling:
1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1004 * How to achieve both?
1005 * - Only replace the result after an SDX or SCU?
1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1010 UErrorCode
*pErrorCode
) {
1013 const UChar
*source
, *sourceLimit
;
1015 int32_t targetCapacity
;
1018 UBool isSingleByteMode
;
1019 uint8_t dynamicWindow
;
1020 uint32_t currentOffset
;
1024 int32_t sourceIndex
, nextSourceIndex
;
1028 /* variables for compression heuristics */
1034 /* set up the local pointers */
1035 cnv
=pArgs
->converter
;
1036 scsu
=(SCSUData
*)cnv
->extraInfo
;
1038 /* set up the local pointers */
1039 source
=pArgs
->source
;
1040 sourceLimit
=pArgs
->sourceLimit
;
1041 target
=(uint8_t *)pArgs
->target
;
1042 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1043 offsets
=pArgs
->offsets
;
1045 /* get the state machine state */
1046 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1047 dynamicWindow
=scsu
->fromUDynamicWindow
;
1048 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1052 /* sourceIndex=-1 if the current character began in the previous buffer */
1053 sourceIndex
= c
==0 ? 0 : -1;
1056 /* similar conversion "loop" as in toUnicode */
1058 if(isSingleByteMode
) {
1059 if(c
!=0 && targetCapacity
>0) {
1060 goto getTrailSingle
;
1063 /* state machine for single-byte mode */
1064 /* singleByteMode: */
1065 while(source
<sourceLimit
) {
1066 if(targetCapacity
<=0) {
1067 /* target is full */
1068 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1074 if((c
-0x20)<=0x5f) {
1075 /* pass US-ASCII graphic character through */
1076 *target
++=(uint8_t)c
;
1078 *offsets
++=sourceIndex
;
1082 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1084 *target
++=(uint8_t)c
;
1086 *offsets
++=sourceIndex
;
1090 /* quote C0 control character */
1095 } else if((delta
=c
-currentOffset
)<=0x7f) {
1096 /* use the current dynamic window */
1097 *target
++=(uint8_t)(delta
|0x80);
1099 *offsets
++=sourceIndex
;
1102 } else if(U16_IS_SURROGATE(c
)) {
1103 if(U16_IS_SURROGATE_LEAD(c
)) {
1106 if(source
<sourceLimit
) {
1107 /* test the following code unit */
1109 if(U16_IS_TRAIL(trail
)) {
1112 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1113 /* convert this surrogate code point */
1114 /* exit this condition tree */
1116 /* this is an unmatched lead code unit (1st surrogate) */
1117 /* callback(illegal) */
1118 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1126 /* this is an unmatched trail code unit (2nd surrogate) */
1127 /* callback(illegal) */
1128 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1132 /* compress supplementary character U+10000..U+10ffff */
1133 if((delta
=c
-currentOffset
)<=0x7f) {
1134 /* use the current dynamic window */
1135 *target
++=(uint8_t)(delta
|0x80);
1137 *offsets
++=sourceIndex
;
1140 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1141 /* there is a dynamic window that contains this character, change to it */
1142 dynamicWindow
=window
;
1143 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1144 useDynamicWindow(scsu
, dynamicWindow
);
1145 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1148 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1149 /* might check if there are more characters in this window to come */
1150 /* define an extended window with this character */
1152 dynamicWindow
=getNextDynamicWindow(scsu
);
1153 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1154 useDynamicWindow(scsu
, dynamicWindow
);
1155 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1159 /* change to Unicode mode and output this (lead, trail) pair */
1160 isSingleByteMode
=FALSE
;
1161 *target
++=(uint8_t)SCU
;
1163 *offsets
++=sourceIndex
;
1166 c
=((uint32_t)lead
<<16)|trail
;
1171 /* quote C1 control character */
1172 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1175 } else if(c
==0xfeff || c
>=0xfff0) {
1176 /* quote signature character=byte order mark and specials */
1181 /* compress all other BMP characters */
1182 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1183 /* there is a window defined that contains this character - switch to it or quote from it? */
1184 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1185 /* change to dynamic window */
1186 dynamicWindow
=window
;
1187 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1188 useDynamicWindow(scsu
, dynamicWindow
);
1189 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1193 /* quote from dynamic window */
1194 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1198 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1199 /* quote from static window */
1200 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1203 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1204 /* define a dynamic window with this character */
1205 dynamicWindow
=getNextDynamicWindow(scsu
);
1206 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1207 useDynamicWindow(scsu
, dynamicWindow
);
1208 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1211 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1212 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1215 * this character is not compressible (a BMP ideograph or similar);
1216 * switch to Unicode mode if this is the last character in the block
1217 * or there is at least one more ideograph following immediately
1219 isSingleByteMode
=FALSE
;
1231 /* normal end of conversion: prepare for a new character */
1233 sourceIndex
=nextSourceIndex
;
1236 if(c
!=0 && targetCapacity
>0) {
1237 goto getTrailUnicode
;
1240 /* state machine for Unicode mode */
1241 /* unicodeByteMode: */
1242 while(source
<sourceLimit
) {
1243 if(targetCapacity
<=0) {
1244 /* target is full */
1245 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1251 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1252 /* not compressible, write character directly */
1253 if(targetCapacity
>=2) {
1254 *target
++=(uint8_t)(c
>>8);
1255 *target
++=(uint8_t)c
;
1257 *offsets
++=sourceIndex
;
1258 *offsets
++=sourceIndex
;
1265 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1266 /* compress BMP character if the following one is not an uncompressible ideograph */
1267 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1268 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1269 /* ASCII digit or letter */
1270 isSingleByteMode
=TRUE
;
1271 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1274 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1275 /* there is a dynamic window that contains this character, change to it */
1276 isSingleByteMode
=TRUE
;
1277 dynamicWindow
=window
;
1278 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1279 useDynamicWindow(scsu
, dynamicWindow
);
1280 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1283 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1284 /* define a dynamic window with this character */
1285 isSingleByteMode
=TRUE
;
1286 dynamicWindow
=getNextDynamicWindow(scsu
);
1287 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1288 useDynamicWindow(scsu
, dynamicWindow
);
1289 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1295 /* don't know how to compress this character, just write it directly */
1298 } else if(c
<0xe000) {
1299 /* c is a surrogate */
1300 if(U16_IS_SURROGATE_LEAD(c
)) {
1303 if(source
<sourceLimit
) {
1304 /* test the following code unit */
1306 if(U16_IS_TRAIL(trail
)) {
1309 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1310 /* convert this surrogate code point */
1311 /* exit this condition tree */
1313 /* this is an unmatched lead code unit (1st surrogate) */
1314 /* callback(illegal) */
1315 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1323 /* this is an unmatched trail code unit (2nd surrogate) */
1324 /* callback(illegal) */
1325 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1329 /* compress supplementary character */
1330 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1331 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1334 * there is a dynamic window that contains this character and
1335 * the following character is not uncompressible,
1336 * change to the window
1338 isSingleByteMode
=TRUE
;
1339 dynamicWindow
=window
;
1340 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1341 useDynamicWindow(scsu
, dynamicWindow
);
1342 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1345 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1346 (code
=getDynamicOffset(c
, &offset
))>=0
1348 /* two supplementary characters in (probably) the same window - define an extended one */
1349 isSingleByteMode
=TRUE
;
1351 dynamicWindow
=getNextDynamicWindow(scsu
);
1352 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1353 useDynamicWindow(scsu
, dynamicWindow
);
1354 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1358 /* don't know how to compress this character, just write it directly */
1359 c
=((uint32_t)lead
<<16)|trail
;
1363 } else /* 0xe000<=c<0xf300 */ {
1364 /* quote to avoid SCSU tags */
1370 /* normal end of conversion: prepare for a new character */
1372 sourceIndex
=nextSourceIndex
;
1377 /* set the converter state back into UConverter */
1378 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1379 scsu
->fromUDynamicWindow
=dynamicWindow
;
1383 /* write back the updated pointers */
1384 pArgs
->source
=source
;
1385 pArgs
->target
=(char *)target
;
1386 pArgs
->offsets
=offsets
;
1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1391 /* from the first if in the loop we know that targetCapacity>0 */
1392 if(length
<=targetCapacity
) {
1395 /* each branch falls through to the next one */
1397 *target
++=(uint8_t)(c
>>24);
1400 *target
++=(uint8_t)(c
>>16);
1403 *target
++=(uint8_t)(c
>>8);
1406 *target
++=(uint8_t)c
;
1409 /* will never occur */
1414 /* each branch falls through to the next one */
1416 *target
++=(uint8_t)(c
>>24);
1417 *offsets
++=sourceIndex
;
1420 *target
++=(uint8_t)(c
>>16);
1421 *offsets
++=sourceIndex
;
1424 *target
++=(uint8_t)(c
>>8);
1425 *offsets
++=sourceIndex
;
1428 *target
++=(uint8_t)c
;
1429 *offsets
++=sourceIndex
;
1432 /* will never occur */
1436 targetCapacity
-=length
;
1438 /* normal end of conversion: prepare for a new character */
1440 sourceIndex
=nextSourceIndex
;
1446 * We actually do this backwards here:
1447 * In order to save an intermediate variable, we output
1448 * first to the overflow buffer what does not fit into the
1451 /* we know that 0<=targetCapacity<length<=4 */
1452 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1453 length
-=targetCapacity
;
1454 p
=(uint8_t *)cnv
->charErrorBuffer
;
1456 /* each branch falls through to the next one */
1458 *p
++=(uint8_t)(c
>>24);
1461 *p
++=(uint8_t)(c
>>16);
1464 *p
++=(uint8_t)(c
>>8);
1470 /* will never occur */
1473 cnv
->charErrorBufferLength
=(int8_t)length
;
1475 /* now output what fits into the regular target */
1476 c
>>=8*length
; /* length was reduced by targetCapacity */
1477 switch(targetCapacity
) {
1478 /* each branch falls through to the next one */
1480 *target
++=(uint8_t)(c
>>16);
1482 *offsets
++=sourceIndex
;
1486 *target
++=(uint8_t)(c
>>8);
1488 *offsets
++=sourceIndex
;
1492 *target
++=(uint8_t)c
;
1494 *offsets
++=sourceIndex
;
1501 /* target overflow */
1503 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1510 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1511 * If a change is made in the original function, then either
1512 * change this function the same way or
1513 * re-copy the original function and remove the variables
1514 * offsets, sourceIndex, and nextSourceIndex.
1517 _SCSUFromUnicode(UConverterFromUnicodeArgs
*pArgs
,
1518 UErrorCode
*pErrorCode
) {
1521 const UChar
*source
, *sourceLimit
;
1523 int32_t targetCapacity
;
1525 UBool isSingleByteMode
;
1526 uint8_t dynamicWindow
;
1527 uint32_t currentOffset
;
1533 /* variables for compression heuristics */
1539 /* set up the local pointers */
1540 cnv
=pArgs
->converter
;
1541 scsu
=(SCSUData
*)cnv
->extraInfo
;
1543 /* set up the local pointers */
1544 source
=pArgs
->source
;
1545 sourceLimit
=pArgs
->sourceLimit
;
1546 target
=(uint8_t *)pArgs
->target
;
1547 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1549 /* get the state machine state */
1550 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1551 dynamicWindow
=scsu
->fromUDynamicWindow
;
1552 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1556 /* similar conversion "loop" as in toUnicode */
1558 if(isSingleByteMode
) {
1559 if(c
!=0 && targetCapacity
>0) {
1560 goto getTrailSingle
;
1563 /* state machine for single-byte mode */
1564 /* singleByteMode: */
1565 while(source
<sourceLimit
) {
1566 if(targetCapacity
<=0) {
1567 /* target is full */
1568 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1573 if((c
-0x20)<=0x5f) {
1574 /* pass US-ASCII graphic character through */
1575 *target
++=(uint8_t)c
;
1578 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1580 *target
++=(uint8_t)c
;
1583 /* quote C0 control character */
1588 } else if((delta
=c
-currentOffset
)<=0x7f) {
1589 /* use the current dynamic window */
1590 *target
++=(uint8_t)(delta
|0x80);
1592 } else if(U16_IS_SURROGATE(c
)) {
1593 if(U16_IS_SURROGATE_LEAD(c
)) {
1596 if(source
<sourceLimit
) {
1597 /* test the following code unit */
1599 if(U16_IS_TRAIL(trail
)) {
1601 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1602 /* convert this surrogate code point */
1603 /* exit this condition tree */
1605 /* this is an unmatched lead code unit (1st surrogate) */
1606 /* callback(illegal) */
1607 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1615 /* this is an unmatched trail code unit (2nd surrogate) */
1616 /* callback(illegal) */
1617 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1621 /* compress supplementary character U+10000..U+10ffff */
1622 if((delta
=c
-currentOffset
)<=0x7f) {
1623 /* use the current dynamic window */
1624 *target
++=(uint8_t)(delta
|0x80);
1626 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1627 /* there is a dynamic window that contains this character, change to it */
1628 dynamicWindow
=window
;
1629 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1630 useDynamicWindow(scsu
, dynamicWindow
);
1631 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1634 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1635 /* might check if there are more characters in this window to come */
1636 /* define an extended window with this character */
1638 dynamicWindow
=getNextDynamicWindow(scsu
);
1639 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1640 useDynamicWindow(scsu
, dynamicWindow
);
1641 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1645 /* change to Unicode mode and output this (lead, trail) pair */
1646 isSingleByteMode
=FALSE
;
1647 *target
++=(uint8_t)SCU
;
1649 c
=((uint32_t)lead
<<16)|trail
;
1654 /* quote C1 control character */
1655 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1658 } else if(c
==0xfeff || c
>=0xfff0) {
1659 /* quote signature character=byte order mark and specials */
1664 /* compress all other BMP characters */
1665 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1666 /* there is a window defined that contains this character - switch to it or quote from it? */
1667 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1668 /* change to dynamic window */
1669 dynamicWindow
=window
;
1670 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1671 useDynamicWindow(scsu
, dynamicWindow
);
1672 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1676 /* quote from dynamic window */
1677 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1681 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1682 /* quote from static window */
1683 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1686 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1687 /* define a dynamic window with this character */
1688 dynamicWindow
=getNextDynamicWindow(scsu
);
1689 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1690 useDynamicWindow(scsu
, dynamicWindow
);
1691 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1694 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1695 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1698 * this character is not compressible (a BMP ideograph or similar);
1699 * switch to Unicode mode if this is the last character in the block
1700 * or there is at least one more ideograph following immediately
1702 isSingleByteMode
=FALSE
;
1714 /* normal end of conversion: prepare for a new character */
1718 if(c
!=0 && targetCapacity
>0) {
1719 goto getTrailUnicode
;
1722 /* state machine for Unicode mode */
1723 /* unicodeByteMode: */
1724 while(source
<sourceLimit
) {
1725 if(targetCapacity
<=0) {
1726 /* target is full */
1727 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1732 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1733 /* not compressible, write character directly */
1734 if(targetCapacity
>=2) {
1735 *target
++=(uint8_t)(c
>>8);
1736 *target
++=(uint8_t)c
;
1742 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1743 /* compress BMP character if the following one is not an uncompressible ideograph */
1744 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1745 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1746 /* ASCII digit or letter */
1747 isSingleByteMode
=TRUE
;
1748 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1751 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1752 /* there is a dynamic window that contains this character, change to it */
1753 isSingleByteMode
=TRUE
;
1754 dynamicWindow
=window
;
1755 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1756 useDynamicWindow(scsu
, dynamicWindow
);
1757 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1760 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1761 /* define a dynamic window with this character */
1762 isSingleByteMode
=TRUE
;
1763 dynamicWindow
=getNextDynamicWindow(scsu
);
1764 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1765 useDynamicWindow(scsu
, dynamicWindow
);
1766 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1772 /* don't know how to compress this character, just write it directly */
1775 } else if(c
<0xe000) {
1776 /* c is a surrogate */
1777 if(U16_IS_SURROGATE_LEAD(c
)) {
1780 if(source
<sourceLimit
) {
1781 /* test the following code unit */
1783 if(U16_IS_TRAIL(trail
)) {
1785 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1786 /* convert this surrogate code point */
1787 /* exit this condition tree */
1789 /* this is an unmatched lead code unit (1st surrogate) */
1790 /* callback(illegal) */
1791 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1799 /* this is an unmatched trail code unit (2nd surrogate) */
1800 /* callback(illegal) */
1801 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1805 /* compress supplementary character */
1806 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1807 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1810 * there is a dynamic window that contains this character and
1811 * the following character is not uncompressible,
1812 * change to the window
1814 isSingleByteMode
=TRUE
;
1815 dynamicWindow
=window
;
1816 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1817 useDynamicWindow(scsu
, dynamicWindow
);
1818 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1821 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1822 (code
=getDynamicOffset(c
, &offset
))>=0
1824 /* two supplementary characters in (probably) the same window - define an extended one */
1825 isSingleByteMode
=TRUE
;
1827 dynamicWindow
=getNextDynamicWindow(scsu
);
1828 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1829 useDynamicWindow(scsu
, dynamicWindow
);
1830 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1834 /* don't know how to compress this character, just write it directly */
1835 c
=((uint32_t)lead
<<16)|trail
;
1839 } else /* 0xe000<=c<0xf300 */ {
1840 /* quote to avoid SCSU tags */
1846 /* normal end of conversion: prepare for a new character */
1852 /* set the converter state back into UConverter */
1853 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1854 scsu
->fromUDynamicWindow
=dynamicWindow
;
1858 /* write back the updated pointers */
1859 pArgs
->source
=source
;
1860 pArgs
->target
=(char *)target
;
1864 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1865 /* from the first if in the loop we know that targetCapacity>0 */
1866 if(length
<=targetCapacity
) {
1868 /* each branch falls through to the next one */
1870 *target
++=(uint8_t)(c
>>24);
1873 *target
++=(uint8_t)(c
>>16);
1876 *target
++=(uint8_t)(c
>>8);
1879 *target
++=(uint8_t)c
;
1882 /* will never occur */
1885 targetCapacity
-=length
;
1887 /* normal end of conversion: prepare for a new character */
1894 * We actually do this backwards here:
1895 * In order to save an intermediate variable, we output
1896 * first to the overflow buffer what does not fit into the
1899 /* we know that 0<=targetCapacity<length<=4 */
1900 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1901 length
-=targetCapacity
;
1902 p
=(uint8_t *)cnv
->charErrorBuffer
;
1904 /* each branch falls through to the next one */
1906 *p
++=(uint8_t)(c
>>24);
1909 *p
++=(uint8_t)(c
>>16);
1912 *p
++=(uint8_t)(c
>>8);
1918 /* will never occur */
1921 cnv
->charErrorBufferLength
=(int8_t)length
;
1923 /* now output what fits into the regular target */
1924 c
>>=8*length
; /* length was reduced by targetCapacity */
1925 switch(targetCapacity
) {
1926 /* each branch falls through to the next one */
1928 *target
++=(uint8_t)(c
>>16);
1931 *target
++=(uint8_t)(c
>>8);
1934 *target
++=(uint8_t)c
;
1940 /* target overflow */
1942 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1948 /* miscellaneous ------------------------------------------------------------ */
1951 _SCSUGetName(const UConverter
*cnv
) {
1952 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
1954 switch(scsu
->locale
) {
1956 return "SCSU,locale=ja";
1962 /* structure for SafeClone calculations */
1963 struct cloneSCSUStruct
1970 _SCSUSafeClone(const UConverter
*cnv
,
1972 int32_t *pBufferSize
,
1975 struct cloneSCSUStruct
* localClone
;
1976 int32_t bufferSizeNeeded
= sizeof(struct cloneSCSUStruct
);
1978 if (U_FAILURE(*status
)){
1982 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1983 *pBufferSize
= bufferSizeNeeded
;
1987 localClone
= (struct cloneSCSUStruct
*)stackBuffer
;
1988 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1990 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(SCSUData
));
1991 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1992 localClone
->cnv
.isExtraLocal
= TRUE
;
1994 return &localClone
->cnv
;
1998 static const UConverterImpl _SCSUImpl
={
2009 _SCSUToUnicodeWithOffsets
,
2011 _SCSUFromUnicodeWithOffsets
,
2018 ucnv_getCompleteUnicodeSet
2021 static const UConverterStaticData _SCSUStaticData
={
2022 sizeof(UConverterStaticData
),
2024 1212, /* CCSID for SCSU */
2025 UCNV_IBM
, UCNV_SCSU
,
2026 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2028 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2029 * substitution string.
2031 { 0x0e, 0xff, 0xfd, 0 }, 3,
2035 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2038 const UConverterSharedData _SCSUData
=
2039 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData
, &_SCSUImpl
);