2 ******************************************************************************
4 * Copyright (C) 2000-2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvscsu.c
10 * tab size: 8 (not used)
13 * created on: 2000nov18
14 * created by: Markus W. Scherer
16 * This is an implementation of the Standard Compression Scheme for Unicode
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 * Reserved commands and window settings are treated as illegal sequences and
19 * will result in callback calls.
22 #include "unicode/utypes.h"
24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "unicode/utf16.h"
33 /* SCSU definitions --------------------------------------------------------- */
35 /* SCSU command byte values */
37 SQ0
=0x01, /* Quote from window pair 0 */
38 SQ7
=0x08, /* Quote from window pair 7 */
39 SDX
=0x0B, /* Define a window as extended */
40 Srs
=0x0C, /* reserved */
41 SQU
=0x0E, /* Quote a single Unicode character */
42 SCU
=0x0F, /* Change to Unicode mode */
43 SC0
=0x10, /* Select window 0 */
44 SC7
=0x17, /* Select window 7 */
45 SD0
=0x18, /* Define and select window 0 */
46 SD7
=0x1F, /* Define and select window 7 */
48 UC0
=0xE0, /* Select window 0 */
49 UC7
=0xE7, /* Select window 7 */
50 UD0
=0xE8, /* Define and select window 0 */
51 UD7
=0xEF, /* Define and select window 7 */
52 UQU
=0xF0, /* Quote a single Unicode character */
53 UDX
=0xF1, /* Define a Window as extended */
54 Urs
=0xF2 /* reserved */
59 * Unicode code points from 3400 to E000 are not adressible by
60 * dynamic window, since in these areas no short run alphabets are
61 * found. Therefore add gapOffset to all values from gapThreshold.
66 /* values between reservedStart and fixedThreshold are reserved */
69 /* use table of predefined fixed offsets for values from fixedThreshold */
73 /* constant offsets for the 8 static windows */
74 static const uint32_t staticOffsets
[8]={
75 0x0000, /* ASCII for quoted tags */
76 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
77 0x0100, /* Latin Extended-A */
78 0x0300, /* Combining Diacritical Marks */
79 0x2000, /* General Punctuation */
80 0x2080, /* Currency Symbols */
81 0x2100, /* Letterlike Symbols and Number Forms */
82 0x3000 /* CJK Symbols and punctuation */
85 /* initial offsets for the 8 dynamic (sliding) windows */
86 static const uint32_t initialDynamicOffsets
[8]={
88 0x00C0, /* Latin Extended A */
89 0x0400, /* Cyrillic */
91 0x0900, /* Devanagari */
92 0x3040, /* Hiragana */
93 0x30A0, /* Katakana */
94 0xFF00 /* Fullwidth ASCII */
97 /* Table of fixed predefined Offsets */
98 static const uint32_t fixedOffsets
[]={
99 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
100 /* 0xFA */ 0x0250, /* IPA extensions */
101 /* 0xFB */ 0x0370, /* Greek */
102 /* 0xFC */ 0x0530, /* Armenian */
103 /* 0xFD */ 0x3040, /* Hiragana */
104 /* 0xFE */ 0x30A0, /* Katakana */
105 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
119 typedef struct SCSUData
{
120 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
121 uint32_t toUDynamicOffsets
[8];
122 uint32_t fromUDynamicOffsets
[8];
124 /* state machine state - toUnicode */
125 UBool toUIsSingleByteMode
;
127 int8_t toUQuoteWindow
, toUDynamicWindow
;
129 uint8_t toUPadding
[3];
131 /* state machine state - fromUnicode */
132 UBool fromUIsSingleByteMode
;
133 int8_t fromUDynamicWindow
;
136 * windowUse[] keeps track of the use of the dynamic windows:
137 * At nextWindowUseIndex there is the least recently used window,
138 * and the following windows (in a wrapping manner) are more and more
140 * At nextWindowUseIndex-1 there is the most recently used window.
143 int8_t nextWindowUseIndex
;
147 static const int8_t initialWindowUse
[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
148 static const int8_t initialWindowUse_ja
[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
154 /* SCSU setup functions ----------------------------------------------------- */
157 _SCSUReset(UConverter
*cnv
, UConverterResetChoice choice
) {
158 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
160 if(choice
<=UCNV_RESET_TO_UNICODE
) {
161 /* reset toUnicode */
162 uprv_memcpy(scsu
->toUDynamicOffsets
, initialDynamicOffsets
, 32);
164 scsu
->toUIsSingleByteMode
=TRUE
;
165 scsu
->toUState
=readCommand
;
166 scsu
->toUQuoteWindow
=scsu
->toUDynamicWindow
=0;
171 if(choice
!=UCNV_RESET_TO_UNICODE
) {
172 /* reset fromUnicode */
173 uprv_memcpy(scsu
->fromUDynamicOffsets
, initialDynamicOffsets
, 32);
175 scsu
->fromUIsSingleByteMode
=TRUE
;
176 scsu
->fromUDynamicWindow
=0;
178 scsu
->nextWindowUseIndex
=0;
179 switch(scsu
->locale
) {
181 uprv_memcpy(scsu
->windowUse
, initialWindowUse_ja
, 8);
184 uprv_memcpy(scsu
->windowUse
, initialWindowUse
, 8);
193 _SCSUOpen(UConverter
*cnv
,
194 UConverterLoadArgs
*pArgs
,
195 UErrorCode
*pErrorCode
) {
196 const char *locale
=pArgs
->locale
;
197 if(pArgs
->onlyTestIsLoadable
) {
200 cnv
->extraInfo
=uprv_malloc(sizeof(SCSUData
));
201 if(cnv
->extraInfo
!=NULL
) {
202 if(locale
!=NULL
&& locale
[0]=='j' && locale
[1]=='a' && (locale
[2]==0 || locale
[2]=='_')) {
203 ((SCSUData
*)cnv
->extraInfo
)->locale
=l_ja
;
205 ((SCSUData
*)cnv
->extraInfo
)->locale
=lGeneric
;
207 _SCSUReset(cnv
, UCNV_RESET_BOTH
);
209 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
212 /* Set the substitution character U+fffd as a Unicode string. */
213 cnv
->subUChars
[0]=0xfffd;
218 _SCSUClose(UConverter
*cnv
) {
219 if(cnv
->extraInfo
!=NULL
) {
220 if(!cnv
->isExtraLocal
) {
221 uprv_free(cnv
->extraInfo
);
227 /* SCSU-to-Unicode conversion functions ------------------------------------- */
230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
231 UErrorCode
*pErrorCode
) {
234 const uint8_t *source
, *sourceLimit
;
236 const UChar
*targetLimit
;
238 UBool isSingleByteMode
;
239 uint8_t state
, byteOne
;
240 int8_t quoteWindow
, dynamicWindow
;
242 int32_t sourceIndex
, nextSourceIndex
;
246 /* set up the local pointers */
247 cnv
=pArgs
->converter
;
248 scsu
=(SCSUData
*)cnv
->extraInfo
;
250 source
=(const uint8_t *)pArgs
->source
;
251 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
252 target
=pArgs
->target
;
253 targetLimit
=pArgs
->targetLimit
;
254 offsets
=pArgs
->offsets
;
256 /* get the state machine state */
257 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
258 state
=scsu
->toUState
;
259 quoteWindow
=scsu
->toUQuoteWindow
;
260 dynamicWindow
=scsu
->toUDynamicWindow
;
261 byteOne
=scsu
->toUByteOne
;
263 /* sourceIndex=-1 if the current character began in the previous buffer */
264 sourceIndex
=state
==readCommand
? 0 : -1;
270 * For performance, this is not a normal C loop.
271 * Instead, there are two code blocks for the two SCSU modes.
272 * The function branches to either one, and a change of the mode is done with a goto to
275 * Each branch has two conventional loops:
276 * - a fast-path loop for the most common codes in the mode
277 * - a loop for all other codes in the mode
278 * When the fast-path runs into a code that it cannot handle, its loop ends and it
279 * runs into the following loop to handle the other codes.
280 * The end of the input or output buffer is also handled by the slower loop.
281 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
283 * The callback handling is done by returning with an error code.
284 * The conversion framework actually calls the callback function.
286 if(isSingleByteMode
) {
287 /* fast path for single-byte mode */
288 if(state
==readCommand
) {
290 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
294 /* write US-ASCII graphic character or DEL */
297 *offsets
++=sourceIndex
;
300 /* write from dynamic window */
301 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
305 *offsets
++=sourceIndex
;
308 /* output surrogate pair */
309 *target
++=(UChar
)(0xd7c0+(c
>>10));
310 if(target
<targetLimit
) {
311 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
313 *offsets
++=sourceIndex
;
314 *offsets
++=sourceIndex
;
317 /* target overflow */
319 *offsets
++=sourceIndex
;
321 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
322 cnv
->UCharErrorBufferLength
=1;
323 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
328 sourceIndex
=nextSourceIndex
;
332 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
334 while(source
<sourceLimit
) {
335 if(target
>=targetLimit
) {
337 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
344 /* redundant conditions are commented out */
345 /* here: b<0x20 because otherwise we would be in fastSingle */
346 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
350 *offsets
++=sourceIndex
;
352 sourceIndex
=nextSourceIndex
;
356 dynamicWindow
=(int8_t)(b
-SC0
);
357 sourceIndex
=nextSourceIndex
;
359 } else /* if(SD0<=b && b<=SD7) */ {
360 dynamicWindow
=(int8_t)(b
-SD0
);
363 } else if(/* SQ0<=b && */ b
<=SQ7
) {
364 quoteWindow
=(int8_t)(b
-SQ0
);
371 sourceIndex
=nextSourceIndex
;
372 isSingleByteMode
=FALSE
;
375 /* callback(illegal) */
376 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
382 /* store the first byte of a multibyte sequence in toUBytes[] */
393 *target
++=(UChar
)((byteOne
<<8)|b
);
395 *offsets
++=sourceIndex
;
397 sourceIndex
=nextSourceIndex
;
402 /* all static offsets are in the BMP */
403 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
405 *offsets
++=sourceIndex
;
408 /* write from dynamic window */
409 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
413 *offsets
++=sourceIndex
;
416 /* output surrogate pair */
417 *target
++=(UChar
)(0xd7c0+(c
>>10));
418 if(target
<targetLimit
) {
419 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
421 *offsets
++=sourceIndex
;
422 *offsets
++=sourceIndex
;
425 /* target overflow */
427 *offsets
++=sourceIndex
;
429 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
430 cnv
->UCharErrorBufferLength
=1;
431 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
436 sourceIndex
=nextSourceIndex
;
440 dynamicWindow
=(int8_t)((b
>>5)&7);
441 byteOne
=(uint8_t)(b
&0x1f);
447 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
448 sourceIndex
=nextSourceIndex
;
453 /* callback(illegal): Reserved window offset value 0 */
457 } else if(b
<gapThreshold
) {
458 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
459 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
460 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
461 } else if(b
>=fixedThreshold
) {
462 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
464 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
469 sourceIndex
=nextSourceIndex
;
475 /* fast path for Unicode mode */
476 if(state
==readCommand
) {
478 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
479 *target
++=(UChar
)((b
<<8)|source
[1]);
481 *offsets
++=sourceIndex
;
483 sourceIndex
=nextSourceIndex
;
489 /* normal state machine for Unicode mode */
490 /* unicodeByteMode: */
491 while(source
<sourceLimit
) {
492 if(target
>=targetLimit
) {
494 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
501 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
506 } else if(/* UC0<=b && */ b
<=UC7
) {
507 dynamicWindow
=(int8_t)(b
-UC0
);
508 sourceIndex
=nextSourceIndex
;
509 isSingleByteMode
=TRUE
;
511 } else if(/* UD0<=b && */ b
<=UD7
) {
512 dynamicWindow
=(int8_t)(b
-UD0
);
513 isSingleByteMode
=TRUE
;
519 isSingleByteMode
=TRUE
;
529 /* callback(illegal) */
530 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
543 *target
++=(UChar
)((byteOne
<<8)|b
);
545 *offsets
++=sourceIndex
;
547 sourceIndex
=nextSourceIndex
;
555 /* set the converter state back into UConverter */
556 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
557 /* reset to deal with the next character */
559 } else if(state
==readCommand
) {
560 /* not in a multi-byte sequence, reset toULength */
563 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
564 scsu
->toUState
=state
;
565 scsu
->toUQuoteWindow
=quoteWindow
;
566 scsu
->toUDynamicWindow
=dynamicWindow
;
567 scsu
->toUByteOne
=byteOne
;
569 /* write back the updated pointers */
570 pArgs
->source
=(const char *)source
;
571 pArgs
->target
=target
;
572 pArgs
->offsets
=offsets
;
577 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
578 * If a change is made in the original function, then either
579 * change this function the same way or
580 * re-copy the original function and remove the variables
581 * offsets, sourceIndex, and nextSourceIndex.
584 _SCSUToUnicode(UConverterToUnicodeArgs
*pArgs
,
585 UErrorCode
*pErrorCode
) {
588 const uint8_t *source
, *sourceLimit
;
590 const UChar
*targetLimit
;
591 UBool isSingleByteMode
;
592 uint8_t state
, byteOne
;
593 int8_t quoteWindow
, dynamicWindow
;
597 /* set up the local pointers */
598 cnv
=pArgs
->converter
;
599 scsu
=(SCSUData
*)cnv
->extraInfo
;
601 source
=(const uint8_t *)pArgs
->source
;
602 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
603 target
=pArgs
->target
;
604 targetLimit
=pArgs
->targetLimit
;
606 /* get the state machine state */
607 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
608 state
=scsu
->toUState
;
609 quoteWindow
=scsu
->toUQuoteWindow
;
610 dynamicWindow
=scsu
->toUDynamicWindow
;
611 byteOne
=scsu
->toUByteOne
;
616 * For performance, this is not a normal C loop.
617 * Instead, there are two code blocks for the two SCSU modes.
618 * The function branches to either one, and a change of the mode is done with a goto to
621 * Each branch has two conventional loops:
622 * - a fast-path loop for the most common codes in the mode
623 * - a loop for all other codes in the mode
624 * When the fast-path runs into a code that it cannot handle, its loop ends and it
625 * runs into the following loop to handle the other codes.
626 * The end of the input or output buffer is also handled by the slower loop.
627 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
629 * The callback handling is done by returning with an error code.
630 * The conversion framework actually calls the callback function.
632 if(isSingleByteMode
) {
633 /* fast path for single-byte mode */
634 if(state
==readCommand
) {
636 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
639 /* write US-ASCII graphic character or DEL */
642 /* write from dynamic window */
643 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
647 /* output surrogate pair */
648 *target
++=(UChar
)(0xd7c0+(c
>>10));
649 if(target
<targetLimit
) {
650 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
652 /* target overflow */
653 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
654 cnv
->UCharErrorBufferLength
=1;
655 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
663 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
665 while(source
<sourceLimit
) {
666 if(target
>=targetLimit
) {
668 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
674 /* redundant conditions are commented out */
675 /* here: b<0x20 because otherwise we would be in fastSingle */
676 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
682 dynamicWindow
=(int8_t)(b
-SC0
);
684 } else /* if(SD0<=b && b<=SD7) */ {
685 dynamicWindow
=(int8_t)(b
-SD0
);
688 } else if(/* SQ0<=b && */ b
<=SQ7
) {
689 quoteWindow
=(int8_t)(b
-SQ0
);
696 isSingleByteMode
=FALSE
;
699 /* callback(illegal) */
700 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
706 /* store the first byte of a multibyte sequence in toUBytes[] */
717 *target
++=(UChar
)((byteOne
<<8)|b
);
722 /* all static offsets are in the BMP */
723 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
725 /* write from dynamic window */
726 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
730 /* output surrogate pair */
731 *target
++=(UChar
)(0xd7c0+(c
>>10));
732 if(target
<targetLimit
) {
733 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
735 /* target overflow */
736 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
737 cnv
->UCharErrorBufferLength
=1;
738 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
746 dynamicWindow
=(int8_t)((b
>>5)&7);
747 byteOne
=(uint8_t)(b
&0x1f);
753 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
758 /* callback(illegal): Reserved window offset value 0 */
762 } else if(b
<gapThreshold
) {
763 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
764 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
765 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
766 } else if(b
>=fixedThreshold
) {
767 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
769 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
779 /* fast path for Unicode mode */
780 if(state
==readCommand
) {
782 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
783 *target
++=(UChar
)((b
<<8)|source
[1]);
788 /* normal state machine for Unicode mode */
789 /* unicodeByteMode: */
790 while(source
<sourceLimit
) {
791 if(target
>=targetLimit
) {
793 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
799 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
804 } else if(/* UC0<=b && */ b
<=UC7
) {
805 dynamicWindow
=(int8_t)(b
-UC0
);
806 isSingleByteMode
=TRUE
;
808 } else if(/* UD0<=b && */ b
<=UD7
) {
809 dynamicWindow
=(int8_t)(b
-UD0
);
810 isSingleByteMode
=TRUE
;
816 isSingleByteMode
=TRUE
;
826 /* callback(illegal) */
827 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
840 *target
++=(UChar
)((byteOne
<<8)|b
);
848 /* set the converter state back into UConverter */
849 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
850 /* reset to deal with the next character */
852 } else if(state
==readCommand
) {
853 /* not in a multi-byte sequence, reset toULength */
856 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
857 scsu
->toUState
=state
;
858 scsu
->toUQuoteWindow
=quoteWindow
;
859 scsu
->toUDynamicWindow
=dynamicWindow
;
860 scsu
->toUByteOne
=byteOne
;
862 /* write back the updated pointers */
863 pArgs
->source
=(const char *)source
;
864 pArgs
->target
=target
;
868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
872 * reasonable results. The lookahead is minimal.
873 * Many cases are simple:
874 * A character fits directly into the current mode, a dynamic or static window,
875 * or is not compressible. These cases are tested first.
876 * Real compression heuristics are applied to the rest, in code branches for
877 * single/Unicode mode and BMP/supplementary code points.
878 * The heuristics used here are extremely simple.
881 /* get the number of the window that this character is in, or -1 */
883 getWindow(const uint32_t offsets
[8], uint32_t c
) {
886 if((uint32_t)(c
-offsets
[i
])<=0x7f) {
893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
895 isInOffsetWindowOrDirect(uint32_t offset
, uint32_t c
) {
896 return (UBool
)(c
<=offset
+0x7f &&
897 (c
>=offset
|| (c
<=0x7f &&
898 (c
>=0x20 || (1UL<<c
)&0x2601))));
899 /* binary 0010 0110 0000 0001,
900 check for b==0xd || b==0xa || b==9 || b==0 */
904 * getNextDynamicWindow returns the next dynamic window to be redefined
907 getNextDynamicWindow(SCSUData
*scsu
) {
908 int8_t window
=scsu
->windowUse
[scsu
->nextWindowUseIndex
];
909 if(++scsu
->nextWindowUseIndex
==8) {
910 scsu
->nextWindowUseIndex
=0;
916 * useDynamicWindow() adjusts
917 * windowUse[] and nextWindowUseIndex for the algorithm to choose
918 * the next dynamic window to be defined;
919 * a subclass may override it and provide its own algorithm.
922 useDynamicWindow(SCSUData
*scsu
, int8_t window
) {
924 * move the existing window, which just became the most recently used one,
925 * up in windowUse[] to nextWindowUseIndex-1
928 /* first, find the index of the window - backwards to favor the more recently used windows */
931 i
=scsu
->nextWindowUseIndex
;
936 } while(scsu
->windowUse
[i
]!=window
);
938 /* now copy each windowUse[i+1] to [i] */
943 while(j
!=scsu
->nextWindowUseIndex
) {
944 scsu
->windowUse
[i
]=scsu
->windowUse
[j
];
949 /* finally, set the window into the most recently used index */
950 scsu
->windowUse
[i
]=window
;
954 * calculate the offset and the code for a dynamic window that contains the character
955 * takes fixed offsets into account
956 * the offset of the window is stored in the offset variable,
957 * the code is returned
959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
962 getDynamicOffset(uint32_t c
, uint32_t *pOffset
) {
966 if((uint32_t)(c
-fixedOffsets
[i
])<=0x7f) {
967 *pOffset
=fixedOffsets
[i
];
973 /* No dynamic window for US-ASCII. */
975 } else if(c
<0x3400 ||
976 (uint32_t)(c
-0x10000)<(0x14000-0x10000) ||
977 (uint32_t)(c
-0x1d000)<=(0x1ffff-0x1d000)
979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
980 *pOffset
=c
&0x7fffff80;
982 } else if(0xe000<=c
&& c
!=0xfeff && c
<0xfff0) {
983 /* For these characters we need to take the gapOffset into account. */
984 *pOffset
=c
&0x7fffff80;
985 return (int)((c
-gapOffset
)>>7);
992 * Idea for compression:
993 * - save SCSUData and other state before really starting work
994 * - at endloop, see if compression could be better with just unicode mode
995 * - don't do this if a callback has been called
996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
997 * - different buffer handling!
999 * Drawback or need for corrective handling:
1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1004 * How to achieve both?
1005 * - Only replace the result after an SDX or SCU?
1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1010 UErrorCode
*pErrorCode
) {
1013 const UChar
*source
, *sourceLimit
;
1015 int32_t targetCapacity
;
1018 UBool isSingleByteMode
;
1019 uint8_t dynamicWindow
;
1020 uint32_t currentOffset
;
1024 int32_t sourceIndex
, nextSourceIndex
;
1028 /* variables for compression heuristics */
1034 /* set up the local pointers */
1035 cnv
=pArgs
->converter
;
1036 scsu
=(SCSUData
*)cnv
->extraInfo
;
1038 /* set up the local pointers */
1039 source
=pArgs
->source
;
1040 sourceLimit
=pArgs
->sourceLimit
;
1041 target
=(uint8_t *)pArgs
->target
;
1042 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1043 offsets
=pArgs
->offsets
;
1045 /* get the state machine state */
1046 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1047 dynamicWindow
=scsu
->fromUDynamicWindow
;
1048 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1052 /* sourceIndex=-1 if the current character began in the previous buffer */
1053 sourceIndex
= c
==0 ? 0 : -1;
1056 /* similar conversion "loop" as in toUnicode */
1058 if(isSingleByteMode
) {
1059 if(c
!=0 && targetCapacity
>0) {
1060 goto getTrailSingle
;
1063 /* state machine for single-byte mode */
1064 /* singleByteMode: */
1065 while(source
<sourceLimit
) {
1066 if(targetCapacity
<=0) {
1067 /* target is full */
1068 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1074 if((c
-0x20)<=0x5f) {
1075 /* pass US-ASCII graphic character through */
1076 *target
++=(uint8_t)c
;
1078 *offsets
++=sourceIndex
;
1082 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1084 *target
++=(uint8_t)c
;
1086 *offsets
++=sourceIndex
;
1090 /* quote C0 control character */
1095 } else if((delta
=c
-currentOffset
)<=0x7f) {
1096 /* use the current dynamic window */
1097 *target
++=(uint8_t)(delta
|0x80);
1099 *offsets
++=sourceIndex
;
1102 } else if(U16_IS_SURROGATE(c
)) {
1103 if(U16_IS_SURROGATE_LEAD(c
)) {
1106 if(source
<sourceLimit
) {
1107 /* test the following code unit */
1109 if(U16_IS_TRAIL(trail
)) {
1112 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1113 /* convert this surrogate code point */
1114 /* exit this condition tree */
1116 /* this is an unmatched lead code unit (1st surrogate) */
1117 /* callback(illegal) */
1118 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1126 /* this is an unmatched trail code unit (2nd surrogate) */
1127 /* callback(illegal) */
1128 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1132 /* compress supplementary character U+10000..U+10ffff */
1133 if((delta
=c
-currentOffset
)<=0x7f) {
1134 /* use the current dynamic window */
1135 *target
++=(uint8_t)(delta
|0x80);
1137 *offsets
++=sourceIndex
;
1140 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1141 /* there is a dynamic window that contains this character, change to it */
1142 dynamicWindow
=window
;
1143 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1144 useDynamicWindow(scsu
, dynamicWindow
);
1145 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1148 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1149 /* might check if there are more characters in this window to come */
1150 /* define an extended window with this character */
1152 dynamicWindow
=getNextDynamicWindow(scsu
);
1153 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1154 useDynamicWindow(scsu
, dynamicWindow
);
1155 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1159 /* change to Unicode mode and output this (lead, trail) pair */
1160 isSingleByteMode
=FALSE
;
1161 *target
++=(uint8_t)SCU
;
1163 *offsets
++=sourceIndex
;
1166 c
=((uint32_t)lead
<<16)|trail
;
1171 /* quote C1 control character */
1172 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1175 } else if(c
==0xfeff || c
>=0xfff0) {
1176 /* quote signature character=byte order mark and specials */
1181 /* compress all other BMP characters */
1182 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1183 /* there is a window defined that contains this character - switch to it or quote from it? */
1184 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1185 /* change to dynamic window */
1186 dynamicWindow
=window
;
1187 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1188 useDynamicWindow(scsu
, dynamicWindow
);
1189 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1193 /* quote from dynamic window */
1194 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1198 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1199 /* quote from static window */
1200 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1203 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1204 /* define a dynamic window with this character */
1205 dynamicWindow
=getNextDynamicWindow(scsu
);
1206 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1207 useDynamicWindow(scsu
, dynamicWindow
);
1208 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1211 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1212 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1215 * this character is not compressible (a BMP ideograph or similar);
1216 * switch to Unicode mode if this is the last character in the block
1217 * or there is at least one more ideograph following immediately
1219 isSingleByteMode
=FALSE
;
1231 /* normal end of conversion: prepare for a new character */
1233 sourceIndex
=nextSourceIndex
;
1236 if(c
!=0 && targetCapacity
>0) {
1237 goto getTrailUnicode
;
1240 /* state machine for Unicode mode */
1241 /* unicodeByteMode: */
1242 while(source
<sourceLimit
) {
1243 if(targetCapacity
<=0) {
1244 /* target is full */
1245 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1251 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1252 /* not compressible, write character directly */
1253 if(targetCapacity
>=2) {
1254 *target
++=(uint8_t)(c
>>8);
1255 *target
++=(uint8_t)c
;
1257 *offsets
++=sourceIndex
;
1258 *offsets
++=sourceIndex
;
1265 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1266 /* compress BMP character if the following one is not an uncompressible ideograph */
1267 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1268 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1269 /* ASCII digit or letter */
1270 isSingleByteMode
=TRUE
;
1271 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1274 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1275 /* there is a dynamic window that contains this character, change to it */
1276 isSingleByteMode
=TRUE
;
1277 dynamicWindow
=window
;
1278 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1279 useDynamicWindow(scsu
, dynamicWindow
);
1280 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1283 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1284 /* define a dynamic window with this character */
1285 isSingleByteMode
=TRUE
;
1286 dynamicWindow
=getNextDynamicWindow(scsu
);
1287 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1288 useDynamicWindow(scsu
, dynamicWindow
);
1289 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1295 /* don't know how to compress this character, just write it directly */
1298 } else if(c
<0xe000) {
1299 /* c is a surrogate */
1300 if(U16_IS_SURROGATE_LEAD(c
)) {
1303 if(source
<sourceLimit
) {
1304 /* test the following code unit */
1306 if(U16_IS_TRAIL(trail
)) {
1309 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1310 /* convert this surrogate code point */
1311 /* exit this condition tree */
1313 /* this is an unmatched lead code unit (1st surrogate) */
1314 /* callback(illegal) */
1315 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1323 /* this is an unmatched trail code unit (2nd surrogate) */
1324 /* callback(illegal) */
1325 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1329 /* compress supplementary character */
1330 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1331 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1334 * there is a dynamic window that contains this character and
1335 * the following character is not uncompressible,
1336 * change to the window
1338 isSingleByteMode
=TRUE
;
1339 dynamicWindow
=window
;
1340 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1341 useDynamicWindow(scsu
, dynamicWindow
);
1342 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1345 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1346 (code
=getDynamicOffset(c
, &offset
))>=0
1348 /* two supplementary characters in (probably) the same window - define an extended one */
1349 isSingleByteMode
=TRUE
;
1351 dynamicWindow
=getNextDynamicWindow(scsu
);
1352 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1353 useDynamicWindow(scsu
, dynamicWindow
);
1354 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1358 /* don't know how to compress this character, just write it directly */
1359 c
=((uint32_t)lead
<<16)|trail
;
1363 } else /* 0xe000<=c<0xf300 */ {
1364 /* quote to avoid SCSU tags */
1370 /* normal end of conversion: prepare for a new character */
1372 sourceIndex
=nextSourceIndex
;
1377 /* set the converter state back into UConverter */
1378 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1379 scsu
->fromUDynamicWindow
=dynamicWindow
;
1383 /* write back the updated pointers */
1384 pArgs
->source
=source
;
1385 pArgs
->target
=(char *)target
;
1386 pArgs
->offsets
=offsets
;
1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1391 /* from the first if in the loop we know that targetCapacity>0 */
1392 if(length
<=targetCapacity
) {
1395 /* each branch falls through to the next one */
1397 *target
++=(uint8_t)(c
>>24);
1398 case 3: /*fall through*/
1399 *target
++=(uint8_t)(c
>>16);
1400 case 2: /*fall through*/
1401 *target
++=(uint8_t)(c
>>8);
1402 case 1: /*fall through*/
1403 *target
++=(uint8_t)c
;
1405 /* will never occur */
1410 /* each branch falls through to the next one */
1412 *target
++=(uint8_t)(c
>>24);
1413 *offsets
++=sourceIndex
;
1414 case 3: /*fall through*/
1415 *target
++=(uint8_t)(c
>>16);
1416 *offsets
++=sourceIndex
;
1417 case 2: /*fall through*/
1418 *target
++=(uint8_t)(c
>>8);
1419 *offsets
++=sourceIndex
;
1420 case 1: /*fall through*/
1421 *target
++=(uint8_t)c
;
1422 *offsets
++=sourceIndex
;
1424 /* will never occur */
1428 targetCapacity
-=length
;
1430 /* normal end of conversion: prepare for a new character */
1432 sourceIndex
=nextSourceIndex
;
1438 * We actually do this backwards here:
1439 * In order to save an intermediate variable, we output
1440 * first to the overflow buffer what does not fit into the
1443 /* we know that 0<=targetCapacity<length<=4 */
1444 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1445 length
-=targetCapacity
;
1446 p
=(uint8_t *)cnv
->charErrorBuffer
;
1448 /* each branch falls through to the next one */
1450 *p
++=(uint8_t)(c
>>24);
1451 case 3: /*fall through*/
1452 *p
++=(uint8_t)(c
>>16);
1453 case 2: /*fall through*/
1454 *p
++=(uint8_t)(c
>>8);
1455 case 1: /*fall through*/
1458 /* will never occur */
1461 cnv
->charErrorBufferLength
=(int8_t)length
;
1463 /* now output what fits into the regular target */
1464 c
>>=8*length
; /* length was reduced by targetCapacity */
1465 switch(targetCapacity
) {
1466 /* each branch falls through to the next one */
1468 *target
++=(uint8_t)(c
>>16);
1470 *offsets
++=sourceIndex
;
1472 case 2: /*fall through*/
1473 *target
++=(uint8_t)(c
>>8);
1475 *offsets
++=sourceIndex
;
1477 case 1: /*fall through*/
1478 *target
++=(uint8_t)c
;
1480 *offsets
++=sourceIndex
;
1486 /* target overflow */
1488 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1495 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1496 * If a change is made in the original function, then either
1497 * change this function the same way or
1498 * re-copy the original function and remove the variables
1499 * offsets, sourceIndex, and nextSourceIndex.
1502 _SCSUFromUnicode(UConverterFromUnicodeArgs
*pArgs
,
1503 UErrorCode
*pErrorCode
) {
1506 const UChar
*source
, *sourceLimit
;
1508 int32_t targetCapacity
;
1510 UBool isSingleByteMode
;
1511 uint8_t dynamicWindow
;
1512 uint32_t currentOffset
;
1518 /* variables for compression heuristics */
1524 /* set up the local pointers */
1525 cnv
=pArgs
->converter
;
1526 scsu
=(SCSUData
*)cnv
->extraInfo
;
1528 /* set up the local pointers */
1529 source
=pArgs
->source
;
1530 sourceLimit
=pArgs
->sourceLimit
;
1531 target
=(uint8_t *)pArgs
->target
;
1532 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1534 /* get the state machine state */
1535 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1536 dynamicWindow
=scsu
->fromUDynamicWindow
;
1537 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1541 /* similar conversion "loop" as in toUnicode */
1543 if(isSingleByteMode
) {
1544 if(c
!=0 && targetCapacity
>0) {
1545 goto getTrailSingle
;
1548 /* state machine for single-byte mode */
1549 /* singleByteMode: */
1550 while(source
<sourceLimit
) {
1551 if(targetCapacity
<=0) {
1552 /* target is full */
1553 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1558 if((c
-0x20)<=0x5f) {
1559 /* pass US-ASCII graphic character through */
1560 *target
++=(uint8_t)c
;
1563 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1565 *target
++=(uint8_t)c
;
1568 /* quote C0 control character */
1573 } else if((delta
=c
-currentOffset
)<=0x7f) {
1574 /* use the current dynamic window */
1575 *target
++=(uint8_t)(delta
|0x80);
1577 } else if(U16_IS_SURROGATE(c
)) {
1578 if(U16_IS_SURROGATE_LEAD(c
)) {
1581 if(source
<sourceLimit
) {
1582 /* test the following code unit */
1584 if(U16_IS_TRAIL(trail
)) {
1586 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1587 /* convert this surrogate code point */
1588 /* exit this condition tree */
1590 /* this is an unmatched lead code unit (1st surrogate) */
1591 /* callback(illegal) */
1592 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1600 /* this is an unmatched trail code unit (2nd surrogate) */
1601 /* callback(illegal) */
1602 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1606 /* compress supplementary character U+10000..U+10ffff */
1607 if((delta
=c
-currentOffset
)<=0x7f) {
1608 /* use the current dynamic window */
1609 *target
++=(uint8_t)(delta
|0x80);
1611 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1612 /* there is a dynamic window that contains this character, change to it */
1613 dynamicWindow
=window
;
1614 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1615 useDynamicWindow(scsu
, dynamicWindow
);
1616 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1619 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1620 /* might check if there are more characters in this window to come */
1621 /* define an extended window with this character */
1623 dynamicWindow
=getNextDynamicWindow(scsu
);
1624 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1625 useDynamicWindow(scsu
, dynamicWindow
);
1626 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1630 /* change to Unicode mode and output this (lead, trail) pair */
1631 isSingleByteMode
=FALSE
;
1632 *target
++=(uint8_t)SCU
;
1634 c
=((uint32_t)lead
<<16)|trail
;
1639 /* quote C1 control character */
1640 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1643 } else if(c
==0xfeff || c
>=0xfff0) {
1644 /* quote signature character=byte order mark and specials */
1649 /* compress all other BMP characters */
1650 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1651 /* there is a window defined that contains this character - switch to it or quote from it? */
1652 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1653 /* change to dynamic window */
1654 dynamicWindow
=window
;
1655 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1656 useDynamicWindow(scsu
, dynamicWindow
);
1657 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1661 /* quote from dynamic window */
1662 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1666 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1667 /* quote from static window */
1668 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1671 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1672 /* define a dynamic window with this character */
1673 dynamicWindow
=getNextDynamicWindow(scsu
);
1674 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1675 useDynamicWindow(scsu
, dynamicWindow
);
1676 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1679 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1680 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1683 * this character is not compressible (a BMP ideograph or similar);
1684 * switch to Unicode mode if this is the last character in the block
1685 * or there is at least one more ideograph following immediately
1687 isSingleByteMode
=FALSE
;
1699 /* normal end of conversion: prepare for a new character */
1703 if(c
!=0 && targetCapacity
>0) {
1704 goto getTrailUnicode
;
1707 /* state machine for Unicode mode */
1708 /* unicodeByteMode: */
1709 while(source
<sourceLimit
) {
1710 if(targetCapacity
<=0) {
1711 /* target is full */
1712 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1717 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1718 /* not compressible, write character directly */
1719 if(targetCapacity
>=2) {
1720 *target
++=(uint8_t)(c
>>8);
1721 *target
++=(uint8_t)c
;
1727 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1728 /* compress BMP character if the following one is not an uncompressible ideograph */
1729 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1730 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1731 /* ASCII digit or letter */
1732 isSingleByteMode
=TRUE
;
1733 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1736 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1737 /* there is a dynamic window that contains this character, change to it */
1738 isSingleByteMode
=TRUE
;
1739 dynamicWindow
=window
;
1740 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1741 useDynamicWindow(scsu
, dynamicWindow
);
1742 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1745 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1746 /* define a dynamic window with this character */
1747 isSingleByteMode
=TRUE
;
1748 dynamicWindow
=getNextDynamicWindow(scsu
);
1749 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1750 useDynamicWindow(scsu
, dynamicWindow
);
1751 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1757 /* don't know how to compress this character, just write it directly */
1760 } else if(c
<0xe000) {
1761 /* c is a surrogate */
1762 if(U16_IS_SURROGATE_LEAD(c
)) {
1765 if(source
<sourceLimit
) {
1766 /* test the following code unit */
1768 if(U16_IS_TRAIL(trail
)) {
1770 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1771 /* convert this surrogate code point */
1772 /* exit this condition tree */
1774 /* this is an unmatched lead code unit (1st surrogate) */
1775 /* callback(illegal) */
1776 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1784 /* this is an unmatched trail code unit (2nd surrogate) */
1785 /* callback(illegal) */
1786 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1790 /* compress supplementary character */
1791 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1792 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1795 * there is a dynamic window that contains this character and
1796 * the following character is not uncompressible,
1797 * change to the window
1799 isSingleByteMode
=TRUE
;
1800 dynamicWindow
=window
;
1801 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1802 useDynamicWindow(scsu
, dynamicWindow
);
1803 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1806 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1807 (code
=getDynamicOffset(c
, &offset
))>=0
1809 /* two supplementary characters in (probably) the same window - define an extended one */
1810 isSingleByteMode
=TRUE
;
1812 dynamicWindow
=getNextDynamicWindow(scsu
);
1813 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1814 useDynamicWindow(scsu
, dynamicWindow
);
1815 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1819 /* don't know how to compress this character, just write it directly */
1820 c
=((uint32_t)lead
<<16)|trail
;
1824 } else /* 0xe000<=c<0xf300 */ {
1825 /* quote to avoid SCSU tags */
1831 /* normal end of conversion: prepare for a new character */
1837 /* set the converter state back into UConverter */
1838 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1839 scsu
->fromUDynamicWindow
=dynamicWindow
;
1843 /* write back the updated pointers */
1844 pArgs
->source
=source
;
1845 pArgs
->target
=(char *)target
;
1849 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1850 /* from the first if in the loop we know that targetCapacity>0 */
1851 if(length
<=targetCapacity
) {
1853 /* each branch falls through to the next one */
1855 *target
++=(uint8_t)(c
>>24);
1856 case 3: /*fall through*/
1857 *target
++=(uint8_t)(c
>>16);
1858 case 2: /*fall through*/
1859 *target
++=(uint8_t)(c
>>8);
1860 case 1: /*fall through*/
1861 *target
++=(uint8_t)c
;
1863 /* will never occur */
1866 targetCapacity
-=length
;
1868 /* normal end of conversion: prepare for a new character */
1875 * We actually do this backwards here:
1876 * In order to save an intermediate variable, we output
1877 * first to the overflow buffer what does not fit into the
1880 /* we know that 0<=targetCapacity<length<=4 */
1881 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1882 length
-=targetCapacity
;
1883 p
=(uint8_t *)cnv
->charErrorBuffer
;
1885 /* each branch falls through to the next one */
1887 *p
++=(uint8_t)(c
>>24);
1888 case 3: /*fall through*/
1889 *p
++=(uint8_t)(c
>>16);
1890 case 2: /*fall through*/
1891 *p
++=(uint8_t)(c
>>8);
1892 case 1: /*fall through*/
1895 /* will never occur */
1898 cnv
->charErrorBufferLength
=(int8_t)length
;
1900 /* now output what fits into the regular target */
1901 c
>>=8*length
; /* length was reduced by targetCapacity */
1902 switch(targetCapacity
) {
1903 /* each branch falls through to the next one */
1905 *target
++=(uint8_t)(c
>>16);
1906 case 2: /*fall through*/
1907 *target
++=(uint8_t)(c
>>8);
1908 case 1: /*fall through*/
1909 *target
++=(uint8_t)c
;
1914 /* target overflow */
1916 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1922 /* miscellaneous ------------------------------------------------------------ */
1925 _SCSUGetName(const UConverter
*cnv
) {
1926 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
1928 switch(scsu
->locale
) {
1930 return "SCSU,locale=ja";
1936 /* structure for SafeClone calculations */
1937 struct cloneSCSUStruct
1944 _SCSUSafeClone(const UConverter
*cnv
,
1946 int32_t *pBufferSize
,
1949 struct cloneSCSUStruct
* localClone
;
1950 int32_t bufferSizeNeeded
= sizeof(struct cloneSCSUStruct
);
1952 if (U_FAILURE(*status
)){
1956 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1957 *pBufferSize
= bufferSizeNeeded
;
1961 localClone
= (struct cloneSCSUStruct
*)stackBuffer
;
1962 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1964 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(SCSUData
));
1965 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1966 localClone
->cnv
.isExtraLocal
= TRUE
;
1968 return &localClone
->cnv
;
1972 static const UConverterImpl _SCSUImpl
={
1983 _SCSUToUnicodeWithOffsets
,
1985 _SCSUFromUnicodeWithOffsets
,
1992 ucnv_getCompleteUnicodeSet
1995 static const UConverterStaticData _SCSUStaticData
={
1996 sizeof(UConverterStaticData
),
1998 1212, /* CCSID for SCSU */
1999 UCNV_IBM
, UCNV_SCSU
,
2000 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2002 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2003 * substitution string.
2005 { 0x0e, 0xff, 0xfd, 0 }, 3,
2009 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2012 const UConverterSharedData _SCSUData
={
2013 sizeof(UConverterSharedData
), ~((uint32_t)0),
2014 NULL
, NULL
, &_SCSUStaticData
, FALSE
, &_SCSUImpl
,