2 ******************************************************************************
4 * Copyright (C) 2000-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvscsu.c
10 * tab size: 8 (not used)
13 * created on: 2000nov18
14 * created by: Markus W. Scherer
16 * This is an implementation of the Standard Compression Scheme for Unicode
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 * Reserved commands and window settings are treated as illegal sequences and
19 * will result in callback calls.
22 #include "unicode/utypes.h"
24 #if !UCONFIG_NO_CONVERSION
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
32 /* SCSU definitions --------------------------------------------------------- */
34 /* SCSU command byte values */
36 SQ0
=0x01, /* Quote from window pair 0 */
37 SQ7
=0x08, /* Quote from window pair 7 */
38 SDX
=0x0B, /* Define a window as extended */
39 Srs
=0x0C, /* reserved */
40 SQU
=0x0E, /* Quote a single Unicode character */
41 SCU
=0x0F, /* Change to Unicode mode */
42 SC0
=0x10, /* Select window 0 */
43 SC7
=0x17, /* Select window 7 */
44 SD0
=0x18, /* Define and select window 0 */
45 SD7
=0x1F, /* Define and select window 7 */
47 UC0
=0xE0, /* Select window 0 */
48 UC7
=0xE7, /* Select window 7 */
49 UD0
=0xE8, /* Define and select window 0 */
50 UD7
=0xEF, /* Define and select window 7 */
51 UQU
=0xF0, /* Quote a single Unicode character */
52 UDX
=0xF1, /* Define a Window as extended */
53 Urs
=0xF2 /* reserved */
58 * Unicode code points from 3400 to E000 are not adressible by
59 * dynamic window, since in these areas no short run alphabets are
60 * found. Therefore add gapOffset to all values from gapThreshold.
65 /* values between reservedStart and fixedThreshold are reserved */
68 /* use table of predefined fixed offsets for values from fixedThreshold */
72 /* constant offsets for the 8 static windows */
73 static const uint32_t staticOffsets
[8]={
74 0x0000, /* ASCII for quoted tags */
75 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
76 0x0100, /* Latin Extended-A */
77 0x0300, /* Combining Diacritical Marks */
78 0x2000, /* General Punctuation */
79 0x2080, /* Currency Symbols */
80 0x2100, /* Letterlike Symbols and Number Forms */
81 0x3000 /* CJK Symbols and punctuation */
84 /* initial offsets for the 8 dynamic (sliding) windows */
85 static const uint32_t initialDynamicOffsets
[8]={
87 0x00C0, /* Latin Extended A */
88 0x0400, /* Cyrillic */
90 0x0900, /* Devanagari */
91 0x3040, /* Hiragana */
92 0x30A0, /* Katakana */
93 0xFF00 /* Fullwidth ASCII */
96 /* Table of fixed predefined Offsets */
97 static const uint32_t fixedOffsets
[]={
98 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
99 /* 0xFA */ 0x0250, /* IPA extensions */
100 /* 0xFB */ 0x0370, /* Greek */
101 /* 0xFC */ 0x0530, /* Armenian */
102 /* 0xFD */ 0x3040, /* Hiragana */
103 /* 0xFE */ 0x30A0, /* Katakana */
104 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
118 typedef struct SCSUData
{
119 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
120 uint32_t toUDynamicOffsets
[8];
121 uint32_t fromUDynamicOffsets
[8];
123 /* state machine state - toUnicode */
124 UBool toUIsSingleByteMode
;
126 int8_t toUQuoteWindow
, toUDynamicWindow
;
128 uint8_t toUPadding
[3];
130 /* state machine state - fromUnicode */
131 UBool fromUIsSingleByteMode
;
132 int8_t fromUDynamicWindow
;
135 * windowUse[] keeps track of the use of the dynamic windows:
136 * At nextWindowUseIndex there is the least recently used window,
137 * and the following windows (in a wrapping manner) are more and more
139 * At nextWindowUseIndex-1 there is the most recently used window.
142 int8_t nextWindowUseIndex
;
146 static const int8_t initialWindowUse
[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
147 static const int8_t initialWindowUse_ja
[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
153 /* SCSU setup functions ----------------------------------------------------- */
156 _SCSUReset(UConverter
*cnv
, UConverterResetChoice choice
) {
157 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
159 if(choice
<=UCNV_RESET_TO_UNICODE
) {
160 /* reset toUnicode */
161 uprv_memcpy(scsu
->toUDynamicOffsets
, initialDynamicOffsets
, 32);
163 scsu
->toUIsSingleByteMode
=TRUE
;
164 scsu
->toUState
=readCommand
;
165 scsu
->toUQuoteWindow
=scsu
->toUDynamicWindow
=0;
170 if(choice
!=UCNV_RESET_TO_UNICODE
) {
171 /* reset fromUnicode */
172 uprv_memcpy(scsu
->fromUDynamicOffsets
, initialDynamicOffsets
, 32);
174 scsu
->fromUIsSingleByteMode
=TRUE
;
175 scsu
->fromUDynamicWindow
=0;
177 scsu
->nextWindowUseIndex
=0;
178 switch(scsu
->locale
) {
180 uprv_memcpy(scsu
->windowUse
, initialWindowUse_ja
, 8);
183 uprv_memcpy(scsu
->windowUse
, initialWindowUse
, 8);
192 _SCSUOpen(UConverter
*cnv
,
196 UErrorCode
*pErrorCode
) {
197 cnv
->extraInfo
=uprv_malloc(sizeof(SCSUData
));
198 if(cnv
->extraInfo
!=NULL
) {
199 if(locale
!=NULL
&& locale
[0]=='j' && locale
[1]=='a' && (locale
[2]==0 || locale
[2]=='_')) {
200 ((SCSUData
*)cnv
->extraInfo
)->locale
=l_ja
;
202 ((SCSUData
*)cnv
->extraInfo
)->locale
=lGeneric
;
204 _SCSUReset(cnv
, UCNV_RESET_BOTH
);
206 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
209 /* Set the substitution character U+fffd as a Unicode string. */
210 cnv
->subUChars
[0]=0xfffd;
215 _SCSUClose(UConverter
*cnv
) {
216 if(cnv
->extraInfo
!=NULL
) {
217 if(!cnv
->isExtraLocal
) {
218 uprv_free(cnv
->extraInfo
);
224 /* SCSU-to-Unicode conversion functions ------------------------------------- */
227 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
228 UErrorCode
*pErrorCode
) {
231 const uint8_t *source
, *sourceLimit
;
233 const UChar
*targetLimit
;
235 UBool isSingleByteMode
;
236 uint8_t state
, byteOne
;
237 int8_t quoteWindow
, dynamicWindow
;
239 int32_t sourceIndex
, nextSourceIndex
;
243 /* set up the local pointers */
244 cnv
=pArgs
->converter
;
245 scsu
=(SCSUData
*)cnv
->extraInfo
;
247 source
=(const uint8_t *)pArgs
->source
;
248 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
249 target
=pArgs
->target
;
250 targetLimit
=pArgs
->targetLimit
;
251 offsets
=pArgs
->offsets
;
253 /* get the state machine state */
254 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
255 state
=scsu
->toUState
;
256 quoteWindow
=scsu
->toUQuoteWindow
;
257 dynamicWindow
=scsu
->toUDynamicWindow
;
258 byteOne
=scsu
->toUByteOne
;
260 /* sourceIndex=-1 if the current character began in the previous buffer */
261 sourceIndex
=state
==readCommand
? 0 : -1;
267 * For performance, this is not a normal C loop.
268 * Instead, there are two code blocks for the two SCSU modes.
269 * The function branches to either one, and a change of the mode is done with a goto to
272 * Each branch has two conventional loops:
273 * - a fast-path loop for the most common codes in the mode
274 * - a loop for all other codes in the mode
275 * When the fast-path runs into a code that it cannot handle, its loop ends and it
276 * runs into the following loop to handle the other codes.
277 * The end of the input or output buffer is also handled by the slower loop.
278 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
280 * The callback handling is done by returning with an error code.
281 * The conversion framework actually calls the callback function.
283 if(isSingleByteMode
) {
284 /* fast path for single-byte mode */
285 if(state
==readCommand
) {
287 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
291 /* write US-ASCII graphic character or DEL */
294 *offsets
++=sourceIndex
;
297 /* write from dynamic window */
298 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
302 *offsets
++=sourceIndex
;
305 /* output surrogate pair */
306 *target
++=(UChar
)(0xd7c0+(c
>>10));
307 if(target
<targetLimit
) {
308 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
310 *offsets
++=sourceIndex
;
311 *offsets
++=sourceIndex
;
314 /* target overflow */
316 *offsets
++=sourceIndex
;
318 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
319 cnv
->UCharErrorBufferLength
=1;
320 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
325 sourceIndex
=nextSourceIndex
;
329 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
331 while(source
<sourceLimit
) {
332 if(target
>=targetLimit
) {
334 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
341 /* redundant conditions are commented out */
342 /* here: b<0x20 because otherwise we would be in fastSingle */
343 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
347 *offsets
++=sourceIndex
;
349 sourceIndex
=nextSourceIndex
;
353 dynamicWindow
=(int8_t)(b
-SC0
);
354 sourceIndex
=nextSourceIndex
;
356 } else /* if(SD0<=b && b<=SD7) */ {
357 dynamicWindow
=(int8_t)(b
-SD0
);
360 } else if(/* SQ0<=b && */ b
<=SQ7
) {
361 quoteWindow
=(int8_t)(b
-SQ0
);
368 sourceIndex
=nextSourceIndex
;
369 isSingleByteMode
=FALSE
;
372 /* callback(illegal) */
373 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
379 /* store the first byte of a multibyte sequence in toUBytes[] */
390 *target
++=(UChar
)((byteOne
<<8)|b
);
392 *offsets
++=sourceIndex
;
394 sourceIndex
=nextSourceIndex
;
399 /* all static offsets are in the BMP */
400 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
402 *offsets
++=sourceIndex
;
405 /* write from dynamic window */
406 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
410 *offsets
++=sourceIndex
;
413 /* output surrogate pair */
414 *target
++=(UChar
)(0xd7c0+(c
>>10));
415 if(target
<targetLimit
) {
416 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
418 *offsets
++=sourceIndex
;
419 *offsets
++=sourceIndex
;
422 /* target overflow */
424 *offsets
++=sourceIndex
;
426 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
427 cnv
->UCharErrorBufferLength
=1;
428 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
433 sourceIndex
=nextSourceIndex
;
437 dynamicWindow
=(int8_t)((b
>>5)&7);
438 byteOne
=(uint8_t)(b
&0x1f);
444 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
445 sourceIndex
=nextSourceIndex
;
450 /* callback(illegal): Reserved window offset value 0 */
454 } else if(b
<gapThreshold
) {
455 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
456 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
457 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
458 } else if(b
>=fixedThreshold
) {
459 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
461 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
466 sourceIndex
=nextSourceIndex
;
472 /* fast path for Unicode mode */
473 if(state
==readCommand
) {
475 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
476 *target
++=(UChar
)((b
<<8)|source
[1]);
478 *offsets
++=sourceIndex
;
480 sourceIndex
=nextSourceIndex
;
486 /* normal state machine for Unicode mode */
487 /* unicodeByteMode: */
488 while(source
<sourceLimit
) {
489 if(target
>=targetLimit
) {
491 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
498 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
503 } else if(/* UC0<=b && */ b
<=UC7
) {
504 dynamicWindow
=(int8_t)(b
-UC0
);
505 sourceIndex
=nextSourceIndex
;
506 isSingleByteMode
=TRUE
;
508 } else if(/* UD0<=b && */ b
<=UD7
) {
509 dynamicWindow
=(int8_t)(b
-UD0
);
510 isSingleByteMode
=TRUE
;
516 isSingleByteMode
=TRUE
;
526 /* callback(illegal) */
527 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
540 *target
++=(UChar
)((byteOne
<<8)|b
);
542 *offsets
++=sourceIndex
;
544 sourceIndex
=nextSourceIndex
;
552 /* set the converter state back into UConverter */
553 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
554 /* reset to deal with the next character */
556 } else if(state
==readCommand
) {
557 /* not in a multi-byte sequence, reset toULength */
560 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
561 scsu
->toUState
=state
;
562 scsu
->toUQuoteWindow
=quoteWindow
;
563 scsu
->toUDynamicWindow
=dynamicWindow
;
564 scsu
->toUByteOne
=byteOne
;
566 /* write back the updated pointers */
567 pArgs
->source
=(const char *)source
;
568 pArgs
->target
=target
;
569 pArgs
->offsets
=offsets
;
574 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
575 * If a change is made in the original function, then either
576 * change this function the same way or
577 * re-copy the original function and remove the variables
578 * offsets, sourceIndex, and nextSourceIndex.
581 _SCSUToUnicode(UConverterToUnicodeArgs
*pArgs
,
582 UErrorCode
*pErrorCode
) {
585 const uint8_t *source
, *sourceLimit
;
587 const UChar
*targetLimit
;
588 UBool isSingleByteMode
;
589 uint8_t state
, byteOne
;
590 int8_t quoteWindow
, dynamicWindow
;
594 /* set up the local pointers */
595 cnv
=pArgs
->converter
;
596 scsu
=(SCSUData
*)cnv
->extraInfo
;
598 source
=(const uint8_t *)pArgs
->source
;
599 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
600 target
=pArgs
->target
;
601 targetLimit
=pArgs
->targetLimit
;
603 /* get the state machine state */
604 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
605 state
=scsu
->toUState
;
606 quoteWindow
=scsu
->toUQuoteWindow
;
607 dynamicWindow
=scsu
->toUDynamicWindow
;
608 byteOne
=scsu
->toUByteOne
;
613 * For performance, this is not a normal C loop.
614 * Instead, there are two code blocks for the two SCSU modes.
615 * The function branches to either one, and a change of the mode is done with a goto to
618 * Each branch has two conventional loops:
619 * - a fast-path loop for the most common codes in the mode
620 * - a loop for all other codes in the mode
621 * When the fast-path runs into a code that it cannot handle, its loop ends and it
622 * runs into the following loop to handle the other codes.
623 * The end of the input or output buffer is also handled by the slower loop.
624 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
626 * The callback handling is done by returning with an error code.
627 * The conversion framework actually calls the callback function.
629 if(isSingleByteMode
) {
630 /* fast path for single-byte mode */
631 if(state
==readCommand
) {
633 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
636 /* write US-ASCII graphic character or DEL */
639 /* write from dynamic window */
640 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
644 /* output surrogate pair */
645 *target
++=(UChar
)(0xd7c0+(c
>>10));
646 if(target
<targetLimit
) {
647 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
649 /* target overflow */
650 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
651 cnv
->UCharErrorBufferLength
=1;
652 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
660 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
662 while(source
<sourceLimit
) {
663 if(target
>=targetLimit
) {
665 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
671 /* redundant conditions are commented out */
672 /* here: b<0x20 because otherwise we would be in fastSingle */
673 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
679 dynamicWindow
=(int8_t)(b
-SC0
);
681 } else /* if(SD0<=b && b<=SD7) */ {
682 dynamicWindow
=(int8_t)(b
-SD0
);
685 } else if(/* SQ0<=b && */ b
<=SQ7
) {
686 quoteWindow
=(int8_t)(b
-SQ0
);
693 isSingleByteMode
=FALSE
;
696 /* callback(illegal) */
697 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
703 /* store the first byte of a multibyte sequence in toUBytes[] */
714 *target
++=(UChar
)((byteOne
<<8)|b
);
719 /* all static offsets are in the BMP */
720 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
722 /* write from dynamic window */
723 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
727 /* output surrogate pair */
728 *target
++=(UChar
)(0xd7c0+(c
>>10));
729 if(target
<targetLimit
) {
730 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
732 /* target overflow */
733 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
734 cnv
->UCharErrorBufferLength
=1;
735 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
743 dynamicWindow
=(int8_t)((b
>>5)&7);
744 byteOne
=(uint8_t)(b
&0x1f);
750 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
755 /* callback(illegal): Reserved window offset value 0 */
759 } else if(b
<gapThreshold
) {
760 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
761 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
762 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
763 } else if(b
>=fixedThreshold
) {
764 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
766 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
776 /* fast path for Unicode mode */
777 if(state
==readCommand
) {
779 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
780 *target
++=(UChar
)((b
<<8)|source
[1]);
785 /* normal state machine for Unicode mode */
786 /* unicodeByteMode: */
787 while(source
<sourceLimit
) {
788 if(target
>=targetLimit
) {
790 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
796 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
801 } else if(/* UC0<=b && */ b
<=UC7
) {
802 dynamicWindow
=(int8_t)(b
-UC0
);
803 isSingleByteMode
=TRUE
;
805 } else if(/* UD0<=b && */ b
<=UD7
) {
806 dynamicWindow
=(int8_t)(b
-UD0
);
807 isSingleByteMode
=TRUE
;
813 isSingleByteMode
=TRUE
;
823 /* callback(illegal) */
824 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
837 *target
++=(UChar
)((byteOne
<<8)|b
);
845 /* set the converter state back into UConverter */
846 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
847 /* reset to deal with the next character */
849 } else if(state
==readCommand
) {
850 /* not in a multi-byte sequence, reset toULength */
853 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
854 scsu
->toUState
=state
;
855 scsu
->toUQuoteWindow
=quoteWindow
;
856 scsu
->toUDynamicWindow
=dynamicWindow
;
857 scsu
->toUByteOne
=byteOne
;
859 /* write back the updated pointers */
860 pArgs
->source
=(const char *)source
;
861 pArgs
->target
=target
;
865 /* SCSU-from-Unicode conversion functions ----------------------------------- */
868 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
869 * reasonable results. The lookahead is minimal.
870 * Many cases are simple:
871 * A character fits directly into the current mode, a dynamic or static window,
872 * or is not compressible. These cases are tested first.
873 * Real compression heuristics are applied to the rest, in code branches for
874 * single/Unicode mode and BMP/supplementary code points.
875 * The heuristics used here are extremely simple.
878 /* get the number of the window that this character is in, or -1 */
880 getWindow(const uint32_t offsets
[8], uint32_t c
) {
883 if((uint32_t)(c
-offsets
[i
])<=0x7f) {
890 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
892 isInOffsetWindowOrDirect(uint32_t offset
, uint32_t c
) {
893 return (UBool
)(c
<=offset
+0x7f &&
894 (c
>=offset
|| (c
<=0x7f &&
895 (c
>=0x20 || (1UL<<c
)&0x2601))));
896 /* binary 0010 0110 0000 0001,
897 check for b==0xd || b==0xa || b==9 || b==0 */
901 * getNextDynamicWindow returns the next dynamic window to be redefined
904 getNextDynamicWindow(SCSUData
*scsu
) {
905 int8_t window
=scsu
->windowUse
[scsu
->nextWindowUseIndex
];
906 if(++scsu
->nextWindowUseIndex
==8) {
907 scsu
->nextWindowUseIndex
=0;
913 * useDynamicWindow() adjusts
914 * windowUse[] and nextWindowUseIndex for the algorithm to choose
915 * the next dynamic window to be defined;
916 * a subclass may override it and provide its own algorithm.
919 useDynamicWindow(SCSUData
*scsu
, int8_t window
) {
921 * move the existing window, which just became the most recently used one,
922 * up in windowUse[] to nextWindowUseIndex-1
925 /* first, find the index of the window - backwards to favor the more recently used windows */
928 i
=scsu
->nextWindowUseIndex
;
933 } while(scsu
->windowUse
[i
]!=window
);
935 /* now copy each windowUse[i+1] to [i] */
940 while(j
!=scsu
->nextWindowUseIndex
) {
941 scsu
->windowUse
[i
]=scsu
->windowUse
[j
];
946 /* finally, set the window into the most recently used index */
947 scsu
->windowUse
[i
]=window
;
951 * calculate the offset and the code for a dynamic window that contains the character
952 * takes fixed offsets into account
953 * the offset of the window is stored in the offset variable,
954 * the code is returned
956 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
959 getDynamicOffset(uint32_t c
, uint32_t *pOffset
) {
963 if((uint32_t)(c
-fixedOffsets
[i
])<=0x7f) {
964 *pOffset
=fixedOffsets
[i
];
970 /* No dynamic window for US-ASCII. */
972 } else if(c
<0x3400 ||
973 (uint32_t)(c
-0x10000)<(0x14000-0x10000) ||
974 (uint32_t)(c
-0x1d000)<=(0x1ffff-0x1d000)
976 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
977 *pOffset
=c
&0x7fffff80;
979 } else if(0xe000<=c
&& c
!=0xfeff && c
<0xfff0) {
980 /* For these characters we need to take the gapOffset into account. */
981 *pOffset
=c
&0x7fffff80;
982 return (int)((c
-gapOffset
)>>7);
989 * Idea for compression:
990 * - save SCSUData and other state before really starting work
991 * - at endloop, see if compression could be better with just unicode mode
992 * - don't do this if a callback has been called
993 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
994 * - different buffer handling!
996 * Drawback or need for corrective handling:
997 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
998 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
999 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1001 * How to achieve both?
1002 * - Only replace the result after an SDX or SCU?
1006 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1007 UErrorCode
*pErrorCode
) {
1010 const UChar
*source
, *sourceLimit
;
1012 int32_t targetCapacity
;
1015 UBool isSingleByteMode
;
1016 uint8_t dynamicWindow
;
1017 uint32_t currentOffset
;
1021 int32_t sourceIndex
, nextSourceIndex
;
1025 /* variables for compression heuristics */
1031 /* set up the local pointers */
1032 cnv
=pArgs
->converter
;
1033 scsu
=(SCSUData
*)cnv
->extraInfo
;
1035 /* set up the local pointers */
1036 source
=pArgs
->source
;
1037 sourceLimit
=pArgs
->sourceLimit
;
1038 target
=(uint8_t *)pArgs
->target
;
1039 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1040 offsets
=pArgs
->offsets
;
1042 /* get the state machine state */
1043 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1044 dynamicWindow
=scsu
->fromUDynamicWindow
;
1045 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1049 /* sourceIndex=-1 if the current character began in the previous buffer */
1050 sourceIndex
= c
==0 ? 0 : -1;
1053 /* similar conversion "loop" as in toUnicode */
1055 if(isSingleByteMode
) {
1056 if(c
!=0 && targetCapacity
>0) {
1057 goto getTrailSingle
;
1060 /* state machine for single-byte mode */
1061 /* singleByteMode: */
1062 while(source
<sourceLimit
) {
1063 if(targetCapacity
<=0) {
1064 /* target is full */
1065 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1071 if((c
-0x20)<=0x5f) {
1072 /* pass US-ASCII graphic character through */
1073 *target
++=(uint8_t)c
;
1075 *offsets
++=sourceIndex
;
1079 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1081 *target
++=(uint8_t)c
;
1083 *offsets
++=sourceIndex
;
1087 /* quote C0 control character */
1092 } else if((delta
=c
-currentOffset
)<=0x7f) {
1093 /* use the current dynamic window */
1094 *target
++=(uint8_t)(delta
|0x80);
1096 *offsets
++=sourceIndex
;
1099 } else if(UTF_IS_SURROGATE(c
)) {
1100 if(UTF_IS_SURROGATE_FIRST(c
)) {
1103 if(source
<sourceLimit
) {
1104 /* test the following code unit */
1106 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1109 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1110 /* convert this surrogate code point */
1111 /* exit this condition tree */
1113 /* this is an unmatched lead code unit (1st surrogate) */
1114 /* callback(illegal) */
1115 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1123 /* this is an unmatched trail code unit (2nd surrogate) */
1124 /* callback(illegal) */
1125 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1129 /* compress supplementary character U+10000..U+10ffff */
1130 if((delta
=c
-currentOffset
)<=0x7f) {
1131 /* use the current dynamic window */
1132 *target
++=(uint8_t)(delta
|0x80);
1134 *offsets
++=sourceIndex
;
1137 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1138 /* there is a dynamic window that contains this character, change to it */
1139 dynamicWindow
=window
;
1140 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1141 useDynamicWindow(scsu
, dynamicWindow
);
1142 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1145 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1146 /* might check if there are more characters in this window to come */
1147 /* define an extended window with this character */
1149 dynamicWindow
=getNextDynamicWindow(scsu
);
1150 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1151 useDynamicWindow(scsu
, dynamicWindow
);
1152 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1156 /* change to Unicode mode and output this (lead, trail) pair */
1157 isSingleByteMode
=FALSE
;
1158 *target
++=(uint8_t)SCU
;
1160 *offsets
++=sourceIndex
;
1163 c
=((uint32_t)lead
<<16)|trail
;
1168 /* quote C1 control character */
1169 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1172 } else if(c
==0xfeff || c
>=0xfff0) {
1173 /* quote signature character=byte order mark and specials */
1178 /* compress all other BMP characters */
1179 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1180 /* there is a window defined that contains this character - switch to it or quote from it? */
1181 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1182 /* change to dynamic window */
1183 dynamicWindow
=window
;
1184 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1185 useDynamicWindow(scsu
, dynamicWindow
);
1186 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1190 /* quote from dynamic window */
1191 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1195 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1196 /* quote from static window */
1197 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1200 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1201 /* define a dynamic window with this character */
1202 dynamicWindow
=getNextDynamicWindow(scsu
);
1203 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1204 useDynamicWindow(scsu
, dynamicWindow
);
1205 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1208 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1209 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1212 * this character is not compressible (a BMP ideograph or similar);
1213 * switch to Unicode mode if this is the last character in the block
1214 * or there is at least one more ideograph following immediately
1216 isSingleByteMode
=FALSE
;
1228 /* normal end of conversion: prepare for a new character */
1230 sourceIndex
=nextSourceIndex
;
1233 if(c
!=0 && targetCapacity
>0) {
1234 goto getTrailUnicode
;
1237 /* state machine for Unicode mode */
1238 /* unicodeByteMode: */
1239 while(source
<sourceLimit
) {
1240 if(targetCapacity
<=0) {
1241 /* target is full */
1242 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1248 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1249 /* not compressible, write character directly */
1250 if(targetCapacity
>=2) {
1251 *target
++=(uint8_t)(c
>>8);
1252 *target
++=(uint8_t)c
;
1254 *offsets
++=sourceIndex
;
1255 *offsets
++=sourceIndex
;
1262 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1263 /* compress BMP character if the following one is not an uncompressible ideograph */
1264 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1265 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1266 /* ASCII digit or letter */
1267 isSingleByteMode
=TRUE
;
1268 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1271 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1272 /* there is a dynamic window that contains this character, change to it */
1273 isSingleByteMode
=TRUE
;
1274 dynamicWindow
=window
;
1275 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1276 useDynamicWindow(scsu
, dynamicWindow
);
1277 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1280 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1281 /* define a dynamic window with this character */
1282 isSingleByteMode
=TRUE
;
1283 dynamicWindow
=getNextDynamicWindow(scsu
);
1284 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1285 useDynamicWindow(scsu
, dynamicWindow
);
1286 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1292 /* don't know how to compress this character, just write it directly */
1295 } else if(c
<0xe000) {
1296 /* c is a surrogate */
1297 if(UTF_IS_SURROGATE_FIRST(c
)) {
1300 if(source
<sourceLimit
) {
1301 /* test the following code unit */
1303 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1306 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1307 /* convert this surrogate code point */
1308 /* exit this condition tree */
1310 /* this is an unmatched lead code unit (1st surrogate) */
1311 /* callback(illegal) */
1312 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1320 /* this is an unmatched trail code unit (2nd surrogate) */
1321 /* callback(illegal) */
1322 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1326 /* compress supplementary character */
1327 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1328 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1331 * there is a dynamic window that contains this character and
1332 * the following character is not uncompressible,
1333 * change to the window
1335 isSingleByteMode
=TRUE
;
1336 dynamicWindow
=window
;
1337 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1338 useDynamicWindow(scsu
, dynamicWindow
);
1339 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1342 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1343 (code
=getDynamicOffset(c
, &offset
))>=0
1345 /* two supplementary characters in (probably) the same window - define an extended one */
1346 isSingleByteMode
=TRUE
;
1348 dynamicWindow
=getNextDynamicWindow(scsu
);
1349 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1350 useDynamicWindow(scsu
, dynamicWindow
);
1351 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1355 /* don't know how to compress this character, just write it directly */
1356 c
=((uint32_t)lead
<<16)|trail
;
1360 } else /* 0xe000<=c<0xf300 */ {
1361 /* quote to avoid SCSU tags */
1367 /* normal end of conversion: prepare for a new character */
1369 sourceIndex
=nextSourceIndex
;
1374 /* set the converter state back into UConverter */
1375 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1376 scsu
->fromUDynamicWindow
=dynamicWindow
;
1380 /* write back the updated pointers */
1381 pArgs
->source
=source
;
1382 pArgs
->target
=(char *)target
;
1383 pArgs
->offsets
=offsets
;
1387 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1388 /* from the first if in the loop we know that targetCapacity>0 */
1389 if(length
<=targetCapacity
) {
1392 /* each branch falls through to the next one */
1394 *target
++=(uint8_t)(c
>>24);
1396 *target
++=(uint8_t)(c
>>16);
1398 *target
++=(uint8_t)(c
>>8);
1400 *target
++=(uint8_t)c
;
1402 /* will never occur */
1407 /* each branch falls through to the next one */
1409 *target
++=(uint8_t)(c
>>24);
1410 *offsets
++=sourceIndex
;
1412 *target
++=(uint8_t)(c
>>16);
1413 *offsets
++=sourceIndex
;
1415 *target
++=(uint8_t)(c
>>8);
1416 *offsets
++=sourceIndex
;
1418 *target
++=(uint8_t)c
;
1419 *offsets
++=sourceIndex
;
1421 /* will never occur */
1425 targetCapacity
-=length
;
1427 /* normal end of conversion: prepare for a new character */
1429 sourceIndex
=nextSourceIndex
;
1435 * We actually do this backwards here:
1436 * In order to save an intermediate variable, we output
1437 * first to the overflow buffer what does not fit into the
1440 /* we know that 0<=targetCapacity<length<=4 */
1441 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1442 length
-=targetCapacity
;
1443 p
=(uint8_t *)cnv
->charErrorBuffer
;
1445 /* each branch falls through to the next one */
1447 *p
++=(uint8_t)(c
>>24);
1449 *p
++=(uint8_t)(c
>>16);
1451 *p
++=(uint8_t)(c
>>8);
1455 /* will never occur */
1458 cnv
->charErrorBufferLength
=(int8_t)length
;
1460 /* now output what fits into the regular target */
1461 c
>>=8*length
; /* length was reduced by targetCapacity */
1462 switch(targetCapacity
) {
1463 /* each branch falls through to the next one */
1465 *target
++=(uint8_t)(c
>>16);
1467 *offsets
++=sourceIndex
;
1470 *target
++=(uint8_t)(c
>>8);
1472 *offsets
++=sourceIndex
;
1475 *target
++=(uint8_t)c
;
1477 *offsets
++=sourceIndex
;
1483 /* target overflow */
1485 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1492 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1493 * If a change is made in the original function, then either
1494 * change this function the same way or
1495 * re-copy the original function and remove the variables
1496 * offsets, sourceIndex, and nextSourceIndex.
1499 _SCSUFromUnicode(UConverterFromUnicodeArgs
*pArgs
,
1500 UErrorCode
*pErrorCode
) {
1503 const UChar
*source
, *sourceLimit
;
1505 int32_t targetCapacity
;
1507 UBool isSingleByteMode
;
1508 uint8_t dynamicWindow
;
1509 uint32_t currentOffset
;
1515 /* variables for compression heuristics */
1521 /* set up the local pointers */
1522 cnv
=pArgs
->converter
;
1523 scsu
=(SCSUData
*)cnv
->extraInfo
;
1525 /* set up the local pointers */
1526 source
=pArgs
->source
;
1527 sourceLimit
=pArgs
->sourceLimit
;
1528 target
=(uint8_t *)pArgs
->target
;
1529 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1531 /* get the state machine state */
1532 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1533 dynamicWindow
=scsu
->fromUDynamicWindow
;
1534 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1538 /* similar conversion "loop" as in toUnicode */
1540 if(isSingleByteMode
) {
1541 if(c
!=0 && targetCapacity
>0) {
1542 goto getTrailSingle
;
1545 /* state machine for single-byte mode */
1546 /* singleByteMode: */
1547 while(source
<sourceLimit
) {
1548 if(targetCapacity
<=0) {
1549 /* target is full */
1550 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1555 if((c
-0x20)<=0x5f) {
1556 /* pass US-ASCII graphic character through */
1557 *target
++=(uint8_t)c
;
1560 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1562 *target
++=(uint8_t)c
;
1565 /* quote C0 control character */
1570 } else if((delta
=c
-currentOffset
)<=0x7f) {
1571 /* use the current dynamic window */
1572 *target
++=(uint8_t)(delta
|0x80);
1574 } else if(UTF_IS_SURROGATE(c
)) {
1575 if(UTF_IS_SURROGATE_FIRST(c
)) {
1578 if(source
<sourceLimit
) {
1579 /* test the following code unit */
1581 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1583 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1584 /* convert this surrogate code point */
1585 /* exit this condition tree */
1587 /* this is an unmatched lead code unit (1st surrogate) */
1588 /* callback(illegal) */
1589 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1597 /* this is an unmatched trail code unit (2nd surrogate) */
1598 /* callback(illegal) */
1599 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1603 /* compress supplementary character U+10000..U+10ffff */
1604 if((delta
=c
-currentOffset
)<=0x7f) {
1605 /* use the current dynamic window */
1606 *target
++=(uint8_t)(delta
|0x80);
1608 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1609 /* there is a dynamic window that contains this character, change to it */
1610 dynamicWindow
=window
;
1611 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1612 useDynamicWindow(scsu
, dynamicWindow
);
1613 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1616 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1617 /* might check if there are more characters in this window to come */
1618 /* define an extended window with this character */
1620 dynamicWindow
=getNextDynamicWindow(scsu
);
1621 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1622 useDynamicWindow(scsu
, dynamicWindow
);
1623 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1627 /* change to Unicode mode and output this (lead, trail) pair */
1628 isSingleByteMode
=FALSE
;
1629 *target
++=(uint8_t)SCU
;
1631 c
=((uint32_t)lead
<<16)|trail
;
1636 /* quote C1 control character */
1637 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1640 } else if(c
==0xfeff || c
>=0xfff0) {
1641 /* quote signature character=byte order mark and specials */
1646 /* compress all other BMP characters */
1647 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1648 /* there is a window defined that contains this character - switch to it or quote from it? */
1649 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1650 /* change to dynamic window */
1651 dynamicWindow
=window
;
1652 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1653 useDynamicWindow(scsu
, dynamicWindow
);
1654 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1658 /* quote from dynamic window */
1659 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1663 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1664 /* quote from static window */
1665 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1668 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1669 /* define a dynamic window with this character */
1670 dynamicWindow
=getNextDynamicWindow(scsu
);
1671 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1672 useDynamicWindow(scsu
, dynamicWindow
);
1673 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1676 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1677 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1680 * this character is not compressible (a BMP ideograph or similar);
1681 * switch to Unicode mode if this is the last character in the block
1682 * or there is at least one more ideograph following immediately
1684 isSingleByteMode
=FALSE
;
1696 /* normal end of conversion: prepare for a new character */
1700 if(c
!=0 && targetCapacity
>0) {
1701 goto getTrailUnicode
;
1704 /* state machine for Unicode mode */
1705 /* unicodeByteMode: */
1706 while(source
<sourceLimit
) {
1707 if(targetCapacity
<=0) {
1708 /* target is full */
1709 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1714 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1715 /* not compressible, write character directly */
1716 if(targetCapacity
>=2) {
1717 *target
++=(uint8_t)(c
>>8);
1718 *target
++=(uint8_t)c
;
1724 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1725 /* compress BMP character if the following one is not an uncompressible ideograph */
1726 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1727 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1728 /* ASCII digit or letter */
1729 isSingleByteMode
=TRUE
;
1730 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1733 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1734 /* there is a dynamic window that contains this character, change to it */
1735 isSingleByteMode
=TRUE
;
1736 dynamicWindow
=window
;
1737 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1738 useDynamicWindow(scsu
, dynamicWindow
);
1739 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1742 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1743 /* define a dynamic window with this character */
1744 isSingleByteMode
=TRUE
;
1745 dynamicWindow
=getNextDynamicWindow(scsu
);
1746 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1747 useDynamicWindow(scsu
, dynamicWindow
);
1748 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1754 /* don't know how to compress this character, just write it directly */
1757 } else if(c
<0xe000) {
1758 /* c is a surrogate */
1759 if(UTF_IS_SURROGATE_FIRST(c
)) {
1762 if(source
<sourceLimit
) {
1763 /* test the following code unit */
1765 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1767 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1768 /* convert this surrogate code point */
1769 /* exit this condition tree */
1771 /* this is an unmatched lead code unit (1st surrogate) */
1772 /* callback(illegal) */
1773 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1781 /* this is an unmatched trail code unit (2nd surrogate) */
1782 /* callback(illegal) */
1783 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1787 /* compress supplementary character */
1788 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1789 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1792 * there is a dynamic window that contains this character and
1793 * the following character is not uncompressible,
1794 * change to the window
1796 isSingleByteMode
=TRUE
;
1797 dynamicWindow
=window
;
1798 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1799 useDynamicWindow(scsu
, dynamicWindow
);
1800 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1803 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1804 (code
=getDynamicOffset(c
, &offset
))>=0
1806 /* two supplementary characters in (probably) the same window - define an extended one */
1807 isSingleByteMode
=TRUE
;
1809 dynamicWindow
=getNextDynamicWindow(scsu
);
1810 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1811 useDynamicWindow(scsu
, dynamicWindow
);
1812 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1816 /* don't know how to compress this character, just write it directly */
1817 c
=((uint32_t)lead
<<16)|trail
;
1821 } else /* 0xe000<=c<0xf300 */ {
1822 /* quote to avoid SCSU tags */
1828 /* normal end of conversion: prepare for a new character */
1834 /* set the converter state back into UConverter */
1835 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1836 scsu
->fromUDynamicWindow
=dynamicWindow
;
1840 /* write back the updated pointers */
1841 pArgs
->source
=source
;
1842 pArgs
->target
=(char *)target
;
1846 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1847 /* from the first if in the loop we know that targetCapacity>0 */
1848 if(length
<=targetCapacity
) {
1850 /* each branch falls through to the next one */
1852 *target
++=(uint8_t)(c
>>24);
1854 *target
++=(uint8_t)(c
>>16);
1856 *target
++=(uint8_t)(c
>>8);
1858 *target
++=(uint8_t)c
;
1860 /* will never occur */
1863 targetCapacity
-=length
;
1865 /* normal end of conversion: prepare for a new character */
1872 * We actually do this backwards here:
1873 * In order to save an intermediate variable, we output
1874 * first to the overflow buffer what does not fit into the
1877 /* we know that 0<=targetCapacity<length<=4 */
1878 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1879 length
-=targetCapacity
;
1880 p
=(uint8_t *)cnv
->charErrorBuffer
;
1882 /* each branch falls through to the next one */
1884 *p
++=(uint8_t)(c
>>24);
1886 *p
++=(uint8_t)(c
>>16);
1888 *p
++=(uint8_t)(c
>>8);
1892 /* will never occur */
1895 cnv
->charErrorBufferLength
=(int8_t)length
;
1897 /* now output what fits into the regular target */
1898 c
>>=8*length
; /* length was reduced by targetCapacity */
1899 switch(targetCapacity
) {
1900 /* each branch falls through to the next one */
1902 *target
++=(uint8_t)(c
>>16);
1904 *target
++=(uint8_t)(c
>>8);
1906 *target
++=(uint8_t)c
;
1911 /* target overflow */
1913 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1919 /* miscellaneous ------------------------------------------------------------ */
1922 _SCSUGetName(const UConverter
*cnv
) {
1923 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
1925 switch(scsu
->locale
) {
1927 return "SCSU,locale=ja";
1933 /* structure for SafeClone calculations */
1934 struct cloneSCSUStruct
1941 _SCSUSafeClone(const UConverter
*cnv
,
1943 int32_t *pBufferSize
,
1946 struct cloneSCSUStruct
* localClone
;
1947 int32_t bufferSizeNeeded
= sizeof(struct cloneSCSUStruct
);
1949 if (U_FAILURE(*status
)){
1953 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1954 *pBufferSize
= bufferSizeNeeded
;
1958 localClone
= (struct cloneSCSUStruct
*)stackBuffer
;
1959 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1961 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(SCSUData
));
1962 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1963 localClone
->cnv
.isExtraLocal
= TRUE
;
1965 return &localClone
->cnv
;
1969 static const UConverterImpl _SCSUImpl
={
1980 _SCSUToUnicodeWithOffsets
,
1982 _SCSUFromUnicodeWithOffsets
,
1989 ucnv_getCompleteUnicodeSet
1992 static const UConverterStaticData _SCSUStaticData
={
1993 sizeof(UConverterStaticData
),
1995 1212, /* CCSID for SCSU */
1996 UCNV_IBM
, UCNV_SCSU
,
1997 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
1999 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2000 * substitution string.
2002 { 0x0e, 0xff, 0xfd, 0 }, 3,
2006 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2009 const UConverterSharedData _SCSUData
={
2010 sizeof(UConverterSharedData
), ~((uint32_t)0),
2011 NULL
, NULL
, &_SCSUStaticData
, FALSE
, &_SCSUImpl
,