2 ******************************************************************************
4 * Copyright (C) 2000-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvscsu.c
10 * tab size: 8 (not used)
13 * created on: 2000nov18
14 * created by: Markus W. Scherer
16 * This is an implementation of the Standard Compression Scheme for Unicode
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 * Reserved commands and window settings are treated as illegal sequences and
19 * will result in callback calls.
22 #include "unicode/utypes.h"
24 #if !UCONFIG_NO_CONVERSION
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
32 /* SCSU definitions --------------------------------------------------------- */
34 /* SCSU command byte values */
36 SQ0
=0x01, /* Quote from window pair 0 */
37 SQ7
=0x08, /* Quote from window pair 7 */
38 SDX
=0x0B, /* Define a window as extended */
39 Srs
=0x0C, /* reserved */
40 SQU
=0x0E, /* Quote a single Unicode character */
41 SCU
=0x0F, /* Change to Unicode mode */
42 SC0
=0x10, /* Select window 0 */
43 SC7
=0x17, /* Select window 7 */
44 SD0
=0x18, /* Define and select window 0 */
45 SD7
=0x1F, /* Define and select window 7 */
47 UC0
=0xE0, /* Select window 0 */
48 UC7
=0xE7, /* Select window 7 */
49 UD0
=0xE8, /* Define and select window 0 */
50 UD7
=0xEF, /* Define and select window 7 */
51 UQU
=0xF0, /* Quote a single Unicode character */
52 UDX
=0xF1, /* Define a Window as extended */
53 Urs
=0xF2 /* reserved */
58 * Unicode code points from 3400 to E000 are not adressible by
59 * dynamic window, since in these areas no short run alphabets are
60 * found. Therefore add gapOffset to all values from gapThreshold.
65 /* values between reservedStart and fixedThreshold are reserved */
68 /* use table of predefined fixed offsets for values from fixedThreshold */
72 /* constant offsets for the 8 static windows */
73 static const uint32_t staticOffsets
[8]={
74 0x0000, /* ASCII for quoted tags */
75 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
76 0x0100, /* Latin Extended-A */
77 0x0300, /* Combining Diacritical Marks */
78 0x2000, /* General Punctuation */
79 0x2080, /* Currency Symbols */
80 0x2100, /* Letterlike Symbols and Number Forms */
81 0x3000 /* CJK Symbols and punctuation */
84 /* initial offsets for the 8 dynamic (sliding) windows */
85 static const uint32_t initialDynamicOffsets
[8]={
87 0x00C0, /* Latin Extended A */
88 0x0400, /* Cyrillic */
90 0x0900, /* Devanagari */
91 0x3040, /* Hiragana */
92 0x30A0, /* Katakana */
93 0xFF00 /* Fullwidth ASCII */
96 /* Table of fixed predefined Offsets */
97 static const uint32_t fixedOffsets
[]={
98 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
99 /* 0xFA */ 0x0250, /* IPA extensions */
100 /* 0xFB */ 0x0370, /* Greek */
101 /* 0xFC */ 0x0530, /* Armenian */
102 /* 0xFD */ 0x3040, /* Hiragana */
103 /* 0xFE */ 0x30A0, /* Katakana */
104 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
118 typedef struct SCSUData
{
119 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
120 uint32_t toUDynamicOffsets
[8];
121 uint32_t fromUDynamicOffsets
[8];
123 /* state machine state - toUnicode */
124 UBool toUIsSingleByteMode
;
126 int8_t toUQuoteWindow
, toUDynamicWindow
;
128 uint8_t toUPadding
[3];
130 /* state machine state - fromUnicode */
131 UBool fromUIsSingleByteMode
;
132 int8_t fromUDynamicWindow
;
135 * windowUse[] keeps track of the use of the dynamic windows:
136 * At nextWindowUseIndex there is the least recently used window,
137 * and the following windows (in a wrapping manner) are more and more
139 * At nextWindowUseIndex-1 there is the most recently used window.
142 int8_t nextWindowUseIndex
;
146 static const int8_t initialWindowUse
[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
147 static const int8_t initialWindowUse_ja
[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
153 /* SCSU setup functions ----------------------------------------------------- */
156 _SCSUReset(UConverter
*cnv
, UConverterResetChoice choice
) {
157 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
159 if(choice
<=UCNV_RESET_TO_UNICODE
) {
160 /* reset toUnicode */
161 uprv_memcpy(scsu
->toUDynamicOffsets
, initialDynamicOffsets
, 32);
163 scsu
->toUIsSingleByteMode
=TRUE
;
164 scsu
->toUState
=readCommand
;
165 scsu
->toUQuoteWindow
=scsu
->toUDynamicWindow
=0;
170 if(choice
!=UCNV_RESET_TO_UNICODE
) {
171 /* reset fromUnicode */
172 uprv_memcpy(scsu
->fromUDynamicOffsets
, initialDynamicOffsets
, 32);
174 scsu
->fromUIsSingleByteMode
=TRUE
;
175 scsu
->fromUDynamicWindow
=0;
177 scsu
->nextWindowUseIndex
=0;
178 switch(scsu
->locale
) {
180 uprv_memcpy(scsu
->windowUse
, initialWindowUse_ja
, 8);
183 uprv_memcpy(scsu
->windowUse
, initialWindowUse
, 8);
192 _SCSUOpen(UConverter
*cnv
,
193 UConverterLoadArgs
*pArgs
,
194 UErrorCode
*pErrorCode
) {
195 const char *locale
=pArgs
->locale
;
196 if(pArgs
->onlyTestIsLoadable
) {
199 cnv
->extraInfo
=uprv_malloc(sizeof(SCSUData
));
200 if(cnv
->extraInfo
!=NULL
) {
201 if(locale
!=NULL
&& locale
[0]=='j' && locale
[1]=='a' && (locale
[2]==0 || locale
[2]=='_')) {
202 ((SCSUData
*)cnv
->extraInfo
)->locale
=l_ja
;
204 ((SCSUData
*)cnv
->extraInfo
)->locale
=lGeneric
;
206 _SCSUReset(cnv
, UCNV_RESET_BOTH
);
208 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
211 /* Set the substitution character U+fffd as a Unicode string. */
212 cnv
->subUChars
[0]=0xfffd;
217 _SCSUClose(UConverter
*cnv
) {
218 if(cnv
->extraInfo
!=NULL
) {
219 if(!cnv
->isExtraLocal
) {
220 uprv_free(cnv
->extraInfo
);
226 /* SCSU-to-Unicode conversion functions ------------------------------------- */
229 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
230 UErrorCode
*pErrorCode
) {
233 const uint8_t *source
, *sourceLimit
;
235 const UChar
*targetLimit
;
237 UBool isSingleByteMode
;
238 uint8_t state
, byteOne
;
239 int8_t quoteWindow
, dynamicWindow
;
241 int32_t sourceIndex
, nextSourceIndex
;
245 /* set up the local pointers */
246 cnv
=pArgs
->converter
;
247 scsu
=(SCSUData
*)cnv
->extraInfo
;
249 source
=(const uint8_t *)pArgs
->source
;
250 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
251 target
=pArgs
->target
;
252 targetLimit
=pArgs
->targetLimit
;
253 offsets
=pArgs
->offsets
;
255 /* get the state machine state */
256 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
257 state
=scsu
->toUState
;
258 quoteWindow
=scsu
->toUQuoteWindow
;
259 dynamicWindow
=scsu
->toUDynamicWindow
;
260 byteOne
=scsu
->toUByteOne
;
262 /* sourceIndex=-1 if the current character began in the previous buffer */
263 sourceIndex
=state
==readCommand
? 0 : -1;
269 * For performance, this is not a normal C loop.
270 * Instead, there are two code blocks for the two SCSU modes.
271 * The function branches to either one, and a change of the mode is done with a goto to
274 * Each branch has two conventional loops:
275 * - a fast-path loop for the most common codes in the mode
276 * - a loop for all other codes in the mode
277 * When the fast-path runs into a code that it cannot handle, its loop ends and it
278 * runs into the following loop to handle the other codes.
279 * The end of the input or output buffer is also handled by the slower loop.
280 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
282 * The callback handling is done by returning with an error code.
283 * The conversion framework actually calls the callback function.
285 if(isSingleByteMode
) {
286 /* fast path for single-byte mode */
287 if(state
==readCommand
) {
289 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
293 /* write US-ASCII graphic character or DEL */
296 *offsets
++=sourceIndex
;
299 /* write from dynamic window */
300 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
304 *offsets
++=sourceIndex
;
307 /* output surrogate pair */
308 *target
++=(UChar
)(0xd7c0+(c
>>10));
309 if(target
<targetLimit
) {
310 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
312 *offsets
++=sourceIndex
;
313 *offsets
++=sourceIndex
;
316 /* target overflow */
318 *offsets
++=sourceIndex
;
320 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
321 cnv
->UCharErrorBufferLength
=1;
322 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
327 sourceIndex
=nextSourceIndex
;
331 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
333 while(source
<sourceLimit
) {
334 if(target
>=targetLimit
) {
336 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
343 /* redundant conditions are commented out */
344 /* here: b<0x20 because otherwise we would be in fastSingle */
345 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
349 *offsets
++=sourceIndex
;
351 sourceIndex
=nextSourceIndex
;
355 dynamicWindow
=(int8_t)(b
-SC0
);
356 sourceIndex
=nextSourceIndex
;
358 } else /* if(SD0<=b && b<=SD7) */ {
359 dynamicWindow
=(int8_t)(b
-SD0
);
362 } else if(/* SQ0<=b && */ b
<=SQ7
) {
363 quoteWindow
=(int8_t)(b
-SQ0
);
370 sourceIndex
=nextSourceIndex
;
371 isSingleByteMode
=FALSE
;
374 /* callback(illegal) */
375 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
381 /* store the first byte of a multibyte sequence in toUBytes[] */
392 *target
++=(UChar
)((byteOne
<<8)|b
);
394 *offsets
++=sourceIndex
;
396 sourceIndex
=nextSourceIndex
;
401 /* all static offsets are in the BMP */
402 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
404 *offsets
++=sourceIndex
;
407 /* write from dynamic window */
408 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
412 *offsets
++=sourceIndex
;
415 /* output surrogate pair */
416 *target
++=(UChar
)(0xd7c0+(c
>>10));
417 if(target
<targetLimit
) {
418 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
420 *offsets
++=sourceIndex
;
421 *offsets
++=sourceIndex
;
424 /* target overflow */
426 *offsets
++=sourceIndex
;
428 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
429 cnv
->UCharErrorBufferLength
=1;
430 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
435 sourceIndex
=nextSourceIndex
;
439 dynamicWindow
=(int8_t)((b
>>5)&7);
440 byteOne
=(uint8_t)(b
&0x1f);
446 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
447 sourceIndex
=nextSourceIndex
;
452 /* callback(illegal): Reserved window offset value 0 */
456 } else if(b
<gapThreshold
) {
457 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
458 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
459 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
460 } else if(b
>=fixedThreshold
) {
461 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
463 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
468 sourceIndex
=nextSourceIndex
;
474 /* fast path for Unicode mode */
475 if(state
==readCommand
) {
477 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
478 *target
++=(UChar
)((b
<<8)|source
[1]);
480 *offsets
++=sourceIndex
;
482 sourceIndex
=nextSourceIndex
;
488 /* normal state machine for Unicode mode */
489 /* unicodeByteMode: */
490 while(source
<sourceLimit
) {
491 if(target
>=targetLimit
) {
493 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
500 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
505 } else if(/* UC0<=b && */ b
<=UC7
) {
506 dynamicWindow
=(int8_t)(b
-UC0
);
507 sourceIndex
=nextSourceIndex
;
508 isSingleByteMode
=TRUE
;
510 } else if(/* UD0<=b && */ b
<=UD7
) {
511 dynamicWindow
=(int8_t)(b
-UD0
);
512 isSingleByteMode
=TRUE
;
518 isSingleByteMode
=TRUE
;
528 /* callback(illegal) */
529 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
542 *target
++=(UChar
)((byteOne
<<8)|b
);
544 *offsets
++=sourceIndex
;
546 sourceIndex
=nextSourceIndex
;
554 /* set the converter state back into UConverter */
555 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
556 /* reset to deal with the next character */
558 } else if(state
==readCommand
) {
559 /* not in a multi-byte sequence, reset toULength */
562 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
563 scsu
->toUState
=state
;
564 scsu
->toUQuoteWindow
=quoteWindow
;
565 scsu
->toUDynamicWindow
=dynamicWindow
;
566 scsu
->toUByteOne
=byteOne
;
568 /* write back the updated pointers */
569 pArgs
->source
=(const char *)source
;
570 pArgs
->target
=target
;
571 pArgs
->offsets
=offsets
;
576 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
577 * If a change is made in the original function, then either
578 * change this function the same way or
579 * re-copy the original function and remove the variables
580 * offsets, sourceIndex, and nextSourceIndex.
583 _SCSUToUnicode(UConverterToUnicodeArgs
*pArgs
,
584 UErrorCode
*pErrorCode
) {
587 const uint8_t *source
, *sourceLimit
;
589 const UChar
*targetLimit
;
590 UBool isSingleByteMode
;
591 uint8_t state
, byteOne
;
592 int8_t quoteWindow
, dynamicWindow
;
596 /* set up the local pointers */
597 cnv
=pArgs
->converter
;
598 scsu
=(SCSUData
*)cnv
->extraInfo
;
600 source
=(const uint8_t *)pArgs
->source
;
601 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
602 target
=pArgs
->target
;
603 targetLimit
=pArgs
->targetLimit
;
605 /* get the state machine state */
606 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
607 state
=scsu
->toUState
;
608 quoteWindow
=scsu
->toUQuoteWindow
;
609 dynamicWindow
=scsu
->toUDynamicWindow
;
610 byteOne
=scsu
->toUByteOne
;
615 * For performance, this is not a normal C loop.
616 * Instead, there are two code blocks for the two SCSU modes.
617 * The function branches to either one, and a change of the mode is done with a goto to
620 * Each branch has two conventional loops:
621 * - a fast-path loop for the most common codes in the mode
622 * - a loop for all other codes in the mode
623 * When the fast-path runs into a code that it cannot handle, its loop ends and it
624 * runs into the following loop to handle the other codes.
625 * The end of the input or output buffer is also handled by the slower loop.
626 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
628 * The callback handling is done by returning with an error code.
629 * The conversion framework actually calls the callback function.
631 if(isSingleByteMode
) {
632 /* fast path for single-byte mode */
633 if(state
==readCommand
) {
635 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
638 /* write US-ASCII graphic character or DEL */
641 /* write from dynamic window */
642 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
646 /* output surrogate pair */
647 *target
++=(UChar
)(0xd7c0+(c
>>10));
648 if(target
<targetLimit
) {
649 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
651 /* target overflow */
652 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
653 cnv
->UCharErrorBufferLength
=1;
654 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
662 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
664 while(source
<sourceLimit
) {
665 if(target
>=targetLimit
) {
667 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
673 /* redundant conditions are commented out */
674 /* here: b<0x20 because otherwise we would be in fastSingle */
675 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
681 dynamicWindow
=(int8_t)(b
-SC0
);
683 } else /* if(SD0<=b && b<=SD7) */ {
684 dynamicWindow
=(int8_t)(b
-SD0
);
687 } else if(/* SQ0<=b && */ b
<=SQ7
) {
688 quoteWindow
=(int8_t)(b
-SQ0
);
695 isSingleByteMode
=FALSE
;
698 /* callback(illegal) */
699 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
705 /* store the first byte of a multibyte sequence in toUBytes[] */
716 *target
++=(UChar
)((byteOne
<<8)|b
);
721 /* all static offsets are in the BMP */
722 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
724 /* write from dynamic window */
725 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
729 /* output surrogate pair */
730 *target
++=(UChar
)(0xd7c0+(c
>>10));
731 if(target
<targetLimit
) {
732 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
734 /* target overflow */
735 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
736 cnv
->UCharErrorBufferLength
=1;
737 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
745 dynamicWindow
=(int8_t)((b
>>5)&7);
746 byteOne
=(uint8_t)(b
&0x1f);
752 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
757 /* callback(illegal): Reserved window offset value 0 */
761 } else if(b
<gapThreshold
) {
762 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
763 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
764 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
765 } else if(b
>=fixedThreshold
) {
766 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
768 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
778 /* fast path for Unicode mode */
779 if(state
==readCommand
) {
781 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
782 *target
++=(UChar
)((b
<<8)|source
[1]);
787 /* normal state machine for Unicode mode */
788 /* unicodeByteMode: */
789 while(source
<sourceLimit
) {
790 if(target
>=targetLimit
) {
792 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
798 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
803 } else if(/* UC0<=b && */ b
<=UC7
) {
804 dynamicWindow
=(int8_t)(b
-UC0
);
805 isSingleByteMode
=TRUE
;
807 } else if(/* UD0<=b && */ b
<=UD7
) {
808 dynamicWindow
=(int8_t)(b
-UD0
);
809 isSingleByteMode
=TRUE
;
815 isSingleByteMode
=TRUE
;
825 /* callback(illegal) */
826 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
839 *target
++=(UChar
)((byteOne
<<8)|b
);
847 /* set the converter state back into UConverter */
848 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
849 /* reset to deal with the next character */
851 } else if(state
==readCommand
) {
852 /* not in a multi-byte sequence, reset toULength */
855 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
856 scsu
->toUState
=state
;
857 scsu
->toUQuoteWindow
=quoteWindow
;
858 scsu
->toUDynamicWindow
=dynamicWindow
;
859 scsu
->toUByteOne
=byteOne
;
861 /* write back the updated pointers */
862 pArgs
->source
=(const char *)source
;
863 pArgs
->target
=target
;
867 /* SCSU-from-Unicode conversion functions ----------------------------------- */
870 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
871 * reasonable results. The lookahead is minimal.
872 * Many cases are simple:
873 * A character fits directly into the current mode, a dynamic or static window,
874 * or is not compressible. These cases are tested first.
875 * Real compression heuristics are applied to the rest, in code branches for
876 * single/Unicode mode and BMP/supplementary code points.
877 * The heuristics used here are extremely simple.
880 /* get the number of the window that this character is in, or -1 */
882 getWindow(const uint32_t offsets
[8], uint32_t c
) {
885 if((uint32_t)(c
-offsets
[i
])<=0x7f) {
892 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
894 isInOffsetWindowOrDirect(uint32_t offset
, uint32_t c
) {
895 return (UBool
)(c
<=offset
+0x7f &&
896 (c
>=offset
|| (c
<=0x7f &&
897 (c
>=0x20 || (1UL<<c
)&0x2601))));
898 /* binary 0010 0110 0000 0001,
899 check for b==0xd || b==0xa || b==9 || b==0 */
903 * getNextDynamicWindow returns the next dynamic window to be redefined
906 getNextDynamicWindow(SCSUData
*scsu
) {
907 int8_t window
=scsu
->windowUse
[scsu
->nextWindowUseIndex
];
908 if(++scsu
->nextWindowUseIndex
==8) {
909 scsu
->nextWindowUseIndex
=0;
915 * useDynamicWindow() adjusts
916 * windowUse[] and nextWindowUseIndex for the algorithm to choose
917 * the next dynamic window to be defined;
918 * a subclass may override it and provide its own algorithm.
921 useDynamicWindow(SCSUData
*scsu
, int8_t window
) {
923 * move the existing window, which just became the most recently used one,
924 * up in windowUse[] to nextWindowUseIndex-1
927 /* first, find the index of the window - backwards to favor the more recently used windows */
930 i
=scsu
->nextWindowUseIndex
;
935 } while(scsu
->windowUse
[i
]!=window
);
937 /* now copy each windowUse[i+1] to [i] */
942 while(j
!=scsu
->nextWindowUseIndex
) {
943 scsu
->windowUse
[i
]=scsu
->windowUse
[j
];
948 /* finally, set the window into the most recently used index */
949 scsu
->windowUse
[i
]=window
;
953 * calculate the offset and the code for a dynamic window that contains the character
954 * takes fixed offsets into account
955 * the offset of the window is stored in the offset variable,
956 * the code is returned
958 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
961 getDynamicOffset(uint32_t c
, uint32_t *pOffset
) {
965 if((uint32_t)(c
-fixedOffsets
[i
])<=0x7f) {
966 *pOffset
=fixedOffsets
[i
];
972 /* No dynamic window for US-ASCII. */
974 } else if(c
<0x3400 ||
975 (uint32_t)(c
-0x10000)<(0x14000-0x10000) ||
976 (uint32_t)(c
-0x1d000)<=(0x1ffff-0x1d000)
978 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
979 *pOffset
=c
&0x7fffff80;
981 } else if(0xe000<=c
&& c
!=0xfeff && c
<0xfff0) {
982 /* For these characters we need to take the gapOffset into account. */
983 *pOffset
=c
&0x7fffff80;
984 return (int)((c
-gapOffset
)>>7);
991 * Idea for compression:
992 * - save SCSUData and other state before really starting work
993 * - at endloop, see if compression could be better with just unicode mode
994 * - don't do this if a callback has been called
995 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
996 * - different buffer handling!
998 * Drawback or need for corrective handling:
999 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1000 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1001 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1003 * How to achieve both?
1004 * - Only replace the result after an SDX or SCU?
1008 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1009 UErrorCode
*pErrorCode
) {
1012 const UChar
*source
, *sourceLimit
;
1014 int32_t targetCapacity
;
1017 UBool isSingleByteMode
;
1018 uint8_t dynamicWindow
;
1019 uint32_t currentOffset
;
1023 int32_t sourceIndex
, nextSourceIndex
;
1027 /* variables for compression heuristics */
1033 /* set up the local pointers */
1034 cnv
=pArgs
->converter
;
1035 scsu
=(SCSUData
*)cnv
->extraInfo
;
1037 /* set up the local pointers */
1038 source
=pArgs
->source
;
1039 sourceLimit
=pArgs
->sourceLimit
;
1040 target
=(uint8_t *)pArgs
->target
;
1041 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1042 offsets
=pArgs
->offsets
;
1044 /* get the state machine state */
1045 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1046 dynamicWindow
=scsu
->fromUDynamicWindow
;
1047 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1051 /* sourceIndex=-1 if the current character began in the previous buffer */
1052 sourceIndex
= c
==0 ? 0 : -1;
1055 /* similar conversion "loop" as in toUnicode */
1057 if(isSingleByteMode
) {
1058 if(c
!=0 && targetCapacity
>0) {
1059 goto getTrailSingle
;
1062 /* state machine for single-byte mode */
1063 /* singleByteMode: */
1064 while(source
<sourceLimit
) {
1065 if(targetCapacity
<=0) {
1066 /* target is full */
1067 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1073 if((c
-0x20)<=0x5f) {
1074 /* pass US-ASCII graphic character through */
1075 *target
++=(uint8_t)c
;
1077 *offsets
++=sourceIndex
;
1081 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1083 *target
++=(uint8_t)c
;
1085 *offsets
++=sourceIndex
;
1089 /* quote C0 control character */
1094 } else if((delta
=c
-currentOffset
)<=0x7f) {
1095 /* use the current dynamic window */
1096 *target
++=(uint8_t)(delta
|0x80);
1098 *offsets
++=sourceIndex
;
1101 } else if(UTF_IS_SURROGATE(c
)) {
1102 if(UTF_IS_SURROGATE_FIRST(c
)) {
1105 if(source
<sourceLimit
) {
1106 /* test the following code unit */
1108 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1111 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1112 /* convert this surrogate code point */
1113 /* exit this condition tree */
1115 /* this is an unmatched lead code unit (1st surrogate) */
1116 /* callback(illegal) */
1117 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1125 /* this is an unmatched trail code unit (2nd surrogate) */
1126 /* callback(illegal) */
1127 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1131 /* compress supplementary character U+10000..U+10ffff */
1132 if((delta
=c
-currentOffset
)<=0x7f) {
1133 /* use the current dynamic window */
1134 *target
++=(uint8_t)(delta
|0x80);
1136 *offsets
++=sourceIndex
;
1139 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1140 /* there is a dynamic window that contains this character, change to it */
1141 dynamicWindow
=window
;
1142 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1143 useDynamicWindow(scsu
, dynamicWindow
);
1144 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1147 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1148 /* might check if there are more characters in this window to come */
1149 /* define an extended window with this character */
1151 dynamicWindow
=getNextDynamicWindow(scsu
);
1152 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1153 useDynamicWindow(scsu
, dynamicWindow
);
1154 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1158 /* change to Unicode mode and output this (lead, trail) pair */
1159 isSingleByteMode
=FALSE
;
1160 *target
++=(uint8_t)SCU
;
1162 *offsets
++=sourceIndex
;
1165 c
=((uint32_t)lead
<<16)|trail
;
1170 /* quote C1 control character */
1171 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1174 } else if(c
==0xfeff || c
>=0xfff0) {
1175 /* quote signature character=byte order mark and specials */
1180 /* compress all other BMP characters */
1181 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1182 /* there is a window defined that contains this character - switch to it or quote from it? */
1183 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1184 /* change to dynamic window */
1185 dynamicWindow
=window
;
1186 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1187 useDynamicWindow(scsu
, dynamicWindow
);
1188 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1192 /* quote from dynamic window */
1193 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1197 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1198 /* quote from static window */
1199 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1202 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1203 /* define a dynamic window with this character */
1204 dynamicWindow
=getNextDynamicWindow(scsu
);
1205 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1206 useDynamicWindow(scsu
, dynamicWindow
);
1207 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1210 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1211 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1214 * this character is not compressible (a BMP ideograph or similar);
1215 * switch to Unicode mode if this is the last character in the block
1216 * or there is at least one more ideograph following immediately
1218 isSingleByteMode
=FALSE
;
1230 /* normal end of conversion: prepare for a new character */
1232 sourceIndex
=nextSourceIndex
;
1235 if(c
!=0 && targetCapacity
>0) {
1236 goto getTrailUnicode
;
1239 /* state machine for Unicode mode */
1240 /* unicodeByteMode: */
1241 while(source
<sourceLimit
) {
1242 if(targetCapacity
<=0) {
1243 /* target is full */
1244 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1250 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1251 /* not compressible, write character directly */
1252 if(targetCapacity
>=2) {
1253 *target
++=(uint8_t)(c
>>8);
1254 *target
++=(uint8_t)c
;
1256 *offsets
++=sourceIndex
;
1257 *offsets
++=sourceIndex
;
1264 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1265 /* compress BMP character if the following one is not an uncompressible ideograph */
1266 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1267 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1268 /* ASCII digit or letter */
1269 isSingleByteMode
=TRUE
;
1270 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1273 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1274 /* there is a dynamic window that contains this character, change to it */
1275 isSingleByteMode
=TRUE
;
1276 dynamicWindow
=window
;
1277 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1278 useDynamicWindow(scsu
, dynamicWindow
);
1279 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1282 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1283 /* define a dynamic window with this character */
1284 isSingleByteMode
=TRUE
;
1285 dynamicWindow
=getNextDynamicWindow(scsu
);
1286 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1287 useDynamicWindow(scsu
, dynamicWindow
);
1288 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1294 /* don't know how to compress this character, just write it directly */
1297 } else if(c
<0xe000) {
1298 /* c is a surrogate */
1299 if(UTF_IS_SURROGATE_FIRST(c
)) {
1302 if(source
<sourceLimit
) {
1303 /* test the following code unit */
1305 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1308 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1309 /* convert this surrogate code point */
1310 /* exit this condition tree */
1312 /* this is an unmatched lead code unit (1st surrogate) */
1313 /* callback(illegal) */
1314 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1322 /* this is an unmatched trail code unit (2nd surrogate) */
1323 /* callback(illegal) */
1324 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1328 /* compress supplementary character */
1329 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1330 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1333 * there is a dynamic window that contains this character and
1334 * the following character is not uncompressible,
1335 * change to the window
1337 isSingleByteMode
=TRUE
;
1338 dynamicWindow
=window
;
1339 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1340 useDynamicWindow(scsu
, dynamicWindow
);
1341 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1344 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1345 (code
=getDynamicOffset(c
, &offset
))>=0
1347 /* two supplementary characters in (probably) the same window - define an extended one */
1348 isSingleByteMode
=TRUE
;
1350 dynamicWindow
=getNextDynamicWindow(scsu
);
1351 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1352 useDynamicWindow(scsu
, dynamicWindow
);
1353 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1357 /* don't know how to compress this character, just write it directly */
1358 c
=((uint32_t)lead
<<16)|trail
;
1362 } else /* 0xe000<=c<0xf300 */ {
1363 /* quote to avoid SCSU tags */
1369 /* normal end of conversion: prepare for a new character */
1371 sourceIndex
=nextSourceIndex
;
1376 /* set the converter state back into UConverter */
1377 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1378 scsu
->fromUDynamicWindow
=dynamicWindow
;
1382 /* write back the updated pointers */
1383 pArgs
->source
=source
;
1384 pArgs
->target
=(char *)target
;
1385 pArgs
->offsets
=offsets
;
1389 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1390 /* from the first if in the loop we know that targetCapacity>0 */
1391 if(length
<=targetCapacity
) {
1394 /* each branch falls through to the next one */
1396 *target
++=(uint8_t)(c
>>24);
1398 *target
++=(uint8_t)(c
>>16);
1400 *target
++=(uint8_t)(c
>>8);
1402 *target
++=(uint8_t)c
;
1404 /* will never occur */
1409 /* each branch falls through to the next one */
1411 *target
++=(uint8_t)(c
>>24);
1412 *offsets
++=sourceIndex
;
1414 *target
++=(uint8_t)(c
>>16);
1415 *offsets
++=sourceIndex
;
1417 *target
++=(uint8_t)(c
>>8);
1418 *offsets
++=sourceIndex
;
1420 *target
++=(uint8_t)c
;
1421 *offsets
++=sourceIndex
;
1423 /* will never occur */
1427 targetCapacity
-=length
;
1429 /* normal end of conversion: prepare for a new character */
1431 sourceIndex
=nextSourceIndex
;
1437 * We actually do this backwards here:
1438 * In order to save an intermediate variable, we output
1439 * first to the overflow buffer what does not fit into the
1442 /* we know that 0<=targetCapacity<length<=4 */
1443 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1444 length
-=targetCapacity
;
1445 p
=(uint8_t *)cnv
->charErrorBuffer
;
1447 /* each branch falls through to the next one */
1449 *p
++=(uint8_t)(c
>>24);
1451 *p
++=(uint8_t)(c
>>16);
1453 *p
++=(uint8_t)(c
>>8);
1457 /* will never occur */
1460 cnv
->charErrorBufferLength
=(int8_t)length
;
1462 /* now output what fits into the regular target */
1463 c
>>=8*length
; /* length was reduced by targetCapacity */
1464 switch(targetCapacity
) {
1465 /* each branch falls through to the next one */
1467 *target
++=(uint8_t)(c
>>16);
1469 *offsets
++=sourceIndex
;
1472 *target
++=(uint8_t)(c
>>8);
1474 *offsets
++=sourceIndex
;
1477 *target
++=(uint8_t)c
;
1479 *offsets
++=sourceIndex
;
1485 /* target overflow */
1487 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1494 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1495 * If a change is made in the original function, then either
1496 * change this function the same way or
1497 * re-copy the original function and remove the variables
1498 * offsets, sourceIndex, and nextSourceIndex.
1501 _SCSUFromUnicode(UConverterFromUnicodeArgs
*pArgs
,
1502 UErrorCode
*pErrorCode
) {
1505 const UChar
*source
, *sourceLimit
;
1507 int32_t targetCapacity
;
1509 UBool isSingleByteMode
;
1510 uint8_t dynamicWindow
;
1511 uint32_t currentOffset
;
1517 /* variables for compression heuristics */
1523 /* set up the local pointers */
1524 cnv
=pArgs
->converter
;
1525 scsu
=(SCSUData
*)cnv
->extraInfo
;
1527 /* set up the local pointers */
1528 source
=pArgs
->source
;
1529 sourceLimit
=pArgs
->sourceLimit
;
1530 target
=(uint8_t *)pArgs
->target
;
1531 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1533 /* get the state machine state */
1534 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1535 dynamicWindow
=scsu
->fromUDynamicWindow
;
1536 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1540 /* similar conversion "loop" as in toUnicode */
1542 if(isSingleByteMode
) {
1543 if(c
!=0 && targetCapacity
>0) {
1544 goto getTrailSingle
;
1547 /* state machine for single-byte mode */
1548 /* singleByteMode: */
1549 while(source
<sourceLimit
) {
1550 if(targetCapacity
<=0) {
1551 /* target is full */
1552 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1557 if((c
-0x20)<=0x5f) {
1558 /* pass US-ASCII graphic character through */
1559 *target
++=(uint8_t)c
;
1562 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1564 *target
++=(uint8_t)c
;
1567 /* quote C0 control character */
1572 } else if((delta
=c
-currentOffset
)<=0x7f) {
1573 /* use the current dynamic window */
1574 *target
++=(uint8_t)(delta
|0x80);
1576 } else if(UTF_IS_SURROGATE(c
)) {
1577 if(UTF_IS_SURROGATE_FIRST(c
)) {
1580 if(source
<sourceLimit
) {
1581 /* test the following code unit */
1583 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1585 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1586 /* convert this surrogate code point */
1587 /* exit this condition tree */
1589 /* this is an unmatched lead code unit (1st surrogate) */
1590 /* callback(illegal) */
1591 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1599 /* this is an unmatched trail code unit (2nd surrogate) */
1600 /* callback(illegal) */
1601 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1605 /* compress supplementary character U+10000..U+10ffff */
1606 if((delta
=c
-currentOffset
)<=0x7f) {
1607 /* use the current dynamic window */
1608 *target
++=(uint8_t)(delta
|0x80);
1610 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1611 /* there is a dynamic window that contains this character, change to it */
1612 dynamicWindow
=window
;
1613 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1614 useDynamicWindow(scsu
, dynamicWindow
);
1615 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1618 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1619 /* might check if there are more characters in this window to come */
1620 /* define an extended window with this character */
1622 dynamicWindow
=getNextDynamicWindow(scsu
);
1623 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1624 useDynamicWindow(scsu
, dynamicWindow
);
1625 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1629 /* change to Unicode mode and output this (lead, trail) pair */
1630 isSingleByteMode
=FALSE
;
1631 *target
++=(uint8_t)SCU
;
1633 c
=((uint32_t)lead
<<16)|trail
;
1638 /* quote C1 control character */
1639 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1642 } else if(c
==0xfeff || c
>=0xfff0) {
1643 /* quote signature character=byte order mark and specials */
1648 /* compress all other BMP characters */
1649 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1650 /* there is a window defined that contains this character - switch to it or quote from it? */
1651 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1652 /* change to dynamic window */
1653 dynamicWindow
=window
;
1654 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1655 useDynamicWindow(scsu
, dynamicWindow
);
1656 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1660 /* quote from dynamic window */
1661 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1665 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1666 /* quote from static window */
1667 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1670 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1671 /* define a dynamic window with this character */
1672 dynamicWindow
=getNextDynamicWindow(scsu
);
1673 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1674 useDynamicWindow(scsu
, dynamicWindow
);
1675 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1678 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1679 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1682 * this character is not compressible (a BMP ideograph or similar);
1683 * switch to Unicode mode if this is the last character in the block
1684 * or there is at least one more ideograph following immediately
1686 isSingleByteMode
=FALSE
;
1698 /* normal end of conversion: prepare for a new character */
1702 if(c
!=0 && targetCapacity
>0) {
1703 goto getTrailUnicode
;
1706 /* state machine for Unicode mode */
1707 /* unicodeByteMode: */
1708 while(source
<sourceLimit
) {
1709 if(targetCapacity
<=0) {
1710 /* target is full */
1711 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1716 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1717 /* not compressible, write character directly */
1718 if(targetCapacity
>=2) {
1719 *target
++=(uint8_t)(c
>>8);
1720 *target
++=(uint8_t)c
;
1726 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1727 /* compress BMP character if the following one is not an uncompressible ideograph */
1728 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1729 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1730 /* ASCII digit or letter */
1731 isSingleByteMode
=TRUE
;
1732 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1735 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1736 /* there is a dynamic window that contains this character, change to it */
1737 isSingleByteMode
=TRUE
;
1738 dynamicWindow
=window
;
1739 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1740 useDynamicWindow(scsu
, dynamicWindow
);
1741 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1744 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1745 /* define a dynamic window with this character */
1746 isSingleByteMode
=TRUE
;
1747 dynamicWindow
=getNextDynamicWindow(scsu
);
1748 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1749 useDynamicWindow(scsu
, dynamicWindow
);
1750 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1756 /* don't know how to compress this character, just write it directly */
1759 } else if(c
<0xe000) {
1760 /* c is a surrogate */
1761 if(UTF_IS_SURROGATE_FIRST(c
)) {
1764 if(source
<sourceLimit
) {
1765 /* test the following code unit */
1767 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1769 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1770 /* convert this surrogate code point */
1771 /* exit this condition tree */
1773 /* this is an unmatched lead code unit (1st surrogate) */
1774 /* callback(illegal) */
1775 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1783 /* this is an unmatched trail code unit (2nd surrogate) */
1784 /* callback(illegal) */
1785 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1789 /* compress supplementary character */
1790 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1791 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1794 * there is a dynamic window that contains this character and
1795 * the following character is not uncompressible,
1796 * change to the window
1798 isSingleByteMode
=TRUE
;
1799 dynamicWindow
=window
;
1800 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1801 useDynamicWindow(scsu
, dynamicWindow
);
1802 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1805 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1806 (code
=getDynamicOffset(c
, &offset
))>=0
1808 /* two supplementary characters in (probably) the same window - define an extended one */
1809 isSingleByteMode
=TRUE
;
1811 dynamicWindow
=getNextDynamicWindow(scsu
);
1812 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1813 useDynamicWindow(scsu
, dynamicWindow
);
1814 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1818 /* don't know how to compress this character, just write it directly */
1819 c
=((uint32_t)lead
<<16)|trail
;
1823 } else /* 0xe000<=c<0xf300 */ {
1824 /* quote to avoid SCSU tags */
1830 /* normal end of conversion: prepare for a new character */
1836 /* set the converter state back into UConverter */
1837 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1838 scsu
->fromUDynamicWindow
=dynamicWindow
;
1842 /* write back the updated pointers */
1843 pArgs
->source
=source
;
1844 pArgs
->target
=(char *)target
;
1848 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1849 /* from the first if in the loop we know that targetCapacity>0 */
1850 if(length
<=targetCapacity
) {
1852 /* each branch falls through to the next one */
1854 *target
++=(uint8_t)(c
>>24);
1856 *target
++=(uint8_t)(c
>>16);
1858 *target
++=(uint8_t)(c
>>8);
1860 *target
++=(uint8_t)c
;
1862 /* will never occur */
1865 targetCapacity
-=length
;
1867 /* normal end of conversion: prepare for a new character */
1874 * We actually do this backwards here:
1875 * In order to save an intermediate variable, we output
1876 * first to the overflow buffer what does not fit into the
1879 /* we know that 0<=targetCapacity<length<=4 */
1880 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1881 length
-=targetCapacity
;
1882 p
=(uint8_t *)cnv
->charErrorBuffer
;
1884 /* each branch falls through to the next one */
1886 *p
++=(uint8_t)(c
>>24);
1888 *p
++=(uint8_t)(c
>>16);
1890 *p
++=(uint8_t)(c
>>8);
1894 /* will never occur */
1897 cnv
->charErrorBufferLength
=(int8_t)length
;
1899 /* now output what fits into the regular target */
1900 c
>>=8*length
; /* length was reduced by targetCapacity */
1901 switch(targetCapacity
) {
1902 /* each branch falls through to the next one */
1904 *target
++=(uint8_t)(c
>>16);
1906 *target
++=(uint8_t)(c
>>8);
1908 *target
++=(uint8_t)c
;
1913 /* target overflow */
1915 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1921 /* miscellaneous ------------------------------------------------------------ */
1924 _SCSUGetName(const UConverter
*cnv
) {
1925 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
1927 switch(scsu
->locale
) {
1929 return "SCSU,locale=ja";
1935 /* structure for SafeClone calculations */
1936 struct cloneSCSUStruct
1943 _SCSUSafeClone(const UConverter
*cnv
,
1945 int32_t *pBufferSize
,
1948 struct cloneSCSUStruct
* localClone
;
1949 int32_t bufferSizeNeeded
= sizeof(struct cloneSCSUStruct
);
1951 if (U_FAILURE(*status
)){
1955 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1956 *pBufferSize
= bufferSizeNeeded
;
1960 localClone
= (struct cloneSCSUStruct
*)stackBuffer
;
1961 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1963 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(SCSUData
));
1964 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1965 localClone
->cnv
.isExtraLocal
= TRUE
;
1967 return &localClone
->cnv
;
1971 static const UConverterImpl _SCSUImpl
={
1982 _SCSUToUnicodeWithOffsets
,
1984 _SCSUFromUnicodeWithOffsets
,
1991 ucnv_getCompleteUnicodeSet
1994 static const UConverterStaticData _SCSUStaticData
={
1995 sizeof(UConverterStaticData
),
1997 1212, /* CCSID for SCSU */
1998 UCNV_IBM
, UCNV_SCSU
,
1999 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2001 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2002 * substitution string.
2004 { 0x0e, 0xff, 0xfd, 0 }, 3,
2008 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2011 const UConverterSharedData _SCSUData
={
2012 sizeof(UConverterSharedData
), ~((uint32_t)0),
2013 NULL
, NULL
, &_SCSUStaticData
, FALSE
, &_SCSUImpl
,