1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2000-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: ucnvscsu.c
12 * tab size: 8 (not used)
15 * created on: 2000nov18
16 * created by: Markus W. Scherer
18 * This is an implementation of the Standard Compression Scheme for Unicode
19 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
20 * Reserved commands and window settings are treated as illegal sequences and
21 * will result in callback calls.
24 #include "unicode/utypes.h"
26 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
28 #include "unicode/ucnv.h"
29 #include "unicode/ucnv_cb.h"
30 #include "unicode/utf16.h"
35 /* SCSU definitions --------------------------------------------------------- */
37 /* SCSU command byte values */
39 SQ0
=0x01, /* Quote from window pair 0 */
40 SQ7
=0x08, /* Quote from window pair 7 */
41 SDX
=0x0B, /* Define a window as extended */
42 Srs
=0x0C, /* reserved */
43 SQU
=0x0E, /* Quote a single Unicode character */
44 SCU
=0x0F, /* Change to Unicode mode */
45 SC0
=0x10, /* Select window 0 */
46 SC7
=0x17, /* Select window 7 */
47 SD0
=0x18, /* Define and select window 0 */
48 SD7
=0x1F, /* Define and select window 7 */
50 UC0
=0xE0, /* Select window 0 */
51 UC7
=0xE7, /* Select window 7 */
52 UD0
=0xE8, /* Define and select window 0 */
53 UD7
=0xEF, /* Define and select window 7 */
54 UQU
=0xF0, /* Quote a single Unicode character */
55 UDX
=0xF1, /* Define a Window as extended */
56 Urs
=0xF2 /* reserved */
61 * Unicode code points from 3400 to E000 are not adressible by
62 * dynamic window, since in these areas no short run alphabets are
63 * found. Therefore add gapOffset to all values from gapThreshold.
68 /* values between reservedStart and fixedThreshold are reserved */
71 /* use table of predefined fixed offsets for values from fixedThreshold */
75 /* constant offsets for the 8 static windows */
76 static const uint32_t staticOffsets
[8]={
77 0x0000, /* ASCII for quoted tags */
78 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
79 0x0100, /* Latin Extended-A */
80 0x0300, /* Combining Diacritical Marks */
81 0x2000, /* General Punctuation */
82 0x2080, /* Currency Symbols */
83 0x2100, /* Letterlike Symbols and Number Forms */
84 0x3000 /* CJK Symbols and punctuation */
87 /* initial offsets for the 8 dynamic (sliding) windows */
88 static const uint32_t initialDynamicOffsets
[8]={
90 0x00C0, /* Latin Extended A */
91 0x0400, /* Cyrillic */
93 0x0900, /* Devanagari */
94 0x3040, /* Hiragana */
95 0x30A0, /* Katakana */
96 0xFF00 /* Fullwidth ASCII */
99 /* Table of fixed predefined Offsets */
100 static const uint32_t fixedOffsets
[]={
101 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
102 /* 0xFA */ 0x0250, /* IPA extensions */
103 /* 0xFB */ 0x0370, /* Greek */
104 /* 0xFC */ 0x0530, /* Armenian */
105 /* 0xFD */ 0x3040, /* Hiragana */
106 /* 0xFE */ 0x30A0, /* Katakana */
107 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
121 typedef struct SCSUData
{
122 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
123 uint32_t toUDynamicOffsets
[8];
124 uint32_t fromUDynamicOffsets
[8];
126 /* state machine state - toUnicode */
127 UBool toUIsSingleByteMode
;
129 int8_t toUQuoteWindow
, toUDynamicWindow
;
131 uint8_t toUPadding
[3];
133 /* state machine state - fromUnicode */
134 UBool fromUIsSingleByteMode
;
135 int8_t fromUDynamicWindow
;
138 * windowUse[] keeps track of the use of the dynamic windows:
139 * At nextWindowUseIndex there is the least recently used window,
140 * and the following windows (in a wrapping manner) are more and more
142 * At nextWindowUseIndex-1 there is the most recently used window.
145 int8_t nextWindowUseIndex
;
149 static const int8_t initialWindowUse
[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
150 static const int8_t initialWindowUse_ja
[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
156 /* SCSU setup functions ----------------------------------------------------- */
158 static void U_CALLCONV
159 _SCSUReset(UConverter
*cnv
, UConverterResetChoice choice
) {
160 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
162 if(choice
<=UCNV_RESET_TO_UNICODE
) {
163 /* reset toUnicode */
164 uprv_memcpy(scsu
->toUDynamicOffsets
, initialDynamicOffsets
, 32);
166 scsu
->toUIsSingleByteMode
=TRUE
;
167 scsu
->toUState
=readCommand
;
168 scsu
->toUQuoteWindow
=scsu
->toUDynamicWindow
=0;
173 if(choice
!=UCNV_RESET_TO_UNICODE
) {
174 /* reset fromUnicode */
175 uprv_memcpy(scsu
->fromUDynamicOffsets
, initialDynamicOffsets
, 32);
177 scsu
->fromUIsSingleByteMode
=TRUE
;
178 scsu
->fromUDynamicWindow
=0;
180 scsu
->nextWindowUseIndex
=0;
181 switch(scsu
->locale
) {
183 uprv_memcpy(scsu
->windowUse
, initialWindowUse_ja
, 8);
186 uprv_memcpy(scsu
->windowUse
, initialWindowUse
, 8);
194 static void U_CALLCONV
195 _SCSUOpen(UConverter
*cnv
,
196 UConverterLoadArgs
*pArgs
,
197 UErrorCode
*pErrorCode
) {
198 const char *locale
=pArgs
->locale
;
199 if(pArgs
->onlyTestIsLoadable
) {
202 cnv
->extraInfo
=uprv_malloc(sizeof(SCSUData
));
203 if(cnv
->extraInfo
!=NULL
) {
204 if(locale
!=NULL
&& locale
[0]=='j' && locale
[1]=='a' && (locale
[2]==0 || locale
[2]=='_')) {
205 ((SCSUData
*)cnv
->extraInfo
)->locale
=l_ja
;
207 ((SCSUData
*)cnv
->extraInfo
)->locale
=lGeneric
;
209 _SCSUReset(cnv
, UCNV_RESET_BOTH
);
211 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
214 /* Set the substitution character U+fffd as a Unicode string. */
215 cnv
->subUChars
[0]=0xfffd;
219 static void U_CALLCONV
220 _SCSUClose(UConverter
*cnv
) {
221 if(cnv
->extraInfo
!=NULL
) {
222 if(!cnv
->isExtraLocal
) {
223 uprv_free(cnv
->extraInfo
);
229 /* SCSU-to-Unicode conversion functions ------------------------------------- */
231 static void U_CALLCONV
232 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
233 UErrorCode
*pErrorCode
) {
236 const uint8_t *source
, *sourceLimit
;
238 const UChar
*targetLimit
;
240 UBool isSingleByteMode
;
241 uint8_t state
, byteOne
;
242 int8_t quoteWindow
, dynamicWindow
;
244 int32_t sourceIndex
, nextSourceIndex
;
248 /* set up the local pointers */
249 cnv
=pArgs
->converter
;
250 scsu
=(SCSUData
*)cnv
->extraInfo
;
252 source
=(const uint8_t *)pArgs
->source
;
253 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
254 target
=pArgs
->target
;
255 targetLimit
=pArgs
->targetLimit
;
256 offsets
=pArgs
->offsets
;
258 /* get the state machine state */
259 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
260 state
=scsu
->toUState
;
261 quoteWindow
=scsu
->toUQuoteWindow
;
262 dynamicWindow
=scsu
->toUDynamicWindow
;
263 byteOne
=scsu
->toUByteOne
;
265 /* sourceIndex=-1 if the current character began in the previous buffer */
266 sourceIndex
=state
==readCommand
? 0 : -1;
272 * For performance, this is not a normal C loop.
273 * Instead, there are two code blocks for the two SCSU modes.
274 * The function branches to either one, and a change of the mode is done with a goto to
277 * Each branch has two conventional loops:
278 * - a fast-path loop for the most common codes in the mode
279 * - a loop for all other codes in the mode
280 * When the fast-path runs into a code that it cannot handle, its loop ends and it
281 * runs into the following loop to handle the other codes.
282 * The end of the input or output buffer is also handled by the slower loop.
283 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
285 * The callback handling is done by returning with an error code.
286 * The conversion framework actually calls the callback function.
288 if(isSingleByteMode
) {
289 /* fast path for single-byte mode */
290 if(state
==readCommand
) {
292 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
296 /* write US-ASCII graphic character or DEL */
299 *offsets
++=sourceIndex
;
302 /* write from dynamic window */
303 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
307 *offsets
++=sourceIndex
;
310 /* output surrogate pair */
311 *target
++=(UChar
)(0xd7c0+(c
>>10));
312 if(target
<targetLimit
) {
313 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
315 *offsets
++=sourceIndex
;
316 *offsets
++=sourceIndex
;
319 /* target overflow */
321 *offsets
++=sourceIndex
;
323 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
324 cnv
->UCharErrorBufferLength
=1;
325 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
330 sourceIndex
=nextSourceIndex
;
334 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
336 while(source
<sourceLimit
) {
337 if(target
>=targetLimit
) {
339 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
346 /* redundant conditions are commented out */
347 /* here: b<0x20 because otherwise we would be in fastSingle */
348 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
352 *offsets
++=sourceIndex
;
354 sourceIndex
=nextSourceIndex
;
358 dynamicWindow
=(int8_t)(b
-SC0
);
359 sourceIndex
=nextSourceIndex
;
361 } else /* if(SD0<=b && b<=SD7) */ {
362 dynamicWindow
=(int8_t)(b
-SD0
);
365 } else if(/* SQ0<=b && */ b
<=SQ7
) {
366 quoteWindow
=(int8_t)(b
-SQ0
);
373 sourceIndex
=nextSourceIndex
;
374 isSingleByteMode
=FALSE
;
377 /* callback(illegal) */
378 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
384 /* store the first byte of a multibyte sequence in toUBytes[] */
395 *target
++=(UChar
)((byteOne
<<8)|b
);
397 *offsets
++=sourceIndex
;
399 sourceIndex
=nextSourceIndex
;
404 /* all static offsets are in the BMP */
405 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
407 *offsets
++=sourceIndex
;
410 /* write from dynamic window */
411 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
415 *offsets
++=sourceIndex
;
418 /* output surrogate pair */
419 *target
++=(UChar
)(0xd7c0+(c
>>10));
420 if(target
<targetLimit
) {
421 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
423 *offsets
++=sourceIndex
;
424 *offsets
++=sourceIndex
;
427 /* target overflow */
429 *offsets
++=sourceIndex
;
431 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
432 cnv
->UCharErrorBufferLength
=1;
433 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
438 sourceIndex
=nextSourceIndex
;
442 dynamicWindow
=(int8_t)((b
>>5)&7);
443 byteOne
=(uint8_t)(b
&0x1f);
449 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
450 sourceIndex
=nextSourceIndex
;
455 /* callback(illegal): Reserved window offset value 0 */
459 } else if(b
<gapThreshold
) {
460 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
461 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
462 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
463 } else if(b
>=fixedThreshold
) {
464 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
466 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
471 sourceIndex
=nextSourceIndex
;
477 /* fast path for Unicode mode */
478 if(state
==readCommand
) {
480 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
481 *target
++=(UChar
)((b
<<8)|source
[1]);
483 *offsets
++=sourceIndex
;
485 sourceIndex
=nextSourceIndex
;
491 /* normal state machine for Unicode mode */
492 /* unicodeByteMode: */
493 while(source
<sourceLimit
) {
494 if(target
>=targetLimit
) {
496 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
503 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
508 } else if(/* UC0<=b && */ b
<=UC7
) {
509 dynamicWindow
=(int8_t)(b
-UC0
);
510 sourceIndex
=nextSourceIndex
;
511 isSingleByteMode
=TRUE
;
513 } else if(/* UD0<=b && */ b
<=UD7
) {
514 dynamicWindow
=(int8_t)(b
-UD0
);
515 isSingleByteMode
=TRUE
;
521 isSingleByteMode
=TRUE
;
531 /* callback(illegal) */
532 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
545 *target
++=(UChar
)((byteOne
<<8)|b
);
547 *offsets
++=sourceIndex
;
549 sourceIndex
=nextSourceIndex
;
557 /* set the converter state back into UConverter */
558 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
559 /* reset to deal with the next character */
561 } else if(state
==readCommand
) {
562 /* not in a multi-byte sequence, reset toULength */
565 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
566 scsu
->toUState
=state
;
567 scsu
->toUQuoteWindow
=quoteWindow
;
568 scsu
->toUDynamicWindow
=dynamicWindow
;
569 scsu
->toUByteOne
=byteOne
;
571 /* write back the updated pointers */
572 pArgs
->source
=(const char *)source
;
573 pArgs
->target
=target
;
574 pArgs
->offsets
=offsets
;
579 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
580 * If a change is made in the original function, then either
581 * change this function the same way or
582 * re-copy the original function and remove the variables
583 * offsets, sourceIndex, and nextSourceIndex.
585 static void U_CALLCONV
586 _SCSUToUnicode(UConverterToUnicodeArgs
*pArgs
,
587 UErrorCode
*pErrorCode
) {
590 const uint8_t *source
, *sourceLimit
;
592 const UChar
*targetLimit
;
593 UBool isSingleByteMode
;
594 uint8_t state
, byteOne
;
595 int8_t quoteWindow
, dynamicWindow
;
599 /* set up the local pointers */
600 cnv
=pArgs
->converter
;
601 scsu
=(SCSUData
*)cnv
->extraInfo
;
603 source
=(const uint8_t *)pArgs
->source
;
604 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
605 target
=pArgs
->target
;
606 targetLimit
=pArgs
->targetLimit
;
608 /* get the state machine state */
609 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
610 state
=scsu
->toUState
;
611 quoteWindow
=scsu
->toUQuoteWindow
;
612 dynamicWindow
=scsu
->toUDynamicWindow
;
613 byteOne
=scsu
->toUByteOne
;
618 * For performance, this is not a normal C loop.
619 * Instead, there are two code blocks for the two SCSU modes.
620 * The function branches to either one, and a change of the mode is done with a goto to
623 * Each branch has two conventional loops:
624 * - a fast-path loop for the most common codes in the mode
625 * - a loop for all other codes in the mode
626 * When the fast-path runs into a code that it cannot handle, its loop ends and it
627 * runs into the following loop to handle the other codes.
628 * The end of the input or output buffer is also handled by the slower loop.
629 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
631 * The callback handling is done by returning with an error code.
632 * The conversion framework actually calls the callback function.
634 if(isSingleByteMode
) {
635 /* fast path for single-byte mode */
636 if(state
==readCommand
) {
638 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
641 /* write US-ASCII graphic character or DEL */
644 /* write from dynamic window */
645 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
649 /* output surrogate pair */
650 *target
++=(UChar
)(0xd7c0+(c
>>10));
651 if(target
<targetLimit
) {
652 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
654 /* target overflow */
655 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
656 cnv
->UCharErrorBufferLength
=1;
657 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
665 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
667 while(source
<sourceLimit
) {
668 if(target
>=targetLimit
) {
670 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
676 /* redundant conditions are commented out */
677 /* here: b<0x20 because otherwise we would be in fastSingle */
678 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
684 dynamicWindow
=(int8_t)(b
-SC0
);
686 } else /* if(SD0<=b && b<=SD7) */ {
687 dynamicWindow
=(int8_t)(b
-SD0
);
690 } else if(/* SQ0<=b && */ b
<=SQ7
) {
691 quoteWindow
=(int8_t)(b
-SQ0
);
698 isSingleByteMode
=FALSE
;
701 /* callback(illegal) */
702 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
708 /* store the first byte of a multibyte sequence in toUBytes[] */
719 *target
++=(UChar
)((byteOne
<<8)|b
);
724 /* all static offsets are in the BMP */
725 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
727 /* write from dynamic window */
728 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
732 /* output surrogate pair */
733 *target
++=(UChar
)(0xd7c0+(c
>>10));
734 if(target
<targetLimit
) {
735 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
737 /* target overflow */
738 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
739 cnv
->UCharErrorBufferLength
=1;
740 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
748 dynamicWindow
=(int8_t)((b
>>5)&7);
749 byteOne
=(uint8_t)(b
&0x1f);
755 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
760 /* callback(illegal): Reserved window offset value 0 */
764 } else if(b
<gapThreshold
) {
765 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
766 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
767 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
768 } else if(b
>=fixedThreshold
) {
769 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
771 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
781 /* fast path for Unicode mode */
782 if(state
==readCommand
) {
784 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
785 *target
++=(UChar
)((b
<<8)|source
[1]);
790 /* normal state machine for Unicode mode */
791 /* unicodeByteMode: */
792 while(source
<sourceLimit
) {
793 if(target
>=targetLimit
) {
795 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
801 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
806 } else if(/* UC0<=b && */ b
<=UC7
) {
807 dynamicWindow
=(int8_t)(b
-UC0
);
808 isSingleByteMode
=TRUE
;
810 } else if(/* UD0<=b && */ b
<=UD7
) {
811 dynamicWindow
=(int8_t)(b
-UD0
);
812 isSingleByteMode
=TRUE
;
818 isSingleByteMode
=TRUE
;
828 /* callback(illegal) */
829 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
842 *target
++=(UChar
)((byteOne
<<8)|b
);
850 /* set the converter state back into UConverter */
851 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
852 /* reset to deal with the next character */
854 } else if(state
==readCommand
) {
855 /* not in a multi-byte sequence, reset toULength */
858 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
859 scsu
->toUState
=state
;
860 scsu
->toUQuoteWindow
=quoteWindow
;
861 scsu
->toUDynamicWindow
=dynamicWindow
;
862 scsu
->toUByteOne
=byteOne
;
864 /* write back the updated pointers */
865 pArgs
->source
=(const char *)source
;
866 pArgs
->target
=target
;
870 /* SCSU-from-Unicode conversion functions ----------------------------------- */
873 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
874 * reasonable results. The lookahead is minimal.
875 * Many cases are simple:
876 * A character fits directly into the current mode, a dynamic or static window,
877 * or is not compressible. These cases are tested first.
878 * Real compression heuristics are applied to the rest, in code branches for
879 * single/Unicode mode and BMP/supplementary code points.
880 * The heuristics used here are extremely simple.
883 /* get the number of the window that this character is in, or -1 */
885 getWindow(const uint32_t offsets
[8], uint32_t c
) {
888 if((uint32_t)(c
-offsets
[i
])<=0x7f) {
895 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
897 isInOffsetWindowOrDirect(uint32_t offset
, uint32_t c
) {
898 return (UBool
)(c
<=offset
+0x7f &&
899 (c
>=offset
|| (c
<=0x7f &&
900 (c
>=0x20 || (1UL<<c
)&0x2601))));
901 /* binary 0010 0110 0000 0001,
902 check for b==0xd || b==0xa || b==9 || b==0 */
906 * getNextDynamicWindow returns the next dynamic window to be redefined
909 getNextDynamicWindow(SCSUData
*scsu
) {
910 int8_t window
=scsu
->windowUse
[scsu
->nextWindowUseIndex
];
911 if(++scsu
->nextWindowUseIndex
==8) {
912 scsu
->nextWindowUseIndex
=0;
918 * useDynamicWindow() adjusts
919 * windowUse[] and nextWindowUseIndex for the algorithm to choose
920 * the next dynamic window to be defined;
921 * a subclass may override it and provide its own algorithm.
924 useDynamicWindow(SCSUData
*scsu
, int8_t window
) {
926 * move the existing window, which just became the most recently used one,
927 * up in windowUse[] to nextWindowUseIndex-1
930 /* first, find the index of the window - backwards to favor the more recently used windows */
933 i
=scsu
->nextWindowUseIndex
;
938 } while(scsu
->windowUse
[i
]!=window
);
940 /* now copy each windowUse[i+1] to [i] */
945 while(j
!=scsu
->nextWindowUseIndex
) {
946 scsu
->windowUse
[i
]=scsu
->windowUse
[j
];
951 /* finally, set the window into the most recently used index */
952 scsu
->windowUse
[i
]=window
;
956 * calculate the offset and the code for a dynamic window that contains the character
957 * takes fixed offsets into account
958 * the offset of the window is stored in the offset variable,
959 * the code is returned
961 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
964 getDynamicOffset(uint32_t c
, uint32_t *pOffset
) {
968 if((uint32_t)(c
-fixedOffsets
[i
])<=0x7f) {
969 *pOffset
=fixedOffsets
[i
];
975 /* No dynamic window for US-ASCII. */
977 } else if(c
<0x3400 ||
978 (uint32_t)(c
-0x10000)<(0x14000-0x10000) ||
979 (uint32_t)(c
-0x1d000)<=(0x1ffff-0x1d000)
981 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
982 *pOffset
=c
&0x7fffff80;
984 } else if(0xe000<=c
&& c
!=0xfeff && c
<0xfff0) {
985 /* For these characters we need to take the gapOffset into account. */
986 *pOffset
=c
&0x7fffff80;
987 return (int)((c
-gapOffset
)>>7);
994 * Idea for compression:
995 * - save SCSUData and other state before really starting work
996 * - at endloop, see if compression could be better with just unicode mode
997 * - don't do this if a callback has been called
998 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
999 * - different buffer handling!
1001 * Drawback or need for corrective handling:
1002 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1003 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1004 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1006 * How to achieve both?
1007 * - Only replace the result after an SDX or SCU?
1010 static void U_CALLCONV
1011 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1012 UErrorCode
*pErrorCode
) {
1015 const UChar
*source
, *sourceLimit
;
1017 int32_t targetCapacity
;
1020 UBool isSingleByteMode
;
1021 uint8_t dynamicWindow
;
1022 uint32_t currentOffset
;
1026 int32_t sourceIndex
, nextSourceIndex
;
1030 /* variables for compression heuristics */
1036 /* set up the local pointers */
1037 cnv
=pArgs
->converter
;
1038 scsu
=(SCSUData
*)cnv
->extraInfo
;
1040 /* set up the local pointers */
1041 source
=pArgs
->source
;
1042 sourceLimit
=pArgs
->sourceLimit
;
1043 target
=(uint8_t *)pArgs
->target
;
1044 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1045 offsets
=pArgs
->offsets
;
1047 /* get the state machine state */
1048 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1049 dynamicWindow
=scsu
->fromUDynamicWindow
;
1050 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1054 /* sourceIndex=-1 if the current character began in the previous buffer */
1055 sourceIndex
= c
==0 ? 0 : -1;
1058 /* similar conversion "loop" as in toUnicode */
1060 if(isSingleByteMode
) {
1061 if(c
!=0 && targetCapacity
>0) {
1062 goto getTrailSingle
;
1065 /* state machine for single-byte mode */
1066 /* singleByteMode: */
1067 while(source
<sourceLimit
) {
1068 if(targetCapacity
<=0) {
1069 /* target is full */
1070 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1076 if((c
-0x20)<=0x5f) {
1077 /* pass US-ASCII graphic character through */
1078 *target
++=(uint8_t)c
;
1080 *offsets
++=sourceIndex
;
1084 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1086 *target
++=(uint8_t)c
;
1088 *offsets
++=sourceIndex
;
1092 /* quote C0 control character */
1097 } else if((delta
=c
-currentOffset
)<=0x7f) {
1098 /* use the current dynamic window */
1099 *target
++=(uint8_t)(delta
|0x80);
1101 *offsets
++=sourceIndex
;
1104 } else if(U16_IS_SURROGATE(c
)) {
1105 if(U16_IS_SURROGATE_LEAD(c
)) {
1108 if(source
<sourceLimit
) {
1109 /* test the following code unit */
1111 if(U16_IS_TRAIL(trail
)) {
1114 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1115 /* convert this surrogate code point */
1116 /* exit this condition tree */
1118 /* this is an unmatched lead code unit (1st surrogate) */
1119 /* callback(illegal) */
1120 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1128 /* this is an unmatched trail code unit (2nd surrogate) */
1129 /* callback(illegal) */
1130 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1134 /* compress supplementary character U+10000..U+10ffff */
1135 if((delta
=c
-currentOffset
)<=0x7f) {
1136 /* use the current dynamic window */
1137 *target
++=(uint8_t)(delta
|0x80);
1139 *offsets
++=sourceIndex
;
1142 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1143 /* there is a dynamic window that contains this character, change to it */
1144 dynamicWindow
=window
;
1145 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1146 useDynamicWindow(scsu
, dynamicWindow
);
1147 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1150 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1151 /* might check if there are more characters in this window to come */
1152 /* define an extended window with this character */
1154 dynamicWindow
=getNextDynamicWindow(scsu
);
1155 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1156 useDynamicWindow(scsu
, dynamicWindow
);
1157 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1161 /* change to Unicode mode and output this (lead, trail) pair */
1162 isSingleByteMode
=FALSE
;
1163 *target
++=(uint8_t)SCU
;
1165 *offsets
++=sourceIndex
;
1168 c
=((uint32_t)lead
<<16)|trail
;
1173 /* quote C1 control character */
1174 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1177 } else if(c
==0xfeff || c
>=0xfff0) {
1178 /* quote signature character=byte order mark and specials */
1183 /* compress all other BMP characters */
1184 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1185 /* there is a window defined that contains this character - switch to it or quote from it? */
1186 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1187 /* change to dynamic window */
1188 dynamicWindow
=window
;
1189 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1190 useDynamicWindow(scsu
, dynamicWindow
);
1191 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1195 /* quote from dynamic window */
1196 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1200 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1201 /* quote from static window */
1202 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1205 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1206 /* define a dynamic window with this character */
1207 dynamicWindow
=getNextDynamicWindow(scsu
);
1208 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1209 useDynamicWindow(scsu
, dynamicWindow
);
1210 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1213 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1214 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1217 * this character is not compressible (a BMP ideograph or similar);
1218 * switch to Unicode mode if this is the last character in the block
1219 * or there is at least one more ideograph following immediately
1221 isSingleByteMode
=FALSE
;
1233 /* normal end of conversion: prepare for a new character */
1235 sourceIndex
=nextSourceIndex
;
1238 if(c
!=0 && targetCapacity
>0) {
1239 goto getTrailUnicode
;
1242 /* state machine for Unicode mode */
1243 /* unicodeByteMode: */
1244 while(source
<sourceLimit
) {
1245 if(targetCapacity
<=0) {
1246 /* target is full */
1247 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1253 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1254 /* not compressible, write character directly */
1255 if(targetCapacity
>=2) {
1256 *target
++=(uint8_t)(c
>>8);
1257 *target
++=(uint8_t)c
;
1259 *offsets
++=sourceIndex
;
1260 *offsets
++=sourceIndex
;
1267 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1268 /* compress BMP character if the following one is not an uncompressible ideograph */
1269 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1270 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1271 /* ASCII digit or letter */
1272 isSingleByteMode
=TRUE
;
1273 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1276 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1277 /* there is a dynamic window that contains this character, change to it */
1278 isSingleByteMode
=TRUE
;
1279 dynamicWindow
=window
;
1280 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1281 useDynamicWindow(scsu
, dynamicWindow
);
1282 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1285 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1286 /* define a dynamic window with this character */
1287 isSingleByteMode
=TRUE
;
1288 dynamicWindow
=getNextDynamicWindow(scsu
);
1289 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1290 useDynamicWindow(scsu
, dynamicWindow
);
1291 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1297 /* don't know how to compress this character, just write it directly */
1300 } else if(c
<0xe000) {
1301 /* c is a surrogate */
1302 if(U16_IS_SURROGATE_LEAD(c
)) {
1305 if(source
<sourceLimit
) {
1306 /* test the following code unit */
1308 if(U16_IS_TRAIL(trail
)) {
1311 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1312 /* convert this surrogate code point */
1313 /* exit this condition tree */
1315 /* this is an unmatched lead code unit (1st surrogate) */
1316 /* callback(illegal) */
1317 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1325 /* this is an unmatched trail code unit (2nd surrogate) */
1326 /* callback(illegal) */
1327 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1331 /* compress supplementary character */
1332 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1333 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1336 * there is a dynamic window that contains this character and
1337 * the following character is not uncompressible,
1338 * change to the window
1340 isSingleByteMode
=TRUE
;
1341 dynamicWindow
=window
;
1342 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1343 useDynamicWindow(scsu
, dynamicWindow
);
1344 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1347 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1348 (code
=getDynamicOffset(c
, &offset
))>=0
1350 /* two supplementary characters in (probably) the same window - define an extended one */
1351 isSingleByteMode
=TRUE
;
1353 dynamicWindow
=getNextDynamicWindow(scsu
);
1354 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1355 useDynamicWindow(scsu
, dynamicWindow
);
1356 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1360 /* don't know how to compress this character, just write it directly */
1361 c
=((uint32_t)lead
<<16)|trail
;
1365 } else /* 0xe000<=c<0xf300 */ {
1366 /* quote to avoid SCSU tags */
1372 /* normal end of conversion: prepare for a new character */
1374 sourceIndex
=nextSourceIndex
;
1379 /* set the converter state back into UConverter */
1380 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1381 scsu
->fromUDynamicWindow
=dynamicWindow
;
1385 /* write back the updated pointers */
1386 pArgs
->source
=source
;
1387 pArgs
->target
=(char *)target
;
1388 pArgs
->offsets
=offsets
;
1392 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1393 /* from the first if in the loop we know that targetCapacity>0 */
1394 if(length
<=targetCapacity
) {
1397 /* each branch falls through to the next one */
1399 *target
++=(uint8_t)(c
>>24);
1402 *target
++=(uint8_t)(c
>>16);
1405 *target
++=(uint8_t)(c
>>8);
1408 *target
++=(uint8_t)c
;
1411 /* will never occur */
1416 /* each branch falls through to the next one */
1418 *target
++=(uint8_t)(c
>>24);
1419 *offsets
++=sourceIndex
;
1422 *target
++=(uint8_t)(c
>>16);
1423 *offsets
++=sourceIndex
;
1426 *target
++=(uint8_t)(c
>>8);
1427 *offsets
++=sourceIndex
;
1430 *target
++=(uint8_t)c
;
1431 *offsets
++=sourceIndex
;
1434 /* will never occur */
1438 targetCapacity
-=length
;
1440 /* normal end of conversion: prepare for a new character */
1442 sourceIndex
=nextSourceIndex
;
1448 * We actually do this backwards here:
1449 * In order to save an intermediate variable, we output
1450 * first to the overflow buffer what does not fit into the
1453 /* we know that 0<=targetCapacity<length<=4 */
1454 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1455 length
-=targetCapacity
;
1456 p
=(uint8_t *)cnv
->charErrorBuffer
;
1458 /* each branch falls through to the next one */
1460 *p
++=(uint8_t)(c
>>24);
1463 *p
++=(uint8_t)(c
>>16);
1466 *p
++=(uint8_t)(c
>>8);
1472 /* will never occur */
1475 cnv
->charErrorBufferLength
=(int8_t)length
;
1477 /* now output what fits into the regular target */
1478 c
>>=8*length
; /* length was reduced by targetCapacity */
1479 switch(targetCapacity
) {
1480 /* each branch falls through to the next one */
1482 *target
++=(uint8_t)(c
>>16);
1484 *offsets
++=sourceIndex
;
1488 *target
++=(uint8_t)(c
>>8);
1490 *offsets
++=sourceIndex
;
1494 *target
++=(uint8_t)c
;
1496 *offsets
++=sourceIndex
;
1503 /* target overflow */
1505 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1512 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1513 * If a change is made in the original function, then either
1514 * change this function the same way or
1515 * re-copy the original function and remove the variables
1516 * offsets, sourceIndex, and nextSourceIndex.
1518 static void U_CALLCONV
1519 _SCSUFromUnicode(UConverterFromUnicodeArgs
*pArgs
,
1520 UErrorCode
*pErrorCode
) {
1523 const UChar
*source
, *sourceLimit
;
1525 int32_t targetCapacity
;
1527 UBool isSingleByteMode
;
1528 uint8_t dynamicWindow
;
1529 uint32_t currentOffset
;
1535 /* variables for compression heuristics */
1541 /* set up the local pointers */
1542 cnv
=pArgs
->converter
;
1543 scsu
=(SCSUData
*)cnv
->extraInfo
;
1545 /* set up the local pointers */
1546 source
=pArgs
->source
;
1547 sourceLimit
=pArgs
->sourceLimit
;
1548 target
=(uint8_t *)pArgs
->target
;
1549 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1551 /* get the state machine state */
1552 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1553 dynamicWindow
=scsu
->fromUDynamicWindow
;
1554 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1558 /* similar conversion "loop" as in toUnicode */
1560 if(isSingleByteMode
) {
1561 if(c
!=0 && targetCapacity
>0) {
1562 goto getTrailSingle
;
1565 /* state machine for single-byte mode */
1566 /* singleByteMode: */
1567 while(source
<sourceLimit
) {
1568 if(targetCapacity
<=0) {
1569 /* target is full */
1570 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1575 if((c
-0x20)<=0x5f) {
1576 /* pass US-ASCII graphic character through */
1577 *target
++=(uint8_t)c
;
1580 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1582 *target
++=(uint8_t)c
;
1585 /* quote C0 control character */
1590 } else if((delta
=c
-currentOffset
)<=0x7f) {
1591 /* use the current dynamic window */
1592 *target
++=(uint8_t)(delta
|0x80);
1594 } else if(U16_IS_SURROGATE(c
)) {
1595 if(U16_IS_SURROGATE_LEAD(c
)) {
1598 if(source
<sourceLimit
) {
1599 /* test the following code unit */
1601 if(U16_IS_TRAIL(trail
)) {
1603 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1604 /* convert this surrogate code point */
1605 /* exit this condition tree */
1607 /* this is an unmatched lead code unit (1st surrogate) */
1608 /* callback(illegal) */
1609 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1617 /* this is an unmatched trail code unit (2nd surrogate) */
1618 /* callback(illegal) */
1619 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1623 /* compress supplementary character U+10000..U+10ffff */
1624 if((delta
=c
-currentOffset
)<=0x7f) {
1625 /* use the current dynamic window */
1626 *target
++=(uint8_t)(delta
|0x80);
1628 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1629 /* there is a dynamic window that contains this character, change to it */
1630 dynamicWindow
=window
;
1631 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1632 useDynamicWindow(scsu
, dynamicWindow
);
1633 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1636 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1637 /* might check if there are more characters in this window to come */
1638 /* define an extended window with this character */
1640 dynamicWindow
=getNextDynamicWindow(scsu
);
1641 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1642 useDynamicWindow(scsu
, dynamicWindow
);
1643 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1647 /* change to Unicode mode and output this (lead, trail) pair */
1648 isSingleByteMode
=FALSE
;
1649 *target
++=(uint8_t)SCU
;
1651 c
=((uint32_t)lead
<<16)|trail
;
1656 /* quote C1 control character */
1657 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1660 } else if(c
==0xfeff || c
>=0xfff0) {
1661 /* quote signature character=byte order mark and specials */
1666 /* compress all other BMP characters */
1667 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1668 /* there is a window defined that contains this character - switch to it or quote from it? */
1669 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1670 /* change to dynamic window */
1671 dynamicWindow
=window
;
1672 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1673 useDynamicWindow(scsu
, dynamicWindow
);
1674 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1678 /* quote from dynamic window */
1679 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1683 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1684 /* quote from static window */
1685 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1688 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1689 /* define a dynamic window with this character */
1690 dynamicWindow
=getNextDynamicWindow(scsu
);
1691 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1692 useDynamicWindow(scsu
, dynamicWindow
);
1693 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1696 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1697 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1700 * this character is not compressible (a BMP ideograph or similar);
1701 * switch to Unicode mode if this is the last character in the block
1702 * or there is at least one more ideograph following immediately
1704 isSingleByteMode
=FALSE
;
1716 /* normal end of conversion: prepare for a new character */
1720 if(c
!=0 && targetCapacity
>0) {
1721 goto getTrailUnicode
;
1724 /* state machine for Unicode mode */
1725 /* unicodeByteMode: */
1726 while(source
<sourceLimit
) {
1727 if(targetCapacity
<=0) {
1728 /* target is full */
1729 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1734 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1735 /* not compressible, write character directly */
1736 if(targetCapacity
>=2) {
1737 *target
++=(uint8_t)(c
>>8);
1738 *target
++=(uint8_t)c
;
1744 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1745 /* compress BMP character if the following one is not an uncompressible ideograph */
1746 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1747 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1748 /* ASCII digit or letter */
1749 isSingleByteMode
=TRUE
;
1750 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1753 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1754 /* there is a dynamic window that contains this character, change to it */
1755 isSingleByteMode
=TRUE
;
1756 dynamicWindow
=window
;
1757 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1758 useDynamicWindow(scsu
, dynamicWindow
);
1759 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1762 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1763 /* define a dynamic window with this character */
1764 isSingleByteMode
=TRUE
;
1765 dynamicWindow
=getNextDynamicWindow(scsu
);
1766 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1767 useDynamicWindow(scsu
, dynamicWindow
);
1768 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1774 /* don't know how to compress this character, just write it directly */
1777 } else if(c
<0xe000) {
1778 /* c is a surrogate */
1779 if(U16_IS_SURROGATE_LEAD(c
)) {
1782 if(source
<sourceLimit
) {
1783 /* test the following code unit */
1785 if(U16_IS_TRAIL(trail
)) {
1787 c
=U16_GET_SUPPLEMENTARY(c
, trail
);
1788 /* convert this surrogate code point */
1789 /* exit this condition tree */
1791 /* this is an unmatched lead code unit (1st surrogate) */
1792 /* callback(illegal) */
1793 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1801 /* this is an unmatched trail code unit (2nd surrogate) */
1802 /* callback(illegal) */
1803 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1807 /* compress supplementary character */
1808 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1809 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1812 * there is a dynamic window that contains this character and
1813 * the following character is not uncompressible,
1814 * change to the window
1816 isSingleByteMode
=TRUE
;
1817 dynamicWindow
=window
;
1818 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1819 useDynamicWindow(scsu
, dynamicWindow
);
1820 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1823 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1824 (code
=getDynamicOffset(c
, &offset
))>=0
1826 /* two supplementary characters in (probably) the same window - define an extended one */
1827 isSingleByteMode
=TRUE
;
1829 dynamicWindow
=getNextDynamicWindow(scsu
);
1830 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1831 useDynamicWindow(scsu
, dynamicWindow
);
1832 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1836 /* don't know how to compress this character, just write it directly */
1837 c
=((uint32_t)lead
<<16)|trail
;
1841 } else /* 0xe000<=c<0xf300 */ {
1842 /* quote to avoid SCSU tags */
1848 /* normal end of conversion: prepare for a new character */
1854 /* set the converter state back into UConverter */
1855 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1856 scsu
->fromUDynamicWindow
=dynamicWindow
;
1860 /* write back the updated pointers */
1861 pArgs
->source
=source
;
1862 pArgs
->target
=(char *)target
;
1866 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1867 /* from the first if in the loop we know that targetCapacity>0 */
1868 if(length
<=targetCapacity
) {
1870 /* each branch falls through to the next one */
1872 *target
++=(uint8_t)(c
>>24);
1875 *target
++=(uint8_t)(c
>>16);
1878 *target
++=(uint8_t)(c
>>8);
1881 *target
++=(uint8_t)c
;
1884 /* will never occur */
1887 targetCapacity
-=length
;
1889 /* normal end of conversion: prepare for a new character */
1896 * We actually do this backwards here:
1897 * In order to save an intermediate variable, we output
1898 * first to the overflow buffer what does not fit into the
1901 /* we know that 0<=targetCapacity<length<=4 */
1902 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1903 length
-=targetCapacity
;
1904 p
=(uint8_t *)cnv
->charErrorBuffer
;
1906 /* each branch falls through to the next one */
1908 *p
++=(uint8_t)(c
>>24);
1911 *p
++=(uint8_t)(c
>>16);
1914 *p
++=(uint8_t)(c
>>8);
1920 /* will never occur */
1923 cnv
->charErrorBufferLength
=(int8_t)length
;
1925 /* now output what fits into the regular target */
1926 c
>>=8*length
; /* length was reduced by targetCapacity */
1927 switch(targetCapacity
) {
1928 /* each branch falls through to the next one */
1930 *target
++=(uint8_t)(c
>>16);
1933 *target
++=(uint8_t)(c
>>8);
1936 *target
++=(uint8_t)c
;
1942 /* target overflow */
1944 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1950 /* miscellaneous ------------------------------------------------------------ */
1952 static const char * U_CALLCONV
1953 _SCSUGetName(const UConverter
*cnv
) {
1954 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
1956 switch(scsu
->locale
) {
1958 return "SCSU,locale=ja";
1964 /* structure for SafeClone calculations */
1965 struct cloneSCSUStruct
1971 static UConverter
* U_CALLCONV
1972 _SCSUSafeClone(const UConverter
*cnv
,
1974 int32_t *pBufferSize
,
1977 struct cloneSCSUStruct
* localClone
;
1978 int32_t bufferSizeNeeded
= sizeof(struct cloneSCSUStruct
);
1980 if (U_FAILURE(*status
)){
1984 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1985 *pBufferSize
= bufferSizeNeeded
;
1989 localClone
= (struct cloneSCSUStruct
*)stackBuffer
;
1990 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1992 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(SCSUData
));
1993 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1994 localClone
->cnv
.isExtraLocal
= TRUE
;
1996 return &localClone
->cnv
;
2000 static const UConverterImpl _SCSUImpl
={
2011 _SCSUToUnicodeWithOffsets
,
2013 _SCSUFromUnicodeWithOffsets
,
2020 ucnv_getCompleteUnicodeSet
,
2025 static const UConverterStaticData _SCSUStaticData
={
2026 sizeof(UConverterStaticData
),
2028 1212, /* CCSID for SCSU */
2029 UCNV_IBM
, UCNV_SCSU
,
2030 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2032 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2033 * substitution string.
2035 { 0x0e, 0xff, 0xfd, 0 }, 3,
2039 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2042 const UConverterSharedData _SCSUData
=
2043 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData
, &_SCSUImpl
);