2 ******************************************************************************
4 * Copyright (C) 2000-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: ucnvscsu.c
10 * tab size: 8 (not used)
13 * created on: 2000nov18
14 * created by: Markus W. Scherer
16 * This is an implementation of the Standard Compression Scheme for Unicode
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 * Reserved commands and window settings are treated as illegal sequences and
19 * will result in callback calls.
22 #include "unicode/utypes.h"
24 #if !UCONFIG_NO_CONVERSION
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
32 /* SCSU definitions --------------------------------------------------------- */
34 /* SCSU command byte values */
36 SQ0
=0x01, /* Quote from window pair 0 */
37 SQ7
=0x08, /* Quote from window pair 7 */
38 SDX
=0x0B, /* Define a window as extended */
39 Srs
=0x0C, /* reserved */
40 SQU
=0x0E, /* Quote a single Unicode character */
41 SCU
=0x0F, /* Change to Unicode mode */
42 SC0
=0x10, /* Select window 0 */
43 SC7
=0x17, /* Select window 7 */
44 SD0
=0x18, /* Define and select window 0 */
45 SD7
=0x1F, /* Define and select window 7 */
47 UC0
=0xE0, /* Select window 0 */
48 UC7
=0xE7, /* Select window 7 */
49 UD0
=0xE8, /* Define and select window 0 */
50 UD7
=0xEF, /* Define and select window 7 */
51 UQU
=0xF0, /* Quote a single Unicode character */
52 UDX
=0xF1, /* Define a Window as extended */
53 Urs
=0xF2 /* reserved */
58 * Unicode code points from 3400 to E000 are not adressible by
59 * dynamic window, since in these areas no short run alphabets are
60 * found. Therefore add gapOffset to all values from gapThreshold.
65 /* values between reservedStart and fixedThreshold are reserved */
68 /* use table of predefined fixed offsets for values from fixedThreshold */
72 /* constant offsets for the 8 static windows */
73 static const uint32_t staticOffsets
[8]={
74 0x0000, /* ASCII for quoted tags */
75 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
76 0x0100, /* Latin Extended-A */
77 0x0300, /* Combining Diacritical Marks */
78 0x2000, /* General Punctuation */
79 0x2080, /* Currency Symbols */
80 0x2100, /* Letterlike Symbols and Number Forms */
81 0x3000 /* CJK Symbols and punctuation */
84 /* initial offsets for the 8 dynamic (sliding) windows */
85 static const uint32_t initialDynamicOffsets
[8]={
87 0x00C0, /* Latin Extended A */
88 0x0400, /* Cyrillic */
90 0x0900, /* Devanagari */
91 0x3040, /* Hiragana */
92 0x30A0, /* Katakana */
93 0xFF00 /* Fullwidth ASCII */
96 /* Table of fixed predefined Offsets */
97 static const uint32_t fixedOffsets
[]={
98 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
99 /* 0xFA */ 0x0250, /* IPA extensions */
100 /* 0xFB */ 0x0370, /* Greek */
101 /* 0xFC */ 0x0530, /* Armenian */
102 /* 0xFD */ 0x3040, /* Hiragana */
103 /* 0xFE */ 0x30A0, /* Katakana */
104 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
118 typedef struct SCSUData
{
119 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
120 uint32_t toUDynamicOffsets
[8];
121 uint32_t fromUDynamicOffsets
[8];
123 /* state machine state - toUnicode */
124 UBool toUIsSingleByteMode
;
126 int8_t toUQuoteWindow
, toUDynamicWindow
;
128 uint8_t toUPadding
[3];
130 /* state machine state - fromUnicode */
131 UBool fromUIsSingleByteMode
;
132 int8_t fromUDynamicWindow
;
135 * windowUse[] keeps track of the use of the dynamic windows:
136 * At nextWindowUseIndex there is the least recently used window,
137 * and the following windows (in a wrapping manner) are more and more
139 * At nextWindowUseIndex-1 there is the most recently used window.
142 int8_t nextWindowUseIndex
;
146 static const int8_t initialWindowUse
[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
147 static const int8_t initialWindowUse_ja
[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
153 /* SCSU setup functions ----------------------------------------------------- */
156 _SCSUReset(UConverter
*cnv
, UConverterResetChoice choice
) {
157 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
159 if(choice
<=UCNV_RESET_TO_UNICODE
) {
160 /* reset toUnicode */
161 uprv_memcpy(scsu
->toUDynamicOffsets
, initialDynamicOffsets
, 32);
163 scsu
->toUIsSingleByteMode
=TRUE
;
164 scsu
->toUState
=readCommand
;
165 scsu
->toUQuoteWindow
=scsu
->toUDynamicWindow
=0;
170 if(choice
!=UCNV_RESET_TO_UNICODE
) {
171 /* reset fromUnicode */
172 uprv_memcpy(scsu
->fromUDynamicOffsets
, initialDynamicOffsets
, 32);
174 scsu
->fromUIsSingleByteMode
=TRUE
;
175 scsu
->fromUDynamicWindow
=0;
177 scsu
->nextWindowUseIndex
=0;
178 switch(scsu
->locale
) {
180 uprv_memcpy(scsu
->windowUse
, initialWindowUse_ja
, 8);
183 uprv_memcpy(scsu
->windowUse
, initialWindowUse
, 8);
192 _SCSUOpen(UConverter
*cnv
,
196 UErrorCode
*pErrorCode
) {
197 cnv
->extraInfo
=uprv_malloc(sizeof(SCSUData
));
198 if(cnv
->extraInfo
!=NULL
) {
199 if(locale
!=NULL
&& locale
[0]=='j' && locale
[1]=='a' && (locale
[2]==0 || locale
[2]=='_')) {
200 ((SCSUData
*)cnv
->extraInfo
)->locale
=l_ja
;
202 ((SCSUData
*)cnv
->extraInfo
)->locale
=lGeneric
;
204 _SCSUReset(cnv
, UCNV_RESET_BOTH
);
206 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
211 _SCSUClose(UConverter
*cnv
) {
212 if(cnv
->extraInfo
!=NULL
) {
213 if(!cnv
->isExtraLocal
) {
214 uprv_free(cnv
->extraInfo
);
220 /* SCSU-to-Unicode conversion functions ------------------------------------- */
223 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs
*pArgs
,
224 UErrorCode
*pErrorCode
) {
227 const uint8_t *source
, *sourceLimit
;
229 const UChar
*targetLimit
;
231 UBool isSingleByteMode
;
232 uint8_t state
, byteOne
;
233 int8_t quoteWindow
, dynamicWindow
;
235 int32_t sourceIndex
, nextSourceIndex
;
239 /* set up the local pointers */
240 cnv
=pArgs
->converter
;
241 scsu
=(SCSUData
*)cnv
->extraInfo
;
243 source
=(const uint8_t *)pArgs
->source
;
244 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
245 target
=pArgs
->target
;
246 targetLimit
=pArgs
->targetLimit
;
247 offsets
=pArgs
->offsets
;
249 /* get the state machine state */
250 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
251 state
=scsu
->toUState
;
252 quoteWindow
=scsu
->toUQuoteWindow
;
253 dynamicWindow
=scsu
->toUDynamicWindow
;
254 byteOne
=scsu
->toUByteOne
;
256 /* sourceIndex=-1 if the current character began in the previous buffer */
257 sourceIndex
=state
==readCommand
? 0 : -1;
263 * For performance, this is not a normal C loop.
264 * Instead, there are two code blocks for the two SCSU modes.
265 * The function branches to either one, and a change of the mode is done with a goto to
268 * Each branch has two conventional loops:
269 * - a fast-path loop for the most common codes in the mode
270 * - a loop for all other codes in the mode
271 * When the fast-path runs into a code that it cannot handle, its loop ends and it
272 * runs into the following loop to handle the other codes.
273 * The end of the input or output buffer is also handled by the slower loop.
274 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
276 * The callback handling is done by returning with an error code.
277 * The conversion framework actually calls the callback function.
279 if(isSingleByteMode
) {
280 /* fast path for single-byte mode */
281 if(state
==readCommand
) {
283 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
287 /* write US-ASCII graphic character or DEL */
290 *offsets
++=sourceIndex
;
293 /* write from dynamic window */
294 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
298 *offsets
++=sourceIndex
;
301 /* output surrogate pair */
302 *target
++=(UChar
)(0xd7c0+(c
>>10));
303 if(target
<targetLimit
) {
304 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
306 *offsets
++=sourceIndex
;
307 *offsets
++=sourceIndex
;
310 /* target overflow */
312 *offsets
++=sourceIndex
;
314 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
315 cnv
->UCharErrorBufferLength
=1;
316 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
321 sourceIndex
=nextSourceIndex
;
325 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
327 while(source
<sourceLimit
) {
328 if(target
>=targetLimit
) {
330 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
337 /* redundant conditions are commented out */
338 /* here: b<0x20 because otherwise we would be in fastSingle */
339 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
343 *offsets
++=sourceIndex
;
345 sourceIndex
=nextSourceIndex
;
349 dynamicWindow
=(int8_t)(b
-SC0
);
350 sourceIndex
=nextSourceIndex
;
352 } else /* if(SD0<=b && b<=SD7) */ {
353 dynamicWindow
=(int8_t)(b
-SD0
);
356 } else if(/* SQ0<=b && */ b
<=SQ7
) {
357 quoteWindow
=(int8_t)(b
-SQ0
);
364 sourceIndex
=nextSourceIndex
;
365 isSingleByteMode
=FALSE
;
368 /* callback(illegal) */
369 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
375 /* store the first byte of a multibyte sequence in toUBytes[] */
386 *target
++=(UChar
)((byteOne
<<8)|b
);
388 *offsets
++=sourceIndex
;
390 sourceIndex
=nextSourceIndex
;
395 /* all static offsets are in the BMP */
396 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
398 *offsets
++=sourceIndex
;
401 /* write from dynamic window */
402 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
406 *offsets
++=sourceIndex
;
409 /* output surrogate pair */
410 *target
++=(UChar
)(0xd7c0+(c
>>10));
411 if(target
<targetLimit
) {
412 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
414 *offsets
++=sourceIndex
;
415 *offsets
++=sourceIndex
;
418 /* target overflow */
420 *offsets
++=sourceIndex
;
422 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
423 cnv
->UCharErrorBufferLength
=1;
424 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
429 sourceIndex
=nextSourceIndex
;
433 dynamicWindow
=(int8_t)((b
>>5)&7);
434 byteOne
=(uint8_t)(b
&0x1f);
440 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
441 sourceIndex
=nextSourceIndex
;
446 /* callback(illegal): Reserved window offset value 0 */
450 } else if(b
<gapThreshold
) {
451 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
452 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
453 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
454 } else if(b
>=fixedThreshold
) {
455 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
457 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
462 sourceIndex
=nextSourceIndex
;
468 /* fast path for Unicode mode */
469 if(state
==readCommand
) {
471 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
472 *target
++=(UChar
)((b
<<8)|source
[1]);
474 *offsets
++=sourceIndex
;
476 sourceIndex
=nextSourceIndex
;
482 /* normal state machine for Unicode mode */
483 /* unicodeByteMode: */
484 while(source
<sourceLimit
) {
485 if(target
>=targetLimit
) {
487 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
494 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
499 } else if(/* UC0<=b && */ b
<=UC7
) {
500 dynamicWindow
=(int8_t)(b
-UC0
);
501 sourceIndex
=nextSourceIndex
;
502 isSingleByteMode
=TRUE
;
504 } else if(/* UD0<=b && */ b
<=UD7
) {
505 dynamicWindow
=(int8_t)(b
-UD0
);
506 isSingleByteMode
=TRUE
;
512 isSingleByteMode
=TRUE
;
522 /* callback(illegal) */
523 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
536 *target
++=(UChar
)((byteOne
<<8)|b
);
538 *offsets
++=sourceIndex
;
540 sourceIndex
=nextSourceIndex
;
548 /* set the converter state back into UConverter */
549 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
550 /* reset to deal with the next character */
552 } else if(state
==readCommand
) {
553 /* not in a multi-byte sequence, reset toULength */
556 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
557 scsu
->toUState
=state
;
558 scsu
->toUQuoteWindow
=quoteWindow
;
559 scsu
->toUDynamicWindow
=dynamicWindow
;
560 scsu
->toUByteOne
=byteOne
;
562 /* write back the updated pointers */
563 pArgs
->source
=(const char *)source
;
564 pArgs
->target
=target
;
565 pArgs
->offsets
=offsets
;
570 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
571 * If a change is made in the original function, then either
572 * change this function the same way or
573 * re-copy the original function and remove the variables
574 * offsets, sourceIndex, and nextSourceIndex.
577 _SCSUToUnicode(UConverterToUnicodeArgs
*pArgs
,
578 UErrorCode
*pErrorCode
) {
581 const uint8_t *source
, *sourceLimit
;
583 const UChar
*targetLimit
;
584 UBool isSingleByteMode
;
585 uint8_t state
, byteOne
;
586 int8_t quoteWindow
, dynamicWindow
;
590 /* set up the local pointers */
591 cnv
=pArgs
->converter
;
592 scsu
=(SCSUData
*)cnv
->extraInfo
;
594 source
=(const uint8_t *)pArgs
->source
;
595 sourceLimit
=(const uint8_t *)pArgs
->sourceLimit
;
596 target
=pArgs
->target
;
597 targetLimit
=pArgs
->targetLimit
;
599 /* get the state machine state */
600 isSingleByteMode
=scsu
->toUIsSingleByteMode
;
601 state
=scsu
->toUState
;
602 quoteWindow
=scsu
->toUQuoteWindow
;
603 dynamicWindow
=scsu
->toUDynamicWindow
;
604 byteOne
=scsu
->toUByteOne
;
609 * For performance, this is not a normal C loop.
610 * Instead, there are two code blocks for the two SCSU modes.
611 * The function branches to either one, and a change of the mode is done with a goto to
614 * Each branch has two conventional loops:
615 * - a fast-path loop for the most common codes in the mode
616 * - a loop for all other codes in the mode
617 * When the fast-path runs into a code that it cannot handle, its loop ends and it
618 * runs into the following loop to handle the other codes.
619 * The end of the input or output buffer is also handled by the slower loop.
620 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
622 * The callback handling is done by returning with an error code.
623 * The conversion framework actually calls the callback function.
625 if(isSingleByteMode
) {
626 /* fast path for single-byte mode */
627 if(state
==readCommand
) {
629 while(source
<sourceLimit
&& target
<targetLimit
&& (b
=*source
)>=0x20) {
632 /* write US-ASCII graphic character or DEL */
635 /* write from dynamic window */
636 uint32_t c
=scsu
->toUDynamicOffsets
[dynamicWindow
]+(b
&0x7f);
640 /* output surrogate pair */
641 *target
++=(UChar
)(0xd7c0+(c
>>10));
642 if(target
<targetLimit
) {
643 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
645 /* target overflow */
646 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
647 cnv
->UCharErrorBufferLength
=1;
648 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
656 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
658 while(source
<sourceLimit
) {
659 if(target
>=targetLimit
) {
661 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
667 /* redundant conditions are commented out */
668 /* here: b<0x20 because otherwise we would be in fastSingle */
669 if((1UL<<b
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
675 dynamicWindow
=(int8_t)(b
-SC0
);
677 } else /* if(SD0<=b && b<=SD7) */ {
678 dynamicWindow
=(int8_t)(b
-SD0
);
681 } else if(/* SQ0<=b && */ b
<=SQ7
) {
682 quoteWindow
=(int8_t)(b
-SQ0
);
689 isSingleByteMode
=FALSE
;
692 /* callback(illegal) */
693 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
699 /* store the first byte of a multibyte sequence in toUBytes[] */
710 *target
++=(UChar
)((byteOne
<<8)|b
);
715 /* all static offsets are in the BMP */
716 *target
++=(UChar
)(staticOffsets
[quoteWindow
]+b
);
718 /* write from dynamic window */
719 uint32_t c
=scsu
->toUDynamicOffsets
[quoteWindow
]+(b
&0x7f);
723 /* output surrogate pair */
724 *target
++=(UChar
)(0xd7c0+(c
>>10));
725 if(target
<targetLimit
) {
726 *target
++=(UChar
)(0xdc00|(c
&0x3ff));
728 /* target overflow */
729 cnv
->UCharErrorBuffer
[0]=(UChar
)(0xdc00|(c
&0x3ff));
730 cnv
->UCharErrorBufferLength
=1;
731 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
739 dynamicWindow
=(int8_t)((b
>>5)&7);
740 byteOne
=(uint8_t)(b
&0x1f);
746 scsu
->toUDynamicOffsets
[dynamicWindow
]=0x10000+(byteOne
<<15UL | b
<<7UL);
751 /* callback(illegal): Reserved window offset value 0 */
755 } else if(b
<gapThreshold
) {
756 scsu
->toUDynamicOffsets
[dynamicWindow
]=b
<<7UL;
757 } else if((uint8_t)(b
-gapThreshold
)<(reservedStart
-gapThreshold
)) {
758 scsu
->toUDynamicOffsets
[dynamicWindow
]=(b
<<7UL)+gapOffset
;
759 } else if(b
>=fixedThreshold
) {
760 scsu
->toUDynamicOffsets
[dynamicWindow
]=fixedOffsets
[b
-fixedThreshold
];
762 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
772 /* fast path for Unicode mode */
773 if(state
==readCommand
) {
775 while(source
+1<sourceLimit
&& target
<targetLimit
&& (uint8_t)((b
=*source
)-UC0
)>(Urs
-UC0
)) {
776 *target
++=(UChar
)((b
<<8)|source
[1]);
781 /* normal state machine for Unicode mode */
782 /* unicodeByteMode: */
783 while(source
<sourceLimit
) {
784 if(target
>=targetLimit
) {
786 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
792 if((uint8_t)(b
-UC0
)>(Urs
-UC0
)) {
797 } else if(/* UC0<=b && */ b
<=UC7
) {
798 dynamicWindow
=(int8_t)(b
-UC0
);
799 isSingleByteMode
=TRUE
;
801 } else if(/* UD0<=b && */ b
<=UD7
) {
802 dynamicWindow
=(int8_t)(b
-UD0
);
803 isSingleByteMode
=TRUE
;
809 isSingleByteMode
=TRUE
;
819 /* callback(illegal) */
820 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
833 *target
++=(UChar
)((byteOne
<<8)|b
);
841 /* set the converter state back into UConverter */
842 if(U_FAILURE(*pErrorCode
) && *pErrorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
843 /* reset to deal with the next character */
845 } else if(state
==readCommand
) {
846 /* not in a multi-byte sequence, reset toULength */
849 scsu
->toUIsSingleByteMode
=isSingleByteMode
;
850 scsu
->toUState
=state
;
851 scsu
->toUQuoteWindow
=quoteWindow
;
852 scsu
->toUDynamicWindow
=dynamicWindow
;
853 scsu
->toUByteOne
=byteOne
;
855 /* write back the updated pointers */
856 pArgs
->source
=(const char *)source
;
857 pArgs
->target
=target
;
861 /* SCSU-from-Unicode conversion functions ----------------------------------- */
864 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
865 * reasonable results. The lookahead is minimal.
866 * Many cases are simple:
867 * A character fits directly into the current mode, a dynamic or static window,
868 * or is not compressible. These cases are tested first.
869 * Real compression heuristics are applied to the rest, in code branches for
870 * single/Unicode mode and BMP/supplementary code points.
871 * The heuristics used here are extremely simple.
874 /* get the number of the window that this character is in, or -1 */
876 getWindow(const uint32_t offsets
[8], uint32_t c
) {
879 if((uint32_t)(c
-offsets
[i
])<=0x7f) {
886 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
888 isInOffsetWindowOrDirect(uint32_t offset
, uint32_t c
) {
889 return (UBool
)(c
<=offset
+0x7f &&
890 (c
>=offset
|| (c
<=0x7f &&
891 (c
>=0x20 || (1UL<<c
)&0x2601))));
892 /* binary 0010 0110 0000 0001,
893 check for b==0xd || b==0xa || b==9 || b==0 */
897 * getNextDynamicWindow returns the next dynamic window to be redefined
900 getNextDynamicWindow(SCSUData
*scsu
) {
901 int8_t window
=scsu
->windowUse
[scsu
->nextWindowUseIndex
];
902 if(++scsu
->nextWindowUseIndex
==8) {
903 scsu
->nextWindowUseIndex
=0;
909 * useDynamicWindow() adjusts
910 * windowUse[] and nextWindowUseIndex for the algorithm to choose
911 * the next dynamic window to be defined;
912 * a subclass may override it and provide its own algorithm.
915 useDynamicWindow(SCSUData
*scsu
, int8_t window
) {
917 * move the existing window, which just became the most recently used one,
918 * up in windowUse[] to nextWindowUseIndex-1
921 /* first, find the index of the window - backwards to favor the more recently used windows */
924 i
=scsu
->nextWindowUseIndex
;
929 } while(scsu
->windowUse
[i
]!=window
);
931 /* now copy each windowUse[i+1] to [i] */
936 while(j
!=scsu
->nextWindowUseIndex
) {
937 scsu
->windowUse
[i
]=scsu
->windowUse
[j
];
942 /* finally, set the window into the most recently used index */
943 scsu
->windowUse
[i
]=window
;
947 * calculate the offset and the code for a dynamic window that contains the character
948 * takes fixed offsets into account
949 * the offset of the window is stored in the offset variable,
950 * the code is returned
952 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
955 getDynamicOffset(uint32_t c
, uint32_t *pOffset
) {
959 if((uint32_t)(c
-fixedOffsets
[i
])<=0x7f) {
960 *pOffset
=fixedOffsets
[i
];
966 /* No dynamic window for US-ASCII. */
968 } else if(c
<0x3400 ||
969 (uint32_t)(c
-0x10000)<(0x14000-0x10000) ||
970 (uint32_t)(c
-0x1d000)<=(0x1ffff-0x1d000)
972 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
973 *pOffset
=c
&0x7fffff80;
975 } else if(0xe000<=c
&& c
!=0xfeff && c
<0xfff0) {
976 /* For these characters we need to take the gapOffset into account. */
977 *pOffset
=c
&0x7fffff80;
978 return (int)((c
-gapOffset
)>>7);
985 * Idea for compression:
986 * - save SCSUData and other state before really starting work
987 * - at endloop, see if compression could be better with just unicode mode
988 * - don't do this if a callback has been called
989 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
990 * - different buffer handling!
992 * Drawback or need for corrective handling:
993 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
994 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
995 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
997 * How to achieve both?
998 * - Only replace the result after an SDX or SCU?
1002 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs
*pArgs
,
1003 UErrorCode
*pErrorCode
) {
1006 const UChar
*source
, *sourceLimit
;
1008 int32_t targetCapacity
;
1011 UBool isSingleByteMode
;
1012 uint8_t dynamicWindow
;
1013 uint32_t currentOffset
;
1017 int32_t sourceIndex
, nextSourceIndex
;
1021 /* variables for compression heuristics */
1027 /* set up the local pointers */
1028 cnv
=pArgs
->converter
;
1029 scsu
=(SCSUData
*)cnv
->extraInfo
;
1031 /* set up the local pointers */
1032 source
=pArgs
->source
;
1033 sourceLimit
=pArgs
->sourceLimit
;
1034 target
=(uint8_t *)pArgs
->target
;
1035 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1036 offsets
=pArgs
->offsets
;
1038 /* get the state machine state */
1039 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1040 dynamicWindow
=scsu
->fromUDynamicWindow
;
1041 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1045 /* sourceIndex=-1 if the current character began in the previous buffer */
1046 sourceIndex
= c
==0 ? 0 : -1;
1049 /* similar conversion "loop" as in toUnicode */
1051 if(isSingleByteMode
) {
1052 if(c
!=0 && targetCapacity
>0) {
1053 goto getTrailSingle
;
1056 /* state machine for single-byte mode */
1057 /* singleByteMode: */
1058 while(source
<sourceLimit
) {
1059 if(targetCapacity
<=0) {
1060 /* target is full */
1061 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1067 if((c
-0x20)<=0x5f) {
1068 /* pass US-ASCII graphic character through */
1069 *target
++=(uint8_t)c
;
1071 *offsets
++=sourceIndex
;
1075 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1077 *target
++=(uint8_t)c
;
1079 *offsets
++=sourceIndex
;
1083 /* quote C0 control character */
1088 } else if((delta
=c
-currentOffset
)<=0x7f) {
1089 /* use the current dynamic window */
1090 *target
++=(uint8_t)(delta
|0x80);
1092 *offsets
++=sourceIndex
;
1095 } else if(UTF_IS_SURROGATE(c
)) {
1096 if(UTF_IS_SURROGATE_FIRST(c
)) {
1099 if(source
<sourceLimit
) {
1100 /* test the following code unit */
1102 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1105 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1106 /* convert this surrogate code point */
1107 /* exit this condition tree */
1109 /* this is an unmatched lead code unit (1st surrogate) */
1110 /* callback(illegal) */
1111 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1119 /* this is an unmatched trail code unit (2nd surrogate) */
1120 /* callback(illegal) */
1121 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1125 /* compress supplementary character U+10000..U+10ffff */
1126 if((delta
=c
-currentOffset
)<=0x7f) {
1127 /* use the current dynamic window */
1128 *target
++=(uint8_t)(delta
|0x80);
1130 *offsets
++=sourceIndex
;
1133 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1134 /* there is a dynamic window that contains this character, change to it */
1135 dynamicWindow
=window
;
1136 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1137 useDynamicWindow(scsu
, dynamicWindow
);
1138 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1141 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1142 /* might check if there are more characters in this window to come */
1143 /* define an extended window with this character */
1145 dynamicWindow
=getNextDynamicWindow(scsu
);
1146 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1147 useDynamicWindow(scsu
, dynamicWindow
);
1148 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1152 /* change to Unicode mode and output this (lead, trail) pair */
1153 isSingleByteMode
=FALSE
;
1154 *target
++=(uint8_t)SCU
;
1156 *offsets
++=sourceIndex
;
1159 c
=((uint32_t)lead
<<16)|trail
;
1164 /* quote C1 control character */
1165 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1168 } else if(c
==0xfeff || c
>=0xfff0) {
1169 /* quote signature character=byte order mark and specials */
1174 /* compress all other BMP characters */
1175 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1176 /* there is a window defined that contains this character - switch to it or quote from it? */
1177 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1178 /* change to dynamic window */
1179 dynamicWindow
=window
;
1180 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1181 useDynamicWindow(scsu
, dynamicWindow
);
1182 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1186 /* quote from dynamic window */
1187 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1191 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1192 /* quote from static window */
1193 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1196 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1197 /* define a dynamic window with this character */
1198 dynamicWindow
=getNextDynamicWindow(scsu
);
1199 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1200 useDynamicWindow(scsu
, dynamicWindow
);
1201 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1204 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1205 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1208 * this character is not compressible (a BMP ideograph or similar);
1209 * switch to Unicode mode if this is the last character in the block
1210 * or there is at least one more ideograph following immediately
1212 isSingleByteMode
=FALSE
;
1224 /* normal end of conversion: prepare for a new character */
1226 sourceIndex
=nextSourceIndex
;
1229 if(c
!=0 && targetCapacity
>0) {
1230 goto getTrailUnicode
;
1233 /* state machine for Unicode mode */
1234 /* unicodeByteMode: */
1235 while(source
<sourceLimit
) {
1236 if(targetCapacity
<=0) {
1237 /* target is full */
1238 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1244 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1245 /* not compressible, write character directly */
1246 if(targetCapacity
>=2) {
1247 *target
++=(uint8_t)(c
>>8);
1248 *target
++=(uint8_t)c
;
1250 *offsets
++=sourceIndex
;
1251 *offsets
++=sourceIndex
;
1258 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1259 /* compress BMP character if the following one is not an uncompressible ideograph */
1260 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1261 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1262 /* ASCII digit or letter */
1263 isSingleByteMode
=TRUE
;
1264 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1267 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1268 /* there is a dynamic window that contains this character, change to it */
1269 isSingleByteMode
=TRUE
;
1270 dynamicWindow
=window
;
1271 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1272 useDynamicWindow(scsu
, dynamicWindow
);
1273 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1276 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1277 /* define a dynamic window with this character */
1278 isSingleByteMode
=TRUE
;
1279 dynamicWindow
=getNextDynamicWindow(scsu
);
1280 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1281 useDynamicWindow(scsu
, dynamicWindow
);
1282 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1288 /* don't know how to compress this character, just write it directly */
1291 } else if(c
<0xe000) {
1292 /* c is a surrogate */
1293 if(UTF_IS_SURROGATE_FIRST(c
)) {
1296 if(source
<sourceLimit
) {
1297 /* test the following code unit */
1299 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1302 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1303 /* convert this surrogate code point */
1304 /* exit this condition tree */
1306 /* this is an unmatched lead code unit (1st surrogate) */
1307 /* callback(illegal) */
1308 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1316 /* this is an unmatched trail code unit (2nd surrogate) */
1317 /* callback(illegal) */
1318 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1322 /* compress supplementary character */
1323 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1324 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1327 * there is a dynamic window that contains this character and
1328 * the following character is not uncompressible,
1329 * change to the window
1331 isSingleByteMode
=TRUE
;
1332 dynamicWindow
=window
;
1333 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1334 useDynamicWindow(scsu
, dynamicWindow
);
1335 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1338 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1339 (code
=getDynamicOffset(c
, &offset
))>=0
1341 /* two supplementary characters in (probably) the same window - define an extended one */
1342 isSingleByteMode
=TRUE
;
1344 dynamicWindow
=getNextDynamicWindow(scsu
);
1345 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1346 useDynamicWindow(scsu
, dynamicWindow
);
1347 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1351 /* don't know how to compress this character, just write it directly */
1352 c
=((uint32_t)lead
<<16)|trail
;
1356 } else /* 0xe000<=c<0xf300 */ {
1357 /* quote to avoid SCSU tags */
1363 /* normal end of conversion: prepare for a new character */
1365 sourceIndex
=nextSourceIndex
;
1370 /* set the converter state back into UConverter */
1371 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1372 scsu
->fromUDynamicWindow
=dynamicWindow
;
1376 /* write back the updated pointers */
1377 pArgs
->source
=source
;
1378 pArgs
->target
=(char *)target
;
1379 pArgs
->offsets
=offsets
;
1383 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1384 /* from the first if in the loop we know that targetCapacity>0 */
1385 if(length
<=targetCapacity
) {
1388 /* each branch falls through to the next one */
1390 *target
++=(uint8_t)(c
>>24);
1392 *target
++=(uint8_t)(c
>>16);
1394 *target
++=(uint8_t)(c
>>8);
1396 *target
++=(uint8_t)c
;
1398 /* will never occur */
1403 /* each branch falls through to the next one */
1405 *target
++=(uint8_t)(c
>>24);
1406 *offsets
++=sourceIndex
;
1408 *target
++=(uint8_t)(c
>>16);
1409 *offsets
++=sourceIndex
;
1411 *target
++=(uint8_t)(c
>>8);
1412 *offsets
++=sourceIndex
;
1414 *target
++=(uint8_t)c
;
1415 *offsets
++=sourceIndex
;
1417 /* will never occur */
1421 targetCapacity
-=length
;
1423 /* normal end of conversion: prepare for a new character */
1425 sourceIndex
=nextSourceIndex
;
1431 * We actually do this backwards here:
1432 * In order to save an intermediate variable, we output
1433 * first to the overflow buffer what does not fit into the
1436 /* we know that 1<=targetCapacity<length<=4 */
1437 length
-=targetCapacity
;
1438 p
=(uint8_t *)cnv
->charErrorBuffer
;
1440 /* each branch falls through to the next one */
1442 *p
++=(uint8_t)(c
>>16);
1444 *p
++=(uint8_t)(c
>>8);
1448 /* will never occur */
1451 cnv
->charErrorBufferLength
=(int8_t)length
;
1453 /* now output what fits into the regular target */
1454 c
>>=8*length
; /* length was reduced by targetCapacity */
1455 switch(targetCapacity
) {
1456 /* each branch falls through to the next one */
1458 *target
++=(uint8_t)(c
>>16);
1460 *offsets
++=sourceIndex
;
1463 *target
++=(uint8_t)(c
>>8);
1465 *offsets
++=sourceIndex
;
1468 *target
++=(uint8_t)c
;
1470 *offsets
++=sourceIndex
;
1473 /* will never occur */
1477 /* target overflow */
1479 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1486 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1487 * If a change is made in the original function, then either
1488 * change this function the same way or
1489 * re-copy the original function and remove the variables
1490 * offsets, sourceIndex, and nextSourceIndex.
1493 _SCSUFromUnicode(UConverterFromUnicodeArgs
*pArgs
,
1494 UErrorCode
*pErrorCode
) {
1497 const UChar
*source
, *sourceLimit
;
1499 int32_t targetCapacity
;
1501 UBool isSingleByteMode
;
1502 uint8_t dynamicWindow
;
1503 uint32_t currentOffset
;
1509 /* variables for compression heuristics */
1515 /* set up the local pointers */
1516 cnv
=pArgs
->converter
;
1517 scsu
=(SCSUData
*)cnv
->extraInfo
;
1519 /* set up the local pointers */
1520 source
=pArgs
->source
;
1521 sourceLimit
=pArgs
->sourceLimit
;
1522 target
=(uint8_t *)pArgs
->target
;
1523 targetCapacity
=(int32_t)(pArgs
->targetLimit
-pArgs
->target
);
1525 /* get the state machine state */
1526 isSingleByteMode
=scsu
->fromUIsSingleByteMode
;
1527 dynamicWindow
=scsu
->fromUDynamicWindow
;
1528 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1532 /* similar conversion "loop" as in toUnicode */
1534 if(isSingleByteMode
) {
1535 if(c
!=0 && targetCapacity
>0) {
1536 goto getTrailSingle
;
1539 /* state machine for single-byte mode */
1540 /* singleByteMode: */
1541 while(source
<sourceLimit
) {
1542 if(targetCapacity
<=0) {
1543 /* target is full */
1544 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1549 if((c
-0x20)<=0x5f) {
1550 /* pass US-ASCII graphic character through */
1551 *target
++=(uint8_t)c
;
1554 if((1UL<<c
)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1556 *target
++=(uint8_t)c
;
1559 /* quote C0 control character */
1564 } else if((delta
=c
-currentOffset
)<=0x7f) {
1565 /* use the current dynamic window */
1566 *target
++=(uint8_t)(delta
|0x80);
1568 } else if(UTF_IS_SURROGATE(c
)) {
1569 if(UTF_IS_SURROGATE_FIRST(c
)) {
1572 if(source
<sourceLimit
) {
1573 /* test the following code unit */
1575 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1577 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1578 /* convert this surrogate code point */
1579 /* exit this condition tree */
1581 /* this is an unmatched lead code unit (1st surrogate) */
1582 /* callback(illegal) */
1583 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1591 /* this is an unmatched trail code unit (2nd surrogate) */
1592 /* callback(illegal) */
1593 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1597 /* compress supplementary character U+10000..U+10ffff */
1598 if((delta
=c
-currentOffset
)<=0x7f) {
1599 /* use the current dynamic window */
1600 *target
++=(uint8_t)(delta
|0x80);
1602 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1603 /* there is a dynamic window that contains this character, change to it */
1604 dynamicWindow
=window
;
1605 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1606 useDynamicWindow(scsu
, dynamicWindow
);
1607 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1610 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1611 /* might check if there are more characters in this window to come */
1612 /* define an extended window with this character */
1614 dynamicWindow
=getNextDynamicWindow(scsu
);
1615 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1616 useDynamicWindow(scsu
, dynamicWindow
);
1617 c
=((uint32_t)SDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1621 /* change to Unicode mode and output this (lead, trail) pair */
1622 isSingleByteMode
=FALSE
;
1623 *target
++=(uint8_t)SCU
;
1625 c
=((uint32_t)lead
<<16)|trail
;
1630 /* quote C1 control character */
1631 c
=(c
&0x7f)|(SQ0
+1)<<8; /* SQ0+1==SQ1 */
1634 } else if(c
==0xfeff || c
>=0xfff0) {
1635 /* quote signature character=byte order mark and specials */
1640 /* compress all other BMP characters */
1641 if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1642 /* there is a window defined that contains this character - switch to it or quote from it? */
1643 if(source
>=sourceLimit
|| isInOffsetWindowOrDirect(scsu
->fromUDynamicOffsets
[window
], *source
)) {
1644 /* change to dynamic window */
1645 dynamicWindow
=window
;
1646 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1647 useDynamicWindow(scsu
, dynamicWindow
);
1648 c
=((uint32_t)(SC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1652 /* quote from dynamic window */
1653 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-scsu
->fromUDynamicOffsets
[window
])|0x80;
1657 } else if((window
=getWindow(staticOffsets
, c
))>=0) {
1658 /* quote from static window */
1659 c
=((uint32_t)(SQ0
+window
)<<8)|(c
-staticOffsets
[window
]);
1662 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1663 /* define a dynamic window with this character */
1664 dynamicWindow
=getNextDynamicWindow(scsu
);
1665 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1666 useDynamicWindow(scsu
, dynamicWindow
);
1667 c
=((uint32_t)(SD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1670 } else if((uint32_t)(c
-0x3400)<(0xd800-0x3400) &&
1671 (source
>=sourceLimit
|| (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1674 * this character is not compressible (a BMP ideograph or similar);
1675 * switch to Unicode mode if this is the last character in the block
1676 * or there is at least one more ideograph following immediately
1678 isSingleByteMode
=FALSE
;
1690 /* normal end of conversion: prepare for a new character */
1694 if(c
!=0 && targetCapacity
>0) {
1695 goto getTrailUnicode
;
1698 /* state machine for Unicode mode */
1699 /* unicodeByteMode: */
1700 while(source
<sourceLimit
) {
1701 if(targetCapacity
<=0) {
1702 /* target is full */
1703 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1708 if((uint32_t)(c
-0x3400)<(0xd800-0x3400)) {
1709 /* not compressible, write character directly */
1710 if(targetCapacity
>=2) {
1711 *target
++=(uint8_t)(c
>>8);
1712 *target
++=(uint8_t)c
;
1718 } else if((uint32_t)(c
-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1719 /* compress BMP character if the following one is not an uncompressible ideograph */
1720 if(!(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))) {
1721 if(((uint32_t)(c
-0x30)<10 || (uint32_t)(c
-0x61)<26 || (uint32_t)(c
-0x41)<26)) {
1722 /* ASCII digit or letter */
1723 isSingleByteMode
=TRUE
;
1724 c
|=((uint32_t)(UC0
+dynamicWindow
)<<8)|c
;
1727 } else if((window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0) {
1728 /* there is a dynamic window that contains this character, change to it */
1729 isSingleByteMode
=TRUE
;
1730 dynamicWindow
=window
;
1731 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1732 useDynamicWindow(scsu
, dynamicWindow
);
1733 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1736 } else if((code
=getDynamicOffset(c
, &offset
))>=0) {
1737 /* define a dynamic window with this character */
1738 isSingleByteMode
=TRUE
;
1739 dynamicWindow
=getNextDynamicWindow(scsu
);
1740 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1741 useDynamicWindow(scsu
, dynamicWindow
);
1742 c
=((uint32_t)(UD0
+dynamicWindow
)<<16)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1748 /* don't know how to compress this character, just write it directly */
1751 } else if(c
<0xe000) {
1752 /* c is a surrogate */
1753 if(UTF_IS_SURROGATE_FIRST(c
)) {
1756 if(source
<sourceLimit
) {
1757 /* test the following code unit */
1759 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1761 c
=UTF16_GET_PAIR_VALUE(c
, trail
);
1762 /* convert this surrogate code point */
1763 /* exit this condition tree */
1765 /* this is an unmatched lead code unit (1st surrogate) */
1766 /* callback(illegal) */
1767 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1775 /* this is an unmatched trail code unit (2nd surrogate) */
1776 /* callback(illegal) */
1777 *pErrorCode
=U_ILLEGAL_CHAR_FOUND
;
1781 /* compress supplementary character */
1782 if( (window
=getWindow(scsu
->fromUDynamicOffsets
, c
))>=0 &&
1783 !(source
<sourceLimit
&& (uint32_t)(*source
-0x3400)<(0xd800-0x3400))
1786 * there is a dynamic window that contains this character and
1787 * the following character is not uncompressible,
1788 * change to the window
1790 isSingleByteMode
=TRUE
;
1791 dynamicWindow
=window
;
1792 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
];
1793 useDynamicWindow(scsu
, dynamicWindow
);
1794 c
=((uint32_t)(UC0
+dynamicWindow
)<<8)|(c
-currentOffset
)|0x80;
1797 } else if(source
<sourceLimit
&& lead
==*source
&& /* too lazy to check trail in same window as source[1] */
1798 (code
=getDynamicOffset(c
, &offset
))>=0
1800 /* two supplementary characters in (probably) the same window - define an extended one */
1801 isSingleByteMode
=TRUE
;
1803 dynamicWindow
=getNextDynamicWindow(scsu
);
1804 currentOffset
=scsu
->fromUDynamicOffsets
[dynamicWindow
]=offset
;
1805 useDynamicWindow(scsu
, dynamicWindow
);
1806 c
=((uint32_t)UDX
<<24)|((uint32_t)dynamicWindow
<<21)|((uint32_t)code
<<8)|(c
-currentOffset
)|0x80;
1810 /* don't know how to compress this character, just write it directly */
1811 c
=((uint32_t)lead
<<16)|trail
;
1815 } else /* 0xe000<=c<0xf300 */ {
1816 /* quote to avoid SCSU tags */
1822 /* normal end of conversion: prepare for a new character */
1828 /* set the converter state back into UConverter */
1829 scsu
->fromUIsSingleByteMode
=isSingleByteMode
;
1830 scsu
->fromUDynamicWindow
=dynamicWindow
;
1834 /* write back the updated pointers */
1835 pArgs
->source
=source
;
1836 pArgs
->target
=(char *)target
;
1840 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1841 /* from the first if in the loop we know that targetCapacity>0 */
1842 if(length
<=targetCapacity
) {
1844 /* each branch falls through to the next one */
1846 *target
++=(uint8_t)(c
>>24);
1848 *target
++=(uint8_t)(c
>>16);
1850 *target
++=(uint8_t)(c
>>8);
1852 *target
++=(uint8_t)c
;
1854 /* will never occur */
1857 targetCapacity
-=length
;
1859 /* normal end of conversion: prepare for a new character */
1866 * We actually do this backwards here:
1867 * In order to save an intermediate variable, we output
1868 * first to the overflow buffer what does not fit into the
1871 /* we know that 1<=targetCapacity<length<=4 */
1872 length
-=targetCapacity
;
1873 p
=(uint8_t *)cnv
->charErrorBuffer
;
1875 /* each branch falls through to the next one */
1877 *p
++=(uint8_t)(c
>>16);
1879 *p
++=(uint8_t)(c
>>8);
1883 /* will never occur */
1886 cnv
->charErrorBufferLength
=(int8_t)length
;
1888 /* now output what fits into the regular target */
1889 c
>>=8*length
; /* length was reduced by targetCapacity */
1890 switch(targetCapacity
) {
1891 /* each branch falls through to the next one */
1893 *target
++=(uint8_t)(c
>>16);
1895 *target
++=(uint8_t)(c
>>8);
1897 *target
++=(uint8_t)c
;
1899 /* will never occur */
1903 /* target overflow */
1905 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
1911 /* miscellaneous ------------------------------------------------------------ */
1914 _SCSUGetName(const UConverter
*cnv
) {
1915 SCSUData
*scsu
=(SCSUData
*)cnv
->extraInfo
;
1917 switch(scsu
->locale
) {
1919 return "SCSU,locale=ja";
1926 _SCSUWriteSub(UConverterFromUnicodeArgs
*pArgs
,
1927 int32_t offsetIndex
,
1928 UErrorCode
*pErrorCode
) {
1929 static const char squ_fffd
[]={ (char)SQU
, (char)0xffu
, (char)0xfdu
};
1932 * The substitution character is U+fffd={ ff, fd }.
1933 * If the SCSU converter is in Unicode mode, then these two bytes just need to
1934 * be written. Otherwise, this character is quoted.
1936 if(((SCSUData
*)pArgs
->converter
->extraInfo
)->fromUIsSingleByteMode
) {
1937 /* single-byte mode: quote Unicode */
1938 ucnv_cbFromUWriteBytes(pArgs
,
1940 offsetIndex
, pErrorCode
);
1942 /* Unicode mode: just write U+fffd */
1943 ucnv_cbFromUWriteBytes(pArgs
,
1945 offsetIndex
, pErrorCode
);
1949 /* structure for SafeClone calculations */
1950 struct cloneSCSUStruct
1957 _SCSUSafeClone(const UConverter
*cnv
,
1959 int32_t *pBufferSize
,
1962 struct cloneSCSUStruct
* localClone
;
1963 int32_t bufferSizeNeeded
= sizeof(struct cloneSCSUStruct
);
1965 if (U_FAILURE(*status
)){
1969 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1970 *pBufferSize
= bufferSizeNeeded
;
1974 localClone
= (struct cloneSCSUStruct
*)stackBuffer
;
1975 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1977 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(SCSUData
));
1978 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1979 localClone
->cnv
.isExtraLocal
= TRUE
;
1981 return &localClone
->cnv
;
1985 static const UConverterImpl _SCSUImpl
={
1996 _SCSUToUnicodeWithOffsets
,
1998 _SCSUFromUnicodeWithOffsets
,
2005 ucnv_getCompleteUnicodeSet
2008 static const UConverterStaticData _SCSUStaticData
={
2009 sizeof(UConverterStaticData
),
2011 0, /* CCSID for SCSU */
2012 UCNV_IBM
, UCNV_SCSU
,
2013 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2015 * ### TODO the subchar really must be written by an SCSU function
2016 * however, currently SCSU's fromUnicode() never causes errors, therefore
2017 * no callbacks will be called and no subchars written
2018 * See Jitterbug 2837 - RFE: forbid converting surrogate code points in all charsets
2020 { 0x0e, 0xff, 0xfd, 0 }, 3,
2024 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2027 const UConverterSharedData _SCSUData
={
2028 sizeof(UConverterSharedData
), ~((uint32_t)0),
2029 NULL
, NULL
, &_SCSUStaticData
, FALSE
, &_SCSUImpl
,