]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
4 | * Copyright (C) 2000-2001, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ****************************************************************************** | |
8 | * file name: ucnvmbcs.h | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2000jul07 | |
14 | * created by: Markus W. Scherer | |
15 | */ | |
16 | ||
17 | #ifndef __UCNVMBCS_H__ | |
18 | #define __UCNVMBCS_H__ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | #include "unicode/ucnv.h" | |
22 | #include "ucnv_bld.h" | |
23 | ||
24 | /* MBCS converter data and state -------------------------------------------- */ | |
25 | ||
26 | /** | |
27 | * MBCS action codes for conversions to Unicode. | |
28 | * These values are in bits 23..20 of the state table entries. | |
29 | */ | |
30 | enum { | |
31 | MBCS_STATE_VALID_DIRECT_16, | |
32 | MBCS_STATE_VALID_DIRECT_20, | |
33 | ||
34 | MBCS_STATE_FALLBACK_DIRECT_16, | |
35 | MBCS_STATE_FALLBACK_DIRECT_20, | |
36 | ||
37 | MBCS_STATE_VALID_16, | |
38 | MBCS_STATE_VALID_16_PAIR, | |
39 | ||
40 | MBCS_STATE_UNASSIGNED, | |
41 | MBCS_STATE_ILLEGAL, | |
42 | ||
43 | MBCS_STATE_CHANGE_ONLY | |
44 | }; | |
45 | ||
46 | /* Macros for state table entries */ | |
47 | #define MBCS_ENTRY_TRANSITION(state, offset) (int32_t)(((int32_t)(state)<<24L)|(offset)) | |
48 | #define MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, offset) (int32_t)(((entry)&0xff000000)|(offset)) | |
49 | #define MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, offset) (int32_t)((entry)+(offset)) | |
50 | ||
51 | #define MBCS_ENTRY_FINAL(state, action, value) (int32_t)(0x80000000|((int32_t)(state)<<24L)|((action)<<20L)|(value)) | |
52 | #define MBCS_ENTRY_SET_FINAL(entry) (int32_t)((entry)|0x80000000) | |
53 | #define MBCS_ENTRY_FINAL_SET_ACTION(entry, action) (int32_t)(((entry)&0xff0fffff)|((int32_t)(action)<<20L)) | |
54 | #define MBCS_ENTRY_FINAL_SET_VALUE(entry, value) (int32_t)(((entry)&0xfff00000)|(value)) | |
55 | #define MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, action, value) (int32_t)(((entry)&0xff000000)|((int32_t)(action)<<20L)|(value)) | |
56 | ||
57 | #define MBCS_ENTRY_SET_STATE(entry, state) (int32_t)(((entry)&0x80ffffff)|((int32_t)(state)<<24L)) | |
58 | ||
59 | #define MBCS_ENTRY_STATE(entry) (((entry)>>24)&0x7f) | |
60 | ||
61 | #define MBCS_ENTRY_IS_TRANSITION(entry) ((entry)>=0) | |
62 | #define MBCS_ENTRY_IS_FINAL(entry) ((entry)<0) | |
63 | ||
64 | #define MBCS_ENTRY_TRANSITION_STATE(entry) ((entry)>>24) | |
65 | #define MBCS_ENTRY_TRANSITION_OFFSET(entry) ((entry)&0xffffff) | |
66 | ||
67 | #define MBCS_ENTRY_FINAL_STATE(entry) (((entry)>>24)&0x7f) | |
68 | #define MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry) ((entry)<(int32_t)0x80100000) | |
69 | #define MBCS_ENTRY_FINAL_ACTION(entry) (((entry)>>20)&0xf) | |
70 | #define MBCS_ENTRY_FINAL_VALUE(entry) ((entry)&0xfffff) | |
71 | #define MBCS_ENTRY_FINAL_VALUE_16(entry) (uint16_t)(entry) | |
72 | ||
73 | /* single-byte fromUnicode: get the 16-bit result word */ | |
74 | #define MBCS_SINGLE_RESULT_FROM_U(table, results, c) (results)[ (table)[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] +((c)&0xf) ] | |
75 | ||
76 | /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ | |
77 | #define MBCS_STAGE_2_FROM_U(table, c) ((const uint32_t *)(table))[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] | |
78 | #define MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ( ((stage2Entry) & ((uint32_t)1<< (16+((c)&0xf)) )) !=0) | |
79 | ||
80 | #define MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c) ((uint16_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)] | |
81 | #define MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c) ((uint32_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)] | |
82 | ||
83 | #define MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c) ((bytes)+(16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf))*3) | |
84 | ||
85 | ||
86 | /** | |
87 | * MBCS output types for conversions from Unicode. | |
88 | * These per-converter types determine the storage method in stage 3 of the lookup table, | |
89 | * mostly how many bytes are stored per entry. | |
90 | */ | |
91 | enum { | |
92 | MBCS_OUTPUT_1, /* 0 */ | |
93 | MBCS_OUTPUT_2, /* 1 */ | |
94 | MBCS_OUTPUT_3, /* 2 */ | |
95 | MBCS_OUTPUT_4, /* 3 */ | |
96 | ||
97 | MBCS_OUTPUT_3_EUC=8, /* 8 */ | |
98 | MBCS_OUTPUT_4_EUC, /* 9 */ | |
99 | ||
100 | MBCS_OUTPUT_2_SISO=12, /* c */ | |
101 | MBCS_OUTPUT_2_HZ /* d */ | |
102 | }; | |
103 | ||
104 | /** | |
105 | * Fallbacks to Unicode are stored outside the normal state table and code point structures | |
106 | * in a vector of items of this type. They are sorted by offset. | |
107 | */ | |
108 | typedef struct { | |
109 | uint32_t offset; | |
110 | UChar32 codePoint; | |
111 | } _MBCSToUFallback; | |
112 | ||
113 | /** | |
114 | * This is the MBCS part of the UConverterTable union (a runtime data structure). | |
115 | * It keeps all the per-converter data and points into the loaded mapping tables. | |
116 | */ | |
117 | typedef struct UConverterMBCSTable { | |
118 | /* toUnicode */ | |
119 | uint8_t countStates; | |
120 | uint32_t countToUFallbacks; | |
121 | ||
122 | const int32_t (*stateTable)/*[countStates]*/[256]; | |
123 | int32_t (*swapLFNLStateTable)/*[countStates]*/[256]; /* for swaplfnl */ | |
124 | const uint16_t *unicodeCodeUnits/*[countUnicodeResults]*/; | |
125 | const _MBCSToUFallback *toUFallbacks; | |
126 | ||
127 | /* fromUnicode */ | |
128 | const uint16_t *fromUnicodeTable; | |
129 | const uint8_t *fromUnicodeBytes; | |
130 | uint8_t *swapLFNLFromUnicodeBytes; /* for swaplfnl */ | |
131 | uint32_t fromUBytesLength; | |
132 | uint8_t outputType, unicodeMask; | |
133 | ||
134 | /* converter name for swaplfnl */ | |
135 | char *swapLFNLName; | |
136 | } UConverterMBCSTable; | |
137 | ||
138 | /** | |
139 | * MBCS data structure as part of a .cnv file: | |
140 | * | |
141 | * uint32_t [8]; -- 8 values: | |
142 | * 0 MBCS version in UVersionInfo format (1.0.0.0) | |
143 | * 1 countStates | |
144 | * 2 countToUFallbacks | |
145 | * 3 offsetToUCodeUnits (offsets are counted from the beginning of this header structure) | |
146 | * 4 offsetFromUTable | |
147 | * 5 offsetFromUBytes | |
148 | * 6 flags, bits: | |
149 | * 31.. 8 reserved | |
150 | * 7.. 0 outputType | |
151 | * 7 fromUBytesLength -- header.version 4.1 (ICU 2.4) and higher | |
152 | * | |
153 | * stateTable[countStates][256]; | |
154 | * | |
155 | * struct { (fallbacks are sorted by offset) | |
156 | * uint32_t offset; | |
157 | * UChar32 codePoint; | |
158 | * } toUFallbacks[countToUFallbacks]; | |
159 | * | |
160 | * uint16_t unicodeCodeUnits[?]; (even number of units or padded) | |
161 | * | |
162 | * uint16_t fromUTable[0x440+?]; (32-bit-aligned) | |
163 | * | |
164 | * uint8_t fromUBytes[?]; | |
165 | */ | |
166 | typedef struct { | |
167 | UVersionInfo version; | |
168 | uint32_t countStates, | |
169 | countToUFallbacks, | |
170 | offsetToUCodeUnits, | |
171 | offsetFromUTable, | |
172 | offsetFromUBytes, | |
173 | flags, | |
174 | fromUBytesLength; | |
175 | } _MBCSHeader; | |
176 | ||
177 | /** | |
178 | * This is a simple version of _MBCSGetNextUChar() that is used | |
179 | * by other converter implementations. | |
180 | * It does not use state from the converter, nor error codes. | |
181 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
182 | * | |
183 | * Return value: | |
184 | * U+fffe unassigned | |
185 | * U+ffff illegal | |
186 | * otherwise the Unicode code point | |
187 | */ | |
188 | U_CFUNC UChar32 | |
189 | _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, | |
190 | const char **pSource, const char *sourceLimit, | |
191 | UBool useFallback); | |
192 | ||
193 | /** | |
194 | * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. | |
195 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
196 | */ | |
197 | U_CFUNC UChar32 | |
198 | _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, | |
199 | uint8_t b, UBool useFallback); | |
200 | ||
201 | /** | |
202 | * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. | |
203 | * It works for single-byte, single-state codepages that only map | |
204 | * to and from BMP code points, and it always | |
205 | * returns fallback values. | |
206 | */ | |
207 | #define _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, b) \ | |
208 | (UChar)MBCS_ENTRY_FINAL_VALUE_16((sharedData)->table->mbcs.stateTable[0][(uint8_t)(b)]) | |
209 | ||
210 | /** | |
211 | * This is an internal function that allows other converter implementations | |
212 | * to check whether a byte is a lead byte. | |
213 | */ | |
214 | U_CFUNC UBool | |
215 | _MBCSIsLeadByte(UConverterSharedData *sharedData, char byte); | |
216 | ||
217 | /** This is a macro version of _MBCSIsLeadByte(). */ | |
218 | #define _MBCS_IS_LEAD_BYTE(sharedData, byte) \ | |
219 | (UBool)MBCS_ENTRY_IS_TRANSITION((sharedData)->table->mbcs.stateTable[0][(uint8_t)(byte)]) | |
220 | ||
221 | /** | |
222 | * This is another simple conversion function for internal use by other | |
223 | * conversion implementations. | |
224 | * It does not use the converter state nor call callbacks. | |
225 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
226 | * | |
227 | * It converts one single Unicode code point into codepage bytes, encoded | |
228 | * as one 32-bit value. The function returns the number of bytes in *pValue: | |
229 | * 1..4 the number of bytes in *pValue | |
230 | * 0 unassigned (*pValue undefined) | |
231 | * -1 illegal (currently not used, *pValue undefined) | |
232 | * | |
233 | * *pValue will contain the resulting bytes with the last byte in bits 7..0, | |
234 | * the second to last byte in bits 15..8, etc. | |
235 | * Currently, the function assumes but does not check that 0<=c<=0x10ffff. | |
236 | */ | |
237 | U_CFUNC int32_t | |
238 | _MBCSFromUChar32(UConverterSharedData *sharedData, | |
239 | UChar32 c, uint32_t *pValue, | |
240 | UBool useFallback); | |
241 | ||
242 | /** | |
243 | * This version of _MBCSFromUChar32() is optimized for single-byte codepages. | |
244 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
245 | * | |
246 | * It returns the codepage byte for the code point, or -1 if it is unassigned. | |
247 | */ | |
248 | U_CFUNC int32_t | |
249 | _MBCSSingleFromUChar32(UConverterSharedData *sharedData, | |
250 | UChar32 c, | |
251 | UBool useFallback); | |
252 | ||
253 | /** | |
254 | * SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but | |
255 | * we cheat a little about the type, returning the old types if appropriate. | |
256 | */ | |
257 | U_CFUNC UConverterType | |
258 | _MBCSGetType(const UConverter* converter); | |
259 | ||
260 | U_CFUNC void | |
261 | _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, | |
262 | UErrorCode *pErrorCode); | |
263 | U_CFUNC void | |
264 | _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
265 | UErrorCode *pErrorCode); | |
266 | ||
267 | ||
268 | #endif |