]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ****************************************************************************** | |
3 | * | |
4 | * Copyright (C) 2000-2004, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ****************************************************************************** | |
8 | * file name: ucnvmbcs.h | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2000jul07 | |
14 | * created by: Markus W. Scherer | |
15 | */ | |
16 | ||
17 | #ifndef __UCNVMBCS_H__ | |
18 | #define __UCNVMBCS_H__ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | ||
22 | #if !UCONFIG_NO_CONVERSION | |
23 | ||
24 | #include "unicode/ucnv.h" | |
25 | #include "ucnv_cnv.h" | |
26 | ||
27 | /** | |
28 | * ICU conversion (.cnv) data file structure, following the usual UDataInfo | |
29 | * header. | |
30 | * | |
31 | * Format version: 6.2 | |
32 | * | |
33 | * struct UConverterStaticData -- struct containing the converter name, IBM CCSID, | |
34 | * min/max bytes per character, etc. | |
35 | * see ucnv_bld.h | |
36 | * | |
37 | * -------------------- | |
38 | * | |
39 | * The static data is followed by conversionType-specific data structures. | |
40 | * At the moment, there are only variations of MBCS converters. They all have | |
41 | * the same toUnicode structures, while the fromUnicode structures for SBCS | |
42 | * differ from those for other MBCS-style converters. | |
43 | * | |
44 | * _MBCSHeader.version 4.2 adds an optional conversion extension data structure. | |
45 | * If it is present, then an ICU version reading header versions 4.0 or 4.1 | |
46 | * will be able to use the base table and ignore the extension. | |
47 | * | |
48 | * The unicodeMask in the static data is part of the base table data structure. | |
49 | * Especially, the UCNV_HAS_SUPPLEMENTARY flag determines the length of the | |
50 | * fromUnicode stage 1 array. | |
51 | * The static data unicodeMask refers only to the base table's properties if | |
52 | * a base table is included. | |
53 | * In an extension-only file, the static data unicodeMask is 0. | |
54 | * The extension data indexes have a separate field with the unicodeMask flags. | |
55 | * | |
56 | * MBCS-style data structure following the static data. | |
57 | * Offsets are counted in bytes from the beginning of the MBCS header structure. | |
58 | * Details about usage in comments in ucnvmbcs.c. | |
59 | * | |
60 | * struct _MBCSHeader (see the definition in this header file below) | |
61 | * contains 32-bit fields as follows: | |
62 | * 8 values: | |
63 | * 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0) | |
64 | * 1 uint32_t countStates | |
65 | * 2 uint32_t countToUFallbacks | |
66 | * 3 uint32_t offsetToUCodeUnits | |
67 | * 4 uint32_t offsetFromUTable | |
68 | * 5 uint32_t offsetFromUBytes | |
69 | * 6 uint32_t flags, bits: | |
70 | * 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher | |
71 | * 0 for older versions and if | |
72 | * there is not extension structure | |
73 | * 7.. 0 outputType | |
74 | * 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher | |
75 | * counts bytes in fromUBytes[] | |
76 | * | |
77 | * if(outputType==MBCS_OUTPUT_EXT_ONLY) { | |
78 | * -- base table name for extension-only table | |
79 | * char baseTableName[variable]; -- with NUL plus padding for 4-alignment | |
80 | * | |
81 | * -- all _MBCSHeader fields except for version and flags are 0 | |
82 | * } else { | |
83 | * -- normal base table with optional extension | |
84 | * | |
85 | * int32_t stateTable[countStates][256]; | |
86 | * | |
87 | * struct _MBCSToUFallback { (fallbacks are sorted by offset) | |
88 | * uint32_t offset; | |
89 | * UChar32 codePoint; | |
90 | * } toUFallbacks[countToUFallbacks]; | |
91 | * | |
92 | * uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2]; | |
93 | * (padded to an even number of units) | |
94 | * | |
95 | * -- stage 1 tables | |
96 | * if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) { | |
97 | * -- stage 1 table for all of Unicode | |
98 | * uint16_t fromUTable[0x440]; (32-bit-aligned) | |
99 | * } else { | |
100 | * -- BMP-only tables have a smaller stage 1 table | |
101 | * uint16_t fromUTable[0x40]; (32-bit-aligned) | |
102 | * } | |
103 | * | |
104 | * -- stage 2 tables | |
105 | * length determined by top of stage 1 and bottom of stage 3 tables | |
106 | * if(outputType==MBCS_OUTPUT_1) { | |
107 | * -- SBCS: pure indexes | |
108 | * uint16_t stage 2 indexes[?]; | |
109 | * } else { | |
110 | * -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes | |
111 | * uint32_t stage 2 flags and indexes[?]; | |
112 | * } | |
113 | * | |
114 | * -- stage 3 tables with byte results | |
115 | * if(outputType==MBCS_OUTPUT_1) { | |
116 | * -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c | |
117 | * uint16_t fromUBytes[fromUBytesLength/2]; | |
118 | * } else { | |
119 | * -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c | |
120 | * uint8_t fromUBytes[fromUBytesLength]; or | |
121 | * uint16_t fromUBytes[fromUBytesLength/2]; or | |
122 | * uint32_t fromUBytes[fromUBytesLength/4]; | |
123 | * } | |
124 | * } | |
125 | * | |
126 | * -- extension table, details see ucnv_ext.h | |
127 | * int32_t indexes[>=32]; ... | |
128 | */ | |
129 | ||
130 | /* MBCS converter data and state -------------------------------------------- */ | |
131 | ||
132 | enum { | |
133 | MBCS_MAX_STATE_COUNT=128 | |
134 | }; | |
135 | ||
136 | /** | |
137 | * MBCS action codes for conversions to Unicode. | |
138 | * These values are in bits 23..20 of the state table entries. | |
139 | */ | |
140 | enum { | |
141 | MBCS_STATE_VALID_DIRECT_16, | |
142 | MBCS_STATE_VALID_DIRECT_20, | |
143 | ||
144 | MBCS_STATE_FALLBACK_DIRECT_16, | |
145 | MBCS_STATE_FALLBACK_DIRECT_20, | |
146 | ||
147 | MBCS_STATE_VALID_16, | |
148 | MBCS_STATE_VALID_16_PAIR, | |
149 | ||
150 | MBCS_STATE_UNASSIGNED, | |
151 | MBCS_STATE_ILLEGAL, | |
152 | ||
153 | MBCS_STATE_CHANGE_ONLY | |
154 | }; | |
155 | ||
156 | /* Macros for state table entries */ | |
157 | #define MBCS_ENTRY_TRANSITION(state, offset) (int32_t)(((int32_t)(state)<<24L)|(offset)) | |
158 | #define MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, offset) (int32_t)(((entry)&0xff000000)|(offset)) | |
159 | #define MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, offset) (int32_t)((entry)+(offset)) | |
160 | ||
161 | #define MBCS_ENTRY_FINAL(state, action, value) (int32_t)(0x80000000|((int32_t)(state)<<24L)|((action)<<20L)|(value)) | |
162 | #define MBCS_ENTRY_SET_FINAL(entry) (int32_t)((entry)|0x80000000) | |
163 | #define MBCS_ENTRY_FINAL_SET_ACTION(entry, action) (int32_t)(((entry)&0xff0fffff)|((int32_t)(action)<<20L)) | |
164 | #define MBCS_ENTRY_FINAL_SET_VALUE(entry, value) (int32_t)(((entry)&0xfff00000)|(value)) | |
165 | #define MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, action, value) (int32_t)(((entry)&0xff000000)|((int32_t)(action)<<20L)|(value)) | |
166 | ||
167 | #define MBCS_ENTRY_SET_STATE(entry, state) (int32_t)(((entry)&0x80ffffff)|((int32_t)(state)<<24L)) | |
168 | ||
169 | #define MBCS_ENTRY_STATE(entry) (((entry)>>24)&0x7f) | |
170 | ||
171 | #define MBCS_ENTRY_IS_TRANSITION(entry) ((entry)>=0) | |
172 | #define MBCS_ENTRY_IS_FINAL(entry) ((entry)<0) | |
173 | ||
174 | #define MBCS_ENTRY_TRANSITION_STATE(entry) ((entry)>>24) | |
175 | #define MBCS_ENTRY_TRANSITION_OFFSET(entry) ((entry)&0xffffff) | |
176 | ||
177 | #define MBCS_ENTRY_FINAL_STATE(entry) (((entry)>>24)&0x7f) | |
178 | #define MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry) ((entry)<(int32_t)0x80100000) | |
179 | #define MBCS_ENTRY_FINAL_ACTION(entry) (((entry)>>20)&0xf) | |
180 | #define MBCS_ENTRY_FINAL_VALUE(entry) ((entry)&0xfffff) | |
181 | #define MBCS_ENTRY_FINAL_VALUE_16(entry) (uint16_t)(entry) | |
182 | ||
183 | /* single-byte fromUnicode: get the 16-bit result word */ | |
184 | #define MBCS_SINGLE_RESULT_FROM_U(table, results, c) (results)[ (table)[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] +((c)&0xf) ] | |
185 | ||
186 | /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ | |
187 | #define MBCS_STAGE_2_FROM_U(table, c) ((const uint32_t *)(table))[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] | |
188 | #define MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ( ((stage2Entry) & ((uint32_t)1<< (16+((c)&0xf)) )) !=0) | |
189 | ||
190 | #define MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c) ((uint16_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)] | |
191 | #define MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c) ((uint32_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)] | |
192 | ||
193 | #define MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c) ((bytes)+(16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf))*3) | |
194 | ||
195 | ||
196 | /** | |
197 | * MBCS output types for conversions from Unicode. | |
198 | * These per-converter types determine the storage method in stage 3 of the lookup table, | |
199 | * mostly how many bytes are stored per entry. | |
200 | */ | |
201 | enum { | |
202 | MBCS_OUTPUT_1, /* 0 */ | |
203 | MBCS_OUTPUT_2, /* 1 */ | |
204 | MBCS_OUTPUT_3, /* 2 */ | |
205 | MBCS_OUTPUT_4, /* 3 */ | |
206 | ||
207 | MBCS_OUTPUT_3_EUC=8, /* 8 */ | |
208 | MBCS_OUTPUT_4_EUC, /* 9 */ | |
209 | ||
210 | MBCS_OUTPUT_2_SISO=12, /* c */ | |
211 | MBCS_OUTPUT_2_HZ, /* d */ | |
212 | ||
213 | MBCS_OUTPUT_EXT_ONLY, /* e */ | |
214 | ||
215 | MBCS_OUTPUT_COUNT, | |
216 | ||
217 | MBCS_OUTPUT_DBCS_ONLY=0xdb /* runtime-only type for DBCS-only handling of SISO tables */ | |
218 | }; | |
219 | ||
220 | /** | |
221 | * Fallbacks to Unicode are stored outside the normal state table and code point structures | |
222 | * in a vector of items of this type. They are sorted by offset. | |
223 | */ | |
224 | typedef struct { | |
225 | uint32_t offset; | |
226 | UChar32 codePoint; | |
227 | } _MBCSToUFallback; | |
228 | ||
229 | /** | |
230 | * This is the MBCS part of the UConverterTable union (a runtime data structure). | |
231 | * It keeps all the per-converter data and points into the loaded mapping tables. | |
232 | */ | |
233 | typedef struct UConverterMBCSTable { | |
234 | /* toUnicode */ | |
235 | uint8_t countStates, dbcsOnlyState, stateTableOwned; | |
236 | uint32_t countToUFallbacks; | |
237 | ||
238 | const int32_t (*stateTable)/*[countStates]*/[256]; | |
239 | int32_t (*swapLFNLStateTable)/*[countStates]*/[256]; /* for swaplfnl */ | |
240 | const uint16_t *unicodeCodeUnits/*[countUnicodeResults]*/; | |
241 | const _MBCSToUFallback *toUFallbacks; | |
242 | ||
243 | /* fromUnicode */ | |
244 | const uint16_t *fromUnicodeTable; | |
245 | const uint8_t *fromUnicodeBytes; | |
246 | uint8_t *swapLFNLFromUnicodeBytes; /* for swaplfnl */ | |
247 | uint32_t fromUBytesLength; | |
248 | uint8_t outputType, unicodeMask; | |
249 | ||
250 | /* converter name for swaplfnl */ | |
251 | char *swapLFNLName; | |
252 | ||
253 | /* extension data */ | |
254 | struct UConverterSharedData *baseSharedData; | |
255 | const int32_t *extIndexes; | |
256 | } UConverterMBCSTable; | |
257 | ||
258 | /** | |
259 | * MBCS data header. See data format description above. | |
260 | */ | |
261 | typedef struct { | |
262 | UVersionInfo version; | |
263 | uint32_t countStates, | |
264 | countToUFallbacks, | |
265 | offsetToUCodeUnits, | |
266 | offsetFromUTable, | |
267 | offsetFromUBytes, | |
268 | flags, | |
269 | fromUBytesLength; | |
270 | } _MBCSHeader; | |
271 | ||
272 | /* | |
273 | * This is a simple version of _MBCSGetNextUChar() that is used | |
274 | * by other converter implementations. | |
275 | * It only returns an "assigned" result if it consumes the entire input. | |
276 | * It does not use state from the converter, nor error codes. | |
277 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
278 | * It handles conversion extensions but not GB 18030. | |
279 | * | |
280 | * Return value: | |
281 | * U+fffe unassigned | |
282 | * U+ffff illegal | |
283 | * otherwise the Unicode code point | |
284 | */ | |
285 | U_CFUNC UChar32 | |
286 | ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, | |
287 | const char *source, int32_t length, | |
288 | UBool useFallback); | |
289 | ||
290 | /** | |
291 | * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. | |
292 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
293 | * It does not handle conversion extensions (_extToU()). | |
294 | */ | |
295 | U_CFUNC UChar32 | |
296 | ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, | |
297 | uint8_t b, UBool useFallback); | |
298 | ||
299 | /** | |
300 | * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. | |
301 | * It works for single-byte, single-state codepages that only map | |
302 | * to and from BMP code points, and it always | |
303 | * returns fallback values. | |
304 | */ | |
305 | #define _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, b) \ | |
306 | (UChar)MBCS_ENTRY_FINAL_VALUE_16((sharedData)->mbcs.stateTable[0][(uint8_t)(b)]) | |
307 | ||
308 | /** | |
309 | * This is an internal function that allows other converter implementations | |
310 | * to check whether a byte is a lead byte. | |
311 | */ | |
312 | U_CFUNC UBool | |
313 | ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte); | |
314 | ||
315 | /** This is a macro version of _MBCSIsLeadByte(). */ | |
316 | #define _MBCS_IS_LEAD_BYTE(sharedData, byte) \ | |
317 | (UBool)MBCS_ENTRY_IS_TRANSITION((sharedData)->mbcs.stateTable[0][(uint8_t)(byte)]) | |
318 | ||
319 | /* | |
320 | * This is another simple conversion function for internal use by other | |
321 | * conversion implementations. | |
322 | * It does not use the converter state nor call callbacks. | |
323 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
324 | * It handles conversion extensions but not GB 18030. | |
325 | * | |
326 | * It converts one single Unicode code point into codepage bytes, encoded | |
327 | * as one 32-bit value. The function returns the number of bytes in *pValue: | |
328 | * 1..4 the number of bytes in *pValue | |
329 | * 0 unassigned (*pValue undefined) | |
330 | * -1 illegal (currently not used, *pValue undefined) | |
331 | * | |
332 | * *pValue will contain the resulting bytes with the last byte in bits 7..0, | |
333 | * the second to last byte in bits 15..8, etc. | |
334 | * Currently, the function assumes but does not check that 0<=c<=0x10ffff. | |
335 | */ | |
336 | U_CFUNC int32_t | |
337 | ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, | |
338 | UChar32 c, uint32_t *pValue, | |
339 | UBool useFallback); | |
340 | ||
341 | /** | |
342 | * This version of _MBCSFromUChar32() is optimized for single-byte codepages. | |
343 | * It does not handle the EBCDIC swaplfnl option (set in UConverter). | |
344 | * | |
345 | * It returns the codepage byte for the code point, or -1 if it is unassigned. | |
346 | */ | |
347 | U_CFUNC int32_t | |
348 | ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, | |
349 | UChar32 c, | |
350 | UBool useFallback); | |
351 | ||
352 | /** | |
353 | * SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but | |
354 | * we cheat a little about the type, returning the old types if appropriate. | |
355 | */ | |
356 | U_CFUNC UConverterType | |
357 | ucnv_MBCSGetType(const UConverter* converter); | |
358 | ||
359 | U_CFUNC void | |
360 | ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, | |
361 | UErrorCode *pErrorCode); | |
362 | U_CFUNC void | |
363 | ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | |
364 | UErrorCode *pErrorCode); | |
365 | ||
366 | /* | |
367 | * Internal function returning a UnicodeSet for toUnicode() conversion. | |
368 | * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. | |
369 | * In the future, if we add support for reverse-fallback sets, this function | |
370 | * needs to be updated, and called for each initial state. | |
371 | * Does not currently handle extensions. | |
372 | * Does not empty the set first. | |
373 | */ | |
374 | U_CFUNC void | |
375 | ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData, | |
376 | const USetAdder *sa, | |
377 | UConverterUnicodeSet which, | |
378 | uint8_t state, int32_t lowByte, int32_t highByte, | |
379 | UErrorCode *pErrorCode); | |
380 | ||
381 | /* | |
382 | * Internal function returning a UnicodeSet for toUnicode() conversion. | |
383 | * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. | |
384 | * In the future, if we add support for fallback sets, this function | |
385 | * needs to be updated. | |
386 | * Handles extensions. | |
387 | * Does not empty the set first. | |
388 | */ | |
389 | U_CFUNC void | |
390 | ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, | |
391 | const USetAdder *sa, | |
392 | UConverterUnicodeSet which, | |
393 | UErrorCode *pErrorCode); | |
394 | ||
395 | #endif | |
396 | ||
397 | #endif |