]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnvmbcs.h
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / ucnvmbcs.h
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
3*
4* Copyright (C) 2000-2001, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8* file name: ucnvmbcs.h
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2000jul07
14* created by: Markus W. Scherer
15*/
16
17#ifndef __UCNVMBCS_H__
18#define __UCNVMBCS_H__
19
20#include "unicode/utypes.h"
21#include "unicode/ucnv.h"
22#include "ucnv_bld.h"
23
24/* MBCS converter data and state -------------------------------------------- */
25
26/**
27 * MBCS action codes for conversions to Unicode.
28 * These values are in bits 23..20 of the state table entries.
29 */
30enum {
31 MBCS_STATE_VALID_DIRECT_16,
32 MBCS_STATE_VALID_DIRECT_20,
33
34 MBCS_STATE_FALLBACK_DIRECT_16,
35 MBCS_STATE_FALLBACK_DIRECT_20,
36
37 MBCS_STATE_VALID_16,
38 MBCS_STATE_VALID_16_PAIR,
39
40 MBCS_STATE_UNASSIGNED,
41 MBCS_STATE_ILLEGAL,
42
43 MBCS_STATE_CHANGE_ONLY
44};
45
46/* Macros for state table entries */
47#define MBCS_ENTRY_TRANSITION(state, offset) (int32_t)(((int32_t)(state)<<24L)|(offset))
48#define MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, offset) (int32_t)(((entry)&0xff000000)|(offset))
49#define MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, offset) (int32_t)((entry)+(offset))
50
51#define MBCS_ENTRY_FINAL(state, action, value) (int32_t)(0x80000000|((int32_t)(state)<<24L)|((action)<<20L)|(value))
52#define MBCS_ENTRY_SET_FINAL(entry) (int32_t)((entry)|0x80000000)
53#define MBCS_ENTRY_FINAL_SET_ACTION(entry, action) (int32_t)(((entry)&0xff0fffff)|((int32_t)(action)<<20L))
54#define MBCS_ENTRY_FINAL_SET_VALUE(entry, value) (int32_t)(((entry)&0xfff00000)|(value))
55#define MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, action, value) (int32_t)(((entry)&0xff000000)|((int32_t)(action)<<20L)|(value))
56
57#define MBCS_ENTRY_SET_STATE(entry, state) (int32_t)(((entry)&0x80ffffff)|((int32_t)(state)<<24L))
58
59#define MBCS_ENTRY_STATE(entry) (((entry)>>24)&0x7f)
60
61#define MBCS_ENTRY_IS_TRANSITION(entry) ((entry)>=0)
62#define MBCS_ENTRY_IS_FINAL(entry) ((entry)<0)
63
64#define MBCS_ENTRY_TRANSITION_STATE(entry) ((entry)>>24)
65#define MBCS_ENTRY_TRANSITION_OFFSET(entry) ((entry)&0xffffff)
66
67#define MBCS_ENTRY_FINAL_STATE(entry) (((entry)>>24)&0x7f)
68#define MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry) ((entry)<(int32_t)0x80100000)
69#define MBCS_ENTRY_FINAL_ACTION(entry) (((entry)>>20)&0xf)
70#define MBCS_ENTRY_FINAL_VALUE(entry) ((entry)&0xfffff)
71#define MBCS_ENTRY_FINAL_VALUE_16(entry) (uint16_t)(entry)
72
73/* single-byte fromUnicode: get the 16-bit result word */
74#define MBCS_SINGLE_RESULT_FROM_U(table, results, c) (results)[ (table)[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] +((c)&0xf) ]
75
76/* multi-byte fromUnicode: get the 32-bit stage 2 entry */
77#define MBCS_STAGE_2_FROM_U(table, c) ((const uint32_t *)(table))[ (table)[(c)>>10] +(((c)>>4)&0x3f) ]
78#define MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ( ((stage2Entry) & ((uint32_t)1<< (16+((c)&0xf)) )) !=0)
79
80#define MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c) ((uint16_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)]
81#define MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c) ((uint32_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)]
82
83#define MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c) ((bytes)+(16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf))*3)
84
85
86/**
87 * MBCS output types for conversions from Unicode.
88 * These per-converter types determine the storage method in stage 3 of the lookup table,
89 * mostly how many bytes are stored per entry.
90 */
91enum {
92 MBCS_OUTPUT_1, /* 0 */
93 MBCS_OUTPUT_2, /* 1 */
94 MBCS_OUTPUT_3, /* 2 */
95 MBCS_OUTPUT_4, /* 3 */
96
97 MBCS_OUTPUT_3_EUC=8, /* 8 */
98 MBCS_OUTPUT_4_EUC, /* 9 */
99
100 MBCS_OUTPUT_2_SISO=12, /* c */
101 MBCS_OUTPUT_2_HZ /* d */
102};
103
104/**
105 * Fallbacks to Unicode are stored outside the normal state table and code point structures
106 * in a vector of items of this type. They are sorted by offset.
107 */
108typedef struct {
109 uint32_t offset;
110 UChar32 codePoint;
111} _MBCSToUFallback;
112
113/**
114 * This is the MBCS part of the UConverterTable union (a runtime data structure).
115 * It keeps all the per-converter data and points into the loaded mapping tables.
116 */
117typedef struct UConverterMBCSTable {
118 /* toUnicode */
119 uint8_t countStates;
120 uint32_t countToUFallbacks;
121
122 const int32_t (*stateTable)/*[countStates]*/[256];
123 int32_t (*swapLFNLStateTable)/*[countStates]*/[256]; /* for swaplfnl */
124 const uint16_t *unicodeCodeUnits/*[countUnicodeResults]*/;
125 const _MBCSToUFallback *toUFallbacks;
126
127 /* fromUnicode */
128 const uint16_t *fromUnicodeTable;
129 const uint8_t *fromUnicodeBytes;
130 uint8_t *swapLFNLFromUnicodeBytes; /* for swaplfnl */
131 uint32_t fromUBytesLength;
132 uint8_t outputType, unicodeMask;
133
134 /* converter name for swaplfnl */
135 char *swapLFNLName;
136} UConverterMBCSTable;
137
138/**
139 * MBCS data structure as part of a .cnv file:
140 *
141 * uint32_t [8]; -- 8 values:
142 * 0 MBCS version in UVersionInfo format (1.0.0.0)
143 * 1 countStates
144 * 2 countToUFallbacks
145 * 3 offsetToUCodeUnits (offsets are counted from the beginning of this header structure)
146 * 4 offsetFromUTable
147 * 5 offsetFromUBytes
148 * 6 flags, bits:
149 * 31.. 8 reserved
150 * 7.. 0 outputType
151 * 7 fromUBytesLength -- header.version 4.1 (ICU 2.4) and higher
152 *
153 * stateTable[countStates][256];
154 *
155 * struct { (fallbacks are sorted by offset)
156 * uint32_t offset;
157 * UChar32 codePoint;
158 * } toUFallbacks[countToUFallbacks];
159 *
160 * uint16_t unicodeCodeUnits[?]; (even number of units or padded)
161 *
162 * uint16_t fromUTable[0x440+?]; (32-bit-aligned)
163 *
164 * uint8_t fromUBytes[?];
165 */
166typedef struct {
167 UVersionInfo version;
168 uint32_t countStates,
169 countToUFallbacks,
170 offsetToUCodeUnits,
171 offsetFromUTable,
172 offsetFromUBytes,
173 flags,
174 fromUBytesLength;
175} _MBCSHeader;
176
177/**
178 * This is a simple version of _MBCSGetNextUChar() that is used
179 * by other converter implementations.
180 * It does not use state from the converter, nor error codes.
181 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
182 *
183 * Return value:
184 * U+fffe unassigned
185 * U+ffff illegal
186 * otherwise the Unicode code point
187 */
188U_CFUNC UChar32
189_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
190 const char **pSource, const char *sourceLimit,
191 UBool useFallback);
192
193/**
194 * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
195 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
196 */
197U_CFUNC UChar32
198_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
199 uint8_t b, UBool useFallback);
200
201/**
202 * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte.
203 * It works for single-byte, single-state codepages that only map
204 * to and from BMP code points, and it always
205 * returns fallback values.
206 */
207#define _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, b) \
208 (UChar)MBCS_ENTRY_FINAL_VALUE_16((sharedData)->table->mbcs.stateTable[0][(uint8_t)(b)])
209
210/**
211 * This is an internal function that allows other converter implementations
212 * to check whether a byte is a lead byte.
213 */
214U_CFUNC UBool
215_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);
216
217/** This is a macro version of _MBCSIsLeadByte(). */
218#define _MBCS_IS_LEAD_BYTE(sharedData, byte) \
219 (UBool)MBCS_ENTRY_IS_TRANSITION((sharedData)->table->mbcs.stateTable[0][(uint8_t)(byte)])
220
221/**
222 * This is another simple conversion function for internal use by other
223 * conversion implementations.
224 * It does not use the converter state nor call callbacks.
225 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
226 *
227 * It converts one single Unicode code point into codepage bytes, encoded
228 * as one 32-bit value. The function returns the number of bytes in *pValue:
229 * 1..4 the number of bytes in *pValue
230 * 0 unassigned (*pValue undefined)
231 * -1 illegal (currently not used, *pValue undefined)
232 *
233 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
234 * the second to last byte in bits 15..8, etc.
235 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
236 */
237U_CFUNC int32_t
238_MBCSFromUChar32(UConverterSharedData *sharedData,
239 UChar32 c, uint32_t *pValue,
240 UBool useFallback);
241
242/**
243 * This version of _MBCSFromUChar32() is optimized for single-byte codepages.
244 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
245 *
246 * It returns the codepage byte for the code point, or -1 if it is unassigned.
247 */
248U_CFUNC int32_t
249_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
250 UChar32 c,
251 UBool useFallback);
252
253/**
254 * SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but
255 * we cheat a little about the type, returning the old types if appropriate.
256 */
257U_CFUNC UConverterType
258_MBCSGetType(const UConverter* converter);
259
260U_CFUNC void
261_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
262 UErrorCode *pErrorCode);
263U_CFUNC void
264_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
265 UErrorCode *pErrorCode);
266
267
268#endif