]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
374ca955 | 3 | /* |
46f4442e | 4 | ******************************************************************************* |
51004dcb | 5 | * Copyright (C) 2003-2013, International Business Machines |
46f4442e A |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* | |
8 | * file name: ucm.h | |
f3c0d7a5 | 9 | * encoding: UTF-8 |
46f4442e A |
10 | * tab size: 8 (not used) |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003jun20 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Definitions for the .ucm file parser and handler module ucm.c. | |
17 | */ | |
374ca955 A |
18 | |
19 | #ifndef __UCM_H__ | |
20 | #define __UCM_H__ | |
21 | ||
22 | #include "unicode/utypes.h" | |
23 | #include "ucnvmbcs.h" | |
24 | #include "ucnv_ext.h" | |
25 | #include "filestrm.h" | |
26 | #include <stdio.h> | |
27 | ||
73c04bcf A |
28 | #if !UCONFIG_NO_CONVERSION |
29 | ||
374ca955 A |
30 | U_CDECL_BEGIN |
31 | ||
46f4442e A |
32 | /* constants for UCMapping.moveFlag */ |
33 | enum { | |
34 | UCM_MOVE_TO_EXT=1, | |
35 | UCM_REMOVE_MAPPING=2 | |
36 | }; | |
37 | ||
374ca955 A |
38 | /* |
39 | * Per-mapping data structure | |
40 | * | |
41 | * u if uLen==1: Unicode code point | |
42 | * else index to uLen code points | |
43 | * b if bLen<=4: up to 4 bytes | |
44 | * else index to bLen bytes | |
45 | * uLen number of code points | |
46 | * bLen number of words containing left-justified bytes | |
47 | * bIsMultipleChars indicates that the bytes contain more than one sequence | |
48 | * according to the state table | |
49 | * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) | |
51004dcb A |
50 | * or "good one-way" mapping (4). |
51 | * Same values as in the source file after | | |
374ca955 A |
52 | */ |
53 | typedef struct UCMapping { | |
54 | UChar32 u; | |
55 | union { | |
729e4ab9 | 56 | uint32_t idx; |
374ca955 A |
57 | uint8_t bytes[4]; |
58 | } b; | |
59 | int8_t uLen, bLen, f, moveFlag; | |
60 | } UCMapping; | |
61 | ||
46f4442e | 62 | /* constants for UCMTable.flagsType */ |
374ca955 A |
63 | enum { |
64 | UCM_FLAGS_INITIAL, /* no mappings parsed yet */ | |
65 | UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ | |
66 | UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ | |
67 | UCM_FLAGS_MIXED /* both implicit and explicit */ | |
68 | }; | |
69 | ||
70 | typedef struct UCMTable { | |
71 | UCMapping *mappings; | |
72 | int32_t mappingsCapacity, mappingsLength; | |
73 | ||
74 | UChar32 *codePoints; | |
75 | int32_t codePointsCapacity, codePointsLength; | |
76 | ||
77 | uint8_t *bytes; | |
78 | int32_t bytesCapacity, bytesLength; | |
79 | ||
80 | /* index map for mapping by bytes first */ | |
81 | int32_t *reverseMap; | |
82 | ||
83 | uint8_t unicodeMask; | |
84 | int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ | |
85 | UBool isSorted; | |
86 | } UCMTable; | |
87 | ||
88 | enum { | |
89 | MBCS_STATE_FLAG_DIRECT=1, | |
90 | MBCS_STATE_FLAG_SURROGATES, | |
91 | ||
92 | MBCS_STATE_FLAG_READY=16 | |
93 | }; | |
94 | ||
95 | typedef struct UCMStates { | |
96 | int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; | |
97 | uint32_t stateFlags[MBCS_MAX_STATE_COUNT], | |
98 | stateOffsetSum[MBCS_MAX_STATE_COUNT]; | |
99 | ||
100 | int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; | |
101 | int8_t conversionType, outputType; | |
102 | } UCMStates; | |
103 | ||
104 | typedef struct UCMFile { | |
105 | UCMTable *base, *ext; | |
106 | UCMStates states; | |
107 | ||
108 | char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; | |
109 | } UCMFile; | |
110 | ||
111 | /* simple accesses ---------------------------------------------------------- */ | |
112 | ||
113 | #define UCM_GET_CODE_POINTS(t, m) \ | |
114 | (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) | |
115 | ||
116 | #define UCM_GET_BYTES(t, m) \ | |
729e4ab9 | 117 | (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) |
374ca955 A |
118 | |
119 | /* APIs --------------------------------------------------------------------- */ | |
120 | ||
121 | U_CAPI UCMFile * U_EXPORT2 | |
122 | ucm_open(void); | |
123 | ||
124 | U_CAPI void U_EXPORT2 | |
125 | ucm_close(UCMFile *ucm); | |
126 | ||
127 | U_CAPI UBool U_EXPORT2 | |
128 | ucm_parseHeaderLine(UCMFile *ucm, | |
129 | char *line, char **pKey, char **pValue); | |
130 | ||
131 | /* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ | |
132 | U_CAPI int32_t U_EXPORT2 | |
133 | ucm_mappingType(UCMStates *baseStates, | |
134 | UCMapping *m, | |
135 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], | |
136 | uint8_t bytes[UCNV_EXT_MAX_BYTES]); | |
137 | ||
138 | /* add a mapping to the base or extension table as appropriate */ | |
139 | U_CAPI UBool U_EXPORT2 | |
140 | ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, | |
141 | UCMapping *m, | |
142 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], | |
143 | uint8_t bytes[UCNV_EXT_MAX_BYTES]); | |
144 | ||
145 | U_CAPI UBool U_EXPORT2 | |
146 | ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); | |
147 | ||
148 | ||
149 | U_CAPI UCMTable * U_EXPORT2 | |
150 | ucm_openTable(void); | |
151 | ||
152 | U_CAPI void U_EXPORT2 | |
153 | ucm_closeTable(UCMTable *table); | |
154 | ||
155 | U_CAPI void U_EXPORT2 | |
156 | ucm_resetTable(UCMTable *table); | |
157 | ||
158 | U_CAPI void U_EXPORT2 | |
159 | ucm_sortTable(UCMTable *t); | |
160 | ||
46f4442e A |
161 | /* |
162 | * Remove mappings with their move flag set from the base table | |
163 | * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. | |
164 | */ | |
165 | U_CAPI void U_EXPORT2 | |
166 | ucm_moveMappings(UCMTable *base, UCMTable *ext); | |
167 | ||
374ca955 A |
168 | /** |
169 | * Read a table from a .ucm file, from after the CHARMAP line to | |
170 | * including the END CHARMAP line. | |
171 | */ | |
172 | U_CAPI void U_EXPORT2 | |
173 | ucm_readTable(UCMFile *ucm, FileStream* convFile, | |
174 | UBool forBase, UCMStates *baseStates, | |
175 | UErrorCode *pErrorCode); | |
176 | ||
177 | /** | |
178 | * Check the validity of mappings against a base table's states; | |
179 | * necessary for extension-only tables that were read before their base tables. | |
180 | */ | |
181 | U_CAPI UBool U_EXPORT2 | |
182 | ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); | |
183 | ||
184 | /** | |
185 | * Check a base table against an extension table. | |
186 | * Set the moveTarget!=NULL if it is possible to move mappings from the base. | |
187 | * This is the case where base and extension tables are parsed from a single file | |
188 | * (moveTarget==ext) | |
189 | * or when delta file mappings are subtracted from a base table. | |
190 | * | |
191 | * When a base table cannot be modified because a delta file is parsed in makeconv, | |
192 | * then set moveTarget=NULL. | |
193 | * | |
194 | * if(intersectBase) then mappings that exist in the base table but not in | |
195 | * the extension table are moved to moveTarget instead of showing an error. | |
196 | * | |
197 | * Special mode: | |
198 | * If intersectBase==2 for a DBCS extension table, then SBCS mappings are | |
199 | * not moved out of the base unless their Unicode input requires it. | |
200 | * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. | |
201 | * | |
202 | * For both tables in the same file, the extension table is automatically | |
203 | * built. | |
46f4442e | 204 | * For separate files, the extension file can use a complete mapping table (.ucm file), |
374ca955 A |
205 | * so that common mappings need not be stripped out manually. |
206 | * | |
207 | * | |
208 | * Sort both tables, and then for each mapping direction: | |
209 | * | |
210 | * If intersectBase is TRUE and the base table contains a mapping | |
211 | * that does not exist in the extension table, then this mapping is moved | |
212 | * to moveTarget. | |
213 | * | |
214 | * - otherwise - | |
215 | * | |
216 | * If the base table contains a mapping for which the input sequence is | |
217 | * the same as the extension input, then | |
218 | * - if the output is the same: remove the extension mapping | |
219 | * - else: error | |
220 | * | |
221 | * If the base table contains a mapping for which the input sequence is | |
222 | * a prefix of the extension input, then | |
223 | * - if moveTarget!=NULL: move the base mapping to the moveTarget table | |
224 | * - else: error | |
225 | * | |
226 | * @return FALSE in case of an irreparable error | |
227 | */ | |
228 | U_CAPI UBool U_EXPORT2 | |
229 | ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, | |
230 | UCMTable *moveTarget, UBool intersectBase); | |
231 | ||
232 | U_CAPI void U_EXPORT2 | |
233 | ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); | |
234 | ||
235 | U_CAPI void U_EXPORT2 | |
236 | ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); | |
237 | ||
238 | ||
239 | U_CAPI void U_EXPORT2 | |
240 | ucm_addState(UCMStates *states, const char *s); | |
241 | ||
242 | U_CAPI void U_EXPORT2 | |
729e4ab9 | 243 | ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); |
374ca955 A |
244 | |
245 | U_CAPI int32_t U_EXPORT2 | |
246 | ucm_countChars(UCMStates *states, | |
247 | const uint8_t *bytes, int32_t length); | |
248 | ||
249 | ||
250 | U_CAPI int8_t U_EXPORT2 | |
251 | ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); | |
252 | ||
253 | U_CAPI UBool U_EXPORT2 | |
254 | ucm_parseMappingLine(UCMapping *m, | |
255 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], | |
256 | uint8_t bytes[UCNV_EXT_MAX_BYTES], | |
257 | const char *line); | |
258 | ||
259 | U_CAPI void U_EXPORT2 | |
260 | ucm_addMapping(UCMTable *table, | |
261 | UCMapping *m, | |
262 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], | |
263 | uint8_t bytes[UCNV_EXT_MAX_BYTES]); | |
264 | ||
265 | /* very makeconv-specific functions ----------------------------------------- */ | |
266 | ||
267 | /* finalize and optimize states after the toUnicode mappings are processed */ | |
268 | U_CAPI void U_EXPORT2 | |
269 | ucm_optimizeStates(UCMStates *states, | |
270 | uint16_t **pUnicodeCodeUnits, | |
271 | _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, | |
272 | UBool verbose); | |
273 | ||
274 | /* moved here because it is used inside ucmstate.c */ | |
275 | U_CAPI int32_t U_EXPORT2 | |
276 | ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, | |
277 | uint32_t offset); | |
278 | ||
279 | /* very rptp2ucm-specific functions ----------------------------------------- */ | |
280 | ||
281 | /* | |
282 | * Input: Separate tables with mappings from/to Unicode, | |
283 | * subchar and subchar1 (0 if none). | |
284 | * All mappings must have flag 0. | |
285 | * | |
286 | * Output: fromUTable will contain the union of mappings with the correct | |
287 | * precision flags, and be sorted. | |
288 | */ | |
289 | U_CAPI void U_EXPORT2 | |
290 | ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, | |
291 | const uint8_t *subchar, int32_t subcharLength, | |
292 | uint8_t subchar1); | |
293 | ||
294 | U_CAPI UBool U_EXPORT2 | |
295 | ucm_separateMappings(UCMFile *ucm, UBool isSISO); | |
296 | ||
297 | U_CDECL_END | |
298 | ||
299 | #endif | |
73c04bcf A |
300 | |
301 | #endif | |
302 |