2 *******************************************************************************
3 * Copyright (C) 2003-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * created on: 2003jun20
12 * created by: Markus W. Scherer
14 * Definitions for the .ucm file parser and handler module ucm.c.
20 #include "unicode/utypes.h"
26 #if !UCONFIG_NO_CONVERSION
30 /* constants for UCMapping.moveFlag */
37 * Per-mapping data structure
39 * u if uLen==1: Unicode code point
40 * else index to uLen code points
41 * b if bLen<=4: up to 4 bytes
42 * else index to bLen bytes
43 * uLen number of code points
44 * bLen number of words containing left-justified bytes
45 * bIsMultipleChars indicates that the bytes contain more than one sequence
46 * according to the state table
47 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
48 * or "good one-way" mapping (4).
49 * Same values as in the source file after |
51 typedef struct UCMapping
{
57 int8_t uLen
, bLen
, f
, moveFlag
;
60 /* constants for UCMTable.flagsType */
62 UCM_FLAGS_INITIAL
, /* no mappings parsed yet */
63 UCM_FLAGS_EXPLICIT
, /* .ucm file has mappings with | fallback indicators */
64 UCM_FLAGS_IMPLICIT
, /* .ucm file has mappings without | fallback indicators, later wins */
65 UCM_FLAGS_MIXED
/* both implicit and explicit */
68 typedef struct UCMTable
{
70 int32_t mappingsCapacity
, mappingsLength
;
73 int32_t codePointsCapacity
, codePointsLength
;
76 int32_t bytesCapacity
, bytesLength
;
78 /* index map for mapping by bytes first */
82 int8_t flagsType
; /* UCM_FLAGS_INITIAL etc. */
87 MBCS_STATE_FLAG_DIRECT
=1,
88 MBCS_STATE_FLAG_SURROGATES
,
90 MBCS_STATE_FLAG_READY
=16
93 typedef struct UCMStates
{
94 int32_t stateTable
[MBCS_MAX_STATE_COUNT
][256];
95 uint32_t stateFlags
[MBCS_MAX_STATE_COUNT
],
96 stateOffsetSum
[MBCS_MAX_STATE_COUNT
];
98 int32_t countStates
, minCharLength
, maxCharLength
, countToUCodeUnits
;
99 int8_t conversionType
, outputType
;
102 typedef struct UCMFile
{
103 UCMTable
*base
, *ext
;
106 char baseName
[UCNV_MAX_CONVERTER_NAME_LENGTH
];
109 /* simple accesses ---------------------------------------------------------- */
111 #define UCM_GET_CODE_POINTS(t, m) \
112 (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
114 #define UCM_GET_BYTES(t, m) \
115 (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)
117 /* APIs --------------------------------------------------------------------- */
119 U_CAPI UCMFile
* U_EXPORT2
122 U_CAPI
void U_EXPORT2
123 ucm_close(UCMFile
*ucm
);
125 U_CAPI UBool U_EXPORT2
126 ucm_parseHeaderLine(UCMFile
*ucm
,
127 char *line
, char **pKey
, char **pValue
);
129 /* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */
130 U_CAPI
int32_t U_EXPORT2
131 ucm_mappingType(UCMStates
*baseStates
,
133 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
134 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]);
136 /* add a mapping to the base or extension table as appropriate */
137 U_CAPI UBool U_EXPORT2
138 ucm_addMappingAuto(UCMFile
*ucm
, UBool forBase
, UCMStates
*baseStates
,
140 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
141 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]);
143 U_CAPI UBool U_EXPORT2
144 ucm_addMappingFromLine(UCMFile
*ucm
, const char *line
, UBool forBase
, UCMStates
*baseStates
);
147 U_CAPI UCMTable
* U_EXPORT2
150 U_CAPI
void U_EXPORT2
151 ucm_closeTable(UCMTable
*table
);
153 U_CAPI
void U_EXPORT2
154 ucm_resetTable(UCMTable
*table
);
156 U_CAPI
void U_EXPORT2
157 ucm_sortTable(UCMTable
*t
);
160 * Remove mappings with their move flag set from the base table
161 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
163 U_CAPI
void U_EXPORT2
164 ucm_moveMappings(UCMTable
*base
, UCMTable
*ext
);
167 * Read a table from a .ucm file, from after the CHARMAP line to
168 * including the END CHARMAP line.
170 U_CAPI
void U_EXPORT2
171 ucm_readTable(UCMFile
*ucm
, FileStream
* convFile
,
172 UBool forBase
, UCMStates
*baseStates
,
173 UErrorCode
*pErrorCode
);
176 * Check the validity of mappings against a base table's states;
177 * necessary for extension-only tables that were read before their base tables.
179 U_CAPI UBool U_EXPORT2
180 ucm_checkValidity(UCMTable
*ext
, UCMStates
*baseStates
);
183 * Check a base table against an extension table.
184 * Set the moveTarget!=NULL if it is possible to move mappings from the base.
185 * This is the case where base and extension tables are parsed from a single file
187 * or when delta file mappings are subtracted from a base table.
189 * When a base table cannot be modified because a delta file is parsed in makeconv,
190 * then set moveTarget=NULL.
192 * if(intersectBase) then mappings that exist in the base table but not in
193 * the extension table are moved to moveTarget instead of showing an error.
196 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are
197 * not moved out of the base unless their Unicode input requires it.
198 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
200 * For both tables in the same file, the extension table is automatically
202 * For separate files, the extension file can use a complete mapping table (.ucm file),
203 * so that common mappings need not be stripped out manually.
206 * Sort both tables, and then for each mapping direction:
208 * If intersectBase is TRUE and the base table contains a mapping
209 * that does not exist in the extension table, then this mapping is moved
214 * If the base table contains a mapping for which the input sequence is
215 * the same as the extension input, then
216 * - if the output is the same: remove the extension mapping
219 * If the base table contains a mapping for which the input sequence is
220 * a prefix of the extension input, then
221 * - if moveTarget!=NULL: move the base mapping to the moveTarget table
224 * @return FALSE in case of an irreparable error
226 U_CAPI UBool U_EXPORT2
227 ucm_checkBaseExt(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
228 UCMTable
*moveTarget
, UBool intersectBase
);
230 U_CAPI
void U_EXPORT2
231 ucm_printTable(UCMTable
*table
, FILE *f
, UBool byUnicode
);
233 U_CAPI
void U_EXPORT2
234 ucm_printMapping(UCMTable
*table
, UCMapping
*m
, FILE *f
);
237 U_CAPI
void U_EXPORT2
238 ucm_addState(UCMStates
*states
, const char *s
);
240 U_CAPI
void U_EXPORT2
241 ucm_processStates(UCMStates
*states
, UBool ignoreSISOCheck
);
243 U_CAPI
int32_t U_EXPORT2
244 ucm_countChars(UCMStates
*states
,
245 const uint8_t *bytes
, int32_t length
);
248 U_CAPI
int8_t U_EXPORT2
249 ucm_parseBytes(uint8_t bytes
[UCNV_EXT_MAX_BYTES
], const char *line
, const char **ps
);
251 U_CAPI UBool U_EXPORT2
252 ucm_parseMappingLine(UCMapping
*m
,
253 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
254 uint8_t bytes
[UCNV_EXT_MAX_BYTES
],
257 U_CAPI
void U_EXPORT2
258 ucm_addMapping(UCMTable
*table
,
260 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
261 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]);
263 /* very makeconv-specific functions ----------------------------------------- */
265 /* finalize and optimize states after the toUnicode mappings are processed */
266 U_CAPI
void U_EXPORT2
267 ucm_optimizeStates(UCMStates
*states
,
268 uint16_t **pUnicodeCodeUnits
,
269 _MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
272 /* moved here because it is used inside ucmstate.c */
273 U_CAPI
int32_t U_EXPORT2
274 ucm_findFallback(_MBCSToUFallback
*toUFallbacks
, int32_t countToUFallbacks
,
277 /* very rptp2ucm-specific functions ----------------------------------------- */
280 * Input: Separate tables with mappings from/to Unicode,
281 * subchar and subchar1 (0 if none).
282 * All mappings must have flag 0.
284 * Output: fromUTable will contain the union of mappings with the correct
285 * precision flags, and be sorted.
287 U_CAPI
void U_EXPORT2
288 ucm_mergeTables(UCMTable
*fromUTable
, UCMTable
*toUTable
,
289 const uint8_t *subchar
, int32_t subcharLength
,
292 U_CAPI UBool U_EXPORT2
293 ucm_separateMappings(UCMFile
*ucm
, UBool isSISO
);