]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
57a6839d A |
3 | /* |
4 | ******************************************************************************* | |
b331163b | 5 | * Copyright (C) 2013-2015, International Business Machines |
57a6839d A |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* | |
8 | * collationdatareader.h | |
9 | * | |
10 | * created on: 2013feb07 | |
11 | * created by: Markus W. Scherer | |
12 | */ | |
13 | ||
14 | #ifndef __COLLATIONDATAREADER_H__ | |
15 | #define __COLLATIONDATAREADER_H__ | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_COLLATION | |
20 | ||
21 | #include "unicode/udata.h" | |
22 | ||
23 | struct UDataMemory; | |
24 | ||
25 | U_NAMESPACE_BEGIN | |
26 | ||
27 | struct CollationTailoring; | |
28 | ||
29 | /** | |
30 | * Collation binary data reader. | |
31 | */ | |
32 | struct U_I18N_API CollationDataReader /* all static */ { | |
33 | // The following constants are also copied into source/common/ucol_swp.cpp. | |
34 | // Keep them in sync! | |
35 | enum { | |
36 | /** | |
37 | * Number of int32_t indexes. | |
38 | * | |
39 | * Can be 2 if there are only options. | |
40 | * Can be 7 or 8 if there are only options and a script reordering. | |
41 | * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. | |
42 | */ | |
43 | IX_INDEXES_LENGTH, // 0 | |
44 | /** | |
45 | * Bits 31..24: numericPrimary, for numeric collation | |
46 | * 23..16: fast Latin format version (0 = no fast Latin table) | |
47 | * 15.. 0: options bit set | |
48 | */ | |
49 | IX_OPTIONS, | |
50 | IX_RESERVED2, | |
51 | IX_RESERVED3, | |
52 | ||
53 | /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ | |
54 | IX_JAMO_CE32S_START, // 4 | |
55 | ||
56 | // Byte offsets from the start of the data, after the generic header. | |
57 | // The indexes[] are at byte offset 0, other data follows. | |
58 | // Each data item is aligned properly. | |
59 | // The data items should be in descending order of unit size, | |
60 | // to minimize the need for padding. | |
61 | // Each item's byte length is given by the difference between its offset and | |
62 | // the next index/offset value. | |
63 | /** Byte offset to int32_t reorderCodes[]. */ | |
64 | IX_REORDER_CODES_OFFSET, | |
65 | /** | |
66 | * Byte offset to uint8_t reorderTable[]. | |
67 | * Empty table if <256 bytes (padding only). | |
68 | * Otherwise 256 bytes or more (with padding). | |
69 | */ | |
70 | IX_REORDER_TABLE_OFFSET, | |
71 | /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ | |
72 | IX_TRIE_OFFSET, | |
73 | ||
74 | IX_RESERVED8_OFFSET, // 8 | |
75 | /** Byte offset to int64_t ces[]. */ | |
76 | IX_CES_OFFSET, | |
77 | IX_RESERVED10_OFFSET, | |
78 | /** Byte offset to uint32_t ce32s[]. */ | |
79 | IX_CE32S_OFFSET, | |
80 | ||
81 | /** Byte offset to uint32_t rootElements[]. */ | |
82 | IX_ROOT_ELEMENTS_OFFSET, // 12 | |
83 | /** Byte offset to UChar *contexts[]. */ | |
84 | IX_CONTEXTS_OFFSET, | |
85 | /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ | |
86 | IX_UNSAFE_BWD_OFFSET, | |
87 | /** Byte offset to uint16_t fastLatinTable[]. */ | |
88 | IX_FAST_LATIN_TABLE_OFFSET, | |
89 | ||
90 | /** Byte offset to uint16_t scripts[]. */ | |
91 | IX_SCRIPTS_OFFSET, // 16 | |
92 | /** | |
93 | * Byte offset to UBool compressibleBytes[]. | |
94 | * Empty table if <256 bytes (padding only). | |
95 | * Otherwise 256 bytes or more (with padding). | |
96 | */ | |
97 | IX_COMPRESSIBLE_BYTES_OFFSET, | |
98 | IX_RESERVED18_OFFSET, | |
99 | IX_TOTAL_SIZE | |
100 | }; | |
101 | ||
102 | static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, | |
103 | CollationTailoring &tailoring, UErrorCode &errorCode); | |
104 | ||
105 | static UBool U_CALLCONV | |
106 | isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); | |
107 | ||
108 | private: | |
109 | CollationDataReader(); // no constructor | |
110 | }; | |
111 | ||
112 | /* | |
113 | * Format of collation data (ucadata.icu, binary data in coll/ *.res files). | |
b331163b | 114 | * Format version 5. |
57a6839d A |
115 | * |
116 | * The root collation data is stored in the ucadata.icu file. | |
117 | * Tailorings are stored inside .res resource bundle files, with a complete file header. | |
118 | * | |
119 | * Collation data begins with a standard ICU data file header | |
120 | * (DataHeader, see ucmndata.h and unicode/udata.h). | |
121 | * The UDataInfo.dataVersion field contains the UCA and other version numbers, | |
122 | * see the comments for CollationTailoring.version. | |
123 | * | |
124 | * After the header, the file contains the following parts. | |
125 | * Constants are defined as enum values of the CollationDataReader class. | |
126 | * See also the Collation class. | |
127 | * | |
128 | * int32_t indexes[indexesLength]; | |
129 | * The indexes array has variable length. | |
130 | * Some tailorings only need the length and the options, | |
131 | * others only add reorderCodes and the reorderTable, | |
132 | * some need to store mappings. | |
133 | * Only as many indexes are stored as needed to read all of the data. | |
134 | * | |
135 | * Index 0: indexesLength | |
136 | * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS | |
137 | * Index 2..3: Unused/reserved/0. | |
138 | * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo | |
139 | * are stored in a short, contiguous part of the ce32s array. | |
140 | * | |
141 | * Indexes 5..19 are byte offsets in ascending order. | |
142 | * Each byte offset marks the start of the next part in the data file, | |
143 | * and the end of the previous one. | |
144 | * When two consecutive byte offsets are the same (or too short), | |
145 | * then the corresponding part is empty. | |
146 | * Byte offsets are offsets from after the header, | |
147 | * that is, from the beginning of the indexes[]. | |
148 | * Each part starts at an offset with proper alignment for its data. | |
149 | * If necessary, the previous part may include padding bytes to achieve this alignment. | |
150 | * The last byte offset that is stored in the indexes indicates the total size of the data | |
151 | * (starting with the indexes). | |
152 | * | |
153 | * int32_t reorderCodes[]; -- empty in root | |
154 | * The list of script and reordering codes. | |
155 | * | |
b331163b A |
156 | * Beginning with format version 5, this array may optionally |
157 | * have trailing entries with a full list of reorder ranges | |
158 | * as described for CollationSettings::reorderRanges. | |
159 | * | |
160 | * Script or reorder codes are first and do not exceed 16-bit values. | |
161 | * Range limits are stored in the upper 16 bits, and are never 0. | |
162 | * Split this array into reorder codes and ranges at the first entry | |
163 | * with non-zero upper 16 bits. | |
164 | * | |
165 | * If the ranges are missing but needed for split-reordered primary lead bytes, | |
166 | * then they are regenerated at load time. | |
167 | * | |
57a6839d A |
168 | * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes |
169 | * Primary-weight lead byte permutation table. | |
170 | * Normally present when the reorderCodes are, but can be built at load time. | |
171 | * | |
b331163b A |
172 | * Beginning with format version 5, a 0 entry at a non-zero index |
173 | * (which is otherwise an illegal value) | |
174 | * means that the primary lead byte is "split" | |
175 | * (there are different offsets for primaries that share that lead byte) | |
176 | * and the reordering offset must be determined via the reorder ranges | |
177 | * that are either stored as part of the reorderCodes array | |
178 | * or regenerated at load time. | |
179 | * | |
57a6839d A |
180 | * UTrie2 trie; -- see utrie2_impl.h and utrie2.h |
181 | * The trie holds the main collation data. Each code point is mapped to a 32-bit value. | |
182 | * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, | |
183 | * in which case it is a special CE32 and contains a 4-bit tag and further data. | |
184 | * See the Collation class for details. | |
185 | * | |
186 | * The trie has a value for each lead surrogate code unit with some bits encoding | |
187 | * collective properties of the 1024 supplementary characters whose UTF-16 form starts with | |
188 | * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. | |
189 | * | |
190 | * int64_t ces[]; | |
191 | * 64-bit CEs and expansions that cannot be stored in a more compact form. | |
192 | * | |
193 | * uint32_t ce32s[]; | |
194 | * CE32s for expansions in compact form, and for characters whose trie values | |
195 | * contain special data. | |
196 | * | |
197 | * uint32_t rootElements[]; -- empty in all tailorings | |
198 | * Compact storage for all of the CEs that occur in the root collation. | |
199 | * See the CollationRootElements class. | |
200 | * | |
201 | * UChar *contexts[]; | |
202 | * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. | |
203 | * | |
204 | * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() | |
205 | * Serialized form of characters that are unsafe when iterating backwards, | |
206 | * and at the end of an identical string prefix. | |
207 | * Back up to a safe character. | |
208 | * Lead surrogates are "unsafe" when any of their corresponding supplementary | |
209 | * code points are unsafe. | |
210 | * Does not include [:^lccc=0:][:^tccc=0:]. | |
211 | * For each tailoring, the root unsafeBackwardSet is subtracted. | |
212 | * (As a result, in many tailorings no set needs to be stored.) | |
213 | * | |
214 | * uint16_t fastLatinTable[]; | |
215 | * Optional optimization for Latin text. | |
216 | * See the CollationFastLatin class. | |
217 | * | |
218 | * uint16_t scripts[]; -- empty in all tailorings | |
b331163b A |
219 | * Format version 5: |
220 | * uint16_t numScripts; | |
221 | * uint16_t scriptsIndex[numScripts+16]; | |
222 | * uint16_t scriptStarts[]; | |
223 | * See CollationData::numScripts etc. | |
224 | * | |
225 | * Format version 4: | |
57a6839d A |
226 | * Table of the reordering groups with their first and last lead bytes, |
227 | * and their script and reordering codes. | |
228 | * See CollationData::scripts. | |
229 | * | |
230 | * UBool compressibleBytes[]; -- empty in all tailorings | |
231 | * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. | |
b331163b A |
232 | * |
233 | * ----------------- | |
234 | * Changes for formatVersion 5 (ICU 55) | |
235 | * | |
236 | * Reordering moves single scripts, not groups of scripts. | |
237 | * Reorder ranges are optionally appended to the reorderCodes, | |
238 | * and a 0 entry in the reorderTable indicates a split lead byte. | |
239 | * The scripts data has a new format. | |
240 | * | |
241 | * The rootElements may contain secondary and tertiary weights below common=05. | |
242 | * (Used for small Hiragana letters.) | |
243 | * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. | |
244 | * There are no other data structure changes, but builder code needs to be able to handle such data. | |
245 | * | |
246 | * The collation element for the merge separator code point U+FFFE | |
247 | * does not necessarily have special, unique secondary/tertiary weights any more. | |
57a6839d A |
248 | */ |
249 | ||
250 | U_NAMESPACE_END | |
251 | ||
252 | #endif // !UCONFIG_NO_COLLATION | |
253 | #endif // __COLLATIONDATAREADER_H__ |