2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.h
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
12 #ifndef __COLLATIONDATAREADER_H__
13 #define __COLLATIONDATAREADER_H__
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_COLLATION
19 #include "unicode/udata.h"
25 struct CollationTailoring
;
28 * Collation binary data reader.
30 struct U_I18N_API CollationDataReader
/* all static */ {
31 // The following constants are also copied into source/common/ucol_swp.cpp.
35 * Number of int32_t indexes.
37 * Can be 2 if there are only options.
38 * Can be 7 or 8 if there are only options and a script reordering.
39 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
41 IX_INDEXES_LENGTH
, // 0
43 * Bits 31..24: numericPrimary, for numeric collation
44 * 23..16: fast Latin format version (0 = no fast Latin table)
45 * 15.. 0: options bit set
51 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
52 IX_JAMO_CE32S_START
, // 4
54 // Byte offsets from the start of the data, after the generic header.
55 // The indexes[] are at byte offset 0, other data follows.
56 // Each data item is aligned properly.
57 // The data items should be in descending order of unit size,
58 // to minimize the need for padding.
59 // Each item's byte length is given by the difference between its offset and
60 // the next index/offset value.
61 /** Byte offset to int32_t reorderCodes[]. */
62 IX_REORDER_CODES_OFFSET
,
64 * Byte offset to uint8_t reorderTable[].
65 * Empty table if <256 bytes (padding only).
66 * Otherwise 256 bytes or more (with padding).
68 IX_REORDER_TABLE_OFFSET
,
69 /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
72 IX_RESERVED8_OFFSET
, // 8
73 /** Byte offset to int64_t ces[]. */
76 /** Byte offset to uint32_t ce32s[]. */
79 /** Byte offset to uint32_t rootElements[]. */
80 IX_ROOT_ELEMENTS_OFFSET
, // 12
81 /** Byte offset to UChar *contexts[]. */
83 /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
85 /** Byte offset to uint16_t fastLatinTable[]. */
86 IX_FAST_LATIN_TABLE_OFFSET
,
88 /** Byte offset to uint16_t scripts[]. */
89 IX_SCRIPTS_OFFSET
, // 16
91 * Byte offset to UBool compressibleBytes[].
92 * Empty table if <256 bytes (padding only).
93 * Otherwise 256 bytes or more (with padding).
95 IX_COMPRESSIBLE_BYTES_OFFSET
,
100 static void read(const CollationTailoring
*base
, const uint8_t *inBytes
, int32_t inLength
,
101 CollationTailoring
&tailoring
, UErrorCode
&errorCode
);
103 static UBool U_CALLCONV
104 isAcceptable(void *context
, const char *type
, const char *name
, const UDataInfo
*pInfo
);
107 CollationDataReader(); // no constructor
111 * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
112 * Format version 4.0.
114 * The root collation data is stored in the ucadata.icu file.
115 * Tailorings are stored inside .res resource bundle files, with a complete file header.
117 * Collation data begins with a standard ICU data file header
118 * (DataHeader, see ucmndata.h and unicode/udata.h).
119 * The UDataInfo.dataVersion field contains the UCA and other version numbers,
120 * see the comments for CollationTailoring.version.
122 * After the header, the file contains the following parts.
123 * Constants are defined as enum values of the CollationDataReader class.
124 * See also the Collation class.
126 * int32_t indexes[indexesLength];
127 * The indexes array has variable length.
128 * Some tailorings only need the length and the options,
129 * others only add reorderCodes and the reorderTable,
130 * some need to store mappings.
131 * Only as many indexes are stored as needed to read all of the data.
133 * Index 0: indexesLength
134 * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
135 * Index 2..3: Unused/reserved/0.
136 * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
137 * are stored in a short, contiguous part of the ce32s array.
139 * Indexes 5..19 are byte offsets in ascending order.
140 * Each byte offset marks the start of the next part in the data file,
141 * and the end of the previous one.
142 * When two consecutive byte offsets are the same (or too short),
143 * then the corresponding part is empty.
144 * Byte offsets are offsets from after the header,
145 * that is, from the beginning of the indexes[].
146 * Each part starts at an offset with proper alignment for its data.
147 * If necessary, the previous part may include padding bytes to achieve this alignment.
148 * The last byte offset that is stored in the indexes indicates the total size of the data
149 * (starting with the indexes).
151 * int32_t reorderCodes[]; -- empty in root
152 * The list of script and reordering codes.
154 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
155 * Primary-weight lead byte permutation table.
156 * Normally present when the reorderCodes are, but can be built at load time.
158 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
159 * The trie holds the main collation data. Each code point is mapped to a 32-bit value.
160 * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
161 * in which case it is a special CE32 and contains a 4-bit tag and further data.
162 * See the Collation class for details.
164 * The trie has a value for each lead surrogate code unit with some bits encoding
165 * collective properties of the 1024 supplementary characters whose UTF-16 form starts with
166 * the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
169 * 64-bit CEs and expansions that cannot be stored in a more compact form.
172 * CE32s for expansions in compact form, and for characters whose trie values
173 * contain special data.
175 * uint32_t rootElements[]; -- empty in all tailorings
176 * Compact storage for all of the CEs that occur in the root collation.
177 * See the CollationRootElements class.
180 * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
182 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
183 * Serialized form of characters that are unsafe when iterating backwards,
184 * and at the end of an identical string prefix.
185 * Back up to a safe character.
186 * Lead surrogates are "unsafe" when any of their corresponding supplementary
187 * code points are unsafe.
188 * Does not include [:^lccc=0:][:^tccc=0:].
189 * For each tailoring, the root unsafeBackwardSet is subtracted.
190 * (As a result, in many tailorings no set needs to be stored.)
192 * uint16_t fastLatinTable[];
193 * Optional optimization for Latin text.
194 * See the CollationFastLatin class.
196 * uint16_t scripts[]; -- empty in all tailorings
197 * Table of the reordering groups with their first and last lead bytes,
198 * and their script and reordering codes.
199 * See CollationData::scripts.
201 * UBool compressibleBytes[]; -- empty in all tailorings
202 * Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
207 #endif // !UCONFIG_NO_COLLATION
208 #endif // __COLLATIONDATAREADER_H__