]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationdata.h
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / collationdata.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdata.h
7 *
8 * created on: 2010oct27
9 * created by: Markus W. Scherer
10 */
11
12 #ifndef __COLLATIONDATA_H__
13 #define __COLLATIONDATA_H__
14
15 #include "unicode/utypes.h"
16
17 #if !UCONFIG_NO_COLLATION
18
19 #include "unicode/ucol.h"
20 #include "unicode/uniset.h"
21 #include "collation.h"
22 #include "normalizer2impl.h"
23 #include "utrie2.h"
24
25 struct UDataMemory;
26
27 U_NAMESPACE_BEGIN
28
29 class UVector32;
30
31 /**
32 * Collation data container.
33 * Immutable data created by a CollationDataBuilder, or loaded from a file,
34 * or deserialized from API-provided binary data.
35 *
36 * Includes data for the collation base (root/default), aliased if this is not the base.
37 */
38 struct U_I18N_API CollationData : public UMemory {
39 // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
40 // parallel with the ranges, and resetting ranges that are indexed.
41 // The reordering builder code could clone the resulting template array.
42 enum {
43 REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,
44 REORDER_RESERVED_AFTER_LATIN
45 };
46
47 enum {
48 MAX_NUM_SPECIAL_REORDER_CODES = 8,
49 /** C++ only, data reader check scriptStartsLength. */
50 MAX_NUM_SCRIPT_RANGES = 256
51 };
52
53 CollationData(const Normalizer2Impl &nfc)
54 : trie(NULL),
55 ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
56 jamoCE32s(NULL),
57 nfcImpl(nfc),
58 numericPrimary(0x12000000),
59 ce32sLength(0), cesLength(0), contextsLength(0),
60 compressibleBytes(NULL),
61 unsafeBackwardSet(NULL),
62 fastLatinTable(NULL), fastLatinTableLength(0),
63 numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),
64 rootElements(NULL), rootElementsLength(0) {}
65
66 uint32_t getCE32(UChar32 c) const {
67 return UTRIE2_GET32(trie, c);
68 }
69
70 uint32_t getCE32FromSupplementary(UChar32 c) const {
71 return UTRIE2_GET32_FROM_SUPP(trie, c);
72 }
73
74 UBool isDigit(UChar32 c) const {
75 return c < 0x660 ? c <= 0x39 && 0x30 <= c :
76 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
77 }
78
79 UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
80 return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
81 }
82
83 UBool isCompressibleLeadByte(uint32_t b) const {
84 return compressibleBytes[b];
85 }
86
87 inline UBool isCompressiblePrimary(uint32_t p) const {
88 return isCompressibleLeadByte(p >> 24);
89 }
90
91 /**
92 * Returns the CE32 from two contexts words.
93 * Access to the defaultCE32 for contraction and prefix matching.
94 */
95 static uint32_t readCE32(const UChar *p) {
96 return ((uint32_t)p[0] << 16) | p[1];
97 }
98
99 /**
100 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
101 * Requires that ce32 is special.
102 */
103 uint32_t getIndirectCE32(uint32_t ce32) const;
104 /**
105 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
106 * if ce32 is special.
107 */
108 uint32_t getFinalCE32(uint32_t ce32) const;
109
110 /**
111 * Computes a CE from c's ce32 which has the OFFSET_TAG.
112 */
113 int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
114 int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
115 return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
116 }
117
118 /**
119 * Returns the single CE that c maps to.
120 * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
121 */
122 int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
123
124 /**
125 * Returns the FCD16 value for code point c. c must be >= 0.
126 */
127 uint16_t getFCD16(UChar32 c) const {
128 return nfcImpl.getFCD16(c);
129 }
130
131 /**
132 * Returns the first primary for the script's reordering group.
133 * @return the primary with only the first primary lead byte of the group
134 * (not necessarily an actual root collator primary weight),
135 * or 0 if the script is unknown
136 */
137 uint32_t getFirstPrimaryForGroup(int32_t script) const;
138
139 /**
140 * Returns the last primary for the script's reordering group.
141 * @return the last primary of the group
142 * (not an actual root collator primary weight),
143 * or 0 if the script is unknown
144 */
145 uint32_t getLastPrimaryForGroup(int32_t script) const;
146
147 /**
148 * Finds the reordering group which contains the primary weight.
149 * @return the first script of the group, or -1 if the weight is beyond the last group
150 */
151 int32_t getGroupForPrimary(uint32_t p) const;
152
153 int32_t getEquivalentScripts(int32_t script,
154 int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
155
156 /**
157 * Writes the permutation of primary-weight ranges
158 * for the given reordering of scripts and groups.
159 * The caller checks for illegal arguments and
160 * takes care of [DEFAULT] and memory allocation.
161 *
162 * Each list element will be a (limit, offset) pair as described
163 * for the CollationSettings::reorderRanges.
164 * The list will be empty if no ranges are reordered.
165 */
166 void makeReorderRanges(const int32_t *reorder, int32_t length,
167 UVector32 &ranges, UErrorCode &errorCode) const;
168
169 /** @see jamoCE32s */
170 static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
171
172 /** Main lookup trie. */
173 const UTrie2 *trie;
174 /**
175 * Array of CE32 values.
176 * At index 0 there must be CE32(U+0000)
177 * to support U+0000's special-tag for NUL-termination handling.
178 */
179 const uint32_t *ce32s;
180 /** Array of CE values for expansions and OFFSET_TAG. */
181 const int64_t *ces;
182 /** Array of prefix and contraction-suffix matching data. */
183 const UChar *contexts;
184 /** Base collation data, or NULL if this data itself is a base. */
185 const CollationData *base;
186 /**
187 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
188 * They are normally simple CE32s, rarely expansions.
189 * For fast handling of HANGUL_TAG.
190 */
191 const uint32_t *jamoCE32s;
192 const Normalizer2Impl &nfcImpl;
193 /** The single-byte primary weight (xx000000) for numeric collation. */
194 uint32_t numericPrimary;
195
196 int32_t ce32sLength;
197 int32_t cesLength;
198 int32_t contextsLength;
199
200 /** 256 flags for which primary-weight lead bytes are compressible. */
201 const UBool *compressibleBytes;
202 /**
203 * Set of code points that are unsafe for starting string comparison after an identical prefix,
204 * or in backwards CE iteration.
205 */
206 const UnicodeSet *unsafeBackwardSet;
207
208 /**
209 * Fast Latin table for common-Latin-text string comparisons.
210 * Data structure see class CollationFastLatin.
211 */
212 const uint16_t *fastLatinTable;
213 int32_t fastLatinTableLength;
214
215 /**
216 * Data for scripts and reordering groups.
217 * Uses include building a reordering permutation table and
218 * providing script boundaries to AlphabeticIndex.
219 */
220 int32_t numScripts;
221 /**
222 * The length of scriptsIndex is numScripts+16.
223 * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
224 * 16 special reorder codes (not all used) are mapped starting at numScripts.
225 * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
226 * There are special codes at the end for reorder-reserved primary ranges.
227 *
228 * Multiple scripts may share a range and index, for example Hira & Kana.
229 */
230 const uint16_t *scriptsIndex;
231 /**
232 * Start primary weight (top 16 bits only) for a group/script/reserved range
233 * indexed by scriptsIndex.
234 * The first range (separators & terminators) and the last range (trailing weights)
235 * are not reorderable, and no scriptsIndex entry points to them.
236 */
237 const uint16_t *scriptStarts;
238 int32_t scriptStartsLength;
239
240 /**
241 * Collation elements in the root collator.
242 * Used by the CollationRootElements class. The data structure is described there.
243 * NULL in a tailoring.
244 */
245 const uint32_t *rootElements;
246 int32_t rootElementsLength;
247
248 private:
249 int32_t getScriptIndex(int32_t script) const;
250 void makeReorderRanges(const int32_t *reorder, int32_t length,
251 UBool latinMustMove,
252 UVector32 &ranges, UErrorCode &errorCode) const;
253 int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
254 int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
255 };
256
257 U_NAMESPACE_END
258
259 #endif // !UCONFIG_NO_COLLATION
260 #endif // __COLLATIONDATA_H__