]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationdata.h
ICU-531.31.tar.gz
[apple/icu.git] / icuSources / i18n / collationdata.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdata.h
7 *
8 * created on: 2010oct27
9 * created by: Markus W. Scherer
10 */
11
12 #ifndef __COLLATIONDATA_H__
13 #define __COLLATIONDATA_H__
14
15 #include "unicode/utypes.h"
16
17 #if !UCONFIG_NO_COLLATION
18
19 #include "unicode/uniset.h"
20 #include "collation.h"
21 #include "normalizer2impl.h"
22 #include "utrie2.h"
23
24 struct UDataMemory;
25
26 U_NAMESPACE_BEGIN
27
28 /**
29 * Collation data container.
30 * Immutable data created by a CollationDataBuilder, or loaded from a file,
31 * or deserialized from API-provided binary data.
32 *
33 * Includes data for the collation base (root/default), aliased if this is not the base.
34 */
35 struct U_I18N_API CollationData : public UMemory {
36 CollationData(const Normalizer2Impl &nfc)
37 : trie(NULL),
38 ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
39 jamoCE32s(NULL),
40 nfcImpl(nfc),
41 numericPrimary(0x12000000),
42 ce32sLength(0), cesLength(0), contextsLength(0),
43 compressibleBytes(NULL),
44 unsafeBackwardSet(NULL),
45 fastLatinTable(NULL), fastLatinTableLength(0),
46 scripts(NULL), scriptsLength(0),
47 rootElements(NULL), rootElementsLength(0) {}
48
49 uint32_t getCE32(UChar32 c) const {
50 return UTRIE2_GET32(trie, c);
51 }
52
53 uint32_t getCE32FromSupplementary(UChar32 c) const {
54 return UTRIE2_GET32_FROM_SUPP(trie, c);
55 }
56
57 UBool isDigit(UChar32 c) const {
58 return c < 0x660 ? c <= 0x39 && 0x30 <= c :
59 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
60 }
61
62 UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
63 return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
64 }
65
66 UBool isCompressibleLeadByte(uint32_t b) const {
67 return compressibleBytes[b];
68 }
69
70 inline UBool isCompressiblePrimary(uint32_t p) const {
71 return isCompressibleLeadByte(p >> 24);
72 }
73
74 /**
75 * Returns the CE32 from two contexts words.
76 * Access to the defaultCE32 for contraction and prefix matching.
77 */
78 static uint32_t readCE32(const UChar *p) {
79 return ((uint32_t)p[0] << 16) | p[1];
80 }
81
82 /**
83 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
84 * Requires that ce32 is special.
85 */
86 uint32_t getIndirectCE32(uint32_t ce32) const;
87 /**
88 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
89 * if ce32 is special.
90 */
91 uint32_t getFinalCE32(uint32_t ce32) const;
92
93 /**
94 * Computes a CE from c's ce32 which has the OFFSET_TAG.
95 */
96 int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
97 int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
98 return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
99 }
100
101 /**
102 * Returns the FCD16 value for code point c. c must be >= 0.
103 */
104 uint16_t getFCD16(UChar32 c) const {
105 return nfcImpl.getFCD16(c);
106 }
107
108 /**
109 * Returns the first primary for the script's reordering group.
110 * @return the primary with only the first primary lead byte of the group
111 * (not necessarily an actual root collator primary weight),
112 * or 0 if the script is unknown
113 */
114 uint32_t getFirstPrimaryForGroup(int32_t script) const;
115
116 /**
117 * Returns the last primary for the script's reordering group.
118 * @return the last primary of the group
119 * (not an actual root collator primary weight),
120 * or 0 if the script is unknown
121 */
122 uint32_t getLastPrimaryForGroup(int32_t script) const;
123
124 /**
125 * Finds the reordering group which contains the primary weight.
126 * @return the first script of the group, or -1 if the weight is beyond the last group
127 */
128 int32_t getGroupForPrimary(uint32_t p) const;
129
130 int32_t getEquivalentScripts(int32_t script,
131 int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
132
133 /**
134 * Writes the permutation table for the given reordering of scripts and groups,
135 * mapping from default-order primary-weight lead bytes to reordered lead bytes.
136 * The caller checks for illegal arguments and
137 * takes care of [DEFAULT] and memory allocation.
138 */
139 void makeReorderTable(const int32_t *reorder, int32_t length,
140 uint8_t table[256], UErrorCode &errorCode) const;
141
142 /** @see jamoCE32s */
143 static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
144
145 /** Main lookup trie. */
146 const UTrie2 *trie;
147 /**
148 * Array of CE32 values.
149 * At index 0 there must be CE32(U+0000)
150 * to support U+0000's special-tag for NUL-termination handling.
151 */
152 const uint32_t *ce32s;
153 /** Array of CE values for expansions and OFFSET_TAG. */
154 const int64_t *ces;
155 /** Array of prefix and contraction-suffix matching data. */
156 const UChar *contexts;
157 /** Base collation data, or NULL if this data itself is a base. */
158 const CollationData *base;
159 /**
160 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
161 * They are normally simple CE32s, rarely expansions.
162 * For fast handling of HANGUL_TAG.
163 */
164 const uint32_t *jamoCE32s;
165 const Normalizer2Impl &nfcImpl;
166 /** The single-byte primary weight (xx000000) for numeric collation. */
167 uint32_t numericPrimary;
168
169 int32_t ce32sLength;
170 int32_t cesLength;
171 int32_t contextsLength;
172
173 /** 256 flags for which primary-weight lead bytes are compressible. */
174 const UBool *compressibleBytes;
175 /**
176 * Set of code points that are unsafe for starting string comparison after an identical prefix,
177 * or in backwards CE iteration.
178 */
179 const UnicodeSet *unsafeBackwardSet;
180
181 /**
182 * Fast Latin table for common-Latin-text string comparisons.
183 * Data structure see class CollationFastLatin.
184 */
185 const uint16_t *fastLatinTable;
186 int32_t fastLatinTableLength;
187
188 /**
189 * Data for scripts and reordering groups.
190 * Uses include building a reordering permutation table and
191 * providing script boundaries to AlphabeticIndex.
192 *
193 * This data is a sorted list of primary-weight lead byte ranges (reordering groups),
194 * each with a list of pairs sorted in base collation order;
195 * each pair contains a script/reorder code and the lowest primary weight for that script.
196 *
197 * Data structure:
198 * - Each reordering group is encoded in n+2 16-bit integers.
199 * - First integer:
200 * Bits 15..8: First byte of the reordering group's range.
201 * Bits 7..0: Last byte of the reordering group's range.
202 * - Second integer:
203 * Length n of the list of script/reordering codes.
204 * - Each further integer is a script or reordering code.
205 */
206 const uint16_t *scripts;
207 int32_t scriptsLength;
208
209 /**
210 * Collation elements in the root collator.
211 * Used by the CollationRootElements class. The data structure is described there.
212 * NULL in a tailoring.
213 */
214 const uint32_t *rootElements;
215 int32_t rootElementsLength;
216
217 private:
218 int32_t findScript(int32_t script) const;
219 };
220
221 U_NAMESPACE_END
222
223 #endif // !UCONFIG_NO_COLLATION
224 #endif // __COLLATIONDATA_H__