]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
b331163b | 3 | * Copyright (C) 2013-2015, International Business Machines |
57a6839d A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * collationsettings.h | |
7 | * | |
8 | * created on: 2013feb07 | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #ifndef __COLLATIONSETTINGS_H__ | |
13 | #define __COLLATIONSETTINGS_H__ | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | ||
17 | #if !UCONFIG_NO_COLLATION | |
18 | ||
19 | #include "unicode/ucol.h" | |
20 | #include "collation.h" | |
21 | #include "sharedobject.h" | |
22 | #include "umutex.h" | |
23 | ||
24 | U_NAMESPACE_BEGIN | |
25 | ||
b331163b A |
26 | struct CollationData; |
27 | ||
57a6839d A |
28 | /** |
29 | * Collation settings/options/attributes. | |
30 | * These are the values that can be changed via API. | |
31 | */ | |
32 | struct U_I18N_API CollationSettings : public SharedObject { | |
33 | /** | |
34 | * Options bit 0: Perform the FCD check on the input text and deliver normalized text. | |
35 | */ | |
36 | static const int32_t CHECK_FCD = 1; | |
37 | /** | |
38 | * Options bit 1: Numeric collation. | |
39 | * Also known as CODAN = COllate Digits As Numbers. | |
40 | * | |
41 | * Treat digit sequences as numbers with CE sequences in numeric order, | |
42 | * rather than returning a normal CE for each digit. | |
43 | */ | |
44 | static const int32_t NUMERIC = 2; | |
45 | /** | |
46 | * "Shifted" alternate handling, see ALTERNATE_MASK. | |
47 | */ | |
48 | static const int32_t SHIFTED = 4; | |
49 | /** | |
50 | * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. | |
51 | * Reserve values 8 and 0xc for shift-trimmed and blanked. | |
52 | */ | |
53 | static const int32_t ALTERNATE_MASK = 0xc; | |
54 | /** | |
55 | * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. | |
56 | */ | |
57 | static const int32_t MAX_VARIABLE_SHIFT = 4; | |
58 | /** maxVariable options bit mask before shifting. */ | |
59 | static const int32_t MAX_VARIABLE_MASK = 0x70; | |
60 | /** Options bit 7: Reserved/unused/0. */ | |
61 | /** | |
62 | * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. | |
63 | */ | |
64 | static const int32_t UPPER_FIRST = 0x100; | |
65 | /** | |
66 | * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) | |
67 | * unless case level is on (when they are *moved* into the separate case level). | |
68 | * By default, the case bits are removed from the tertiary weight (ignored). | |
69 | * | |
70 | * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to | |
71 | * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. | |
72 | */ | |
73 | static const int32_t CASE_FIRST = 0x200; | |
74 | /** | |
75 | * Options bit mask for caseFirst and upperFirst, before shifting. | |
76 | * Same value as caseFirst==upperFirst. | |
77 | */ | |
78 | static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; | |
79 | /** | |
80 | * Options bit 10: Insert the case level between the secondary and tertiary levels. | |
81 | */ | |
82 | static const int32_t CASE_LEVEL = 0x400; | |
83 | /** | |
84 | * Options bit 11: Compare secondary weights backwards. ("French secondary") | |
85 | */ | |
86 | static const int32_t BACKWARD_SECONDARY = 0x800; | |
87 | /** | |
88 | * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. | |
89 | * It is the top used bit field in the options. (No need to mask after shifting.) | |
90 | */ | |
91 | static const int32_t STRENGTH_SHIFT = 12; | |
92 | /** Strength options bit mask before shifting. */ | |
93 | static const int32_t STRENGTH_MASK = 0xf000; | |
94 | ||
95 | /** maxVariable values */ | |
96 | enum MaxVariable { | |
97 | MAX_VAR_SPACE, | |
98 | MAX_VAR_PUNCT, | |
99 | MAX_VAR_SYMBOL, | |
100 | MAX_VAR_CURRENCY | |
101 | }; | |
102 | ||
103 | CollationSettings() | |
104 | : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) | | |
105 | (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)), | |
106 | variableTop(0), | |
107 | reorderTable(NULL), | |
b331163b A |
108 | minHighNoReorder(0), |
109 | reorderRanges(NULL), reorderRangesLength(0), | |
57a6839d A |
110 | reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0), |
111 | fastLatinOptions(-1) {} | |
112 | ||
113 | CollationSettings(const CollationSettings &other); | |
114 | virtual ~CollationSettings(); | |
115 | ||
116 | UBool operator==(const CollationSettings &other) const; | |
117 | ||
118 | inline UBool operator!=(const CollationSettings &other) const { | |
119 | return !operator==(other); | |
120 | } | |
121 | ||
122 | int32_t hashCode() const; | |
123 | ||
124 | void resetReordering(); | |
b331163b A |
125 | void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length, |
126 | const uint32_t *ranges, int32_t rangesLength, | |
127 | const uint8_t *table, UErrorCode &errorCode); | |
128 | void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength, | |
129 | UErrorCode &errorCode); | |
130 | void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode); | |
131 | ||
132 | inline UBool hasReordering() const { return reorderTable != NULL; } | |
133 | static UBool reorderTableHasSplitBytes(const uint8_t table[256]); | |
134 | inline uint32_t reorder(uint32_t p) const { | |
135 | uint8_t b = reorderTable[p >> 24]; | |
136 | if(b != 0 || p <= Collation::NO_CE_PRIMARY) { | |
137 | return ((uint32_t)b << 24) | (p & 0xffffff); | |
138 | } else { | |
139 | return reorderEx(p); | |
140 | } | |
141 | } | |
57a6839d A |
142 | |
143 | void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); | |
144 | ||
145 | static int32_t getStrength(int32_t options) { | |
146 | return options >> STRENGTH_SHIFT; | |
147 | } | |
148 | ||
149 | int32_t getStrength() const { | |
150 | return getStrength(options); | |
151 | } | |
152 | ||
153 | /** Sets the options bit for an on/off attribute. */ | |
154 | void setFlag(int32_t bit, UColAttributeValue value, | |
155 | int32_t defaultOptions, UErrorCode &errorCode); | |
156 | ||
157 | UColAttributeValue getFlag(int32_t bit) const { | |
158 | return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF; | |
159 | } | |
160 | ||
161 | void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); | |
162 | ||
163 | UColAttributeValue getCaseFirst() const { | |
164 | int32_t option = options & CASE_FIRST_AND_UPPER_MASK; | |
165 | return (option == 0) ? UCOL_OFF : | |
166 | (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST; | |
167 | } | |
168 | ||
169 | void setAlternateHandling(UColAttributeValue value, | |
170 | int32_t defaultOptions, UErrorCode &errorCode); | |
171 | ||
172 | UColAttributeValue getAlternateHandling() const { | |
173 | return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED; | |
174 | } | |
175 | ||
176 | void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); | |
177 | ||
178 | MaxVariable getMaxVariable() const { | |
179 | return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT); | |
180 | } | |
181 | ||
182 | /** | |
183 | * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. | |
184 | */ | |
185 | static inline UBool isTertiaryWithCaseBits(int32_t options) { | |
186 | return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; | |
187 | } | |
188 | static uint32_t getTertiaryMask(int32_t options) { | |
189 | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. | |
190 | return isTertiaryWithCaseBits(options) ? | |
191 | Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK; | |
192 | } | |
193 | ||
194 | static UBool sortsTertiaryUpperCaseFirst(int32_t options) { | |
195 | // On tertiary level, consider case bits and sort uppercase first | |
196 | // if caseLevel is off and caseFirst==upperFirst. | |
197 | return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; | |
198 | } | |
199 | ||
200 | inline UBool dontCheckFCD() const { | |
201 | return (options & CHECK_FCD) == 0; | |
202 | } | |
203 | ||
204 | inline UBool hasBackwardSecondary() const { | |
205 | return (options & BACKWARD_SECONDARY) != 0; | |
206 | } | |
207 | ||
208 | inline UBool isNumeric() const { | |
209 | return (options & NUMERIC) != 0; | |
210 | } | |
211 | ||
212 | /** CHECK_FCD etc. */ | |
213 | int32_t options; | |
214 | /** Variable-top primary weight. */ | |
215 | uint32_t variableTop; | |
b331163b A |
216 | /** |
217 | * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering. | |
218 | * A 0 entry at a non-zero index means that the primary lead byte is "split" | |
219 | * (there are different offsets for primaries that share that lead byte) | |
220 | * and the reordering offset must be determined via the reorderRanges. | |
221 | */ | |
57a6839d | 222 | const uint8_t *reorderTable; |
b331163b A |
223 | /** Limit of last reordered range. 0 if no reordering or no split bytes. */ |
224 | uint32_t minHighNoReorder; | |
225 | /** | |
226 | * Primary-weight ranges for script reordering, | |
227 | * to be used by reorder(p) for split-reordered primary lead bytes. | |
228 | * | |
229 | * Each entry is a (limit, offset) pair. | |
230 | * The upper 16 bits of the entry are the upper 16 bits of the | |
231 | * exclusive primary limit of a range. | |
232 | * Primaries between the previous limit and this one have their lead bytes | |
233 | * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. | |
234 | * | |
235 | * CollationData::makeReorderRanges() writes a full list where the first range | |
236 | * (at least for terminators and separators) has a 0 offset. | |
237 | * The last range has a non-zero offset. | |
238 | * minHighNoReorder is set to the limit of that last range. | |
239 | * | |
240 | * In the settings object, the initial ranges before the first split lead byte | |
241 | * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. | |
242 | * If there are no split-reordered lead bytes, then no ranges are needed. | |
243 | */ | |
244 | const uint32_t *reorderRanges; | |
245 | int32_t reorderRangesLength; | |
57a6839d A |
246 | /** Array of reorder codes; ignored if reorderCodesLength == 0. */ |
247 | const int32_t *reorderCodes; | |
248 | /** Number of reorder codes; 0 if no reordering. */ | |
249 | int32_t reorderCodesLength; | |
250 | /** | |
251 | * Capacity of reorderCodes. | |
b331163b | 252 | * If 0, then the codes, the ranges, and the table are aliases. |
57a6839d | 253 | * Otherwise, this object owns the memory via the reorderCodes pointer; |
b331163b | 254 | * the codes, the ranges, and the table are in the same memory block, in that order. |
57a6839d A |
255 | */ |
256 | int32_t reorderCodesCapacity; | |
257 | ||
258 | /** Options for CollationFastLatin. Negative if disabled. */ | |
259 | int32_t fastLatinOptions; | |
260 | uint16_t fastLatinPrimaries[0x180]; | |
b331163b A |
261 | |
262 | private: | |
263 | void setReorderArrays(const int32_t *codes, int32_t codesLength, | |
264 | const uint32_t *ranges, int32_t rangesLength, | |
265 | const uint8_t *table, UErrorCode &errorCode); | |
266 | uint32_t reorderEx(uint32_t p) const; | |
57a6839d A |
267 | }; |
268 | ||
269 | U_NAMESPACE_END | |
270 | ||
271 | #endif // !UCONFIG_NO_COLLATION | |
272 | #endif // __COLLATIONSETTINGS_H__ |