2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
12 #ifndef __COLLATIONSETTINGS_H__
13 #define __COLLATIONSETTINGS_H__
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_COLLATION
19 #include "unicode/ucol.h"
20 #include "collation.h"
21 #include "sharedobject.h"
29 * Collation settings/options/attributes.
30 * These are the values that can be changed via API.
32 struct U_I18N_API CollationSettings
: public SharedObject
{
34 * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
36 static const int32_t CHECK_FCD
= 1;
38 * Options bit 1: Numeric collation.
39 * Also known as CODAN = COllate Digits As Numbers.
41 * Treat digit sequences as numbers with CE sequences in numeric order,
42 * rather than returning a normal CE for each digit.
44 static const int32_t NUMERIC
= 2;
46 * "Shifted" alternate handling, see ALTERNATE_MASK.
48 static const int32_t SHIFTED
= 4;
50 * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
51 * Reserve values 8 and 0xc for shift-trimmed and blanked.
53 static const int32_t ALTERNATE_MASK
= 0xc;
55 * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
57 static const int32_t MAX_VARIABLE_SHIFT
= 4;
58 /** maxVariable options bit mask before shifting. */
59 static const int32_t MAX_VARIABLE_MASK
= 0x70;
60 /** Options bit 7: Reserved/unused/0. */
62 * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
64 static const int32_t UPPER_FIRST
= 0x100;
66 * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
67 * unless case level is on (when they are *moved* into the separate case level).
68 * By default, the case bits are removed from the tertiary weight (ignored).
70 * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
71 * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
73 static const int32_t CASE_FIRST
= 0x200;
75 * Options bit mask for caseFirst and upperFirst, before shifting.
76 * Same value as caseFirst==upperFirst.
78 static const int32_t CASE_FIRST_AND_UPPER_MASK
= CASE_FIRST
| UPPER_FIRST
;
80 * Options bit 10: Insert the case level between the secondary and tertiary levels.
82 static const int32_t CASE_LEVEL
= 0x400;
84 * Options bit 11: Compare secondary weights backwards. ("French secondary")
86 static const int32_t BACKWARD_SECONDARY
= 0x800;
88 * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
89 * It is the top used bit field in the options. (No need to mask after shifting.)
91 static const int32_t STRENGTH_SHIFT
= 12;
92 /** Strength options bit mask before shifting. */
93 static const int32_t STRENGTH_MASK
= 0xf000;
95 /** maxVariable values */
104 : options((UCOL_DEFAULT_STRENGTH
<< STRENGTH_SHIFT
) |
105 (MAX_VAR_PUNCT
<< MAX_VARIABLE_SHIFT
)),
109 reorderRanges(NULL
), reorderRangesLength(0),
110 reorderCodes(NULL
), reorderCodesLength(0), reorderCodesCapacity(0),
111 fastLatinOptions(-1) {}
113 CollationSettings(const CollationSettings
&other
);
114 virtual ~CollationSettings();
116 UBool
operator==(const CollationSettings
&other
) const;
118 inline UBool
operator!=(const CollationSettings
&other
) const {
119 return !operator==(other
);
122 int32_t hashCode() const;
124 void resetReordering();
125 void aliasReordering(const CollationData
&data
, const int32_t *codes
, int32_t length
,
126 const uint32_t *ranges
, int32_t rangesLength
,
127 const uint8_t *table
, UErrorCode
&errorCode
);
128 void setReordering(const CollationData
&data
, const int32_t *codes
, int32_t codesLength
,
129 UErrorCode
&errorCode
);
130 void copyReorderingFrom(const CollationSettings
&other
, UErrorCode
&errorCode
);
132 inline UBool
hasReordering() const { return reorderTable
!= NULL
; }
133 static UBool
reorderTableHasSplitBytes(const uint8_t table
[256]);
134 inline uint32_t reorder(uint32_t p
) const {
135 uint8_t b
= reorderTable
[p
>> 24];
136 if(b
!= 0 || p
<= Collation::NO_CE_PRIMARY
) {
137 return ((uint32_t)b
<< 24) | (p
& 0xffffff);
143 void setStrength(int32_t value
, int32_t defaultOptions
, UErrorCode
&errorCode
);
145 static int32_t getStrength(int32_t options
) {
146 return options
>> STRENGTH_SHIFT
;
149 int32_t getStrength() const {
150 return getStrength(options
);
153 /** Sets the options bit for an on/off attribute. */
154 void setFlag(int32_t bit
, UColAttributeValue value
,
155 int32_t defaultOptions
, UErrorCode
&errorCode
);
157 UColAttributeValue
getFlag(int32_t bit
) const {
158 return ((options
& bit
) != 0) ? UCOL_ON
: UCOL_OFF
;
161 void setCaseFirst(UColAttributeValue value
, int32_t defaultOptions
, UErrorCode
&errorCode
);
163 UColAttributeValue
getCaseFirst() const {
164 int32_t option
= options
& CASE_FIRST_AND_UPPER_MASK
;
165 return (option
== 0) ? UCOL_OFF
:
166 (option
== CASE_FIRST
) ? UCOL_LOWER_FIRST
: UCOL_UPPER_FIRST
;
169 void setAlternateHandling(UColAttributeValue value
,
170 int32_t defaultOptions
, UErrorCode
&errorCode
);
172 UColAttributeValue
getAlternateHandling() const {
173 return ((options
& ALTERNATE_MASK
) == 0) ? UCOL_NON_IGNORABLE
: UCOL_SHIFTED
;
176 void setMaxVariable(int32_t value
, int32_t defaultOptions
, UErrorCode
&errorCode
);
178 MaxVariable
getMaxVariable() const {
179 return (MaxVariable
)((options
& MAX_VARIABLE_MASK
) >> MAX_VARIABLE_SHIFT
);
183 * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
185 static inline UBool
isTertiaryWithCaseBits(int32_t options
) {
186 return (options
& (CASE_LEVEL
| CASE_FIRST
)) == CASE_FIRST
;
188 static uint32_t getTertiaryMask(int32_t options
) {
189 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
190 return isTertiaryWithCaseBits(options
) ?
191 Collation::CASE_AND_TERTIARY_MASK
: Collation::ONLY_TERTIARY_MASK
;
194 static UBool
sortsTertiaryUpperCaseFirst(int32_t options
) {
195 // On tertiary level, consider case bits and sort uppercase first
196 // if caseLevel is off and caseFirst==upperFirst.
197 return (options
& (CASE_LEVEL
| CASE_FIRST_AND_UPPER_MASK
)) == CASE_FIRST_AND_UPPER_MASK
;
200 inline UBool
dontCheckFCD() const {
201 return (options
& CHECK_FCD
) == 0;
204 inline UBool
hasBackwardSecondary() const {
205 return (options
& BACKWARD_SECONDARY
) != 0;
208 inline UBool
isNumeric() const {
209 return (options
& NUMERIC
) != 0;
212 /** CHECK_FCD etc. */
214 /** Variable-top primary weight. */
215 uint32_t variableTop
;
217 * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
218 * A 0 entry at a non-zero index means that the primary lead byte is "split"
219 * (there are different offsets for primaries that share that lead byte)
220 * and the reordering offset must be determined via the reorderRanges.
222 const uint8_t *reorderTable
;
223 /** Limit of last reordered range. 0 if no reordering or no split bytes. */
224 uint32_t minHighNoReorder
;
226 * Primary-weight ranges for script reordering,
227 * to be used by reorder(p) for split-reordered primary lead bytes.
229 * Each entry is a (limit, offset) pair.
230 * The upper 16 bits of the entry are the upper 16 bits of the
231 * exclusive primary limit of a range.
232 * Primaries between the previous limit and this one have their lead bytes
233 * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
235 * CollationData::makeReorderRanges() writes a full list where the first range
236 * (at least for terminators and separators) has a 0 offset.
237 * The last range has a non-zero offset.
238 * minHighNoReorder is set to the limit of that last range.
240 * In the settings object, the initial ranges before the first split lead byte
241 * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
242 * If there are no split-reordered lead bytes, then no ranges are needed.
244 const uint32_t *reorderRanges
;
245 int32_t reorderRangesLength
;
246 /** Array of reorder codes; ignored if reorderCodesLength == 0. */
247 const int32_t *reorderCodes
;
248 /** Number of reorder codes; 0 if no reordering. */
249 int32_t reorderCodesLength
;
251 * Capacity of reorderCodes.
252 * If 0, then the codes, the ranges, and the table are aliases.
253 * Otherwise, this object owns the memory via the reorderCodes pointer;
254 * the codes, the ranges, and the table are in the same memory block, in that order.
256 int32_t reorderCodesCapacity
;
258 /** Options for CollationFastLatin. Negative if disabled. */
259 int32_t fastLatinOptions
;
260 uint16_t fastLatinPrimaries
[0x180];
263 void setReorderArrays(const int32_t *codes
, int32_t codesLength
,
264 const uint32_t *ranges
, int32_t rangesLength
,
265 const uint8_t *table
, UErrorCode
&errorCode
);
266 uint32_t reorderEx(uint32_t p
) const;
271 #endif // !UCONFIG_NO_COLLATION
272 #endif // __COLLATIONSETTINGS_H__