1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2010-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * created on: 2010oct27
11 * created by: Markus W. Scherer
14 #ifndef __COLLATIONDATA_H__
15 #define __COLLATIONDATA_H__
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_COLLATION
21 #include "unicode/ucol.h"
22 #include "unicode/uniset.h"
23 #include "collation.h"
24 #include "normalizer2impl.h"
34 * Collation data container.
35 * Immutable data created by a CollationDataBuilder, or loaded from a file,
36 * or deserialized from API-provided binary data.
38 * Includes data for the collation base (root/default), aliased if this is not the base.
40 struct U_I18N_API CollationData
: public UMemory
{
41 // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
42 // parallel with the ranges, and resetting ranges that are indexed.
43 // The reordering builder code could clone the resulting template array.
45 REORDER_RESERVED_BEFORE_LATIN
= UCOL_REORDER_CODE_FIRST
+ 14,
46 REORDER_RESERVED_AFTER_LATIN
50 MAX_NUM_SPECIAL_REORDER_CODES
= 8,
51 /** C++ only, data reader check scriptStartsLength. */
52 MAX_NUM_SCRIPT_RANGES
= 256
55 CollationData(const Normalizer2Impl
&nfc
)
57 ce32s(NULL
), ces(NULL
), contexts(NULL
), base(NULL
),
60 numericPrimary(0x12000000),
61 ce32sLength(0), cesLength(0), contextsLength(0),
62 compressibleBytes(NULL
),
63 unsafeBackwardSet(NULL
),
64 fastLatinTable(NULL
), fastLatinTableLength(0),
65 numScripts(0), scriptsIndex(NULL
), scriptStarts(NULL
), scriptStartsLength(0),
66 rootElements(NULL
), rootElementsLength(0) {}
68 uint32_t getCE32(UChar32 c
) const {
69 return UTRIE2_GET32(trie
, c
);
72 uint32_t getCE32FromSupplementary(UChar32 c
) const {
73 return UTRIE2_GET32_FROM_SUPP(trie
, c
);
76 UBool
isDigit(UChar32 c
) const {
77 return c
< 0x660 ? c
<= 0x39 && 0x30 <= c
:
78 Collation::hasCE32Tag(getCE32(c
), Collation::DIGIT_TAG
);
81 UBool
isUnsafeBackward(UChar32 c
, UBool numeric
) const {
82 return unsafeBackwardSet
->contains(c
) || (numeric
&& isDigit(c
));
85 UBool
isCompressibleLeadByte(uint32_t b
) const {
86 return compressibleBytes
[b
];
89 inline UBool
isCompressiblePrimary(uint32_t p
) const {
90 return isCompressibleLeadByte(p
>> 24);
94 * Returns the CE32 from two contexts words.
95 * Access to the defaultCE32 for contraction and prefix matching.
97 static uint32_t readCE32(const UChar
*p
) {
98 return ((uint32_t)p
[0] << 16) | p
[1];
102 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
103 * Requires that ce32 is special.
105 uint32_t getIndirectCE32(uint32_t ce32
) const;
107 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
108 * if ce32 is special.
110 uint32_t getFinalCE32(uint32_t ce32
) const;
113 * Computes a CE from c's ce32 which has the OFFSET_TAG.
115 int64_t getCEFromOffsetCE32(UChar32 c
, uint32_t ce32
) const {
116 int64_t dataCE
= ces
[Collation::indexFromCE32(ce32
)];
117 return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
));
121 * Returns the single CE that c maps to.
122 * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
124 int64_t getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const;
127 * Returns the FCD16 value for code point c. c must be >= 0.
129 uint16_t getFCD16(UChar32 c
) const {
130 return nfcImpl
.getFCD16(c
);
134 * Returns the first primary for the script's reordering group.
135 * @return the primary with only the first primary lead byte of the group
136 * (not necessarily an actual root collator primary weight),
137 * or 0 if the script is unknown
139 uint32_t getFirstPrimaryForGroup(int32_t script
) const;
142 * Returns the last primary for the script's reordering group.
143 * @return the last primary of the group
144 * (not an actual root collator primary weight),
145 * or 0 if the script is unknown
147 uint32_t getLastPrimaryForGroup(int32_t script
) const;
150 * Finds the reordering group which contains the primary weight.
151 * @return the first script of the group, or -1 if the weight is beyond the last group
153 int32_t getGroupForPrimary(uint32_t p
) const;
155 int32_t getEquivalentScripts(int32_t script
,
156 int32_t dest
[], int32_t capacity
, UErrorCode
&errorCode
) const;
159 * Writes the permutation of primary-weight ranges
160 * for the given reordering of scripts and groups.
161 * The caller checks for illegal arguments and
162 * takes care of [DEFAULT] and memory allocation.
164 * Each list element will be a (limit, offset) pair as described
165 * for the CollationSettings::reorderRanges.
166 * The list will be empty if no ranges are reordered.
168 void makeReorderRanges(const int32_t *reorder
, int32_t length
,
169 UVector32
&ranges
, UErrorCode
&errorCode
) const;
171 /** @see jamoCE32s */
172 static const int32_t JAMO_CE32S_LENGTH
= 19 + 21 + 27;
174 /** Main lookup trie. */
177 * Array of CE32 values.
178 * At index 0 there must be CE32(U+0000)
179 * to support U+0000's special-tag for NUL-termination handling.
181 const uint32_t *ce32s
;
182 /** Array of CE values for expansions and OFFSET_TAG. */
184 /** Array of prefix and contraction-suffix matching data. */
185 const UChar
*contexts
;
186 /** Base collation data, or NULL if this data itself is a base. */
187 const CollationData
*base
;
189 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
190 * They are normally simple CE32s, rarely expansions.
191 * For fast handling of HANGUL_TAG.
193 const uint32_t *jamoCE32s
;
194 const Normalizer2Impl
&nfcImpl
;
195 /** The single-byte primary weight (xx000000) for numeric collation. */
196 uint32_t numericPrimary
;
200 int32_t contextsLength
;
202 /** 256 flags for which primary-weight lead bytes are compressible. */
203 const UBool
*compressibleBytes
;
205 * Set of code points that are unsafe for starting string comparison after an identical prefix,
206 * or in backwards CE iteration.
208 const UnicodeSet
*unsafeBackwardSet
;
211 * Fast Latin table for common-Latin-text string comparisons.
212 * Data structure see class CollationFastLatin.
214 const uint16_t *fastLatinTable
;
215 int32_t fastLatinTableLength
;
218 * Data for scripts and reordering groups.
219 * Uses include building a reordering permutation table and
220 * providing script boundaries to AlphabeticIndex.
224 * The length of scriptsIndex is numScripts+16.
225 * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
226 * 16 special reorder codes (not all used) are mapped starting at numScripts.
227 * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
228 * There are special codes at the end for reorder-reserved primary ranges.
230 * Multiple scripts may share a range and index, for example Hira & Kana.
232 const uint16_t *scriptsIndex
;
234 * Start primary weight (top 16 bits only) for a group/script/reserved range
235 * indexed by scriptsIndex.
236 * The first range (separators & terminators) and the last range (trailing weights)
237 * are not reorderable, and no scriptsIndex entry points to them.
239 const uint16_t *scriptStarts
;
240 int32_t scriptStartsLength
;
243 * Collation elements in the root collator.
244 * Used by the CollationRootElements class. The data structure is described there.
245 * NULL in a tailoring.
247 const uint32_t *rootElements
;
248 int32_t rootElementsLength
;
251 int32_t getScriptIndex(int32_t script
) const;
252 void makeReorderRanges(const int32_t *reorder
, int32_t length
,
254 UVector32
&ranges
, UErrorCode
&errorCode
) const;
255 int32_t addLowScriptRange(uint8_t table
[], int32_t index
, int32_t lowStart
) const;
256 int32_t addHighScriptRange(uint8_t table
[], int32_t index
, int32_t highLimit
) const;
261 #endif // !UCONFIG_NO_COLLATION
262 #endif // __COLLATIONDATA_H__