2 *******************************************************************************
3 * Copyright (C) 2010-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2010oct27
9 * created by: Markus W. Scherer
12 #ifndef __COLLATIONDATA_H__
13 #define __COLLATIONDATA_H__
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_COLLATION
19 #include "unicode/ucol.h"
20 #include "unicode/uniset.h"
21 #include "collation.h"
22 #include "normalizer2impl.h"
32 * Collation data container.
33 * Immutable data created by a CollationDataBuilder, or loaded from a file,
34 * or deserialized from API-provided binary data.
36 * Includes data for the collation base (root/default), aliased if this is not the base.
38 struct U_I18N_API CollationData
: public UMemory
{
39 // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
40 // parallel with the ranges, and resetting ranges that are indexed.
41 // The reordering builder code could clone the resulting template array.
43 REORDER_RESERVED_BEFORE_LATIN
= UCOL_REORDER_CODE_FIRST
+ 14,
44 REORDER_RESERVED_AFTER_LATIN
48 MAX_NUM_SPECIAL_REORDER_CODES
= 8,
49 /** C++ only, data reader check scriptStartsLength. */
50 MAX_NUM_SCRIPT_RANGES
= 256
53 CollationData(const Normalizer2Impl
&nfc
)
55 ce32s(NULL
), ces(NULL
), contexts(NULL
), base(NULL
),
58 numericPrimary(0x12000000),
59 ce32sLength(0), cesLength(0), contextsLength(0),
60 compressibleBytes(NULL
),
61 unsafeBackwardSet(NULL
),
62 fastLatinTable(NULL
), fastLatinTableLength(0),
63 numScripts(0), scriptsIndex(NULL
), scriptStarts(NULL
), scriptStartsLength(0),
64 rootElements(NULL
), rootElementsLength(0) {}
66 uint32_t getCE32(UChar32 c
) const {
67 return UTRIE2_GET32(trie
, c
);
70 uint32_t getCE32FromSupplementary(UChar32 c
) const {
71 return UTRIE2_GET32_FROM_SUPP(trie
, c
);
74 UBool
isDigit(UChar32 c
) const {
75 return c
< 0x660 ? c
<= 0x39 && 0x30 <= c
:
76 Collation::hasCE32Tag(getCE32(c
), Collation::DIGIT_TAG
);
79 UBool
isUnsafeBackward(UChar32 c
, UBool numeric
) const {
80 return unsafeBackwardSet
->contains(c
) || (numeric
&& isDigit(c
));
83 UBool
isCompressibleLeadByte(uint32_t b
) const {
84 return compressibleBytes
[b
];
87 inline UBool
isCompressiblePrimary(uint32_t p
) const {
88 return isCompressibleLeadByte(p
>> 24);
92 * Returns the CE32 from two contexts words.
93 * Access to the defaultCE32 for contraction and prefix matching.
95 static uint32_t readCE32(const UChar
*p
) {
96 return ((uint32_t)p
[0] << 16) | p
[1];
100 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
101 * Requires that ce32 is special.
103 uint32_t getIndirectCE32(uint32_t ce32
) const;
105 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
106 * if ce32 is special.
108 uint32_t getFinalCE32(uint32_t ce32
) const;
111 * Computes a CE from c's ce32 which has the OFFSET_TAG.
113 int64_t getCEFromOffsetCE32(UChar32 c
, uint32_t ce32
) const {
114 int64_t dataCE
= ces
[Collation::indexFromCE32(ce32
)];
115 return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
));
119 * Returns the single CE that c maps to.
120 * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
122 int64_t getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const;
125 * Returns the FCD16 value for code point c. c must be >= 0.
127 uint16_t getFCD16(UChar32 c
) const {
128 return nfcImpl
.getFCD16(c
);
132 * Returns the first primary for the script's reordering group.
133 * @return the primary with only the first primary lead byte of the group
134 * (not necessarily an actual root collator primary weight),
135 * or 0 if the script is unknown
137 uint32_t getFirstPrimaryForGroup(int32_t script
) const;
140 * Returns the last primary for the script's reordering group.
141 * @return the last primary of the group
142 * (not an actual root collator primary weight),
143 * or 0 if the script is unknown
145 uint32_t getLastPrimaryForGroup(int32_t script
) const;
148 * Finds the reordering group which contains the primary weight.
149 * @return the first script of the group, or -1 if the weight is beyond the last group
151 int32_t getGroupForPrimary(uint32_t p
) const;
153 int32_t getEquivalentScripts(int32_t script
,
154 int32_t dest
[], int32_t capacity
, UErrorCode
&errorCode
) const;
157 * Writes the permutation of primary-weight ranges
158 * for the given reordering of scripts and groups.
159 * The caller checks for illegal arguments and
160 * takes care of [DEFAULT] and memory allocation.
162 * Each list element will be a (limit, offset) pair as described
163 * for the CollationSettings::reorderRanges.
164 * The list will be empty if no ranges are reordered.
166 void makeReorderRanges(const int32_t *reorder
, int32_t length
,
167 UVector32
&ranges
, UErrorCode
&errorCode
) const;
169 /** @see jamoCE32s */
170 static const int32_t JAMO_CE32S_LENGTH
= 19 + 21 + 27;
172 /** Main lookup trie. */
175 * Array of CE32 values.
176 * At index 0 there must be CE32(U+0000)
177 * to support U+0000's special-tag for NUL-termination handling.
179 const uint32_t *ce32s
;
180 /** Array of CE values for expansions and OFFSET_TAG. */
182 /** Array of prefix and contraction-suffix matching data. */
183 const UChar
*contexts
;
184 /** Base collation data, or NULL if this data itself is a base. */
185 const CollationData
*base
;
187 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
188 * They are normally simple CE32s, rarely expansions.
189 * For fast handling of HANGUL_TAG.
191 const uint32_t *jamoCE32s
;
192 const Normalizer2Impl
&nfcImpl
;
193 /** The single-byte primary weight (xx000000) for numeric collation. */
194 uint32_t numericPrimary
;
198 int32_t contextsLength
;
200 /** 256 flags for which primary-weight lead bytes are compressible. */
201 const UBool
*compressibleBytes
;
203 * Set of code points that are unsafe for starting string comparison after an identical prefix,
204 * or in backwards CE iteration.
206 const UnicodeSet
*unsafeBackwardSet
;
209 * Fast Latin table for common-Latin-text string comparisons.
210 * Data structure see class CollationFastLatin.
212 const uint16_t *fastLatinTable
;
213 int32_t fastLatinTableLength
;
216 * Data for scripts and reordering groups.
217 * Uses include building a reordering permutation table and
218 * providing script boundaries to AlphabeticIndex.
222 * The length of scriptsIndex is numScripts+16.
223 * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
224 * 16 special reorder codes (not all used) are mapped starting at numScripts.
225 * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
226 * There are special codes at the end for reorder-reserved primary ranges.
228 * Multiple scripts may share a range and index, for example Hira & Kana.
230 const uint16_t *scriptsIndex
;
232 * Start primary weight (top 16 bits only) for a group/script/reserved range
233 * indexed by scriptsIndex.
234 * The first range (separators & terminators) and the last range (trailing weights)
235 * are not reorderable, and no scriptsIndex entry points to them.
237 const uint16_t *scriptStarts
;
238 int32_t scriptStartsLength
;
241 * Collation elements in the root collator.
242 * Used by the CollationRootElements class. The data structure is described there.
243 * NULL in a tailoring.
245 const uint32_t *rootElements
;
246 int32_t rootElementsLength
;
249 int32_t getScriptIndex(int32_t script
) const;
250 void makeReorderRanges(const int32_t *reorder
, int32_t length
,
252 UVector32
&ranges
, UErrorCode
&errorCode
) const;
253 int32_t addLowScriptRange(uint8_t table
[], int32_t index
, int32_t lowStart
) const;
254 int32_t addHighScriptRange(uint8_t table
[], int32_t index
, int32_t highLimit
) const;
259 #endif // !UCONFIG_NO_COLLATION
260 #endif // __COLLATIONDATA_H__