]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/collationdata.h
2 *******************************************************************************
3 * Copyright (C) 2010-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2010oct27
9 * created by: Markus W. Scherer
12 #ifndef __COLLATIONDATA_H__
13 #define __COLLATIONDATA_H__
15 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_COLLATION
19 #include "unicode/uniset.h"
20 #include "collation.h"
21 #include "normalizer2impl.h"
29 * Collation data container.
30 * Immutable data created by a CollationDataBuilder, or loaded from a file,
31 * or deserialized from API-provided binary data.
33 * Includes data for the collation base (root/default), aliased if this is not the base.
35 struct U_I18N_API CollationData
: public UMemory
{
36 CollationData(const Normalizer2Impl
&nfc
)
38 ce32s(NULL
), ces(NULL
), contexts(NULL
), base(NULL
),
41 numericPrimary(0x12000000),
42 ce32sLength(0), cesLength(0), contextsLength(0),
43 compressibleBytes(NULL
),
44 unsafeBackwardSet(NULL
),
45 fastLatinTable(NULL
), fastLatinTableLength(0),
46 scripts(NULL
), scriptsLength(0),
47 rootElements(NULL
), rootElementsLength(0) {}
49 uint32_t getCE32(UChar32 c
) const {
50 return UTRIE2_GET32(trie
, c
);
53 uint32_t getCE32FromSupplementary(UChar32 c
) const {
54 return UTRIE2_GET32_FROM_SUPP(trie
, c
);
57 UBool
isDigit(UChar32 c
) const {
58 return c
< 0x660 ? c
<= 0x39 && 0x30 <= c
:
59 Collation::hasCE32Tag(getCE32(c
), Collation::DIGIT_TAG
);
62 UBool
isUnsafeBackward(UChar32 c
, UBool numeric
) const {
63 return unsafeBackwardSet
->contains(c
) || (numeric
&& isDigit(c
));
66 UBool
isCompressibleLeadByte(uint32_t b
) const {
67 return compressibleBytes
[b
];
70 inline UBool
isCompressiblePrimary(uint32_t p
) const {
71 return isCompressibleLeadByte(p
>> 24);
75 * Returns the CE32 from two contexts words.
76 * Access to the defaultCE32 for contraction and prefix matching.
78 static uint32_t readCE32(const UChar
*p
) {
79 return ((uint32_t)p
[0] << 16) | p
[1];
83 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
84 * Requires that ce32 is special.
86 uint32_t getIndirectCE32(uint32_t ce32
) const;
88 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
91 uint32_t getFinalCE32(uint32_t ce32
) const;
94 * Computes a CE from c's ce32 which has the OFFSET_TAG.
96 int64_t getCEFromOffsetCE32(UChar32 c
, uint32_t ce32
) const {
97 int64_t dataCE
= ces
[Collation::indexFromCE32(ce32
)];
98 return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c
, dataCE
));
102 * Returns the FCD16 value for code point c. c must be >= 0.
104 uint16_t getFCD16(UChar32 c
) const {
105 return nfcImpl
.getFCD16(c
);
109 * Returns the first primary for the script's reordering group.
110 * @return the primary with only the first primary lead byte of the group
111 * (not necessarily an actual root collator primary weight),
112 * or 0 if the script is unknown
114 uint32_t getFirstPrimaryForGroup(int32_t script
) const;
117 * Returns the last primary for the script's reordering group.
118 * @return the last primary of the group
119 * (not an actual root collator primary weight),
120 * or 0 if the script is unknown
122 uint32_t getLastPrimaryForGroup(int32_t script
) const;
125 * Finds the reordering group which contains the primary weight.
126 * @return the first script of the group, or -1 if the weight is beyond the last group
128 int32_t getGroupForPrimary(uint32_t p
) const;
130 int32_t getEquivalentScripts(int32_t script
,
131 int32_t dest
[], int32_t capacity
, UErrorCode
&errorCode
) const;
134 * Writes the permutation table for the given reordering of scripts and groups,
135 * mapping from default-order primary-weight lead bytes to reordered lead bytes.
136 * The caller checks for illegal arguments and
137 * takes care of [DEFAULT] and memory allocation.
139 void makeReorderTable(const int32_t *reorder
, int32_t length
,
140 uint8_t table
[256], UErrorCode
&errorCode
) const;
142 /** @see jamoCE32s */
143 static const int32_t JAMO_CE32S_LENGTH
= 19 + 21 + 27;
145 /** Main lookup trie. */
148 * Array of CE32 values.
149 * At index 0 there must be CE32(U+0000)
150 * to support U+0000's special-tag for NUL-termination handling.
152 const uint32_t *ce32s
;
153 /** Array of CE values for expansions and OFFSET_TAG. */
155 /** Array of prefix and contraction-suffix matching data. */
156 const UChar
*contexts
;
157 /** Base collation data, or NULL if this data itself is a base. */
158 const CollationData
*base
;
160 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
161 * They are normally simple CE32s, rarely expansions.
162 * For fast handling of HANGUL_TAG.
164 const uint32_t *jamoCE32s
;
165 const Normalizer2Impl
&nfcImpl
;
166 /** The single-byte primary weight (xx000000) for numeric collation. */
167 uint32_t numericPrimary
;
171 int32_t contextsLength
;
173 /** 256 flags for which primary-weight lead bytes are compressible. */
174 const UBool
*compressibleBytes
;
176 * Set of code points that are unsafe for starting string comparison after an identical prefix,
177 * or in backwards CE iteration.
179 const UnicodeSet
*unsafeBackwardSet
;
182 * Fast Latin table for common-Latin-text string comparisons.
183 * Data structure see class CollationFastLatin.
185 const uint16_t *fastLatinTable
;
186 int32_t fastLatinTableLength
;
189 * Data for scripts and reordering groups.
190 * Uses include building a reordering permutation table and
191 * providing script boundaries to AlphabeticIndex.
193 * This data is a sorted list of primary-weight lead byte ranges (reordering groups),
194 * each with a list of pairs sorted in base collation order;
195 * each pair contains a script/reorder code and the lowest primary weight for that script.
198 * - Each reordering group is encoded in n+2 16-bit integers.
200 * Bits 15..8: First byte of the reordering group's range.
201 * Bits 7..0: Last byte of the reordering group's range.
203 * Length n of the list of script/reordering codes.
204 * - Each further integer is a script or reordering code.
206 const uint16_t *scripts
;
207 int32_t scriptsLength
;
210 * Collation elements in the root collator.
211 * Used by the CollationRootElements class. The data structure is described there.
212 * NULL in a tailoring.
214 const uint32_t *rootElements
;
215 int32_t rootElementsLength
;
218 int32_t findScript(int32_t script
) const;
223 #endif // !UCONFIG_NO_COLLATION
224 #endif // __COLLATIONDATA_H__