]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2012-2014, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ******************************************************************************* | |
6 | * collationdatabuilder.h | |
7 | * | |
8 | * created on: 2012apr01 | |
9 | * created by: Markus W. Scherer | |
10 | */ | |
11 | ||
12 | #ifndef __COLLATIONDATABUILDER_H__ | |
13 | #define __COLLATIONDATABUILDER_H__ | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | ||
17 | #if !UCONFIG_NO_COLLATION | |
18 | ||
19 | #include "unicode/uniset.h" | |
20 | #include "unicode/unistr.h" | |
21 | #include "unicode/uversion.h" | |
22 | #include "collation.h" | |
23 | #include "collationdata.h" | |
24 | #include "collationsettings.h" | |
25 | #include "normalizer2impl.h" | |
26 | #include "utrie2.h" | |
27 | #include "uvectr32.h" | |
28 | #include "uvectr64.h" | |
29 | #include "uvector.h" | |
30 | ||
31 | U_NAMESPACE_BEGIN | |
32 | ||
33 | struct ConditionalCE32; | |
34 | ||
35 | class CollationFastLatinBuilder; | |
36 | class CopyHelper; | |
37 | class DataBuilderCollationIterator; | |
38 | class UCharsTrieBuilder; | |
39 | ||
40 | /** | |
41 | * Low-level CollationData builder. | |
42 | * Takes (character, CE) pairs and builds them into runtime data structures. | |
43 | * Supports characters with context prefixes and contraction suffixes. | |
44 | */ | |
45 | class U_I18N_API CollationDataBuilder : public UObject { | |
46 | public: | |
47 | /** | |
48 | * Collation element modifier. Interface class for a modifier | |
49 | * that changes a tailoring builder's temporary CEs to final CEs. | |
50 | * Called for every non-special CE32 and every expansion CE. | |
51 | */ | |
52 | class CEModifier : public UObject { | |
53 | public: | |
54 | virtual ~CEModifier(); | |
55 | /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ | |
56 | virtual int64_t modifyCE32(uint32_t ce32) const = 0; | |
57 | /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ | |
58 | virtual int64_t modifyCE(int64_t ce) const = 0; | |
59 | }; | |
60 | ||
61 | CollationDataBuilder(UErrorCode &errorCode); | |
62 | ||
63 | virtual ~CollationDataBuilder(); | |
64 | ||
65 | void initForTailoring(const CollationData *b, UErrorCode &errorCode); | |
66 | ||
67 | virtual UBool isCompressibleLeadByte(uint32_t b) const; | |
68 | ||
69 | inline UBool isCompressiblePrimary(uint32_t p) const { | |
70 | return isCompressibleLeadByte(p >> 24); | |
71 | } | |
72 | ||
73 | /** | |
74 | * @return TRUE if this builder has mappings (e.g., add() has been called) | |
75 | */ | |
76 | UBool hasMappings() const { return modified; } | |
77 | ||
78 | /** | |
79 | * @return TRUE if c has CEs in this builder | |
80 | */ | |
81 | UBool isAssigned(UChar32 c) const; | |
82 | ||
83 | /** | |
84 | * @return the three-byte primary if c maps to a single such CE and has no context data, | |
85 | * otherwise returns 0. | |
86 | */ | |
87 | uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; | |
88 | ||
89 | /** | |
90 | * @return the single CE for c. | |
91 | * Sets an error code if c does not have a single CE. | |
92 | */ | |
93 | int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; | |
94 | ||
95 | void add(const UnicodeString &prefix, const UnicodeString &s, | |
96 | const int64_t ces[], int32_t cesLength, | |
97 | UErrorCode &errorCode); | |
98 | ||
99 | /** | |
100 | * Encodes the ces as either the returned ce32 by itself, | |
101 | * or by storing an expansion, with the returned ce32 referring to that. | |
102 | * | |
103 | * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) | |
104 | */ | |
105 | virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); | |
106 | void addCE32(const UnicodeString &prefix, const UnicodeString &s, | |
107 | uint32_t ce32, UErrorCode &errorCode); | |
108 | ||
109 | /** | |
110 | * Sets three-byte-primary CEs for a range of code points in code point order, | |
111 | * if it is worth doing; otherwise no change is made. | |
112 | * None of the code points in the range should have complex mappings so far | |
113 | * (expansions/contractions/prefixes). | |
114 | * @param start first code point | |
115 | * @param end last code point (inclusive) | |
116 | * @param primary primary weight for 'start' | |
117 | * @param step per-code point primary-weight increment | |
118 | * @param errorCode ICU in/out error code | |
119 | * @return TRUE if an OFFSET_TAG range was used for start..end | |
120 | */ | |
121 | UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, | |
122 | uint32_t primary, int32_t step, | |
123 | UErrorCode &errorCode); | |
124 | ||
125 | /** | |
126 | * Sets three-byte-primary CEs for a range of code points in code point order. | |
127 | * Sets range values if that is worth doing, or else individual values. | |
128 | * None of the code points in the range should have complex mappings so far | |
129 | * (expansions/contractions/prefixes). | |
130 | * @param start first code point | |
131 | * @param end last code point (inclusive) | |
132 | * @param primary primary weight for 'start' | |
133 | * @param step per-code point primary-weight increment | |
134 | * @param errorCode ICU in/out error code | |
135 | * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step | |
136 | */ | |
137 | uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, | |
138 | uint32_t primary, int32_t step, | |
139 | UErrorCode &errorCode); | |
140 | ||
141 | /** | |
142 | * Copies all mappings from the src builder, with modifications. | |
143 | * This builder here must not be built yet, and should be empty. | |
144 | */ | |
145 | void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, | |
146 | UErrorCode &errorCode); | |
147 | ||
148 | void optimize(const UnicodeSet &set, UErrorCode &errorCode); | |
149 | void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); | |
150 | ||
151 | void enableFastLatin() { fastLatinEnabled = TRUE; } | |
152 | virtual void build(CollationData &data, UErrorCode &errorCode); | |
153 | ||
154 | /** | |
155 | * Looks up CEs for s and appends them to the ces array. | |
156 | * Does not handle normalization: s should be in FCD form. | |
157 | * | |
158 | * Does not write completely ignorable CEs. | |
159 | * Does not write beyond Collation::MAX_EXPANSION_LENGTH. | |
160 | * | |
161 | * @return incremented cesLength | |
162 | */ | |
163 | int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); | |
164 | int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, | |
165 | int64_t ces[], int32_t cesLength); | |
166 | ||
167 | protected: | |
168 | friend class CopyHelper; | |
169 | friend class DataBuilderCollationIterator; | |
170 | ||
171 | uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; | |
172 | ||
173 | int32_t addCE(int64_t ce, UErrorCode &errorCode); | |
174 | int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); | |
175 | int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); | |
176 | ||
177 | inline ConditionalCE32 *getConditionalCE32(int32_t index) const { | |
178 | return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); | |
179 | } | |
180 | inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { | |
181 | return getConditionalCE32(Collation::indexFromCE32(ce32)); | |
182 | } | |
183 | ||
184 | static uint32_t makeBuilderContextCE32(int32_t index) { | |
185 | return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); | |
186 | } | |
187 | static inline UBool isBuilderContextCE32(uint32_t ce32) { | |
188 | return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); | |
189 | } | |
190 | ||
191 | static uint32_t encodeOneCEAsCE32(int64_t ce); | |
192 | uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); | |
193 | uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); | |
194 | uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); | |
195 | ||
196 | uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); | |
197 | /** | |
198 | * Copies base contractions to a list of ConditionalCE32. | |
199 | * Sets cond->next to the index of the first new item | |
200 | * and returns the index of the last new item. | |
201 | */ | |
202 | int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, | |
203 | ConditionalCE32 *cond, UErrorCode &errorCode); | |
204 | ||
205 | UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); | |
206 | void setDigitTags(UErrorCode &errorCode); | |
207 | void setLeadSurrogates(UErrorCode &errorCode); | |
208 | ||
209 | void buildMappings(CollationData &data, UErrorCode &errorCode); | |
210 | ||
211 | void clearContexts(); | |
212 | void buildContexts(UErrorCode &errorCode); | |
213 | uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); | |
214 | int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, | |
215 | UErrorCode &errorCode); | |
216 | ||
217 | void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); | |
218 | ||
219 | int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); | |
220 | ||
221 | static UChar32 jamoCpFromIndex(int32_t i) { | |
222 | // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 | |
223 | if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } | |
224 | i -= Hangul::JAMO_L_COUNT; | |
225 | if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } | |
226 | i -= Hangul::JAMO_V_COUNT; | |
227 | // i < 27 | |
228 | return Hangul::JAMO_T_BASE + 1 + i; | |
229 | } | |
230 | ||
231 | /** @see Collation::BUILDER_DATA_TAG */ | |
232 | static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; | |
233 | ||
234 | const Normalizer2Impl &nfcImpl; | |
235 | const CollationData *base; | |
236 | const CollationSettings *baseSettings; | |
237 | UTrie2 *trie; | |
238 | UVector32 ce32s; | |
239 | UVector64 ce64s; | |
240 | UVector conditionalCE32s; // vector of ConditionalCE32 | |
241 | // Characters that have context (prefixes or contraction suffixes). | |
242 | UnicodeSet contextChars; | |
243 | // Serialized UCharsTrie structures for finalized contexts. | |
244 | UnicodeString contexts; | |
245 | UnicodeSet unsafeBackwardSet; | |
246 | UBool modified; | |
247 | ||
248 | UBool fastLatinEnabled; | |
249 | CollationFastLatinBuilder *fastLatinBuilder; | |
250 | ||
251 | DataBuilderCollationIterator *collIter; | |
252 | }; | |
253 | ||
254 | U_NAMESPACE_END | |
255 | ||
256 | #endif // !UCONFIG_NO_COLLATION | |
257 | #endif // __COLLATIONDATABUILDER_H__ |