1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2012-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationdatabuilder.h
10 * created on: 2012apr01
11 * created by: Markus W. Scherer
14 #ifndef __COLLATIONDATABUILDER_H__
15 #define __COLLATIONDATABUILDER_H__
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_COLLATION
21 #include "unicode/uniset.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uversion.h"
24 #include "collation.h"
25 #include "collationdata.h"
26 #include "collationsettings.h"
27 #include "normalizer2impl.h"
35 struct ConditionalCE32
;
37 class CollationFastLatinBuilder
;
39 class DataBuilderCollationIterator
;
40 class UCharsTrieBuilder
;
43 * Low-level CollationData builder.
44 * Takes (character, CE) pairs and builds them into runtime data structures.
45 * Supports characters with context prefixes and contraction suffixes.
47 class U_I18N_API CollationDataBuilder
: public UObject
{
50 * Collation element modifier. Interface class for a modifier
51 * that changes a tailoring builder's temporary CEs to final CEs.
52 * Called for every non-special CE32 and every expansion CE.
54 class CEModifier
: public UObject
{
56 virtual ~CEModifier();
57 /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
58 virtual int64_t modifyCE32(uint32_t ce32
) const = 0;
59 /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
60 virtual int64_t modifyCE(int64_t ce
) const = 0;
63 CollationDataBuilder(UErrorCode
&errorCode
);
65 virtual ~CollationDataBuilder();
67 void initForTailoring(const CollationData
*b
, UErrorCode
&errorCode
);
69 virtual UBool
isCompressibleLeadByte(uint32_t b
) const;
71 inline UBool
isCompressiblePrimary(uint32_t p
) const {
72 return isCompressibleLeadByte(p
>> 24);
76 * @return TRUE if this builder has mappings (e.g., add() has been called)
78 UBool
hasMappings() const { return modified
; }
81 * @return TRUE if c has CEs in this builder
83 UBool
isAssigned(UChar32 c
) const;
86 * @return the three-byte primary if c maps to a single such CE and has no context data,
87 * otherwise returns 0.
89 uint32_t getLongPrimaryIfSingleCE(UChar32 c
) const;
92 * @return the single CE for c.
93 * Sets an error code if c does not have a single CE.
95 int64_t getSingleCE(UChar32 c
, UErrorCode
&errorCode
) const;
97 void add(const UnicodeString
&prefix
, const UnicodeString
&s
,
98 const int64_t ces
[], int32_t cesLength
,
99 UErrorCode
&errorCode
);
102 * Encodes the ces as either the returned ce32 by itself,
103 * or by storing an expansion, with the returned ce32 referring to that.
105 * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
107 virtual uint32_t encodeCEs(const int64_t ces
[], int32_t cesLength
, UErrorCode
&errorCode
);
108 void addCE32(const UnicodeString
&prefix
, const UnicodeString
&s
,
109 uint32_t ce32
, UErrorCode
&errorCode
);
112 * Sets three-byte-primary CEs for a range of code points in code point order,
113 * if it is worth doing; otherwise no change is made.
114 * None of the code points in the range should have complex mappings so far
115 * (expansions/contractions/prefixes).
116 * @param start first code point
117 * @param end last code point (inclusive)
118 * @param primary primary weight for 'start'
119 * @param step per-code point primary-weight increment
120 * @param errorCode ICU in/out error code
121 * @return TRUE if an OFFSET_TAG range was used for start..end
123 UBool
maybeSetPrimaryRange(UChar32 start
, UChar32 end
,
124 uint32_t primary
, int32_t step
,
125 UErrorCode
&errorCode
);
128 * Sets three-byte-primary CEs for a range of code points in code point order.
129 * Sets range values if that is worth doing, or else individual values.
130 * None of the code points in the range should have complex mappings so far
131 * (expansions/contractions/prefixes).
132 * @param start first code point
133 * @param end last code point (inclusive)
134 * @param primary primary weight for 'start'
135 * @param step per-code point primary-weight increment
136 * @param errorCode ICU in/out error code
137 * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
139 uint32_t setPrimaryRangeAndReturnNext(UChar32 start
, UChar32 end
,
140 uint32_t primary
, int32_t step
,
141 UErrorCode
&errorCode
);
144 * Copies all mappings from the src builder, with modifications.
145 * This builder here must not be built yet, and should be empty.
147 void copyFrom(const CollationDataBuilder
&src
, const CEModifier
&modifier
,
148 UErrorCode
&errorCode
);
150 void optimize(const UnicodeSet
&set
, UErrorCode
&errorCode
);
151 void suppressContractions(const UnicodeSet
&set
, UErrorCode
&errorCode
);
153 void enableFastLatin() { fastLatinEnabled
= TRUE
; }
154 virtual void build(CollationData
&data
, UErrorCode
&errorCode
);
157 * Looks up CEs for s and appends them to the ces array.
158 * Does not handle normalization: s should be in FCD form.
160 * Does not write completely ignorable CEs.
161 * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
163 * @return incremented cesLength
165 int32_t getCEs(const UnicodeString
&s
, int64_t ces
[], int32_t cesLength
);
166 int32_t getCEs(const UnicodeString
&prefix
, const UnicodeString
&s
,
167 int64_t ces
[], int32_t cesLength
);
170 friend class CopyHelper
;
171 friend class DataBuilderCollationIterator
;
173 uint32_t getCE32FromOffsetCE32(UBool fromBase
, UChar32 c
, uint32_t ce32
) const;
175 int32_t addCE(int64_t ce
, UErrorCode
&errorCode
);
176 int32_t addCE32(uint32_t ce32
, UErrorCode
&errorCode
);
177 int32_t addConditionalCE32(const UnicodeString
&context
, uint32_t ce32
, UErrorCode
&errorCode
);
179 inline ConditionalCE32
*getConditionalCE32(int32_t index
) const {
180 return static_cast<ConditionalCE32
*>(conditionalCE32s
[index
]);
182 inline ConditionalCE32
*getConditionalCE32ForCE32(uint32_t ce32
) const {
183 return getConditionalCE32(Collation::indexFromCE32(ce32
));
186 static uint32_t makeBuilderContextCE32(int32_t index
) {
187 return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG
, index
);
189 static inline UBool
isBuilderContextCE32(uint32_t ce32
) {
190 return Collation::hasCE32Tag(ce32
, Collation::BUILDER_DATA_TAG
);
193 static uint32_t encodeOneCEAsCE32(int64_t ce
);
194 uint32_t encodeOneCE(int64_t ce
, UErrorCode
&errorCode
);
195 uint32_t encodeExpansion(const int64_t ces
[], int32_t length
, UErrorCode
&errorCode
);
196 uint32_t encodeExpansion32(const int32_t newCE32s
[], int32_t length
, UErrorCode
&errorCode
);
198 uint32_t copyFromBaseCE32(UChar32 c
, uint32_t ce32
, UBool withContext
, UErrorCode
&errorCode
);
200 * Copies base contractions to a list of ConditionalCE32.
201 * Sets cond->next to the index of the first new item
202 * and returns the index of the last new item.
204 int32_t copyContractionsFromBaseCE32(UnicodeString
&context
, UChar32 c
, uint32_t ce32
,
205 ConditionalCE32
*cond
, UErrorCode
&errorCode
);
207 UBool
getJamoCE32s(uint32_t jamoCE32s
[], UErrorCode
&errorCode
);
208 void setDigitTags(UErrorCode
&errorCode
);
209 void setLeadSurrogates(UErrorCode
&errorCode
);
211 void buildMappings(CollationData
&data
, UErrorCode
&errorCode
);
213 void clearContexts();
214 void buildContexts(UErrorCode
&errorCode
);
215 uint32_t buildContext(ConditionalCE32
*head
, UErrorCode
&errorCode
);
216 int32_t addContextTrie(uint32_t defaultCE32
, UCharsTrieBuilder
&trieBuilder
,
217 UErrorCode
&errorCode
);
219 void buildFastLatinTable(CollationData
&data
, UErrorCode
&errorCode
);
221 int32_t getCEs(const UnicodeString
&s
, int32_t start
, int64_t ces
[], int32_t cesLength
);
223 static UChar32
jamoCpFromIndex(int32_t i
) {
224 // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
225 if(i
< Hangul::JAMO_L_COUNT
) { return Hangul::JAMO_L_BASE
+ i
; }
226 i
-= Hangul::JAMO_L_COUNT
;
227 if(i
< Hangul::JAMO_V_COUNT
) { return Hangul::JAMO_V_BASE
+ i
; }
228 i
-= Hangul::JAMO_V_COUNT
;
230 return Hangul::JAMO_T_BASE
+ 1 + i
;
233 /** @see Collation::BUILDER_DATA_TAG */
234 static const uint32_t IS_BUILDER_JAMO_CE32
= 0x100;
236 const Normalizer2Impl
&nfcImpl
;
237 const CollationData
*base
;
238 const CollationSettings
*baseSettings
;
242 UVector conditionalCE32s
; // vector of ConditionalCE32
243 // Characters that have context (prefixes or contraction suffixes).
244 UnicodeSet contextChars
;
245 // Serialized UCharsTrie structures for finalized contexts.
246 UnicodeString contexts
;
247 UnicodeSet unsafeBackwardSet
;
250 UBool fastLatinEnabled
;
251 CollationFastLatinBuilder
*fastLatinBuilder
;
253 DataBuilderCollationIterator
*collIter
;
258 #endif // !UCONFIG_NO_COLLATION
259 #endif // __COLLATIONDATABUILDER_H__