1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucharstriebuilder.h
10 * tab size: 8 (not used)
13 * created on: 2010nov14
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/ucharstrie.h"
19 #include "unicode/ucharstriebuilder.h"
20 #include "unicode/unistr.h"
21 #include "unicode/ustring.h"
31 * Note: This builder implementation stores (string, value) pairs with full copies
32 * of the 16-bit-unit sequences, until the UCharsTrie is built.
33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
36 class UCharsTrieElement
: public UMemory
{
38 // Use compiler's default constructor, initializes nothing.
40 void setTo(const UnicodeString
&s
, int32_t val
, UnicodeString
&strings
, UErrorCode
&errorCode
);
42 UnicodeString
getString(const UnicodeString
&strings
) const {
43 int32_t length
=strings
[stringOffset
];
44 return strings
.tempSubString(stringOffset
+1, length
);
46 int32_t getStringLength(const UnicodeString
&strings
) const {
47 return strings
[stringOffset
];
50 UChar
charAt(int32_t index
, const UnicodeString
&strings
) const {
51 return strings
[stringOffset
+1+index
];
54 int32_t getValue() const { return value
; }
56 int32_t compareStringTo(const UCharsTrieElement
&o
, const UnicodeString
&strings
) const;
59 // The first strings unit contains the string length.
60 // (Compared with a stringLength field here, this saves 2 bytes per string.)
66 UCharsTrieElement::setTo(const UnicodeString
&s
, int32_t val
,
67 UnicodeString
&strings
, UErrorCode
&errorCode
) {
68 if(U_FAILURE(errorCode
)) {
71 int32_t length
=s
.length();
73 // Too long: We store the length in 1 unit.
74 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
77 stringOffset
=strings
.length();
78 strings
.append((UChar
)length
);
84 UCharsTrieElement::compareStringTo(const UCharsTrieElement
&other
, const UnicodeString
&strings
) const {
85 return getString(strings
).compare(other
.getString(strings
));
88 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode
& /*errorCode*/)
89 : elements(NULL
), elementsCapacity(0), elementsLength(0),
90 uchars(NULL
), ucharsCapacity(0), ucharsLength(0) {}
92 UCharsTrieBuilder::~UCharsTrieBuilder() {
98 UCharsTrieBuilder::add(const UnicodeString
&s
, int32_t value
, UErrorCode
&errorCode
) {
99 if(U_FAILURE(errorCode
)) {
103 // Cannot add elements after building.
104 errorCode
=U_NO_WRITE_PERMISSION
;
107 if(elementsLength
==elementsCapacity
) {
109 if(elementsCapacity
==0) {
112 newCapacity
=4*elementsCapacity
;
114 UCharsTrieElement
*newElements
=new UCharsTrieElement
[newCapacity
];
115 if(newElements
==NULL
) {
116 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
119 if(elementsLength
>0) {
120 uprv_memcpy(newElements
, elements
, (size_t)elementsLength
*sizeof(UCharsTrieElement
));
123 elements
=newElements
;
124 elementsCapacity
=newCapacity
;
126 elements
[elementsLength
++].setTo(s
, value
, strings
, errorCode
);
127 if(U_SUCCESS(errorCode
) && strings
.isBogus()) {
128 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
135 static int32_t U_CALLCONV
136 compareElementStrings(const void *context
, const void *left
, const void *right
) {
137 const UnicodeString
*strings
=static_cast<const UnicodeString
*>(context
);
138 const UCharsTrieElement
*leftElement
=static_cast<const UCharsTrieElement
*>(left
);
139 const UCharsTrieElement
*rightElement
=static_cast<const UCharsTrieElement
*>(right
);
140 return leftElement
->compareStringTo(*rightElement
, *strings
);
146 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption
, UErrorCode
&errorCode
) {
147 buildUChars(buildOption
, errorCode
);
148 UCharsTrie
*newTrie
=NULL
;
149 if(U_SUCCESS(errorCode
)) {
150 newTrie
=new UCharsTrie(uchars
, uchars
+(ucharsCapacity
-ucharsLength
));
152 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
154 uchars
=NULL
; // The new trie now owns the array.
162 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption
, UnicodeString
&result
,
163 UErrorCode
&errorCode
) {
164 buildUChars(buildOption
, errorCode
);
165 if(U_SUCCESS(errorCode
)) {
166 result
.setTo(FALSE
, uchars
+(ucharsCapacity
-ucharsLength
), ucharsLength
);
172 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption
, UErrorCode
&errorCode
) {
173 if(U_FAILURE(errorCode
)) {
176 if(uchars
!=NULL
&& ucharsLength
>0) {
180 if(ucharsLength
==0) {
181 if(elementsLength
==0) {
182 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
185 if(strings
.isBogus()) {
186 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
189 uprv_sortArray(elements
, elementsLength
, (int32_t)sizeof(UCharsTrieElement
),
190 compareElementStrings
, &strings
,
191 FALSE
, // need not be a stable sort
193 if(U_FAILURE(errorCode
)) {
196 // Duplicate strings are not allowed.
197 UnicodeString prev
=elements
[0].getString(strings
);
198 for(int32_t i
=1; i
<elementsLength
; ++i
) {
199 UnicodeString current
=elements
[i
].getString(strings
);
201 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
204 prev
.fastCopyFrom(current
);
207 // Create and UChar-serialize the trie for the elements.
209 int32_t capacity
=strings
.length();
213 if(ucharsCapacity
<capacity
) {
215 uchars
=static_cast<UChar
*>(uprv_malloc(capacity
*2));
217 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
221 ucharsCapacity
=capacity
;
223 StringTrieBuilder::build(buildOption
, elementsLength
, errorCode
);
225 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
230 UCharsTrieBuilder::getElementStringLength(int32_t i
) const {
231 return elements
[i
].getStringLength(strings
);
235 UCharsTrieBuilder::getElementUnit(int32_t i
, int32_t unitIndex
) const {
236 return elements
[i
].charAt(unitIndex
, strings
);
240 UCharsTrieBuilder::getElementValue(int32_t i
) const {
241 return elements
[i
].getValue();
245 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first
, int32_t last
, int32_t unitIndex
) const {
246 const UCharsTrieElement
&firstElement
=elements
[first
];
247 const UCharsTrieElement
&lastElement
=elements
[last
];
248 int32_t minStringLength
=firstElement
.getStringLength(strings
);
249 while(++unitIndex
<minStringLength
&&
250 firstElement
.charAt(unitIndex
, strings
)==
251 lastElement
.charAt(unitIndex
, strings
)) {}
256 UCharsTrieBuilder::countElementUnits(int32_t start
, int32_t limit
, int32_t unitIndex
) const {
257 int32_t length
=0; // Number of different units at unitIndex.
260 UChar unit
=elements
[i
++].charAt(unitIndex
, strings
);
261 while(i
<limit
&& unit
==elements
[i
].charAt(unitIndex
, strings
)) {
270 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i
, int32_t unitIndex
, int32_t count
) const {
272 UChar unit
=elements
[i
++].charAt(unitIndex
, strings
);
273 while(unit
==elements
[i
].charAt(unitIndex
, strings
)) {
281 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i
, int32_t unitIndex
, UChar unit
) const {
282 while(unit
==elements
[i
].charAt(unitIndex
, strings
)) {
288 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar
*units
, int32_t len
, Node
*nextNode
)
289 : LinearMatchNode(len
, nextNode
), s(units
) {
290 hash
=hash
*37u+ustr_hashUCharsN(units
, len
);
294 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node
&other
) const {
298 if(!LinearMatchNode::operator==(other
)) {
301 const UCTLinearMatchNode
&o
=(const UCTLinearMatchNode
&)other
;
302 return 0==u_memcmp(s
, o
.s
, length
);
306 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder
&builder
) {
307 UCharsTrieBuilder
&b
=(UCharsTrieBuilder
&)builder
;
308 next
->write(builder
);
310 offset
=b
.writeValueAndType(hasValue
, value
, b
.getMinLinearMatch()+length
-1);
313 StringTrieBuilder::Node
*
314 UCharsTrieBuilder::createLinearMatchNode(int32_t i
, int32_t unitIndex
, int32_t length
,
315 Node
*nextNode
) const {
316 return new UCTLinearMatchNode(
317 elements
[i
].getString(strings
).getBuffer()+unitIndex
,
323 UCharsTrieBuilder::ensureCapacity(int32_t length
) {
325 return FALSE
; // previous memory allocation had failed
327 if(length
>ucharsCapacity
) {
328 int32_t newCapacity
=ucharsCapacity
;
331 } while(newCapacity
<=length
);
332 UChar
*newUChars
=static_cast<UChar
*>(uprv_malloc(newCapacity
*2));
333 if(newUChars
==NULL
) {
334 // unable to allocate memory
340 u_memcpy(newUChars
+(newCapacity
-ucharsLength
),
341 uchars
+(ucharsCapacity
-ucharsLength
), ucharsLength
);
344 ucharsCapacity
=newCapacity
;
350 UCharsTrieBuilder::write(int32_t unit
) {
351 int32_t newLength
=ucharsLength
+1;
352 if(ensureCapacity(newLength
)) {
353 ucharsLength
=newLength
;
354 uchars
[ucharsCapacity
-ucharsLength
]=(UChar
)unit
;
360 UCharsTrieBuilder::write(const UChar
*s
, int32_t length
) {
361 int32_t newLength
=ucharsLength
+length
;
362 if(ensureCapacity(newLength
)) {
363 ucharsLength
=newLength
;
364 u_memcpy(uchars
+(ucharsCapacity
-ucharsLength
), s
, length
);
370 UCharsTrieBuilder::writeElementUnits(int32_t i
, int32_t unitIndex
, int32_t length
) {
371 return write(elements
[i
].getString(strings
).getBuffer()+unitIndex
, length
);
375 UCharsTrieBuilder::writeValueAndFinal(int32_t i
, UBool isFinal
) {
376 if(0<=i
&& i
<=UCharsTrie::kMaxOneUnitValue
) {
377 return write(i
|(isFinal
<<15));
381 if(i
<0 || i
>UCharsTrie::kMaxTwoUnitValue
) {
382 intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitValueLead
);
383 intUnits
[1]=(UChar
)((uint32_t)i
>>16);
384 intUnits
[2]=(UChar
)i
;
386 // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
387 // intUnits[0]=(UChar)(i);
390 intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitValueLead
+(i
>>16));
391 intUnits
[1]=(UChar
)i
;
394 intUnits
[0]=(UChar
)(intUnits
[0]|(isFinal
<<15));
395 return write(intUnits
, length
);
399 UCharsTrieBuilder::writeValueAndType(UBool hasValue
, int32_t value
, int32_t node
) {
405 if(value
<0 || value
>UCharsTrie::kMaxTwoUnitNodeValue
) {
406 intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitNodeValueLead
);
407 intUnits
[1]=(UChar
)((uint32_t)value
>>16);
408 intUnits
[2]=(UChar
)value
;
410 } else if(value
<=UCharsTrie::kMaxOneUnitNodeValue
) {
411 intUnits
[0]=(UChar
)((value
+1)<<6);
414 intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitNodeValueLead
+((value
>>10)&0x7fc0));
415 intUnits
[1]=(UChar
)value
;
418 intUnits
[0]|=(UChar
)node
;
419 return write(intUnits
, length
);
423 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget
) {
424 int32_t i
=ucharsLength
-jumpTarget
;
426 if(i
<=UCharsTrie::kMaxOneUnitDelta
) {
431 if(i
<=UCharsTrie::kMaxTwoUnitDelta
) {
432 intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitDeltaLead
+(i
>>16));
435 intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitDeltaLead
);
436 intUnits
[1]=(UChar
)(i
>>16);
439 intUnits
[length
++]=(UChar
)i
;
440 return write(intUnits
, length
);