1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: bytestriebuilder.cpp
10 * tab size: 8 (not used)
13 * created on: 2010sep25
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/bytestrie.h"
19 #include "unicode/bytestriebuilder.h"
20 #include "unicode/stringpiece.h"
31 * Note: This builder implementation stores (bytes, value) pairs with full copies
32 * of the byte sequences, until the BytesTrie is built.
33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
36 class BytesTrieElement
: public UMemory
{
38 // Use compiler's default constructor, initializes nothing.
40 void setTo(StringPiece s
, int32_t val
, CharString
&strings
, UErrorCode
&errorCode
);
42 StringPiece
getString(const CharString
&strings
) const {
43 int32_t offset
=stringOffset
;
46 length
=(uint8_t)strings
[offset
++];
49 length
=((int32_t)(uint8_t)strings
[offset
]<<8)|(uint8_t)strings
[offset
+1];
52 return StringPiece(strings
.data()+offset
, length
);
54 int32_t getStringLength(const CharString
&strings
) const {
55 int32_t offset
=stringOffset
;
57 return (uint8_t)strings
[offset
];
60 return ((int32_t)(uint8_t)strings
[offset
]<<8)|(uint8_t)strings
[offset
+1];
64 char charAt(int32_t index
, const CharString
&strings
) const { return data(strings
)[index
]; }
66 int32_t getValue() const { return value
; }
68 int32_t compareStringTo(const BytesTrieElement
&o
, const CharString
&strings
) const;
71 const char *data(const CharString
&strings
) const {
72 int32_t offset
=stringOffset
;
78 return strings
.data()+offset
;
81 // If the stringOffset is non-negative, then the first strings byte contains
83 // If the stringOffset is negative, then the first two strings bytes contain
84 // the string length (big-endian), and the offset needs to be bit-inverted.
85 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
91 BytesTrieElement::setTo(StringPiece s
, int32_t val
,
92 CharString
&strings
, UErrorCode
&errorCode
) {
93 if(U_FAILURE(errorCode
)) {
96 int32_t length
=s
.length();
98 // Too long: We store the length in 1 or 2 bytes.
99 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
102 int32_t offset
=strings
.length();
105 strings
.append((char)(length
>>8), errorCode
);
107 strings
.append((char)length
, errorCode
);
110 strings
.append(s
, errorCode
);
114 BytesTrieElement::compareStringTo(const BytesTrieElement
&other
, const CharString
&strings
) const {
115 // TODO: add StringPiece::compare(), see ticket #8187
116 StringPiece thisString
=getString(strings
);
117 StringPiece otherString
=other
.getString(strings
);
118 int32_t lengthDiff
=thisString
.length()-otherString
.length();
119 int32_t commonLength
;
121 commonLength
=thisString
.length();
123 commonLength
=otherString
.length();
125 int32_t diff
=uprv_memcmp(thisString
.data(), otherString
.data(), commonLength
);
126 return diff
!=0 ? diff
: lengthDiff
;
129 BytesTrieBuilder::BytesTrieBuilder(UErrorCode
&errorCode
)
130 : strings(NULL
), elements(NULL
), elementsCapacity(0), elementsLength(0),
131 bytes(NULL
), bytesCapacity(0), bytesLength(0) {
132 if(U_FAILURE(errorCode
)) {
135 strings
=new CharString();
137 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
141 BytesTrieBuilder::~BytesTrieBuilder() {
148 BytesTrieBuilder::add(StringPiece s
, int32_t value
, UErrorCode
&errorCode
) {
149 if(U_FAILURE(errorCode
)) {
153 // Cannot add elements after building.
154 errorCode
=U_NO_WRITE_PERMISSION
;
157 if(elementsLength
==elementsCapacity
) {
159 if(elementsCapacity
==0) {
162 newCapacity
=4*elementsCapacity
;
164 BytesTrieElement
*newElements
=new BytesTrieElement
[newCapacity
];
165 if(newElements
==NULL
) {
166 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
167 return *this; // error instead of dereferencing null
169 if(elementsLength
>0) {
170 uprv_memcpy(newElements
, elements
, (size_t)elementsLength
*sizeof(BytesTrieElement
));
173 elements
=newElements
;
174 elementsCapacity
=newCapacity
;
176 elements
[elementsLength
++].setTo(s
, value
, *strings
, errorCode
);
182 static int32_t U_CALLCONV
183 compareElementStrings(const void *context
, const void *left
, const void *right
) {
184 const CharString
*strings
=static_cast<const CharString
*>(context
);
185 const BytesTrieElement
*leftElement
=static_cast<const BytesTrieElement
*>(left
);
186 const BytesTrieElement
*rightElement
=static_cast<const BytesTrieElement
*>(right
);
187 return leftElement
->compareStringTo(*rightElement
, *strings
);
193 BytesTrieBuilder::build(UStringTrieBuildOption buildOption
, UErrorCode
&errorCode
) {
194 buildBytes(buildOption
, errorCode
);
195 BytesTrie
*newTrie
=NULL
;
196 if(U_SUCCESS(errorCode
)) {
197 newTrie
=new BytesTrie(bytes
, bytes
+(bytesCapacity
-bytesLength
));
199 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
201 bytes
=NULL
; // The new trie now owns the array.
209 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption
, UErrorCode
&errorCode
) {
210 buildBytes(buildOption
, errorCode
);
212 if(U_SUCCESS(errorCode
)) {
213 result
.set(bytes
+(bytesCapacity
-bytesLength
), bytesLength
);
219 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption
, UErrorCode
&errorCode
) {
220 if(U_FAILURE(errorCode
)) {
223 if(bytes
!=NULL
&& bytesLength
>0) {
228 if(elementsLength
==0) {
229 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
232 uprv_sortArray(elements
, elementsLength
, (int32_t)sizeof(BytesTrieElement
),
233 compareElementStrings
, strings
,
234 FALSE
, // need not be a stable sort
236 if(U_FAILURE(errorCode
)) {
239 // Duplicate strings are not allowed.
240 StringPiece prev
=elements
[0].getString(*strings
);
241 for(int32_t i
=1; i
<elementsLength
; ++i
) {
242 StringPiece current
=elements
[i
].getString(*strings
);
244 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
250 // Create and byte-serialize the trie for the elements.
252 int32_t capacity
=strings
->length();
256 if(bytesCapacity
<capacity
) {
258 bytes
=static_cast<char *>(uprv_malloc(capacity
));
260 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
264 bytesCapacity
=capacity
;
266 StringTrieBuilder::build(buildOption
, elementsLength
, errorCode
);
268 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
273 BytesTrieBuilder::clear() {
281 BytesTrieBuilder::getElementStringLength(int32_t i
) const {
282 return elements
[i
].getStringLength(*strings
);
286 BytesTrieBuilder::getElementUnit(int32_t i
, int32_t byteIndex
) const {
287 return (uint8_t)elements
[i
].charAt(byteIndex
, *strings
);
291 BytesTrieBuilder::getElementValue(int32_t i
) const {
292 return elements
[i
].getValue();
296 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first
, int32_t last
, int32_t byteIndex
) const {
297 const BytesTrieElement
&firstElement
=elements
[first
];
298 const BytesTrieElement
&lastElement
=elements
[last
];
299 int32_t minStringLength
=firstElement
.getStringLength(*strings
);
300 while(++byteIndex
<minStringLength
&&
301 firstElement
.charAt(byteIndex
, *strings
)==
302 lastElement
.charAt(byteIndex
, *strings
)) {}
307 BytesTrieBuilder::countElementUnits(int32_t start
, int32_t limit
, int32_t byteIndex
) const {
308 int32_t length
=0; // Number of different bytes at byteIndex.
311 char byte
=elements
[i
++].charAt(byteIndex
, *strings
);
312 while(i
<limit
&& byte
==elements
[i
].charAt(byteIndex
, *strings
)) {
321 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i
, int32_t byteIndex
, int32_t count
) const {
323 char byte
=elements
[i
++].charAt(byteIndex
, *strings
);
324 while(byte
==elements
[i
].charAt(byteIndex
, *strings
)) {
332 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i
, int32_t byteIndex
, UChar byte
) const {
334 while(b
==elements
[i
].charAt(byteIndex
, *strings
)) {
340 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes
, int32_t len
, Node
*nextNode
)
341 : LinearMatchNode(len
, nextNode
), s(bytes
) {
342 hash
=static_cast<int32_t>(
343 static_cast<uint32_t>(hash
)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes
, len
)));
347 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node
&other
) const {
351 if(!LinearMatchNode::operator==(other
)) {
354 const BTLinearMatchNode
&o
=(const BTLinearMatchNode
&)other
;
355 return 0==uprv_memcmp(s
, o
.s
, length
);
359 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder
&builder
) {
360 BytesTrieBuilder
&b
=(BytesTrieBuilder
&)builder
;
361 next
->write(builder
);
363 offset
=b
.write(b
.getMinLinearMatch()+length
-1);
366 StringTrieBuilder::Node
*
367 BytesTrieBuilder::createLinearMatchNode(int32_t i
, int32_t byteIndex
, int32_t length
,
368 Node
*nextNode
) const {
369 return new BTLinearMatchNode(
370 elements
[i
].getString(*strings
).data()+byteIndex
,
376 BytesTrieBuilder::ensureCapacity(int32_t length
) {
378 return FALSE
; // previous memory allocation had failed
380 if(length
>bytesCapacity
) {
381 int32_t newCapacity
=bytesCapacity
;
384 } while(newCapacity
<=length
);
385 char *newBytes
=static_cast<char *>(uprv_malloc(newCapacity
));
387 // unable to allocate memory
393 uprv_memcpy(newBytes
+(newCapacity
-bytesLength
),
394 bytes
+(bytesCapacity
-bytesLength
), bytesLength
);
397 bytesCapacity
=newCapacity
;
403 BytesTrieBuilder::write(int32_t byte
) {
404 int32_t newLength
=bytesLength
+1;
405 if(ensureCapacity(newLength
)) {
406 bytesLength
=newLength
;
407 bytes
[bytesCapacity
-bytesLength
]=(char)byte
;
413 BytesTrieBuilder::write(const char *b
, int32_t length
) {
414 int32_t newLength
=bytesLength
+length
;
415 if(ensureCapacity(newLength
)) {
416 bytesLength
=newLength
;
417 uprv_memcpy(bytes
+(bytesCapacity
-bytesLength
), b
, length
);
423 BytesTrieBuilder::writeElementUnits(int32_t i
, int32_t byteIndex
, int32_t length
) {
424 return write(elements
[i
].getString(*strings
).data()+byteIndex
, length
);
428 BytesTrieBuilder::writeValueAndFinal(int32_t i
, UBool isFinal
) {
429 if(0<=i
&& i
<=BytesTrie::kMaxOneByteValue
) {
430 return write(((BytesTrie::kMinOneByteValueLead
+i
)<<1)|isFinal
);
434 if(i
<0 || i
>0xffffff) {
435 intBytes
[0]=(char)BytesTrie::kFiveByteValueLead
;
436 intBytes
[1]=(char)((uint32_t)i
>>24);
437 intBytes
[2]=(char)((uint32_t)i
>>16);
438 intBytes
[3]=(char)((uint32_t)i
>>8);
441 // } else if(i<=BytesTrie::kMaxOneByteValue) {
442 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
444 if(i
<=BytesTrie::kMaxTwoByteValue
) {
445 intBytes
[0]=(char)(BytesTrie::kMinTwoByteValueLead
+(i
>>8));
447 if(i
<=BytesTrie::kMaxThreeByteValue
) {
448 intBytes
[0]=(char)(BytesTrie::kMinThreeByteValueLead
+(i
>>16));
450 intBytes
[0]=(char)BytesTrie::kFourByteValueLead
;
451 intBytes
[1]=(char)(i
>>16);
454 intBytes
[length
++]=(char)(i
>>8);
456 intBytes
[length
++]=(char)i
;
458 intBytes
[0]=(char)((intBytes
[0]<<1)|isFinal
);
459 return write(intBytes
, length
);
463 BytesTrieBuilder::writeValueAndType(UBool hasValue
, int32_t value
, int32_t node
) {
464 int32_t offset
=write(node
);
466 offset
=writeValueAndFinal(value
, FALSE
);
472 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget
) {
473 int32_t i
=bytesLength
-jumpTarget
;
475 if(i
<=BytesTrie::kMaxOneByteDelta
) {
480 if(i
<=BytesTrie::kMaxTwoByteDelta
) {
481 intBytes
[0]=(char)(BytesTrie::kMinTwoByteDeltaLead
+(i
>>8));
484 if(i
<=BytesTrie::kMaxThreeByteDelta
) {
485 intBytes
[0]=(char)(BytesTrie::kMinThreeByteDeltaLead
+(i
>>16));
489 intBytes
[0]=(char)BytesTrie::kFourByteDeltaLead
;
492 intBytes
[0]=(char)BytesTrie::kFiveByteDeltaLead
;
493 intBytes
[1]=(char)(i
>>24);
496 intBytes
[1]=(char)(i
>>16);
498 intBytes
[1]=(char)(i
>>8);
500 intBytes
[length
++]=(char)i
;
501 return write(intBytes
, length
);