2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucharstriebuilder.h
8 * tab size: 8 (not used)
11 * created on: 2010nov14
12 * created by: Markus W. Scherer
15 #include "unicode/utypes.h"
16 #include "unicode/ucharstrie.h"
17 #include "unicode/ucharstriebuilder.h"
18 #include "unicode/unistr.h"
19 #include "unicode/ustring.h"
29 * Note: This builder implementation stores (string, value) pairs with full copies
30 * of the 16-bit-unit sequences, until the UCharsTrie is built.
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34 class UCharsTrieElement
: public UMemory
{
36 // Use compiler's default constructor, initializes nothing.
38 void setTo(const UnicodeString
&s
, int32_t val
, UnicodeString
&strings
, UErrorCode
&errorCode
);
40 UnicodeString
getString(const UnicodeString
&strings
) const {
41 int32_t length
=strings
[stringOffset
];
42 return strings
.tempSubString(stringOffset
+1, length
);
44 int32_t getStringLength(const UnicodeString
&strings
) const {
45 return strings
[stringOffset
];
48 UChar
charAt(int32_t index
, const UnicodeString
&strings
) const {
49 return strings
[stringOffset
+1+index
];
52 int32_t getValue() const { return value
; }
54 int32_t compareStringTo(const UCharsTrieElement
&o
, const UnicodeString
&strings
) const;
57 // The first strings unit contains the string length.
58 // (Compared with a stringLength field here, this saves 2 bytes per string.)
64 UCharsTrieElement::setTo(const UnicodeString
&s
, int32_t val
,
65 UnicodeString
&strings
, UErrorCode
&errorCode
) {
66 if(U_FAILURE(errorCode
)) {
69 int32_t length
=s
.length();
71 // Too long: We store the length in 1 unit.
72 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
75 stringOffset
=strings
.length();
76 strings
.append((UChar
)length
);
82 UCharsTrieElement::compareStringTo(const UCharsTrieElement
&other
, const UnicodeString
&strings
) const {
83 return getString(strings
).compare(other
.getString(strings
));
86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode
& /*errorCode*/)
87 : elements(NULL
), elementsCapacity(0), elementsLength(0),
88 uchars(NULL
), ucharsCapacity(0), ucharsLength(0) {}
90 UCharsTrieBuilder::~UCharsTrieBuilder() {
96 UCharsTrieBuilder::add(const UnicodeString
&s
, int32_t value
, UErrorCode
&errorCode
) {
97 if(U_FAILURE(errorCode
)) {
101 // Cannot add elements after building.
102 errorCode
=U_NO_WRITE_PERMISSION
;
105 if(elementsLength
==elementsCapacity
) {
107 if(elementsCapacity
==0) {
110 newCapacity
=4*elementsCapacity
;
112 UCharsTrieElement
*newElements
=new UCharsTrieElement
[newCapacity
];
113 if(newElements
==NULL
) {
114 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
117 if(elementsLength
>0) {
118 uprv_memcpy(newElements
, elements
, elementsLength
*sizeof(UCharsTrieElement
));
121 elements
=newElements
;
122 elementsCapacity
=newCapacity
;
124 elements
[elementsLength
++].setTo(s
, value
, strings
, errorCode
);
125 if(U_SUCCESS(errorCode
) && strings
.isBogus()) {
126 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
133 static int32_t U_CALLCONV
134 compareElementStrings(const void *context
, const void *left
, const void *right
) {
135 const UnicodeString
*strings
=static_cast<const UnicodeString
*>(context
);
136 const UCharsTrieElement
*leftElement
=static_cast<const UCharsTrieElement
*>(left
);
137 const UCharsTrieElement
*rightElement
=static_cast<const UCharsTrieElement
*>(right
);
138 return leftElement
->compareStringTo(*rightElement
, *strings
);
144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption
, UErrorCode
&errorCode
) {
145 buildUChars(buildOption
, errorCode
);
146 UCharsTrie
*newTrie
=NULL
;
147 if(U_SUCCESS(errorCode
)) {
148 newTrie
=new UCharsTrie(uchars
, uchars
+(ucharsCapacity
-ucharsLength
));
150 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
152 uchars
=NULL
; // The new trie now owns the array.
160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption
, UnicodeString
&result
,
161 UErrorCode
&errorCode
) {
162 buildUChars(buildOption
, errorCode
);
163 if(U_SUCCESS(errorCode
)) {
164 result
.setTo(FALSE
, uchars
+(ucharsCapacity
-ucharsLength
), ucharsLength
);
170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption
, UErrorCode
&errorCode
) {
171 if(U_FAILURE(errorCode
)) {
174 if(uchars
!=NULL
&& ucharsLength
>0) {
178 if(ucharsLength
==0) {
179 if(elementsLength
==0) {
180 errorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
183 if(strings
.isBogus()) {
184 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
187 uprv_sortArray(elements
, elementsLength
, (int32_t)sizeof(UCharsTrieElement
),
188 compareElementStrings
, &strings
,
189 FALSE
, // need not be a stable sort
191 if(U_FAILURE(errorCode
)) {
194 // Duplicate strings are not allowed.
195 UnicodeString prev
=elements
[0].getString(strings
);
196 for(int32_t i
=1; i
<elementsLength
; ++i
) {
197 UnicodeString current
=elements
[i
].getString(strings
);
199 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
202 prev
.fastCopyFrom(current
);
205 // Create and UChar-serialize the trie for the elements.
207 int32_t capacity
=strings
.length();
211 if(ucharsCapacity
<capacity
) {
213 uchars
=static_cast<UChar
*>(uprv_malloc(capacity
*2));
215 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
219 ucharsCapacity
=capacity
;
221 StringTrieBuilder::build(buildOption
, elementsLength
, errorCode
);
223 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
228 UCharsTrieBuilder::getElementStringLength(int32_t i
) const {
229 return elements
[i
].getStringLength(strings
);
233 UCharsTrieBuilder::getElementUnit(int32_t i
, int32_t unitIndex
) const {
234 return elements
[i
].charAt(unitIndex
, strings
);
238 UCharsTrieBuilder::getElementValue(int32_t i
) const {
239 return elements
[i
].getValue();
243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first
, int32_t last
, int32_t unitIndex
) const {
244 const UCharsTrieElement
&firstElement
=elements
[first
];
245 const UCharsTrieElement
&lastElement
=elements
[last
];
246 int32_t minStringLength
=firstElement
.getStringLength(strings
);
247 while(++unitIndex
<minStringLength
&&
248 firstElement
.charAt(unitIndex
, strings
)==
249 lastElement
.charAt(unitIndex
, strings
)) {}
254 UCharsTrieBuilder::countElementUnits(int32_t start
, int32_t limit
, int32_t unitIndex
) const {
255 int32_t length
=0; // Number of different units at unitIndex.
258 UChar unit
=elements
[i
++].charAt(unitIndex
, strings
);
259 while(i
<limit
&& unit
==elements
[i
].charAt(unitIndex
, strings
)) {
268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i
, int32_t unitIndex
, int32_t count
) const {
270 UChar unit
=elements
[i
++].charAt(unitIndex
, strings
);
271 while(unit
==elements
[i
].charAt(unitIndex
, strings
)) {
279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i
, int32_t unitIndex
, UChar unit
) const {
280 while(unit
==elements
[i
].charAt(unitIndex
, strings
)) {
286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar
*units
, int32_t len
, Node
*nextNode
)
287 : LinearMatchNode(len
, nextNode
), s(units
) {
288 hash
=hash
*37+ustr_hashUCharsN(units
, len
);
292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node
&other
) const {
296 if(!LinearMatchNode::operator==(other
)) {
299 const UCTLinearMatchNode
&o
=(const UCTLinearMatchNode
&)other
;
300 return 0==u_memcmp(s
, o
.s
, length
);
304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder
&builder
) {
305 UCharsTrieBuilder
&b
=(UCharsTrieBuilder
&)builder
;
306 next
->write(builder
);
308 offset
=b
.writeValueAndType(hasValue
, value
, b
.getMinLinearMatch()+length
-1);
311 StringTrieBuilder::Node
*
312 UCharsTrieBuilder::createLinearMatchNode(int32_t i
, int32_t unitIndex
, int32_t length
,
313 Node
*nextNode
) const {
314 return new UCTLinearMatchNode(
315 elements
[i
].getString(strings
).getBuffer()+unitIndex
,
321 UCharsTrieBuilder::ensureCapacity(int32_t length
) {
323 return FALSE
; // previous memory allocation had failed
325 if(length
>ucharsCapacity
) {
326 int32_t newCapacity
=ucharsCapacity
;
329 } while(newCapacity
<=length
);
330 UChar
*newUChars
=static_cast<UChar
*>(uprv_malloc(newCapacity
*2));
331 if(newUChars
==NULL
) {
332 // unable to allocate memory
338 u_memcpy(newUChars
+(newCapacity
-ucharsLength
),
339 uchars
+(ucharsCapacity
-ucharsLength
), ucharsLength
);
342 ucharsCapacity
=newCapacity
;
348 UCharsTrieBuilder::write(int32_t unit
) {
349 int32_t newLength
=ucharsLength
+1;
350 if(ensureCapacity(newLength
)) {
351 ucharsLength
=newLength
;
352 uchars
[ucharsCapacity
-ucharsLength
]=(UChar
)unit
;
358 UCharsTrieBuilder::write(const UChar
*s
, int32_t length
) {
359 int32_t newLength
=ucharsLength
+length
;
360 if(ensureCapacity(newLength
)) {
361 ucharsLength
=newLength
;
362 u_memcpy(uchars
+(ucharsCapacity
-ucharsLength
), s
, length
);
368 UCharsTrieBuilder::writeElementUnits(int32_t i
, int32_t unitIndex
, int32_t length
) {
369 return write(elements
[i
].getString(strings
).getBuffer()+unitIndex
, length
);
373 UCharsTrieBuilder::writeValueAndFinal(int32_t i
, UBool isFinal
) {
374 if(0<=i
&& i
<=UCharsTrie::kMaxOneUnitValue
) {
375 return write(i
|(isFinal
<<15));
379 if(i
<0 || i
>UCharsTrie::kMaxTwoUnitValue
) {
380 intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitValueLead
);
381 intUnits
[1]=(UChar
)((uint32_t)i
>>16);
382 intUnits
[2]=(UChar
)i
;
384 // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
385 // intUnits[0]=(UChar)(i);
388 intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitValueLead
+(i
>>16));
389 intUnits
[1]=(UChar
)i
;
392 intUnits
[0]=(UChar
)(intUnits
[0]|(isFinal
<<15));
393 return write(intUnits
, length
);
397 UCharsTrieBuilder::writeValueAndType(UBool hasValue
, int32_t value
, int32_t node
) {
403 if(value
<0 || value
>UCharsTrie::kMaxTwoUnitNodeValue
) {
404 intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitNodeValueLead
);
405 intUnits
[1]=(UChar
)((uint32_t)value
>>16);
406 intUnits
[2]=(UChar
)value
;
408 } else if(value
<=UCharsTrie::kMaxOneUnitNodeValue
) {
409 intUnits
[0]=(UChar
)((value
+1)<<6);
412 intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitNodeValueLead
+((value
>>10)&0x7fc0));
413 intUnits
[1]=(UChar
)value
;
416 intUnits
[0]|=(UChar
)node
;
417 return write(intUnits
, length
);
421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget
) {
422 int32_t i
=ucharsLength
-jumpTarget
;
424 if(i
<=UCharsTrie::kMaxOneUnitDelta
) {
429 if(i
<=UCharsTrie::kMaxTwoUnitDelta
) {
430 intUnits
[0]=(UChar
)(UCharsTrie::kMinTwoUnitDeltaLead
+(i
>>16));
433 intUnits
[0]=(UChar
)(UCharsTrie::kThreeUnitDeltaLead
);
434 intUnits
[1]=(UChar
)(i
>>16);
437 intUnits
[length
++]=(UChar
)i
;
438 return write(intUnits
, length
);