]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/bytestriebuilder.cpp
ICU-57132.0.1.tar.gz
[apple/icu.git] / icuSources / common / bytestriebuilder.cpp
CommitLineData
4388f060
A
1/*
2*******************************************************************************
51004dcb 3* Copyright (C) 2010-2012, International Business Machines
4388f060
A
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: bytestriebuilder.cpp
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2010sep25
12* created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/bytestrie.h"
17#include "unicode/bytestriebuilder.h"
18#include "unicode/stringpiece.h"
19#include "charstr.h"
20#include "cmemory.h"
21#include "uhash.h"
22#include "uarrsort.h"
23#include "uassert.h"
24#include "ustr_imp.h"
25
26U_NAMESPACE_BEGIN
27
28/*
29 * Note: This builder implementation stores (bytes, value) pairs with full copies
30 * of the byte sequences, until the BytesTrie is built.
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
32 */
33
34class BytesTrieElement : public UMemory {
35public:
36 // Use compiler's default constructor, initializes nothing.
37
38 void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
39
40 StringPiece getString(const CharString &strings) const {
41 int32_t offset=stringOffset;
42 int32_t length;
43 if(offset>=0) {
44 length=(uint8_t)strings[offset++];
45 } else {
46 offset=~offset;
47 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
48 offset+=2;
49 }
50 return StringPiece(strings.data()+offset, length);
51 }
52 int32_t getStringLength(const CharString &strings) const {
53 int32_t offset=stringOffset;
54 if(offset>=0) {
55 return (uint8_t)strings[offset];
56 } else {
57 offset=~offset;
58 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
59 }
60 }
61
62 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
63
64 int32_t getValue() const { return value; }
65
66 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
67
68private:
69 const char *data(const CharString &strings) const {
70 int32_t offset=stringOffset;
71 if(offset>=0) {
72 ++offset;
73 } else {
74 offset=~offset+2;
75 }
76 return strings.data()+offset;
77 }
78
79 // If the stringOffset is non-negative, then the first strings byte contains
80 // the string length.
81 // If the stringOffset is negative, then the first two strings bytes contain
82 // the string length (big-endian), and the offset needs to be bit-inverted.
83 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
84 int32_t stringOffset;
85 int32_t value;
86};
87
88void
89BytesTrieElement::setTo(const StringPiece &s, int32_t val,
90 CharString &strings, UErrorCode &errorCode) {
91 if(U_FAILURE(errorCode)) {
92 return;
93 }
94 int32_t length=s.length();
95 if(length>0xffff) {
96 // Too long: We store the length in 1 or 2 bytes.
97 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
98 return;
99 }
100 int32_t offset=strings.length();
101 if(length>0xff) {
102 offset=~offset;
103 strings.append((char)(length>>8), errorCode);
104 }
105 strings.append((char)length, errorCode);
106 stringOffset=offset;
107 value=val;
108 strings.append(s, errorCode);
109}
110
111int32_t
112BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
113 // TODO: add StringPiece::compare(), see ticket #8187
114 StringPiece thisString=getString(strings);
115 StringPiece otherString=other.getString(strings);
116 int32_t lengthDiff=thisString.length()-otherString.length();
117 int32_t commonLength;
118 if(lengthDiff<=0) {
119 commonLength=thisString.length();
120 } else {
121 commonLength=otherString.length();
122 }
123 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
124 return diff!=0 ? diff : lengthDiff;
125}
126
127BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
128 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
129 bytes(NULL), bytesCapacity(0), bytesLength(0) {
130 if(U_FAILURE(errorCode)) {
131 return;
132 }
133 strings=new CharString();
134 if(strings==NULL) {
135 errorCode=U_MEMORY_ALLOCATION_ERROR;
136 }
137}
138
139BytesTrieBuilder::~BytesTrieBuilder() {
140 delete strings;
141 delete[] elements;
142 uprv_free(bytes);
143}
144
145BytesTrieBuilder &
146BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
147 if(U_FAILURE(errorCode)) {
148 return *this;
149 }
150 if(bytesLength>0) {
151 // Cannot add elements after building.
152 errorCode=U_NO_WRITE_PERMISSION;
153 return *this;
154 }
155 if(elementsLength==elementsCapacity) {
156 int32_t newCapacity;
157 if(elementsCapacity==0) {
158 newCapacity=1024;
159 } else {
160 newCapacity=4*elementsCapacity;
161 }
162 BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
163 if(newElements==NULL) {
164 errorCode=U_MEMORY_ALLOCATION_ERROR;
165 return *this; // error instead of dereferencing null
166 }
167 if(elementsLength>0) {
168 uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
169 }
170 delete[] elements;
171 elements=newElements;
172 elementsCapacity=newCapacity;
173 }
174 elements[elementsLength++].setTo(s, value, *strings, errorCode);
175 return *this;
176}
177
178U_CDECL_BEGIN
179
180static int32_t U_CALLCONV
181compareElementStrings(const void *context, const void *left, const void *right) {
51004dcb
A
182 const CharString *strings=static_cast<const CharString *>(context);
183 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
184 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
4388f060
A
185 return leftElement->compareStringTo(*rightElement, *strings);
186}
187
188U_CDECL_END
189
190BytesTrie *
191BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
192 buildBytes(buildOption, errorCode);
193 BytesTrie *newTrie=NULL;
194 if(U_SUCCESS(errorCode)) {
195 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
196 if(newTrie==NULL) {
197 errorCode=U_MEMORY_ALLOCATION_ERROR;
198 } else {
199 bytes=NULL; // The new trie now owns the array.
200 bytesCapacity=0;
201 }
202 }
203 return newTrie;
204}
205
206StringPiece
207BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
208 buildBytes(buildOption, errorCode);
209 StringPiece result;
210 if(U_SUCCESS(errorCode)) {
211 result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
212 }
213 return result;
214}
215
216void
217BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
218 if(U_FAILURE(errorCode)) {
219 return;
220 }
221 if(bytes!=NULL && bytesLength>0) {
222 // Already built.
223 return;
224 }
225 if(bytesLength==0) {
226 if(elementsLength==0) {
227 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
228 return;
229 }
230 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
231 compareElementStrings, strings,
232 FALSE, // need not be a stable sort
233 &errorCode);
234 if(U_FAILURE(errorCode)) {
235 return;
236 }
237 // Duplicate strings are not allowed.
238 StringPiece prev=elements[0].getString(*strings);
239 for(int32_t i=1; i<elementsLength; ++i) {
240 StringPiece current=elements[i].getString(*strings);
241 if(prev==current) {
242 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
243 return;
244 }
245 prev=current;
246 }
247 }
248 // Create and byte-serialize the trie for the elements.
249 bytesLength=0;
250 int32_t capacity=strings->length();
251 if(capacity<1024) {
252 capacity=1024;
253 }
254 if(bytesCapacity<capacity) {
255 uprv_free(bytes);
51004dcb 256 bytes=static_cast<char *>(uprv_malloc(capacity));
4388f060
A
257 if(bytes==NULL) {
258 errorCode=U_MEMORY_ALLOCATION_ERROR;
259 bytesCapacity=0;
260 return;
261 }
262 bytesCapacity=capacity;
263 }
264 StringTrieBuilder::build(buildOption, elementsLength, errorCode);
265 if(bytes==NULL) {
266 errorCode=U_MEMORY_ALLOCATION_ERROR;
267 }
268}
269
270BytesTrieBuilder &
271BytesTrieBuilder::clear() {
272 strings->clear();
273 elementsLength=0;
274 bytesLength=0;
275 return *this;
276}
277
278int32_t
279BytesTrieBuilder::getElementStringLength(int32_t i) const {
280 return elements[i].getStringLength(*strings);
281}
282
283UChar
284BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
285 return (uint8_t)elements[i].charAt(byteIndex, *strings);
286}
287
288int32_t
289BytesTrieBuilder::getElementValue(int32_t i) const {
290 return elements[i].getValue();
291}
292
293int32_t
294BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
295 const BytesTrieElement &firstElement=elements[first];
296 const BytesTrieElement &lastElement=elements[last];
297 int32_t minStringLength=firstElement.getStringLength(*strings);
298 while(++byteIndex<minStringLength &&
299 firstElement.charAt(byteIndex, *strings)==
300 lastElement.charAt(byteIndex, *strings)) {}
301 return byteIndex;
302}
303
304int32_t
305BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
306 int32_t length=0; // Number of different bytes at byteIndex.
307 int32_t i=start;
308 do {
309 char byte=elements[i++].charAt(byteIndex, *strings);
310 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
311 ++i;
312 }
313 ++length;
314 } while(i<limit);
315 return length;
316}
317
318int32_t
319BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
320 do {
321 char byte=elements[i++].charAt(byteIndex, *strings);
322 while(byte==elements[i].charAt(byteIndex, *strings)) {
323 ++i;
324 }
325 } while(--count>0);
326 return i;
327}
328
329int32_t
330BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
331 char b=(char)byte;
332 while(b==elements[i].charAt(byteIndex, *strings)) {
333 ++i;
334 }
335 return i;
336}
337
338BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
339 : LinearMatchNode(len, nextNode), s(bytes) {
340 hash=hash*37+ustr_hashCharsN(bytes, len);
341}
342
343UBool
344BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
345 if(this==&other) {
346 return TRUE;
347 }
348 if(!LinearMatchNode::operator==(other)) {
349 return FALSE;
350 }
351 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
352 return 0==uprv_memcmp(s, o.s, length);
353}
354
355void
356BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
357 BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
358 next->write(builder);
359 b.write(s, length);
360 offset=b.write(b.getMinLinearMatch()+length-1);
361}
362
363StringTrieBuilder::Node *
364BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
365 Node *nextNode) const {
366 return new BTLinearMatchNode(
367 elements[i].getString(*strings).data()+byteIndex,
368 length,
369 nextNode);
370}
371
372UBool
373BytesTrieBuilder::ensureCapacity(int32_t length) {
374 if(bytes==NULL) {
375 return FALSE; // previous memory allocation had failed
376 }
377 if(length>bytesCapacity) {
378 int32_t newCapacity=bytesCapacity;
379 do {
380 newCapacity*=2;
381 } while(newCapacity<=length);
51004dcb 382 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
4388f060
A
383 if(newBytes==NULL) {
384 // unable to allocate memory
385 uprv_free(bytes);
386 bytes=NULL;
387 bytesCapacity=0;
388 return FALSE;
389 }
390 uprv_memcpy(newBytes+(newCapacity-bytesLength),
391 bytes+(bytesCapacity-bytesLength), bytesLength);
392 uprv_free(bytes);
393 bytes=newBytes;
394 bytesCapacity=newCapacity;
395 }
396 return TRUE;
397}
398
399int32_t
400BytesTrieBuilder::write(int32_t byte) {
401 int32_t newLength=bytesLength+1;
402 if(ensureCapacity(newLength)) {
403 bytesLength=newLength;
404 bytes[bytesCapacity-bytesLength]=(char)byte;
405 }
406 return bytesLength;
407}
408
409int32_t
410BytesTrieBuilder::write(const char *b, int32_t length) {
411 int32_t newLength=bytesLength+length;
412 if(ensureCapacity(newLength)) {
413 bytesLength=newLength;
414 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
415 }
416 return bytesLength;
417}
418
419int32_t
420BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
421 return write(elements[i].getString(*strings).data()+byteIndex, length);
422}
423
424int32_t
425BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
426 if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
427 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
428 }
429 char intBytes[5];
430 int32_t length=1;
431 if(i<0 || i>0xffffff) {
432 intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
51004dcb
A
433 intBytes[1]=(char)((uint32_t)i>>24);
434 intBytes[2]=(char)((uint32_t)i>>16);
435 intBytes[3]=(char)((uint32_t)i>>8);
4388f060
A
436 intBytes[4]=(char)i;
437 length=5;
438 // } else if(i<=BytesTrie::kMaxOneByteValue) {
439 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
440 } else {
441 if(i<=BytesTrie::kMaxTwoByteValue) {
442 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
443 } else {
444 if(i<=BytesTrie::kMaxThreeByteValue) {
445 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
446 } else {
447 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
448 intBytes[1]=(char)(i>>16);
449 length=2;
450 }
451 intBytes[length++]=(char)(i>>8);
452 }
453 intBytes[length++]=(char)i;
454 }
455 intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
456 return write(intBytes, length);
457}
458
459int32_t
460BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
461 int32_t offset=write(node);
462 if(hasValue) {
463 offset=writeValueAndFinal(value, FALSE);
464 }
465 return offset;
466}
467
468int32_t
469BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
470 int32_t i=bytesLength-jumpTarget;
471 U_ASSERT(i>=0);
472 if(i<=BytesTrie::kMaxOneByteDelta) {
473 return write(i);
474 }
475 char intBytes[5];
476 int32_t length;
477 if(i<=BytesTrie::kMaxTwoByteDelta) {
478 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
479 length=1;
480 } else {
481 if(i<=BytesTrie::kMaxThreeByteDelta) {
482 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
483 length=2;
484 } else {
485 if(i<=0xffffff) {
486 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
487 length=3;
488 } else {
489 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
490 intBytes[1]=(char)(i>>24);
491 length=4;
492 }
493 intBytes[1]=(char)(i>>16);
494 }
495 intBytes[1]=(char)(i>>8);
496 }
497 intBytes[length++]=(char)i;
498 return write(intBytes, length);
499}
500
501U_NAMESPACE_END